aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig25
-rw-r--r--mm/Makefile1
-rw-r--r--mm/backing-dev.c57
-rw-r--r--mm/cma.c31
-rw-r--r--mm/cma.h1
-rw-r--r--mm/compaction.c60
-rw-r--r--mm/damon/Kconfig19
-rw-r--r--mm/damon/Makefile7
-rw-r--r--mm/damon/core-test.h21
-rw-r--r--mm/damon/core.c190
-rw-r--r--mm/damon/dbgfs-test.h85
-rw-r--r--mm/damon/dbgfs.c222
-rw-r--r--mm/damon/ops-common.c (renamed from mm/damon/prmtv-common.c)2
-rw-r--r--mm/damon/ops-common.h (renamed from mm/damon/prmtv-common.h)0
-rw-r--r--mm/damon/paddr.c98
-rw-r--r--mm/damon/reclaim.c9
-rw-r--r--mm/damon/sysfs.c2596
-rw-r--r--mm/damon/vaddr-test.h8
-rw-r--r--mm/damon/vaddr.c43
-rw-r--r--mm/debug.c19
-rw-r--r--mm/debug_vm_pgtable.c2
-rw-r--r--mm/early_ioremap.c1
-rw-r--r--mm/fadvise.c5
-rw-r--r--mm/filemap.c149
-rw-r--r--mm/folio-compat.c13
-rw-r--r--mm/gup.c637
-rw-r--r--mm/highmem.c9
-rw-r--r--mm/hmm.c3
-rw-r--r--mm/huge_memory.c300
-rw-r--r--mm/hugetlb.c49
-rw-r--r--mm/hugetlb_vmemmap.c68
-rw-r--r--mm/hwpoison-inject.c7
-rw-r--r--mm/init-mm.c4
-rw-r--r--mm/internal.h136
-rw-r--r--mm/kasan/Makefile2
-rw-r--r--mm/kasan/common.c4
-rw-r--r--mm/kasan/hw_tags.c211
-rw-r--r--mm/kasan/kasan.h56
-rw-r--r--mm/kasan/report.c336
-rw-r--r--mm/kasan/report_generic.c34
-rw-r--r--mm/kasan/report_hw_tags.c1
-rw-r--r--mm/kasan/report_sw_tags.c16
-rw-r--r--mm/kasan/report_tags.c2
-rw-r--r--mm/kasan/shadow.c64
-rw-r--r--mm/kfence/Makefile2
-rw-r--r--mm/kfence/core.c144
-rw-r--r--mm/kfence/kfence_test.c11
-rw-r--r--mm/khugepaged.c63
-rw-r--r--mm/kmemleak.c13
-rw-r--r--mm/ksm.c38
-rw-r--r--mm/list_lru.c422
-rw-r--r--mm/maccess.c125
-rw-r--r--mm/madvise.c152
-rw-r--r--mm/memblock.c15
-rw-r--r--mm/memcontrol.c542
-rw-r--r--mm/memfd.c40
-rw-r--r--mm/memory-failure.c162
-rw-r--r--mm/memory.c282
-rw-r--r--mm/memory_hotplug.c145
-rw-r--r--mm/mempolicy.c31
-rw-r--r--mm/memremap.c70
-rw-r--r--mm/migrate.c988
-rw-r--r--mm/migrate_device.c773
-rw-r--r--mm/mlock.c648
-rw-r--r--mm/mmap.c50
-rw-r--r--mm/mmzone.c14
-rw-r--r--mm/mprotect.c17
-rw-r--r--mm/mremap.c4
-rw-r--r--mm/oom_kill.c5
-rw-r--r--mm/page-writeback.c66
-rw-r--r--mm/page_alloc.c556
-rw-r--r--mm/page_idle.c30
-rw-r--r--mm/page_io.c32
-rw-r--r--mm/page_isolation.c2
-rw-r--r--mm/page_owner.c71
-rw-r--r--mm/page_table_check.c65
-rw-r--r--mm/page_vma_mapped.c58
-rw-r--r--mm/percpu-stats.c2
-rw-r--r--mm/ptdump.c16
-rw-r--r--mm/readahead.c232
-rw-r--r--mm/rmap.c657
-rw-r--r--mm/secretmem.c2
-rw-r--r--mm/shmem.c48
-rw-r--r--mm/slab.c40
-rw-r--r--mm/slab.h25
-rw-r--r--mm/slab_common.c2
-rw-r--r--mm/slob.c8
-rw-r--r--mm/slub.c172
-rw-r--r--mm/sparse-vmemmap.c70
-rw-r--r--mm/sparse.c2
-rw-r--r--mm/swap.c200
-rw-r--r--mm/swap_cgroup.c4
-rw-r--r--mm/swap_state.c4
-rw-r--r--mm/swapfile.c105
-rw-r--r--mm/truncate.c153
-rw-r--r--mm/usercopy.c39
-rw-r--r--mm/userfaultfd.c17
-rw-r--r--mm/util.c90
-rw-r--r--mm/vmalloc.c201
-rw-r--r--mm/vmscan.c445
-rw-r--r--mm/vmstat.c19
-rw-r--r--mm/workingset.c32
-rw-r--r--mm/zswap.c15
103 files changed, 8352 insertions, 5487 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 3326ee3903f3..034d87953600 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -249,6 +249,9 @@ config MIGRATION
pages as migration can relocate pages to satisfy a huge page
allocation instead of reclaiming.
+config DEVICE_MIGRATION
+ def_bool MIGRATION && ZONE_DEVICE
+
config ARCH_ENABLE_HUGEPAGE_MIGRATION
bool
@@ -262,6 +265,9 @@ config HUGETLB_PAGE_SIZE_VARIABLE
HUGETLB_PAGE_ORDER when there are multiple HugeTLB page sizes available
on a platform.
+ Note that the pageblock_order cannot exceed MAX_ORDER - 1 and will be
+ clamped down to MAX_ORDER - 1.
+
config CONTIG_ALLOC
def_bool (MEMORY_ISOLATION && COMPACTION) || CMA
@@ -411,6 +417,9 @@ choice
benefit.
endchoice
+config ARCH_WANT_GENERAL_HUGETLB
+ bool
+
config ARCH_WANTS_THP_SWAP
def_bool n
@@ -744,6 +753,18 @@ config IDLE_PAGE_TRACKING
config ARCH_HAS_CACHE_LINE_SIZE
bool
+config ARCH_HAS_CURRENT_STACK_POINTER
+ bool
+ help
+ In support of HARDENED_USERCOPY performing stack variable lifetime
+ checking, an architecture-agnostic way to find the stack pointer
+ is needed. Once an architecture defines an unsigned long global
+ register alias named "current_stack_pointer", this config can be
+ selected.
+
+config ARCH_HAS_FILTER_PGPROT
+ bool
+
config ARCH_HAS_PTE_DEVMAP
bool
@@ -776,9 +797,6 @@ config ZONE_DEVICE
If FS_DAX is enabled, then say Y.
-config DEV_PAGEMAP_OPS
- bool
-
#
# Helpers to mirror range of the CPU page tables of a process into device page
# tables.
@@ -790,7 +808,6 @@ config HMM_MIRROR
config DEVICE_PRIVATE
bool "Unaddressable device memory (GPU memory, ...)"
depends on ZONE_DEVICE
- select DEV_PAGEMAP_OPS
help
Allows creation of struct pages to represent unaddressable device
diff --git a/mm/Makefile b/mm/Makefile
index 70d4309c9ce3..4cc13f3179a5 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -92,6 +92,7 @@ obj-$(CONFIG_KFENCE) += kfence/
obj-$(CONFIG_FAILSLAB) += failslab.o
obj-$(CONFIG_MEMTEST) += memtest.o
obj-$(CONFIG_MIGRATION) += migrate.o
+obj-$(CONFIG_DEVICE_MIGRATION) += migrate_device.o
obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o khugepaged.o
obj-$(CONFIG_PAGE_COUNTER) += page_counter.o
obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index eae96dfe0261..7176af65b103 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -1005,60 +1005,3 @@ const char *bdi_dev_name(struct backing_dev_info *bdi)
return bdi->dev_name;
}
EXPORT_SYMBOL_GPL(bdi_dev_name);
-
-static wait_queue_head_t congestion_wqh[2] = {
- __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]),
- __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1])
- };
-static atomic_t nr_wb_congested[2];
-
-void clear_bdi_congested(struct backing_dev_info *bdi, int sync)
-{
- wait_queue_head_t *wqh = &congestion_wqh[sync];
- enum wb_congested_state bit;
-
- bit = sync ? WB_sync_congested : WB_async_congested;
- if (test_and_clear_bit(bit, &bdi->wb.congested))
- atomic_dec(&nr_wb_congested[sync]);
- smp_mb__after_atomic();
- if (waitqueue_active(wqh))
- wake_up(wqh);
-}
-EXPORT_SYMBOL(clear_bdi_congested);
-
-void set_bdi_congested(struct backing_dev_info *bdi, int sync)
-{
- enum wb_congested_state bit;
-
- bit = sync ? WB_sync_congested : WB_async_congested;
- if (!test_and_set_bit(bit, &bdi->wb.congested))
- atomic_inc(&nr_wb_congested[sync]);
-}
-EXPORT_SYMBOL(set_bdi_congested);
-
-/**
- * congestion_wait - wait for a backing_dev to become uncongested
- * @sync: SYNC or ASYNC IO
- * @timeout: timeout in jiffies
- *
- * Waits for up to @timeout jiffies for a backing_dev (any backing_dev) to exit
- * write congestion. If no backing_devs are congested then just wait for the
- * next write to be completed.
- */
-long congestion_wait(int sync, long timeout)
-{
- long ret;
- unsigned long start = jiffies;
- DEFINE_WAIT(wait);
- wait_queue_head_t *wqh = &congestion_wqh[sync];
-
- prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
- ret = io_schedule_timeout(timeout);
- finish_wait(wqh, &wait);
-
- trace_writeback_congestion_wait(jiffies_to_usecs(timeout),
- jiffies_to_usecs(jiffies - start));
-
- return ret;
-}
-EXPORT_SYMBOL(congestion_wait);
diff --git a/mm/cma.c b/mm/cma.c
index bc9ca8f3c487..eaa4b5c920a2 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -131,8 +131,10 @@ not_in_zone:
bitmap_free(cma->bitmap);
out_error:
/* Expose all pages to the buddy, they are useless for CMA. */
- for (pfn = base_pfn; pfn < base_pfn + cma->count; pfn++)
- free_reserved_page(pfn_to_page(pfn));
+ if (!cma->reserve_pages_on_error) {
+ for (pfn = base_pfn; pfn < base_pfn + cma->count; pfn++)
+ free_reserved_page(pfn_to_page(pfn));
+ }
totalcma_pages -= cma->count;
cma->count = 0;
pr_err("CMA area %s could not be activated\n", cma->name);
@@ -150,6 +152,11 @@ static int __init cma_init_reserved_areas(void)
}
core_initcall(cma_init_reserved_areas);
+void __init cma_reserve_pages_on_error(struct cma *cma)
+{
+ cma->reserve_pages_on_error = true;
+}
+
/**
* cma_init_reserved_mem() - create custom contiguous area from reserved memory
* @base: Base address of the reserved area
@@ -168,7 +175,6 @@ int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size,
struct cma **res_cma)
{
struct cma *cma;
- phys_addr_t alignment;
/* Sanity checks */
if (cma_area_count == ARRAY_SIZE(cma_areas)) {
@@ -179,15 +185,12 @@ int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size,
if (!size || !memblock_is_region_reserved(base, size))
return -EINVAL;
- /* ensure minimal alignment required by mm core */
- alignment = PAGE_SIZE <<
- max_t(unsigned long, MAX_ORDER - 1, pageblock_order);
-
/* alignment should be aligned with order_per_bit */
- if (!IS_ALIGNED(alignment >> PAGE_SHIFT, 1 << order_per_bit))
+ if (!IS_ALIGNED(CMA_MIN_ALIGNMENT_PAGES, 1 << order_per_bit))
return -EINVAL;
- if (ALIGN(base, alignment) != base || ALIGN(size, alignment) != size)
+ /* ensure minimal alignment required by mm core */
+ if (!IS_ALIGNED(base | size, CMA_MIN_ALIGNMENT_BYTES))
return -EINVAL;
/*
@@ -262,14 +265,8 @@ int __init cma_declare_contiguous_nid(phys_addr_t base,
if (alignment && !is_power_of_2(alignment))
return -EINVAL;
- /*
- * Sanitise input arguments.
- * Pages both ends in CMA area could be merged into adjacent unmovable
- * migratetype page by page allocator's buddy algorithm. In the case,
- * you couldn't get a contiguous memory, which is not what we want.
- */
- alignment = max(alignment, (phys_addr_t)PAGE_SIZE <<
- max_t(unsigned long, MAX_ORDER - 1, pageblock_order));
+ /* Sanitise input arguments. */
+ alignment = max_t(phys_addr_t, alignment, CMA_MIN_ALIGNMENT_BYTES);
if (fixed && base & (alignment - 1)) {
ret = -EINVAL;
pr_err("Region at %pa must be aligned to %pa bytes\n",
diff --git a/mm/cma.h b/mm/cma.h
index 2c775877eae2..88a0595670b7 100644
--- a/mm/cma.h
+++ b/mm/cma.h
@@ -30,6 +30,7 @@ struct cma {
/* kobject requires dynamic object */
struct cma_kobject *cma_kobj;
#endif
+ bool reserve_pages_on_error;
};
extern struct cma cma_areas[MAX_CMA_AREAS];
diff --git a/mm/compaction.c b/mm/compaction.c
index b4e94cda3019..c3e37aa9ff9e 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -785,7 +785,7 @@ static bool too_many_isolated(pg_data_t *pgdat)
* @cc: Compaction control structure.
* @low_pfn: The first PFN to isolate
* @end_pfn: The one-past-the-last PFN to isolate, within same pageblock
- * @isolate_mode: Isolation mode to be used.
+ * @mode: Isolation mode to be used.
*
* Isolate all pages that can be migrated from the range specified by
* [low_pfn, end_pfn). The range is expected to be within same pageblock.
@@ -798,7 +798,7 @@ static bool too_many_isolated(pg_data_t *pgdat)
*/
static int
isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
- unsigned long end_pfn, isolate_mode_t isolate_mode)
+ unsigned long end_pfn, isolate_mode_t mode)
{
pg_data_t *pgdat = cc->zone->zone_pgdat;
unsigned long nr_scanned = 0, nr_isolated = 0;
@@ -806,6 +806,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
unsigned long flags = 0;
struct lruvec *locked = NULL;
struct page *page = NULL, *valid_page = NULL;
+ struct address_space *mapping;
unsigned long start_pfn = low_pfn;
bool skip_on_failure = false;
unsigned long next_skip_pfn = 0;
@@ -990,7 +991,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
locked = NULL;
}
- if (!isolate_movable_page(page, isolate_mode))
+ if (!isolate_movable_page(page, mode))
goto isolate_success;
}
@@ -1002,15 +1003,15 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
* so avoid taking lru_lock and isolating it unnecessarily in an
* admittedly racy check.
*/
- if (!page_mapping(page) &&
- page_count(page) > page_mapcount(page))
+ mapping = page_mapping(page);
+ if (!mapping && page_count(page) > page_mapcount(page))
goto isolate_fail;
/*
* Only allow to migrate anonymous pages in GFP_NOFS context
* because those do not depend on fs locks.
*/
- if (!(cc->gfp_mask & __GFP_FS) && page_mapping(page))
+ if (!(cc->gfp_mask & __GFP_FS) && mapping)
goto isolate_fail;
/*
@@ -1021,9 +1022,45 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
if (unlikely(!get_page_unless_zero(page)))
goto isolate_fail;
- if (!__isolate_lru_page_prepare(page, isolate_mode))
+ /* Only take pages on LRU: a check now makes later tests safe */
+ if (!PageLRU(page))
+ goto isolate_fail_put;
+
+ /* Compaction might skip unevictable pages but CMA takes them */
+ if (!(mode & ISOLATE_UNEVICTABLE) && PageUnevictable(page))
+ goto isolate_fail_put;
+
+ /*
+ * To minimise LRU disruption, the caller can indicate with
+ * ISOLATE_ASYNC_MIGRATE that it only wants to isolate pages
+ * it will be able to migrate without blocking - clean pages
+ * for the most part. PageWriteback would require blocking.
+ */
+ if ((mode & ISOLATE_ASYNC_MIGRATE) && PageWriteback(page))
goto isolate_fail_put;
+ if ((mode & ISOLATE_ASYNC_MIGRATE) && PageDirty(page)) {
+ bool migrate_dirty;
+
+ /*
+ * Only pages without mappings or that have a
+ * ->migratepage callback are possible to migrate
+ * without blocking. However, we can be racing with
+ * truncation so it's necessary to lock the page
+ * to stabilise the mapping as truncation holds
+ * the page lock until after the page is removed
+ * from the page cache.
+ */
+ if (!trylock_page(page))
+ goto isolate_fail_put;
+
+ mapping = page_mapping(page);
+ migrate_dirty = !mapping || mapping->a_ops->migratepage;
+ unlock_page(page);
+ if (!migrate_dirty)
+ goto isolate_fail_put;
+ }
+
/* Try isolate the page */
if (!TestClearPageLRU(page))
goto isolate_fail_put;
@@ -2350,8 +2387,7 @@ compact_zone(struct compact_control *cc, struct capture_control *capc)
update_cached = !sync &&
cc->zone->compact_cached_migrate_pfn[0] == cc->zone->compact_cached_migrate_pfn[1];
- trace_mm_compaction_begin(start_pfn, cc->migrate_pfn,
- cc->free_pfn, end_pfn, sync);
+ trace_mm_compaction_begin(cc, start_pfn, end_pfn, sync);
/* lru_add_drain_all could be expensive with involving other CPUs */
lru_add_drain();
@@ -2401,8 +2437,7 @@ compact_zone(struct compact_control *cc, struct capture_control *capc)
compaction_free, (unsigned long)cc, cc->mode,
MR_COMPACTION, &nr_succeeded);
- trace_mm_compaction_migratepages(cc->nr_migratepages,
- nr_succeeded);
+ trace_mm_compaction_migratepages(cc, nr_succeeded);
/* All pages were either migrated or will be released */
cc->nr_migratepages = 0;
@@ -2478,8 +2513,7 @@ out:
count_compact_events(COMPACTMIGRATE_SCANNED, cc->total_migrate_scanned);
count_compact_events(COMPACTFREE_SCANNED, cc->total_free_scanned);
- trace_mm_compaction_end(start_pfn, cc->migrate_pfn,
- cc->free_pfn, end_pfn, sync, ret);
+ trace_mm_compaction_end(cc, start_pfn, end_pfn, sync, ret);
return ret;
}
diff --git a/mm/damon/Kconfig b/mm/damon/Kconfig
index 5bcf05851ad0..9b559c76d6dd 100644
--- a/mm/damon/Kconfig
+++ b/mm/damon/Kconfig
@@ -25,33 +25,40 @@ config DAMON_KUNIT_TEST
If unsure, say N.
config DAMON_VADDR
- bool "Data access monitoring primitives for virtual address spaces"
+ bool "Data access monitoring operations for virtual address spaces"
depends on DAMON && MMU
select PAGE_IDLE_FLAG
help
- This builds the default data access monitoring primitives for DAMON
+ This builds the default data access monitoring operations for DAMON
that work for virtual address spaces.
config DAMON_PADDR
- bool "Data access monitoring primitives for the physical address space"
+ bool "Data access monitoring operations for the physical address space"
depends on DAMON && MMU
select PAGE_IDLE_FLAG
help
- This builds the default data access monitoring primitives for DAMON
+ This builds the default data access monitoring operations for DAMON
that works for the physical address space.
config DAMON_VADDR_KUNIT_TEST
- bool "Test for DAMON primitives" if !KUNIT_ALL_TESTS
+ bool "Test for DAMON operations" if !KUNIT_ALL_TESTS
depends on DAMON_VADDR && KUNIT=y
default KUNIT_ALL_TESTS
help
- This builds the DAMON virtual addresses primitives Kunit test suite.
+ This builds the DAMON virtual addresses operations Kunit test suite.
For more information on KUnit and unit tests in general, please refer
to the KUnit documentation.
If unsure, say N.
+config DAMON_SYSFS
+ bool "DAMON sysfs interface"
+ depends on DAMON && SYSFS
+ help
+ This builds the sysfs interface for DAMON. The user space can use
+ the interface for arbitrary data access monitoring.
+
config DAMON_DBGFS
bool "DAMON debugfs interface"
depends on DAMON_VADDR && DAMON_PADDR && DEBUG_FS
diff --git a/mm/damon/Makefile b/mm/damon/Makefile
index f7d5ac377a2b..dbf7190b4144 100644
--- a/mm/damon/Makefile
+++ b/mm/damon/Makefile
@@ -1,7 +1,8 @@
# SPDX-License-Identifier: GPL-2.0
-obj-$(CONFIG_DAMON) := core.o
-obj-$(CONFIG_DAMON_VADDR) += prmtv-common.o vaddr.o
-obj-$(CONFIG_DAMON_PADDR) += prmtv-common.o paddr.o
+obj-y := core.o
+obj-$(CONFIG_DAMON_VADDR) += ops-common.o vaddr.o
+obj-$(CONFIG_DAMON_PADDR) += ops-common.o paddr.o
+obj-$(CONFIG_DAMON_SYSFS) += sysfs.o
obj-$(CONFIG_DAMON_DBGFS) += dbgfs.o
obj-$(CONFIG_DAMON_RECLAIM) += reclaim.o
diff --git a/mm/damon/core-test.h b/mm/damon/core-test.h
index 7008c3735e99..b4085deb9fa0 100644
--- a/mm/damon/core-test.h
+++ b/mm/damon/core-test.h
@@ -24,7 +24,7 @@ static void damon_test_regions(struct kunit *test)
KUNIT_EXPECT_EQ(test, 2ul, r->ar.end);
KUNIT_EXPECT_EQ(test, 0u, r->nr_accesses);
- t = damon_new_target(42);
+ t = damon_new_target();
KUNIT_EXPECT_EQ(test, 0u, damon_nr_regions(t));
damon_add_region(r, t);
@@ -52,8 +52,7 @@ static void damon_test_target(struct kunit *test)
struct damon_ctx *c = damon_new_ctx();
struct damon_target *t;
- t = damon_new_target(42);
- KUNIT_EXPECT_EQ(test, 42ul, t->id);
+ t = damon_new_target();
KUNIT_EXPECT_EQ(test, 0u, nr_damon_targets(c));
damon_add_target(c, t);
@@ -78,7 +77,6 @@ static void damon_test_target(struct kunit *test)
static void damon_test_aggregate(struct kunit *test)
{
struct damon_ctx *ctx = damon_new_ctx();
- unsigned long target_ids[] = {1, 2, 3};
unsigned long saddr[][3] = {{10, 20, 30}, {5, 42, 49}, {13, 33, 55} };
unsigned long eaddr[][3] = {{15, 27, 40}, {31, 45, 55}, {23, 44, 66} };
unsigned long accesses[][3] = {{42, 95, 84}, {10, 20, 30}, {0, 1, 2} };
@@ -86,7 +84,10 @@ static void damon_test_aggregate(struct kunit *test)
struct damon_region *r;
int it, ir;
- damon_set_targets(ctx, target_ids, 3);
+ for (it = 0; it < 3; it++) {
+ t = damon_new_target();
+ damon_add_target(ctx, t);
+ }
it = 0;
damon_for_each_target(t, ctx) {
@@ -122,7 +123,7 @@ static void damon_test_split_at(struct kunit *test)
struct damon_target *t;
struct damon_region *r;
- t = damon_new_target(42);
+ t = damon_new_target();
r = damon_new_region(0, 100);
damon_add_region(r, t);
damon_split_region_at(c, t, r, 25);
@@ -143,7 +144,7 @@ static void damon_test_merge_two(struct kunit *test)
struct damon_region *r, *r2, *r3;
int i;
- t = damon_new_target(42);
+ t = damon_new_target();
r = damon_new_region(0, 100);
r->nr_accesses = 10;
damon_add_region(r, t);
@@ -191,7 +192,7 @@ static void damon_test_merge_regions_of(struct kunit *test)
unsigned long eaddrs[] = {112, 130, 156, 170, 230};
int i;
- t = damon_new_target(42);
+ t = damon_new_target();
for (i = 0; i < ARRAY_SIZE(sa); i++) {
r = damon_new_region(sa[i], ea[i]);
r->nr_accesses = nrs[i];
@@ -215,14 +216,14 @@ static void damon_test_split_regions_of(struct kunit *test)
struct damon_target *t;
struct damon_region *r;
- t = damon_new_target(42);
+ t = damon_new_target();
r = damon_new_region(0, 22);
damon_add_region(r, t);
damon_split_regions_of(c, t, 2);
KUNIT_EXPECT_LE(test, damon_nr_regions(t), 2u);
damon_free_target(t);
- t = damon_new_target(42);
+ t = damon_new_target();
r = damon_new_region(0, 220);
damon_add_region(r, t);
damon_split_regions_of(c, t, 4);
diff --git a/mm/damon/core.c b/mm/damon/core.c
index 1dd153c31c9e..c1e0fed4e877 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -24,6 +24,73 @@
static DEFINE_MUTEX(damon_lock);
static int nr_running_ctxs;
+static bool running_exclusive_ctxs;
+
+static DEFINE_MUTEX(damon_ops_lock);
+static struct damon_operations damon_registered_ops[NR_DAMON_OPS];
+
+/* Should be called under damon_ops_lock with id smaller than NR_DAMON_OPS */
+static bool damon_registered_ops_id(enum damon_ops_id id)
+{
+ struct damon_operations empty_ops = {};
+
+ if (!memcmp(&empty_ops, &damon_registered_ops[id], sizeof(empty_ops)))
+ return false;
+ return true;
+}
+
+/**
+ * damon_register_ops() - Register a monitoring operations set to DAMON.
+ * @ops: monitoring operations set to register.
+ *
+ * This function registers a monitoring operations set of valid &struct
+ * damon_operations->id so that others can find and use them later.
+ *
+ * Return: 0 on success, negative error code otherwise.
+ */
+int damon_register_ops(struct damon_operations *ops)
+{
+ int err = 0;
+
+ if (ops->id >= NR_DAMON_OPS)
+ return -EINVAL;
+ mutex_lock(&damon_ops_lock);
+ /* Fail for already registered ops */
+ if (damon_registered_ops_id(ops->id)) {
+ err = -EINVAL;
+ goto out;
+ }
+ damon_registered_ops[ops->id] = *ops;
+out:
+ mutex_unlock(&damon_ops_lock);
+ return err;
+}
+
+/**
+ * damon_select_ops() - Select a monitoring operations to use with the context.
+ * @ctx: monitoring context to use the operations.
+ * @id: id of the registered monitoring operations to select.
+ *
+ * This function finds registered monitoring operations set of @id and make
+ * @ctx to use it.
+ *
+ * Return: 0 on success, negative error code otherwise.
+ */
+int damon_select_ops(struct damon_ctx *ctx, enum damon_ops_id id)
+{
+ int err = 0;
+
+ if (id >= NR_DAMON_OPS)
+ return -EINVAL;
+
+ mutex_lock(&damon_ops_lock);
+ if (!damon_registered_ops_id(id))
+ err = -EINVAL;
+ else
+ ctx->ops = damon_registered_ops[id];
+ mutex_unlock(&damon_ops_lock);
+ return err;
+}
/*
* Construct a damon_region struct
@@ -144,7 +211,7 @@ void damon_destroy_scheme(struct damos *s)
*
* Returns the pointer to the new struct if success, or NULL otherwise
*/
-struct damon_target *damon_new_target(unsigned long id)
+struct damon_target *damon_new_target(void)
{
struct damon_target *t;
@@ -152,7 +219,7 @@ struct damon_target *damon_new_target(unsigned long id)
if (!t)
return NULL;
- t->id = id;
+ t->pid = NULL;
t->nr_regions = 0;
INIT_LIST_HEAD(&t->regions_list);
@@ -204,10 +271,10 @@ struct damon_ctx *damon_new_ctx(void)
ctx->sample_interval = 5 * 1000;
ctx->aggr_interval = 100 * 1000;
- ctx->primitive_update_interval = 60 * 1000 * 1000;
+ ctx->ops_update_interval = 60 * 1000 * 1000;
ktime_get_coarse_ts64(&ctx->last_aggregation);
- ctx->last_primitive_update = ctx->last_aggregation;
+ ctx->last_ops_update = ctx->last_aggregation;
mutex_init(&ctx->kdamond_lock);
@@ -224,8 +291,8 @@ static void damon_destroy_targets(struct damon_ctx *ctx)
{
struct damon_target *t, *next_t;
- if (ctx->primitive.cleanup) {
- ctx->primitive.cleanup(ctx);
+ if (ctx->ops.cleanup) {
+ ctx->ops.cleanup(ctx);
return;
}
@@ -246,43 +313,11 @@ void damon_destroy_ctx(struct damon_ctx *ctx)
}
/**
- * damon_set_targets() - Set monitoring targets.
- * @ctx: monitoring context
- * @ids: array of target ids
- * @nr_ids: number of entries in @ids
- *
- * This function should not be called while the kdamond is running.
- *
- * Return: 0 on success, negative error code otherwise.
- */
-int damon_set_targets(struct damon_ctx *ctx,
- unsigned long *ids, ssize_t nr_ids)
-{
- ssize_t i;
- struct damon_target *t, *next;
-
- damon_destroy_targets(ctx);
-
- for (i = 0; i < nr_ids; i++) {
- t = damon_new_target(ids[i]);
- if (!t) {
- /* The caller should do cleanup of the ids itself */
- damon_for_each_target_safe(t, next, ctx)
- damon_destroy_target(t);
- return -ENOMEM;
- }
- damon_add_target(ctx, t);
- }
-
- return 0;
-}
-
-/**
* damon_set_attrs() - Set attributes for the monitoring.
* @ctx: monitoring context
* @sample_int: time interval between samplings
* @aggr_int: time interval between aggregations
- * @primitive_upd_int: time interval between monitoring primitive updates
+ * @ops_upd_int: time interval between monitoring operations updates
* @min_nr_reg: minimal number of regions
* @max_nr_reg: maximum number of regions
*
@@ -292,7 +327,7 @@ int damon_set_targets(struct damon_ctx *ctx,
* Return: 0 on success, negative error code otherwise.
*/
int damon_set_attrs(struct damon_ctx *ctx, unsigned long sample_int,
- unsigned long aggr_int, unsigned long primitive_upd_int,
+ unsigned long aggr_int, unsigned long ops_upd_int,
unsigned long min_nr_reg, unsigned long max_nr_reg)
{
if (min_nr_reg < 3)
@@ -302,7 +337,7 @@ int damon_set_attrs(struct damon_ctx *ctx, unsigned long sample_int,
ctx->sample_interval = sample_int;
ctx->aggr_interval = aggr_int;
- ctx->primitive_update_interval = primitive_upd_int;
+ ctx->ops_update_interval = ops_upd_int;
ctx->min_nr_regions = min_nr_reg;
ctx->max_nr_regions = max_nr_reg;
@@ -400,22 +435,25 @@ static int __damon_start(struct damon_ctx *ctx)
* damon_start() - Starts the monitorings for a given group of contexts.
* @ctxs: an array of the pointers for contexts to start monitoring
* @nr_ctxs: size of @ctxs
+ * @exclusive: exclusiveness of this contexts group
*
* This function starts a group of monitoring threads for a group of monitoring
* contexts. One thread per each context is created and run in parallel. The
- * caller should handle synchronization between the threads by itself. If a
- * group of threads that created by other 'damon_start()' call is currently
- * running, this function does nothing but returns -EBUSY.
+ * caller should handle synchronization between the threads by itself. If
+ * @exclusive is true and a group of threads that created by other
+ * 'damon_start()' call is currently running, this function does nothing but
+ * returns -EBUSY.
*
* Return: 0 on success, negative error code otherwise.
*/
-int damon_start(struct damon_ctx **ctxs, int nr_ctxs)
+int damon_start(struct damon_ctx **ctxs, int nr_ctxs, bool exclusive)
{
int i;
int err = 0;
mutex_lock(&damon_lock);
- if (nr_running_ctxs) {
+ if ((exclusive && nr_running_ctxs) ||
+ (!exclusive && running_exclusive_ctxs)) {
mutex_unlock(&damon_lock);
return -EBUSY;
}
@@ -426,13 +464,15 @@ int damon_start(struct damon_ctx **ctxs, int nr_ctxs)
break;
nr_running_ctxs++;
}
+ if (exclusive && nr_running_ctxs)
+ running_exclusive_ctxs = true;
mutex_unlock(&damon_lock);
return err;
}
/*
- * __damon_stop() - Stops monitoring of given context.
+ * __damon_stop() - Stops monitoring of a given context.
* @ctx: monitoring context
*
* Return: 0 on success, negative error code otherwise.
@@ -470,9 +510,8 @@ int damon_stop(struct damon_ctx **ctxs, int nr_ctxs)
/* nr_running_ctxs is decremented in kdamond_fn */
err = __damon_stop(ctxs[i]);
if (err)
- return err;
+ break;
}
-
return err;
}
@@ -548,10 +587,10 @@ static bool damos_valid_target(struct damon_ctx *c, struct damon_target *t,
{
bool ret = __damos_valid_target(r, s);
- if (!ret || !s->quota.esz || !c->primitive.get_scheme_score)
+ if (!ret || !s->quota.esz || !c->ops.get_scheme_score)
return ret;
- return c->primitive.get_scheme_score(c, t, r, s) >= s->quota.min_score;
+ return c->ops.get_scheme_score(c, t, r, s) >= s->quota.min_score;
}
static void damon_do_apply_schemes(struct damon_ctx *c,
@@ -608,7 +647,7 @@ static void damon_do_apply_schemes(struct damon_ctx *c,
continue;
/* Apply the scheme */
- if (c->primitive.apply_scheme) {
+ if (c->ops.apply_scheme) {
if (quota->esz &&
quota->charged_sz + sz > quota->esz) {
sz = ALIGN_DOWN(quota->esz - quota->charged_sz,
@@ -618,7 +657,7 @@ static void damon_do_apply_schemes(struct damon_ctx *c,
damon_split_region_at(c, t, r, sz);
}
ktime_get_coarse_ts64(&begin);
- sz_applied = c->primitive.apply_scheme(c, t, r, s);
+ sz_applied = c->ops.apply_scheme(c, t, r, s);
ktime_get_coarse_ts64(&end);
quota->total_charged_ns += timespec64_to_ns(&end) -
timespec64_to_ns(&begin);
@@ -692,7 +731,7 @@ static void kdamond_apply_schemes(struct damon_ctx *c)
damos_set_effective_quota(quota);
}
- if (!c->primitive.get_scheme_score)
+ if (!c->ops.get_scheme_score)
continue;
/* Fill up the score histogram */
@@ -701,7 +740,7 @@ static void kdamond_apply_schemes(struct damon_ctx *c)
damon_for_each_region(r, t) {
if (!__damos_valid_target(r, s))
continue;
- score = c->primitive.get_scheme_score(
+ score = c->ops.get_scheme_score(
c, t, r, s);
quota->histogram[score] +=
r->ar.end - r->ar.start;
@@ -880,14 +919,15 @@ static void kdamond_split_regions(struct damon_ctx *ctx)
}
/*
- * Check whether it is time to check and apply the target monitoring regions
+ * Check whether it is time to check and apply the operations-related data
+ * structures.
*
* Returns true if it is.
*/
-static bool kdamond_need_update_primitive(struct damon_ctx *ctx)
+static bool kdamond_need_update_operations(struct damon_ctx *ctx)
{
- return damon_check_reset_time_interval(&ctx->last_primitive_update,
- ctx->primitive_update_interval);
+ return damon_check_reset_time_interval(&ctx->last_ops_update,
+ ctx->ops_update_interval);
}
/*
@@ -905,11 +945,11 @@ static bool kdamond_need_stop(struct damon_ctx *ctx)
if (kthread_should_stop())
return true;
- if (!ctx->primitive.target_valid)
+ if (!ctx->ops.target_valid)
return false;
damon_for_each_target(t, ctx) {
- if (ctx->primitive.target_valid(t))
+ if (ctx->ops.target_valid(t))
return false;
}
@@ -1008,8 +1048,8 @@ static int kdamond_fn(void *data)
pr_debug("kdamond (%d) starts\n", current->pid);
- if (ctx->primitive.init)
- ctx->primitive.init(ctx);
+ if (ctx->ops.init)
+ ctx->ops.init(ctx);
if (ctx->callback.before_start && ctx->callback.before_start(ctx))
done = true;
@@ -1019,16 +1059,16 @@ static int kdamond_fn(void *data)
if (kdamond_wait_activation(ctx))
continue;
- if (ctx->primitive.prepare_access_checks)
- ctx->primitive.prepare_access_checks(ctx);
+ if (ctx->ops.prepare_access_checks)
+ ctx->ops.prepare_access_checks(ctx);
if (ctx->callback.after_sampling &&
ctx->callback.after_sampling(ctx))
done = true;
kdamond_usleep(ctx->sample_interval);
- if (ctx->primitive.check_accesses)
- max_nr_accesses = ctx->primitive.check_accesses(ctx);
+ if (ctx->ops.check_accesses)
+ max_nr_accesses = ctx->ops.check_accesses(ctx);
if (kdamond_aggregate_interval_passed(ctx)) {
kdamond_merge_regions(ctx,
@@ -1040,13 +1080,13 @@ static int kdamond_fn(void *data)
kdamond_apply_schemes(ctx);
kdamond_reset_aggregated(ctx);
kdamond_split_regions(ctx);
- if (ctx->primitive.reset_aggregated)
- ctx->primitive.reset_aggregated(ctx);
+ if (ctx->ops.reset_aggregated)
+ ctx->ops.reset_aggregated(ctx);
}
- if (kdamond_need_update_primitive(ctx)) {
- if (ctx->primitive.update)
- ctx->primitive.update(ctx);
+ if (kdamond_need_update_operations(ctx)) {
+ if (ctx->ops.update)
+ ctx->ops.update(ctx);
sz_limit = damon_region_sz_limit(ctx);
}
}
@@ -1057,8 +1097,8 @@ static int kdamond_fn(void *data)
if (ctx->callback.before_terminate)
ctx->callback.before_terminate(ctx);
- if (ctx->primitive.cleanup)
- ctx->primitive.cleanup(ctx);
+ if (ctx->ops.cleanup)
+ ctx->ops.cleanup(ctx);
pr_debug("kdamond (%d) finishes\n", current->pid);
mutex_lock(&ctx->kdamond_lock);
@@ -1067,6 +1107,8 @@ static int kdamond_fn(void *data)
mutex_lock(&damon_lock);
nr_running_ctxs--;
+ if (!nr_running_ctxs && running_exclusive_ctxs)
+ running_exclusive_ctxs = false;
mutex_unlock(&damon_lock);
return 0;
diff --git a/mm/damon/dbgfs-test.h b/mm/damon/dbgfs-test.h
index 86b9f9528231..0bb0d532b159 100644
--- a/mm/damon/dbgfs-test.h
+++ b/mm/damon/dbgfs-test.h
@@ -12,66 +12,58 @@
#include <kunit/test.h>
-static void damon_dbgfs_test_str_to_target_ids(struct kunit *test)
+static void damon_dbgfs_test_str_to_ints(struct kunit *test)
{
char *question;
- unsigned long *answers;
- unsigned long expected[] = {12, 35, 46};
+ int *answers;
+ int expected[] = {12, 35, 46};
ssize_t nr_integers = 0, i;
question = "123";
- answers = str_to_target_ids(question, strlen(question),
- &nr_integers);
+ answers = str_to_ints(question, strlen(question), &nr_integers);
KUNIT_EXPECT_EQ(test, (ssize_t)1, nr_integers);
- KUNIT_EXPECT_EQ(test, 123ul, answers[0]);
+ KUNIT_EXPECT_EQ(test, 123, answers[0]);
kfree(answers);
question = "123abc";
- answers = str_to_target_ids(question, strlen(question),
- &nr_integers);
+ answers = str_to_ints(question, strlen(question), &nr_integers);
KUNIT_EXPECT_EQ(test, (ssize_t)1, nr_integers);
- KUNIT_EXPECT_EQ(test, 123ul, answers[0]);
+ KUNIT_EXPECT_EQ(test, 123, answers[0]);
kfree(answers);
question = "a123";
- answers = str_to_target_ids(question, strlen(question),
- &nr_integers);
+ answers = str_to_ints(question, strlen(question), &nr_integers);
KUNIT_EXPECT_EQ(test, (ssize_t)0, nr_integers);
kfree(answers);
question = "12 35";
- answers = str_to_target_ids(question, strlen(question),
- &nr_integers);
+ answers = str_to_ints(question, strlen(question), &nr_integers);
KUNIT_EXPECT_EQ(test, (ssize_t)2, nr_integers);
for (i = 0; i < nr_integers; i++)
KUNIT_EXPECT_EQ(test, expected[i], answers[i]);
kfree(answers);
question = "12 35 46";
- answers = str_to_target_ids(question, strlen(question),
- &nr_integers);
+ answers = str_to_ints(question, strlen(question), &nr_integers);
KUNIT_EXPECT_EQ(test, (ssize_t)3, nr_integers);
for (i = 0; i < nr_integers; i++)
KUNIT_EXPECT_EQ(test, expected[i], answers[i]);
kfree(answers);
question = "12 35 abc 46";
- answers = str_to_target_ids(question, strlen(question),
- &nr_integers);
+ answers = str_to_ints(question, strlen(question), &nr_integers);
KUNIT_EXPECT_EQ(test, (ssize_t)2, nr_integers);
for (i = 0; i < 2; i++)
KUNIT_EXPECT_EQ(test, expected[i], answers[i]);
kfree(answers);
question = "";
- answers = str_to_target_ids(question, strlen(question),
- &nr_integers);
+ answers = str_to_ints(question, strlen(question), &nr_integers);
KUNIT_EXPECT_EQ(test, (ssize_t)0, nr_integers);
kfree(answers);
question = "\n";
- answers = str_to_target_ids(question, strlen(question),
- &nr_integers);
+ answers = str_to_ints(question, strlen(question), &nr_integers);
KUNIT_EXPECT_EQ(test, (ssize_t)0, nr_integers);
kfree(answers);
}
@@ -79,30 +71,20 @@ static void damon_dbgfs_test_str_to_target_ids(struct kunit *test)
static void damon_dbgfs_test_set_targets(struct kunit *test)
{
struct damon_ctx *ctx = dbgfs_new_ctx();
- unsigned long ids[] = {1, 2, 3};
char buf[64];
- /* Make DAMON consider target id as plain number */
- ctx->primitive.target_valid = NULL;
- ctx->primitive.cleanup = NULL;
+ /* Make DAMON consider target has no pid */
+ damon_select_ops(ctx, DAMON_OPS_PADDR);
- damon_set_targets(ctx, ids, 3);
- sprint_target_ids(ctx, buf, 64);
- KUNIT_EXPECT_STREQ(test, (char *)buf, "1 2 3\n");
-
- damon_set_targets(ctx, NULL, 0);
+ dbgfs_set_targets(ctx, 0, NULL);
sprint_target_ids(ctx, buf, 64);
KUNIT_EXPECT_STREQ(test, (char *)buf, "\n");
- damon_set_targets(ctx, (unsigned long []){1, 2}, 2);
+ dbgfs_set_targets(ctx, 1, NULL);
sprint_target_ids(ctx, buf, 64);
- KUNIT_EXPECT_STREQ(test, (char *)buf, "1 2\n");
+ KUNIT_EXPECT_STREQ(test, (char *)buf, "42\n");
- damon_set_targets(ctx, (unsigned long []){2}, 1);
- sprint_target_ids(ctx, buf, 64);
- KUNIT_EXPECT_STREQ(test, (char *)buf, "2\n");
-
- damon_set_targets(ctx, NULL, 0);
+ dbgfs_set_targets(ctx, 0, NULL);
sprint_target_ids(ctx, buf, 64);
KUNIT_EXPECT_STREQ(test, (char *)buf, "\n");
@@ -112,25 +94,26 @@ static void damon_dbgfs_test_set_targets(struct kunit *test)
static void damon_dbgfs_test_set_init_regions(struct kunit *test)
{
struct damon_ctx *ctx = damon_new_ctx();
- unsigned long ids[] = {1, 2, 3};
- /* Each line represents one region in ``<target id> <start> <end>`` */
- char * const valid_inputs[] = {"2 10 20\n 2 20 30\n2 35 45",
- "2 10 20\n",
- "2 10 20\n1 39 59\n1 70 134\n 2 20 25\n",
+ /* Each line represents one region in ``<target idx> <start> <end>`` */
+ char * const valid_inputs[] = {"1 10 20\n 1 20 30\n1 35 45",
+ "1 10 20\n",
+ "1 10 20\n0 39 59\n0 70 134\n 1 20 25\n",
""};
/* Reading the file again will show sorted, clean output */
- char * const valid_expects[] = {"2 10 20\n2 20 30\n2 35 45\n",
- "2 10 20\n",
- "1 39 59\n1 70 134\n2 10 20\n2 20 25\n",
+ char * const valid_expects[] = {"1 10 20\n1 20 30\n1 35 45\n",
+ "1 10 20\n",
+ "0 39 59\n0 70 134\n1 10 20\n1 20 25\n",
""};
- char * const invalid_inputs[] = {"4 10 20\n", /* target not exists */
- "2 10 20\n 2 14 26\n", /* regions overlap */
- "1 10 20\n2 30 40\n 1 5 8"}; /* not sorted by address */
+ char * const invalid_inputs[] = {"3 10 20\n", /* target not exists */
+ "1 10 20\n 1 14 26\n", /* regions overlap */
+ "0 10 20\n1 30 40\n 0 5 8"}; /* not sorted by address */
char *input, *expect;
int i, rc;
char buf[256];
- damon_set_targets(ctx, ids, 3);
+ damon_select_ops(ctx, DAMON_OPS_PADDR);
+
+ dbgfs_set_targets(ctx, 3, NULL);
/* Put valid inputs and check the results */
for (i = 0; i < ARRAY_SIZE(valid_inputs); i++) {
@@ -158,12 +141,12 @@ static void damon_dbgfs_test_set_init_regions(struct kunit *test)
KUNIT_EXPECT_STREQ(test, (char *)buf, "");
}
- damon_set_targets(ctx, NULL, 0);
+ dbgfs_set_targets(ctx, 0, NULL);
damon_destroy_ctx(ctx);
}
static struct kunit_case damon_test_cases[] = {
- KUNIT_CASE(damon_dbgfs_test_str_to_target_ids),
+ KUNIT_CASE(damon_dbgfs_test_str_to_ints),
KUNIT_CASE(damon_dbgfs_test_set_targets),
KUNIT_CASE(damon_dbgfs_test_set_init_regions),
{},
diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c
index 5b899601e56c..a0dab8b5e45f 100644
--- a/mm/damon/dbgfs.c
+++ b/mm/damon/dbgfs.c
@@ -56,7 +56,7 @@ static ssize_t dbgfs_attrs_read(struct file *file,
mutex_lock(&ctx->kdamond_lock);
ret = scnprintf(kbuf, ARRAY_SIZE(kbuf), "%lu %lu %lu %lu %lu\n",
ctx->sample_interval, ctx->aggr_interval,
- ctx->primitive_update_interval, ctx->min_nr_regions,
+ ctx->ops_update_interval, ctx->min_nr_regions,
ctx->max_nr_regions);
mutex_unlock(&ctx->kdamond_lock);
@@ -275,25 +275,27 @@ out:
return ret;
}
-static inline bool targetid_is_pid(const struct damon_ctx *ctx)
+static inline bool target_has_pid(const struct damon_ctx *ctx)
{
- return ctx->primitive.target_valid == damon_va_target_valid;
+ return ctx->ops.id == DAMON_OPS_VADDR;
}
static ssize_t sprint_target_ids(struct damon_ctx *ctx, char *buf, ssize_t len)
{
struct damon_target *t;
- unsigned long id;
+ int id;
int written = 0;
int rc;
damon_for_each_target(t, ctx) {
- id = t->id;
- if (targetid_is_pid(ctx))
+ if (target_has_pid(ctx))
/* Show pid numbers to debugfs users */
- id = (unsigned long)pid_vnr((struct pid *)id);
+ id = pid_vnr(t->pid);
+ else
+ /* Show 42 for physical address space, just for fun */
+ id = 42;
- rc = scnprintf(&buf[written], len - written, "%lu ", id);
+ rc = scnprintf(&buf[written], len - written, "%d ", id);
if (!rc)
return -ENOMEM;
written += rc;
@@ -321,54 +323,129 @@ static ssize_t dbgfs_target_ids_read(struct file *file,
}
/*
- * Converts a string into an array of unsigned long integers
+ * Converts a string into an integers array
*
- * Returns an array of unsigned long integers if the conversion success, or
- * NULL otherwise.
+ * Returns an array of integers array if the conversion success, or NULL
+ * otherwise.
*/
-static unsigned long *str_to_target_ids(const char *str, ssize_t len,
- ssize_t *nr_ids)
+static int *str_to_ints(const char *str, ssize_t len, ssize_t *nr_ints)
{
- unsigned long *ids;
- const int max_nr_ids = 32;
- unsigned long id;
+ int *array;
+ const int max_nr_ints = 32;
+ int nr;
int pos = 0, parsed, ret;
- *nr_ids = 0;
- ids = kmalloc_array(max_nr_ids, sizeof(id), GFP_KERNEL);
- if (!ids)
+ *nr_ints = 0;
+ array = kmalloc_array(max_nr_ints, sizeof(*array), GFP_KERNEL);
+ if (!array)
return NULL;
- while (*nr_ids < max_nr_ids && pos < len) {
- ret = sscanf(&str[pos], "%lu%n", &id, &parsed);
+ while (*nr_ints < max_nr_ints && pos < len) {
+ ret = sscanf(&str[pos], "%d%n", &nr, &parsed);
pos += parsed;
if (ret != 1)
break;
- ids[*nr_ids] = id;
- *nr_ids += 1;
+ array[*nr_ints] = nr;
+ *nr_ints += 1;
}
- return ids;
+ return array;
}
-static void dbgfs_put_pids(unsigned long *ids, int nr_ids)
+static void dbgfs_put_pids(struct pid **pids, int nr_pids)
{
int i;
- for (i = 0; i < nr_ids; i++)
- put_pid((struct pid *)ids[i]);
+ for (i = 0; i < nr_pids; i++)
+ put_pid(pids[i]);
+}
+
+/*
+ * Converts a string into an struct pid pointers array
+ *
+ * Returns an array of struct pid pointers if the conversion success, or NULL
+ * otherwise.
+ */
+static struct pid **str_to_pids(const char *str, ssize_t len, ssize_t *nr_pids)
+{
+ int *ints;
+ ssize_t nr_ints;
+ struct pid **pids;
+
+ *nr_pids = 0;
+
+ ints = str_to_ints(str, len, &nr_ints);
+ if (!ints)
+ return NULL;
+
+ pids = kmalloc_array(nr_ints, sizeof(*pids), GFP_KERNEL);
+ if (!pids)
+ goto out;
+
+ for (; *nr_pids < nr_ints; (*nr_pids)++) {
+ pids[*nr_pids] = find_get_pid(ints[*nr_pids]);
+ if (!pids[*nr_pids]) {
+ dbgfs_put_pids(pids, *nr_pids);
+ kfree(ints);
+ kfree(pids);
+ return NULL;
+ }
+ }
+
+out:
+ kfree(ints);
+ return pids;
+}
+
+/*
+ * dbgfs_set_targets() - Set monitoring targets.
+ * @ctx: monitoring context
+ * @nr_targets: number of targets
+ * @pids: array of target pids (size is same to @nr_targets)
+ *
+ * This function should not be called while the kdamond is running. @pids is
+ * ignored if the context is not configured to have pid in each target. On
+ * failure, reference counts of all pids in @pids are decremented.
+ *
+ * Return: 0 on success, negative error code otherwise.
+ */
+static int dbgfs_set_targets(struct damon_ctx *ctx, ssize_t nr_targets,
+ struct pid **pids)
+{
+ ssize_t i;
+ struct damon_target *t, *next;
+
+ damon_for_each_target_safe(t, next, ctx) {
+ if (target_has_pid(ctx))
+ put_pid(t->pid);
+ damon_destroy_target(t);
+ }
+
+ for (i = 0; i < nr_targets; i++) {
+ t = damon_new_target();
+ if (!t) {
+ damon_for_each_target_safe(t, next, ctx)
+ damon_destroy_target(t);
+ if (target_has_pid(ctx))
+ dbgfs_put_pids(pids, nr_targets);
+ return -ENOMEM;
+ }
+ if (target_has_pid(ctx))
+ t->pid = pids[i];
+ damon_add_target(ctx, t);
+ }
+
+ return 0;
}
static ssize_t dbgfs_target_ids_write(struct file *file,
const char __user *buf, size_t count, loff_t *ppos)
{
struct damon_ctx *ctx = file->private_data;
- struct damon_target *t, *next_t;
bool id_is_pid = true;
char *kbuf;
- unsigned long *targets;
+ struct pid **target_pids = NULL;
ssize_t nr_targets;
ssize_t ret;
- int i;
kbuf = user_input_str(buf, count, ppos);
if (IS_ERR(kbuf))
@@ -376,61 +453,47 @@ static ssize_t dbgfs_target_ids_write(struct file *file,
if (!strncmp(kbuf, "paddr\n", count)) {
id_is_pid = false;
- /* target id is meaningless here, but we set it just for fun */
- scnprintf(kbuf, count, "42 ");
- }
-
- targets = str_to_target_ids(kbuf, count, &nr_targets);
- if (!targets) {
- ret = -ENOMEM;
- goto out;
+ nr_targets = 1;
}
if (id_is_pid) {
- for (i = 0; i < nr_targets; i++) {
- targets[i] = (unsigned long)find_get_pid(
- (int)targets[i]);
- if (!targets[i]) {
- dbgfs_put_pids(targets, i);
- ret = -EINVAL;
- goto free_targets_out;
- }
+ target_pids = str_to_pids(kbuf, count, &nr_targets);
+ if (!target_pids) {
+ ret = -ENOMEM;
+ goto out;
}
}
mutex_lock(&ctx->kdamond_lock);
if (ctx->kdamond) {
if (id_is_pid)
- dbgfs_put_pids(targets, nr_targets);
+ dbgfs_put_pids(target_pids, nr_targets);
ret = -EBUSY;
goto unlock_out;
}
/* remove previously set targets */
- damon_for_each_target_safe(t, next_t, ctx) {
- if (targetid_is_pid(ctx))
- put_pid((struct pid *)t->id);
- damon_destroy_target(t);
+ dbgfs_set_targets(ctx, 0, NULL);
+ if (!nr_targets) {
+ ret = count;
+ goto unlock_out;
}
/* Configure the context for the address space type */
if (id_is_pid)
- damon_va_set_primitives(ctx);
+ ret = damon_select_ops(ctx, DAMON_OPS_VADDR);
else
- damon_pa_set_primitives(ctx);
+ ret = damon_select_ops(ctx, DAMON_OPS_PADDR);
+ if (ret)
+ goto unlock_out;
- ret = damon_set_targets(ctx, targets, nr_targets);
- if (ret) {
- if (id_is_pid)
- dbgfs_put_pids(targets, nr_targets);
- } else {
+ ret = dbgfs_set_targets(ctx, nr_targets, target_pids);
+ if (!ret)
ret = count;
- }
unlock_out:
mutex_unlock(&ctx->kdamond_lock);
-free_targets_out:
- kfree(targets);
+ kfree(target_pids);
out:
kfree(kbuf);
return ret;
@@ -440,18 +503,20 @@ static ssize_t sprint_init_regions(struct damon_ctx *c, char *buf, ssize_t len)
{
struct damon_target *t;
struct damon_region *r;
+ int target_idx = 0;
int written = 0;
int rc;
damon_for_each_target(t, c) {
damon_for_each_region(r, t) {
rc = scnprintf(&buf[written], len - written,
- "%lu %lu %lu\n",
- t->id, r->ar.start, r->ar.end);
+ "%d %lu %lu\n",
+ target_idx, r->ar.start, r->ar.end);
if (!rc)
return -ENOMEM;
written += rc;
}
+ target_idx++;
}
return written;
}
@@ -485,22 +550,19 @@ out:
return len;
}
-static int add_init_region(struct damon_ctx *c,
- unsigned long target_id, struct damon_addr_range *ar)
+static int add_init_region(struct damon_ctx *c, int target_idx,
+ struct damon_addr_range *ar)
{
struct damon_target *t;
struct damon_region *r, *prev;
- unsigned long id;
+ unsigned long idx = 0;
int rc = -EINVAL;
if (ar->start >= ar->end)
return -EINVAL;
damon_for_each_target(t, c) {
- id = t->id;
- if (targetid_is_pid(c))
- id = (unsigned long)pid_vnr((struct pid *)id);
- if (id == target_id) {
+ if (idx++ == target_idx) {
r = damon_new_region(ar->start, ar->end);
if (!r)
return -ENOMEM;
@@ -523,7 +585,7 @@ static int set_init_regions(struct damon_ctx *c, const char *str, ssize_t len)
struct damon_target *t;
struct damon_region *r, *next;
int pos = 0, parsed, ret;
- unsigned long target_id;
+ int target_idx;
struct damon_addr_range ar;
int err;
@@ -533,11 +595,11 @@ static int set_init_regions(struct damon_ctx *c, const char *str, ssize_t len)
}
while (pos < len) {
- ret = sscanf(&str[pos], "%lu %lu %lu%n",
- &target_id, &ar.start, &ar.end, &parsed);
+ ret = sscanf(&str[pos], "%d %lu %lu%n",
+ &target_idx, &ar.start, &ar.end, &parsed);
if (ret != 3)
break;
- err = add_init_region(c, target_id, &ar);
+ err = add_init_region(c, target_idx, &ar);
if (err)
goto fail;
pos += parsed;
@@ -660,12 +722,12 @@ static void dbgfs_before_terminate(struct damon_ctx *ctx)
{
struct damon_target *t, *next;
- if (!targetid_is_pid(ctx))
+ if (!target_has_pid(ctx))
return;
mutex_lock(&ctx->kdamond_lock);
damon_for_each_target_safe(t, next, ctx) {
- put_pid((struct pid *)t->id);
+ put_pid(t->pid);
damon_destroy_target(t);
}
mutex_unlock(&ctx->kdamond_lock);
@@ -679,7 +741,11 @@ static struct damon_ctx *dbgfs_new_ctx(void)
if (!ctx)
return NULL;
- damon_va_set_primitives(ctx);
+ if (damon_select_ops(ctx, DAMON_OPS_VADDR) &&
+ damon_select_ops(ctx, DAMON_OPS_PADDR)) {
+ damon_destroy_ctx(ctx);
+ return NULL;
+ }
ctx->callback.before_terminate = dbgfs_before_terminate;
return ctx;
}
@@ -901,7 +967,7 @@ static ssize_t dbgfs_monitor_on_write(struct file *file,
return -EINVAL;
}
}
- ret = damon_start(dbgfs_ctxs, dbgfs_nr_ctxs);
+ ret = damon_start(dbgfs_ctxs, dbgfs_nr_ctxs, true);
} else if (!strncmp(kbuf, "off", count)) {
ret = damon_stop(dbgfs_ctxs, dbgfs_nr_ctxs);
} else {
diff --git a/mm/damon/prmtv-common.c b/mm/damon/ops-common.c
index 92a04f5831d6..e346cc10d143 100644
--- a/mm/damon/prmtv-common.c
+++ b/mm/damon/ops-common.c
@@ -10,7 +10,7 @@
#include <linux/pagemap.h>
#include <linux/rmap.h>
-#include "prmtv-common.h"
+#include "ops-common.h"
/*
* Get an online page for a pfn if it's in the LRU list. Otherwise, returns
diff --git a/mm/damon/prmtv-common.h b/mm/damon/ops-common.h
index e790cb5f8fe0..e790cb5f8fe0 100644
--- a/mm/damon/prmtv-common.h
+++ b/mm/damon/ops-common.h
diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c
index 5e8244f65a1a..21474ae63bc7 100644
--- a/mm/damon/paddr.c
+++ b/mm/damon/paddr.c
@@ -14,16 +14,12 @@
#include <linux/swap.h>
#include "../internal.h"
-#include "prmtv-common.h"
+#include "ops-common.h"
-static bool __damon_pa_mkold(struct page *page, struct vm_area_struct *vma,
+static bool __damon_pa_mkold(struct folio *folio, struct vm_area_struct *vma,
unsigned long addr, void *arg)
{
- struct page_vma_mapped_walk pvmw = {
- .page = page,
- .vma = vma,
- .address = addr,
- };
+ DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, addr, 0);
while (page_vma_mapped_walk(&pvmw)) {
addr = pvmw.address;
@@ -37,32 +33,34 @@ static bool __damon_pa_mkold(struct page *page, struct vm_area_struct *vma,
static void damon_pa_mkold(unsigned long paddr)
{
+ struct folio *folio;
struct page *page = damon_get_page(PHYS_PFN(paddr));
struct rmap_walk_control rwc = {
.rmap_one = __damon_pa_mkold,
- .anon_lock = page_lock_anon_vma_read,
+ .anon_lock = folio_lock_anon_vma_read,
};
bool need_lock;
if (!page)
return;
+ folio = page_folio(page);
- if (!page_mapped(page) || !page_rmapping(page)) {
- set_page_idle(page);
+ if (!folio_mapped(folio) || !folio_raw_mapping(folio)) {
+ folio_set_idle(folio);
goto out;
}
- need_lock = !PageAnon(page) || PageKsm(page);
- if (need_lock && !trylock_page(page))
+ need_lock = !folio_test_anon(folio) || folio_test_ksm(folio);
+ if (need_lock && !folio_trylock(folio))
goto out;
- rmap_walk(page, &rwc);
+ rmap_walk(folio, &rwc);
if (need_lock)
- unlock_page(page);
+ folio_unlock(folio);
out:
- put_page(page);
+ folio_put(folio);
}
static void __damon_pa_prepare_access_check(struct damon_ctx *ctx,
@@ -89,15 +87,11 @@ struct damon_pa_access_chk_result {
bool accessed;
};
-static bool __damon_pa_young(struct page *page, struct vm_area_struct *vma,
+static bool __damon_pa_young(struct folio *folio, struct vm_area_struct *vma,
unsigned long addr, void *arg)
{
struct damon_pa_access_chk_result *result = arg;
- struct page_vma_mapped_walk pvmw = {
- .page = page,
- .vma = vma,
- .address = addr,
- };
+ DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, addr, 0);
result->accessed = false;
result->page_sz = PAGE_SIZE;
@@ -105,12 +99,12 @@ static bool __damon_pa_young(struct page *page, struct vm_area_struct *vma,
addr = pvmw.address;
if (pvmw.pte) {
result->accessed = pte_young(*pvmw.pte) ||
- !page_is_idle(page) ||
+ !folio_test_idle(folio) ||
mmu_notifier_test_young(vma->vm_mm, addr);
} else {
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
result->accessed = pmd_young(*pvmw.pmd) ||
- !page_is_idle(page) ||
+ !folio_test_idle(folio) ||
mmu_notifier_test_young(vma->vm_mm, addr);
result->page_sz = ((1UL) << HPAGE_PMD_SHIFT);
#else
@@ -129,6 +123,7 @@ static bool __damon_pa_young(struct page *page, struct vm_area_struct *vma,
static bool damon_pa_young(unsigned long paddr, unsigned long *page_sz)
{
+ struct folio *folio;
struct page *page = damon_get_page(PHYS_PFN(paddr));
struct damon_pa_access_chk_result result = {
.page_sz = PAGE_SIZE,
@@ -137,33 +132,34 @@ static bool damon_pa_young(unsigned long paddr, unsigned long *page_sz)
struct rmap_walk_control rwc = {
.arg = &result,
.rmap_one = __damon_pa_young,
- .anon_lock = page_lock_anon_vma_read,
+ .anon_lock = folio_lock_anon_vma_read,
};
bool need_lock;
if (!page)
return false;
+ folio = page_folio(page);
- if (!page_mapped(page) || !page_rmapping(page)) {
- if (page_is_idle(page))
+ if (!folio_mapped(folio) || !folio_raw_mapping(folio)) {
+ if (folio_test_idle(folio))
result.accessed = false;
else
result.accessed = true;
- put_page(page);
+ folio_put(folio);
goto out;
}
- need_lock = !PageAnon(page) || PageKsm(page);
- if (need_lock && !trylock_page(page)) {
- put_page(page);
- return NULL;
+ need_lock = !folio_test_anon(folio) || folio_test_ksm(folio);
+ if (need_lock && !folio_trylock(folio)) {
+ folio_put(folio);
+ return false;
}
- rmap_walk(page, &rwc);
+ rmap_walk(folio, &rwc);
if (need_lock)
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
out:
*page_sz = result.page_sz;
@@ -208,11 +204,6 @@ static unsigned int damon_pa_check_accesses(struct damon_ctx *ctx)
return max_nr_accesses;
}
-bool damon_pa_target_valid(void *t)
-{
- return true;
-}
-
static unsigned long damon_pa_apply_scheme(struct damon_ctx *ctx,
struct damon_target *t, struct damon_region *r,
struct damos *scheme)
@@ -261,15 +252,22 @@ static int damon_pa_scheme_score(struct damon_ctx *context,
return DAMOS_MAX_SCORE;
}
-void damon_pa_set_primitives(struct damon_ctx *ctx)
+static int __init damon_pa_initcall(void)
{
- ctx->primitive.init = NULL;
- ctx->primitive.update = NULL;
- ctx->primitive.prepare_access_checks = damon_pa_prepare_access_checks;
- ctx->primitive.check_accesses = damon_pa_check_accesses;
- ctx->primitive.reset_aggregated = NULL;
- ctx->primitive.target_valid = damon_pa_target_valid;
- ctx->primitive.cleanup = NULL;
- ctx->primitive.apply_scheme = damon_pa_apply_scheme;
- ctx->primitive.get_scheme_score = damon_pa_scheme_score;
-}
+ struct damon_operations ops = {
+ .id = DAMON_OPS_PADDR,
+ .init = NULL,
+ .update = NULL,
+ .prepare_access_checks = damon_pa_prepare_access_checks,
+ .check_accesses = damon_pa_check_accesses,
+ .reset_aggregated = NULL,
+ .target_valid = NULL,
+ .cleanup = NULL,
+ .apply_scheme = damon_pa_apply_scheme,
+ .get_scheme_score = damon_pa_scheme_score,
+ };
+
+ return damon_register_ops(&ops);
+};
+
+subsys_initcall(damon_pa_initcall);
diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c
index bc476cef688e..e34c4d0c4d93 100644
--- a/mm/damon/reclaim.c
+++ b/mm/damon/reclaim.c
@@ -330,7 +330,7 @@ static int damon_reclaim_turn(bool on)
if (err)
goto free_scheme_out;
- err = damon_start(&ctx, 1);
+ err = damon_start(&ctx, 1, true);
if (!err) {
kdamond_pid = ctx->kdamond->pid;
return 0;
@@ -384,11 +384,12 @@ static int __init damon_reclaim_init(void)
if (!ctx)
return -ENOMEM;
- damon_pa_set_primitives(ctx);
+ if (damon_select_ops(ctx, DAMON_OPS_PADDR))
+ return -EINVAL;
+
ctx->callback.after_aggregation = damon_reclaim_after_aggregation;
- /* 4242 means nothing but fun */
- target = damon_new_target(4242);
+ target = damon_new_target();
if (!target) {
damon_destroy_ctx(ctx);
return -ENOMEM;
diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
new file mode 100644
index 000000000000..48e434cd43d8
--- /dev/null
+++ b/mm/damon/sysfs.c
@@ -0,0 +1,2596 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * DAMON sysfs Interface
+ *
+ * Copyright (c) 2022 SeongJae Park <sj@kernel.org>
+ */
+
+#include <linux/damon.h>
+#include <linux/kobject.h>
+#include <linux/pid.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+
+static DEFINE_MUTEX(damon_sysfs_lock);
+
+/*
+ * unsigned long range directory
+ */
+
+struct damon_sysfs_ul_range {
+ struct kobject kobj;
+ unsigned long min;
+ unsigned long max;
+};
+
+static struct damon_sysfs_ul_range *damon_sysfs_ul_range_alloc(
+ unsigned long min,
+ unsigned long max)
+{
+ struct damon_sysfs_ul_range *range = kmalloc(sizeof(*range),
+ GFP_KERNEL);
+
+ if (!range)
+ return NULL;
+ range->kobj = (struct kobject){};
+ range->min = min;
+ range->max = max;
+
+ return range;
+}
+
+static ssize_t min_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ struct damon_sysfs_ul_range *range = container_of(kobj,
+ struct damon_sysfs_ul_range, kobj);
+
+ return sysfs_emit(buf, "%lu\n", range->min);
+}
+
+static ssize_t min_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct damon_sysfs_ul_range *range = container_of(kobj,
+ struct damon_sysfs_ul_range, kobj);
+ unsigned long min;
+ int err;
+
+ err = kstrtoul(buf, 0, &min);
+ if (err)
+ return -EINVAL;
+
+ range->min = min;
+ return count;
+}
+
+static ssize_t max_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ struct damon_sysfs_ul_range *range = container_of(kobj,
+ struct damon_sysfs_ul_range, kobj);
+
+ return sysfs_emit(buf, "%lu\n", range->max);
+}
+
+static ssize_t max_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct damon_sysfs_ul_range *range = container_of(kobj,
+ struct damon_sysfs_ul_range, kobj);
+ unsigned long max;
+ int err;
+
+ err = kstrtoul(buf, 0, &max);
+ if (err)
+ return -EINVAL;
+
+ range->max = max;
+ return count;
+}
+
+static void damon_sysfs_ul_range_release(struct kobject *kobj)
+{
+ kfree(container_of(kobj, struct damon_sysfs_ul_range, kobj));
+}
+
+static struct kobj_attribute damon_sysfs_ul_range_min_attr =
+ __ATTR_RW_MODE(min, 0600);
+
+static struct kobj_attribute damon_sysfs_ul_range_max_attr =
+ __ATTR_RW_MODE(max, 0600);
+
+static struct attribute *damon_sysfs_ul_range_attrs[] = {
+ &damon_sysfs_ul_range_min_attr.attr,
+ &damon_sysfs_ul_range_max_attr.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_ul_range);
+
+static struct kobj_type damon_sysfs_ul_range_ktype = {
+ .release = damon_sysfs_ul_range_release,
+ .sysfs_ops = &kobj_sysfs_ops,
+ .default_groups = damon_sysfs_ul_range_groups,
+};
+
+/*
+ * schemes/stats directory
+ */
+
+struct damon_sysfs_stats {
+ struct kobject kobj;
+ unsigned long nr_tried;
+ unsigned long sz_tried;
+ unsigned long nr_applied;
+ unsigned long sz_applied;
+ unsigned long qt_exceeds;
+};
+
+static struct damon_sysfs_stats *damon_sysfs_stats_alloc(void)
+{
+ return kzalloc(sizeof(struct damon_sysfs_stats), GFP_KERNEL);
+}
+
+static ssize_t nr_tried_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ struct damon_sysfs_stats *stats = container_of(kobj,
+ struct damon_sysfs_stats, kobj);
+
+ return sysfs_emit(buf, "%lu\n", stats->nr_tried);
+}
+
+static ssize_t sz_tried_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ struct damon_sysfs_stats *stats = container_of(kobj,
+ struct damon_sysfs_stats, kobj);
+
+ return sysfs_emit(buf, "%lu\n", stats->sz_tried);
+}
+
+static ssize_t nr_applied_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_stats *stats = container_of(kobj,
+ struct damon_sysfs_stats, kobj);
+
+ return sysfs_emit(buf, "%lu\n", stats->nr_applied);
+}
+
+static ssize_t sz_applied_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_stats *stats = container_of(kobj,
+ struct damon_sysfs_stats, kobj);
+
+ return sysfs_emit(buf, "%lu\n", stats->sz_applied);
+}
+
+static ssize_t qt_exceeds_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_stats *stats = container_of(kobj,
+ struct damon_sysfs_stats, kobj);
+
+ return sysfs_emit(buf, "%lu\n", stats->qt_exceeds);
+}
+
+static void damon_sysfs_stats_release(struct kobject *kobj)
+{
+ kfree(container_of(kobj, struct damon_sysfs_stats, kobj));
+}
+
+static struct kobj_attribute damon_sysfs_stats_nr_tried_attr =
+ __ATTR_RO_MODE(nr_tried, 0400);
+
+static struct kobj_attribute damon_sysfs_stats_sz_tried_attr =
+ __ATTR_RO_MODE(sz_tried, 0400);
+
+static struct kobj_attribute damon_sysfs_stats_nr_applied_attr =
+ __ATTR_RO_MODE(nr_applied, 0400);
+
+static struct kobj_attribute damon_sysfs_stats_sz_applied_attr =
+ __ATTR_RO_MODE(sz_applied, 0400);
+
+static struct kobj_attribute damon_sysfs_stats_qt_exceeds_attr =
+ __ATTR_RO_MODE(qt_exceeds, 0400);
+
+static struct attribute *damon_sysfs_stats_attrs[] = {
+ &damon_sysfs_stats_nr_tried_attr.attr,
+ &damon_sysfs_stats_sz_tried_attr.attr,
+ &damon_sysfs_stats_nr_applied_attr.attr,
+ &damon_sysfs_stats_sz_applied_attr.attr,
+ &damon_sysfs_stats_qt_exceeds_attr.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_stats);
+
+static struct kobj_type damon_sysfs_stats_ktype = {
+ .release = damon_sysfs_stats_release,
+ .sysfs_ops = &kobj_sysfs_ops,
+ .default_groups = damon_sysfs_stats_groups,
+};
+
+/*
+ * watermarks directory
+ */
+
+struct damon_sysfs_watermarks {
+ struct kobject kobj;
+ enum damos_wmark_metric metric;
+ unsigned long interval_us;
+ unsigned long high;
+ unsigned long mid;
+ unsigned long low;
+};
+
+static struct damon_sysfs_watermarks *damon_sysfs_watermarks_alloc(
+ enum damos_wmark_metric metric, unsigned long interval_us,
+ unsigned long high, unsigned long mid, unsigned long low)
+{
+ struct damon_sysfs_watermarks *watermarks = kmalloc(
+ sizeof(*watermarks), GFP_KERNEL);
+
+ if (!watermarks)
+ return NULL;
+ watermarks->kobj = (struct kobject){};
+ watermarks->metric = metric;
+ watermarks->interval_us = interval_us;
+ watermarks->high = high;
+ watermarks->mid = mid;
+ watermarks->low = low;
+ return watermarks;
+}
+
+/* Should match with enum damos_wmark_metric */
+static const char * const damon_sysfs_wmark_metric_strs[] = {
+ "none",
+ "free_mem_rate",
+};
+
+static ssize_t metric_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ struct damon_sysfs_watermarks *watermarks = container_of(kobj,
+ struct damon_sysfs_watermarks, kobj);
+
+ return sysfs_emit(buf, "%s\n",
+ damon_sysfs_wmark_metric_strs[watermarks->metric]);
+}
+
+static ssize_t metric_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct damon_sysfs_watermarks *watermarks = container_of(kobj,
+ struct damon_sysfs_watermarks, kobj);
+ enum damos_wmark_metric metric;
+
+ for (metric = 0; metric < NR_DAMOS_WMARK_METRICS; metric++) {
+ if (sysfs_streq(buf, damon_sysfs_wmark_metric_strs[metric])) {
+ watermarks->metric = metric;
+ return count;
+ }
+ }
+ return -EINVAL;
+}
+
+static ssize_t interval_us_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_watermarks *watermarks = container_of(kobj,
+ struct damon_sysfs_watermarks, kobj);
+
+ return sysfs_emit(buf, "%lu\n", watermarks->interval_us);
+}
+
+static ssize_t interval_us_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_watermarks *watermarks = container_of(kobj,
+ struct damon_sysfs_watermarks, kobj);
+ int err = kstrtoul(buf, 0, &watermarks->interval_us);
+
+ if (err)
+ return -EINVAL;
+ return count;
+}
+
+static ssize_t high_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_watermarks *watermarks = container_of(kobj,
+ struct damon_sysfs_watermarks, kobj);
+
+ return sysfs_emit(buf, "%lu\n", watermarks->high);
+}
+
+static ssize_t high_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_watermarks *watermarks = container_of(kobj,
+ struct damon_sysfs_watermarks, kobj);
+ int err = kstrtoul(buf, 0, &watermarks->high);
+
+ if (err)
+ return -EINVAL;
+ return count;
+}
+
+static ssize_t mid_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_watermarks *watermarks = container_of(kobj,
+ struct damon_sysfs_watermarks, kobj);
+
+ return sysfs_emit(buf, "%lu\n", watermarks->mid);
+}
+
+static ssize_t mid_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_watermarks *watermarks = container_of(kobj,
+ struct damon_sysfs_watermarks, kobj);
+ int err = kstrtoul(buf, 0, &watermarks->mid);
+
+ if (err)
+ return -EINVAL;
+ return count;
+}
+
+static ssize_t low_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_watermarks *watermarks = container_of(kobj,
+ struct damon_sysfs_watermarks, kobj);
+
+ return sysfs_emit(buf, "%lu\n", watermarks->low);
+}
+
+static ssize_t low_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_watermarks *watermarks = container_of(kobj,
+ struct damon_sysfs_watermarks, kobj);
+ int err = kstrtoul(buf, 0, &watermarks->low);
+
+ if (err)
+ return -EINVAL;
+ return count;
+}
+
+static void damon_sysfs_watermarks_release(struct kobject *kobj)
+{
+ kfree(container_of(kobj, struct damon_sysfs_watermarks, kobj));
+}
+
+static struct kobj_attribute damon_sysfs_watermarks_metric_attr =
+ __ATTR_RW_MODE(metric, 0600);
+
+static struct kobj_attribute damon_sysfs_watermarks_interval_us_attr =
+ __ATTR_RW_MODE(interval_us, 0600);
+
+static struct kobj_attribute damon_sysfs_watermarks_high_attr =
+ __ATTR_RW_MODE(high, 0600);
+
+static struct kobj_attribute damon_sysfs_watermarks_mid_attr =
+ __ATTR_RW_MODE(mid, 0600);
+
+static struct kobj_attribute damon_sysfs_watermarks_low_attr =
+ __ATTR_RW_MODE(low, 0600);
+
+static struct attribute *damon_sysfs_watermarks_attrs[] = {
+ &damon_sysfs_watermarks_metric_attr.attr,
+ &damon_sysfs_watermarks_interval_us_attr.attr,
+ &damon_sysfs_watermarks_high_attr.attr,
+ &damon_sysfs_watermarks_mid_attr.attr,
+ &damon_sysfs_watermarks_low_attr.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_watermarks);
+
+static struct kobj_type damon_sysfs_watermarks_ktype = {
+ .release = damon_sysfs_watermarks_release,
+ .sysfs_ops = &kobj_sysfs_ops,
+ .default_groups = damon_sysfs_watermarks_groups,
+};
+
+/*
+ * scheme/weights directory
+ */
+
+struct damon_sysfs_weights {
+ struct kobject kobj;
+ unsigned int sz;
+ unsigned int nr_accesses;
+ unsigned int age;
+};
+
+static struct damon_sysfs_weights *damon_sysfs_weights_alloc(unsigned int sz,
+ unsigned int nr_accesses, unsigned int age)
+{
+ struct damon_sysfs_weights *weights = kmalloc(sizeof(*weights),
+ GFP_KERNEL);
+
+ if (!weights)
+ return NULL;
+ weights->kobj = (struct kobject){};
+ weights->sz = sz;
+ weights->nr_accesses = nr_accesses;
+ weights->age = age;
+ return weights;
+}
+
+static ssize_t sz_permil_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_weights *weights = container_of(kobj,
+ struct damon_sysfs_weights, kobj);
+
+ return sysfs_emit(buf, "%u\n", weights->sz);
+}
+
+static ssize_t sz_permil_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_weights *weights = container_of(kobj,
+ struct damon_sysfs_weights, kobj);
+ int err = kstrtouint(buf, 0, &weights->sz);
+
+ if (err)
+ return -EINVAL;
+ return count;
+}
+
+static ssize_t nr_accesses_permil_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_weights *weights = container_of(kobj,
+ struct damon_sysfs_weights, kobj);
+
+ return sysfs_emit(buf, "%u\n", weights->nr_accesses);
+}
+
+static ssize_t nr_accesses_permil_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_weights *weights = container_of(kobj,
+ struct damon_sysfs_weights, kobj);
+ int err = kstrtouint(buf, 0, &weights->nr_accesses);
+
+ if (err)
+ return -EINVAL;
+ return count;
+}
+
+static ssize_t age_permil_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_weights *weights = container_of(kobj,
+ struct damon_sysfs_weights, kobj);
+
+ return sysfs_emit(buf, "%u\n", weights->age);
+}
+
+static ssize_t age_permil_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_weights *weights = container_of(kobj,
+ struct damon_sysfs_weights, kobj);
+ int err = kstrtouint(buf, 0, &weights->age);
+
+ if (err)
+ return -EINVAL;
+ return count;
+}
+
+static void damon_sysfs_weights_release(struct kobject *kobj)
+{
+ kfree(container_of(kobj, struct damon_sysfs_weights, kobj));
+}
+
+static struct kobj_attribute damon_sysfs_weights_sz_attr =
+ __ATTR_RW_MODE(sz_permil, 0600);
+
+static struct kobj_attribute damon_sysfs_weights_nr_accesses_attr =
+ __ATTR_RW_MODE(nr_accesses_permil, 0600);
+
+static struct kobj_attribute damon_sysfs_weights_age_attr =
+ __ATTR_RW_MODE(age_permil, 0600);
+
+static struct attribute *damon_sysfs_weights_attrs[] = {
+ &damon_sysfs_weights_sz_attr.attr,
+ &damon_sysfs_weights_nr_accesses_attr.attr,
+ &damon_sysfs_weights_age_attr.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_weights);
+
+static struct kobj_type damon_sysfs_weights_ktype = {
+ .release = damon_sysfs_weights_release,
+ .sysfs_ops = &kobj_sysfs_ops,
+ .default_groups = damon_sysfs_weights_groups,
+};
+
+/*
+ * quotas directory
+ */
+
+struct damon_sysfs_quotas {
+ struct kobject kobj;
+ struct damon_sysfs_weights *weights;
+ unsigned long ms;
+ unsigned long sz;
+ unsigned long reset_interval_ms;
+};
+
+static struct damon_sysfs_quotas *damon_sysfs_quotas_alloc(void)
+{
+ return kzalloc(sizeof(struct damon_sysfs_quotas), GFP_KERNEL);
+}
+
+static int damon_sysfs_quotas_add_dirs(struct damon_sysfs_quotas *quotas)
+{
+ struct damon_sysfs_weights *weights;
+ int err;
+
+ weights = damon_sysfs_weights_alloc(0, 0, 0);
+ if (!weights)
+ return -ENOMEM;
+
+ err = kobject_init_and_add(&weights->kobj, &damon_sysfs_weights_ktype,
+ &quotas->kobj, "weights");
+ if (err)
+ kobject_put(&weights->kobj);
+ else
+ quotas->weights = weights;
+ return err;
+}
+
+static void damon_sysfs_quotas_rm_dirs(struct damon_sysfs_quotas *quotas)
+{
+ kobject_put(&quotas->weights->kobj);
+}
+
+static ssize_t ms_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ struct damon_sysfs_quotas *quotas = container_of(kobj,
+ struct damon_sysfs_quotas, kobj);
+
+ return sysfs_emit(buf, "%lu\n", quotas->ms);
+}
+
+static ssize_t ms_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct damon_sysfs_quotas *quotas = container_of(kobj,
+ struct damon_sysfs_quotas, kobj);
+ int err = kstrtoul(buf, 0, &quotas->ms);
+
+ if (err)
+ return -EINVAL;
+ return count;
+}
+
+static ssize_t bytes_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ struct damon_sysfs_quotas *quotas = container_of(kobj,
+ struct damon_sysfs_quotas, kobj);
+
+ return sysfs_emit(buf, "%lu\n", quotas->sz);
+}
+
+static ssize_t bytes_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_quotas *quotas = container_of(kobj,
+ struct damon_sysfs_quotas, kobj);
+ int err = kstrtoul(buf, 0, &quotas->sz);
+
+ if (err)
+ return -EINVAL;
+ return count;
+}
+
+static ssize_t reset_interval_ms_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_quotas *quotas = container_of(kobj,
+ struct damon_sysfs_quotas, kobj);
+
+ return sysfs_emit(buf, "%lu\n", quotas->reset_interval_ms);
+}
+
+static ssize_t reset_interval_ms_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_quotas *quotas = container_of(kobj,
+ struct damon_sysfs_quotas, kobj);
+ int err = kstrtoul(buf, 0, &quotas->reset_interval_ms);
+
+ if (err)
+ return -EINVAL;
+ return count;
+}
+
+static void damon_sysfs_quotas_release(struct kobject *kobj)
+{
+ kfree(container_of(kobj, struct damon_sysfs_quotas, kobj));
+}
+
+static struct kobj_attribute damon_sysfs_quotas_ms_attr =
+ __ATTR_RW_MODE(ms, 0600);
+
+static struct kobj_attribute damon_sysfs_quotas_sz_attr =
+ __ATTR_RW_MODE(bytes, 0600);
+
+static struct kobj_attribute damon_sysfs_quotas_reset_interval_ms_attr =
+ __ATTR_RW_MODE(reset_interval_ms, 0600);
+
+static struct attribute *damon_sysfs_quotas_attrs[] = {
+ &damon_sysfs_quotas_ms_attr.attr,
+ &damon_sysfs_quotas_sz_attr.attr,
+ &damon_sysfs_quotas_reset_interval_ms_attr.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_quotas);
+
+static struct kobj_type damon_sysfs_quotas_ktype = {
+ .release = damon_sysfs_quotas_release,
+ .sysfs_ops = &kobj_sysfs_ops,
+ .default_groups = damon_sysfs_quotas_groups,
+};
+
+/*
+ * access_pattern directory
+ */
+
+struct damon_sysfs_access_pattern {
+ struct kobject kobj;
+ struct damon_sysfs_ul_range *sz;
+ struct damon_sysfs_ul_range *nr_accesses;
+ struct damon_sysfs_ul_range *age;
+};
+
+static
+struct damon_sysfs_access_pattern *damon_sysfs_access_pattern_alloc(void)
+{
+ struct damon_sysfs_access_pattern *access_pattern =
+ kmalloc(sizeof(*access_pattern), GFP_KERNEL);
+
+ if (!access_pattern)
+ return NULL;
+ access_pattern->kobj = (struct kobject){};
+ return access_pattern;
+}
+
+static int damon_sysfs_access_pattern_add_range_dir(
+ struct damon_sysfs_access_pattern *access_pattern,
+ struct damon_sysfs_ul_range **range_dir_ptr,
+ char *name)
+{
+ struct damon_sysfs_ul_range *range = damon_sysfs_ul_range_alloc(0, 0);
+ int err;
+
+ if (!range)
+ return -ENOMEM;
+ err = kobject_init_and_add(&range->kobj, &damon_sysfs_ul_range_ktype,
+ &access_pattern->kobj, name);
+ if (err)
+ kobject_put(&range->kobj);
+ else
+ *range_dir_ptr = range;
+ return err;
+}
+
+static int damon_sysfs_access_pattern_add_dirs(
+ struct damon_sysfs_access_pattern *access_pattern)
+{
+ int err;
+
+ err = damon_sysfs_access_pattern_add_range_dir(access_pattern,
+ &access_pattern->sz, "sz");
+ if (err)
+ goto put_sz_out;
+
+ err = damon_sysfs_access_pattern_add_range_dir(access_pattern,
+ &access_pattern->nr_accesses, "nr_accesses");
+ if (err)
+ goto put_nr_accesses_sz_out;
+
+ err = damon_sysfs_access_pattern_add_range_dir(access_pattern,
+ &access_pattern->age, "age");
+ if (err)
+ goto put_age_nr_accesses_sz_out;
+ return 0;
+
+put_age_nr_accesses_sz_out:
+ kobject_put(&access_pattern->age->kobj);
+ access_pattern->age = NULL;
+put_nr_accesses_sz_out:
+ kobject_put(&access_pattern->nr_accesses->kobj);
+ access_pattern->nr_accesses = NULL;
+put_sz_out:
+ kobject_put(&access_pattern->sz->kobj);
+ access_pattern->sz = NULL;
+ return err;
+}
+
+static void damon_sysfs_access_pattern_rm_dirs(
+ struct damon_sysfs_access_pattern *access_pattern)
+{
+ kobject_put(&access_pattern->sz->kobj);
+ kobject_put(&access_pattern->nr_accesses->kobj);
+ kobject_put(&access_pattern->age->kobj);
+}
+
+static void damon_sysfs_access_pattern_release(struct kobject *kobj)
+{
+ kfree(container_of(kobj, struct damon_sysfs_access_pattern, kobj));
+}
+
+static struct attribute *damon_sysfs_access_pattern_attrs[] = {
+ NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_access_pattern);
+
+static struct kobj_type damon_sysfs_access_pattern_ktype = {
+ .release = damon_sysfs_access_pattern_release,
+ .sysfs_ops = &kobj_sysfs_ops,
+ .default_groups = damon_sysfs_access_pattern_groups,
+};
+
+/*
+ * scheme directory
+ */
+
+struct damon_sysfs_scheme {
+ struct kobject kobj;
+ enum damos_action action;
+ struct damon_sysfs_access_pattern *access_pattern;
+ struct damon_sysfs_quotas *quotas;
+ struct damon_sysfs_watermarks *watermarks;
+ struct damon_sysfs_stats *stats;
+};
+
+/* This should match with enum damos_action */
+static const char * const damon_sysfs_damos_action_strs[] = {
+ "willneed",
+ "cold",
+ "pageout",
+ "hugepage",
+ "nohugepage",
+ "stat",
+};
+
+static struct damon_sysfs_scheme *damon_sysfs_scheme_alloc(
+ enum damos_action action)
+{
+ struct damon_sysfs_scheme *scheme = kmalloc(sizeof(*scheme),
+ GFP_KERNEL);
+
+ if (!scheme)
+ return NULL;
+ scheme->kobj = (struct kobject){};
+ scheme->action = action;
+ return scheme;
+}
+
+static int damon_sysfs_scheme_set_access_pattern(
+ struct damon_sysfs_scheme *scheme)
+{
+ struct damon_sysfs_access_pattern *access_pattern;
+ int err;
+
+ access_pattern = damon_sysfs_access_pattern_alloc();
+ if (!access_pattern)
+ return -ENOMEM;
+ err = kobject_init_and_add(&access_pattern->kobj,
+ &damon_sysfs_access_pattern_ktype, &scheme->kobj,
+ "access_pattern");
+ if (err)
+ goto out;
+ err = damon_sysfs_access_pattern_add_dirs(access_pattern);
+ if (err)
+ goto out;
+ scheme->access_pattern = access_pattern;
+ return 0;
+
+out:
+ kobject_put(&access_pattern->kobj);
+ return err;
+}
+
+static int damon_sysfs_scheme_set_quotas(struct damon_sysfs_scheme *scheme)
+{
+ struct damon_sysfs_quotas *quotas = damon_sysfs_quotas_alloc();
+ int err;
+
+ if (!quotas)
+ return -ENOMEM;
+ err = kobject_init_and_add(&quotas->kobj, &damon_sysfs_quotas_ktype,
+ &scheme->kobj, "quotas");
+ if (err)
+ goto out;
+ err = damon_sysfs_quotas_add_dirs(quotas);
+ if (err)
+ goto out;
+ scheme->quotas = quotas;
+ return 0;
+
+out:
+ kobject_put(&quotas->kobj);
+ return err;
+}
+
+static int damon_sysfs_scheme_set_watermarks(struct damon_sysfs_scheme *scheme)
+{
+ struct damon_sysfs_watermarks *watermarks =
+ damon_sysfs_watermarks_alloc(DAMOS_WMARK_NONE, 0, 0, 0, 0);
+ int err;
+
+ if (!watermarks)
+ return -ENOMEM;
+ err = kobject_init_and_add(&watermarks->kobj,
+ &damon_sysfs_watermarks_ktype, &scheme->kobj,
+ "watermarks");
+ if (err)
+ kobject_put(&watermarks->kobj);
+ else
+ scheme->watermarks = watermarks;
+ return err;
+}
+
+static int damon_sysfs_scheme_set_stats(struct damon_sysfs_scheme *scheme)
+{
+ struct damon_sysfs_stats *stats = damon_sysfs_stats_alloc();
+ int err;
+
+ if (!stats)
+ return -ENOMEM;
+ err = kobject_init_and_add(&stats->kobj, &damon_sysfs_stats_ktype,
+ &scheme->kobj, "stats");
+ if (err)
+ kobject_put(&stats->kobj);
+ else
+ scheme->stats = stats;
+ return err;
+}
+
+static int damon_sysfs_scheme_add_dirs(struct damon_sysfs_scheme *scheme)
+{
+ int err;
+
+ err = damon_sysfs_scheme_set_access_pattern(scheme);
+ if (err)
+ return err;
+ err = damon_sysfs_scheme_set_quotas(scheme);
+ if (err)
+ goto put_access_pattern_out;
+ err = damon_sysfs_scheme_set_watermarks(scheme);
+ if (err)
+ goto put_quotas_access_pattern_out;
+ err = damon_sysfs_scheme_set_stats(scheme);
+ if (err)
+ goto put_watermarks_quotas_access_pattern_out;
+ return 0;
+
+put_watermarks_quotas_access_pattern_out:
+ kobject_put(&scheme->watermarks->kobj);
+ scheme->watermarks = NULL;
+put_quotas_access_pattern_out:
+ kobject_put(&scheme->quotas->kobj);
+ scheme->quotas = NULL;
+put_access_pattern_out:
+ kobject_put(&scheme->access_pattern->kobj);
+ scheme->access_pattern = NULL;
+ return err;
+}
+
+static void damon_sysfs_scheme_rm_dirs(struct damon_sysfs_scheme *scheme)
+{
+ damon_sysfs_access_pattern_rm_dirs(scheme->access_pattern);
+ kobject_put(&scheme->access_pattern->kobj);
+ damon_sysfs_quotas_rm_dirs(scheme->quotas);
+ kobject_put(&scheme->quotas->kobj);
+ kobject_put(&scheme->watermarks->kobj);
+ kobject_put(&scheme->stats->kobj);
+}
+
+static ssize_t action_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ struct damon_sysfs_scheme *scheme = container_of(kobj,
+ struct damon_sysfs_scheme, kobj);
+
+ return sysfs_emit(buf, "%s\n",
+ damon_sysfs_damos_action_strs[scheme->action]);
+}
+
+static ssize_t action_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct damon_sysfs_scheme *scheme = container_of(kobj,
+ struct damon_sysfs_scheme, kobj);
+ enum damos_action action;
+
+ for (action = 0; action < NR_DAMOS_ACTIONS; action++) {
+ if (sysfs_streq(buf, damon_sysfs_damos_action_strs[action])) {
+ scheme->action = action;
+ return count;
+ }
+ }
+ return -EINVAL;
+}
+
+static void damon_sysfs_scheme_release(struct kobject *kobj)
+{
+ kfree(container_of(kobj, struct damon_sysfs_scheme, kobj));
+}
+
+static struct kobj_attribute damon_sysfs_scheme_action_attr =
+ __ATTR_RW_MODE(action, 0600);
+
+static struct attribute *damon_sysfs_scheme_attrs[] = {
+ &damon_sysfs_scheme_action_attr.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_scheme);
+
+static struct kobj_type damon_sysfs_scheme_ktype = {
+ .release = damon_sysfs_scheme_release,
+ .sysfs_ops = &kobj_sysfs_ops,
+ .default_groups = damon_sysfs_scheme_groups,
+};
+
+/*
+ * schemes directory
+ */
+
+struct damon_sysfs_schemes {
+ struct kobject kobj;
+ struct damon_sysfs_scheme **schemes_arr;
+ int nr;
+};
+
+static struct damon_sysfs_schemes *damon_sysfs_schemes_alloc(void)
+{
+ return kzalloc(sizeof(struct damon_sysfs_schemes), GFP_KERNEL);
+}
+
+static void damon_sysfs_schemes_rm_dirs(struct damon_sysfs_schemes *schemes)
+{
+ struct damon_sysfs_scheme **schemes_arr = schemes->schemes_arr;
+ int i;
+
+ for (i = 0; i < schemes->nr; i++) {
+ damon_sysfs_scheme_rm_dirs(schemes_arr[i]);
+ kobject_put(&schemes_arr[i]->kobj);
+ }
+ schemes->nr = 0;
+ kfree(schemes_arr);
+ schemes->schemes_arr = NULL;
+}
+
+static int damon_sysfs_schemes_add_dirs(struct damon_sysfs_schemes *schemes,
+ int nr_schemes)
+{
+ struct damon_sysfs_scheme **schemes_arr, *scheme;
+ int err, i;
+
+ damon_sysfs_schemes_rm_dirs(schemes);
+ if (!nr_schemes)
+ return 0;
+
+ schemes_arr = kmalloc_array(nr_schemes, sizeof(*schemes_arr),
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!schemes_arr)
+ return -ENOMEM;
+ schemes->schemes_arr = schemes_arr;
+
+ for (i = 0; i < nr_schemes; i++) {
+ scheme = damon_sysfs_scheme_alloc(DAMOS_STAT);
+ if (!scheme) {
+ damon_sysfs_schemes_rm_dirs(schemes);
+ return -ENOMEM;
+ }
+
+ err = kobject_init_and_add(&scheme->kobj,
+ &damon_sysfs_scheme_ktype, &schemes->kobj,
+ "%d", i);
+ if (err)
+ goto out;
+ err = damon_sysfs_scheme_add_dirs(scheme);
+ if (err)
+ goto out;
+
+ schemes_arr[i] = scheme;
+ schemes->nr++;
+ }
+ return 0;
+
+out:
+ damon_sysfs_schemes_rm_dirs(schemes);
+ kobject_put(&scheme->kobj);
+ return err;
+}
+
+static ssize_t nr_schemes_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_schemes *schemes = container_of(kobj,
+ struct damon_sysfs_schemes, kobj);
+
+ return sysfs_emit(buf, "%d\n", schemes->nr);
+}
+
+static ssize_t nr_schemes_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_schemes *schemes = container_of(kobj,
+ struct damon_sysfs_schemes, kobj);
+ int nr, err = kstrtoint(buf, 0, &nr);
+
+ if (err)
+ return err;
+ if (nr < 0)
+ return -EINVAL;
+
+ if (!mutex_trylock(&damon_sysfs_lock))
+ return -EBUSY;
+ err = damon_sysfs_schemes_add_dirs(schemes, nr);
+ mutex_unlock(&damon_sysfs_lock);
+ if (err)
+ return err;
+ return count;
+}
+
+static void damon_sysfs_schemes_release(struct kobject *kobj)
+{
+ kfree(container_of(kobj, struct damon_sysfs_schemes, kobj));
+}
+
+static struct kobj_attribute damon_sysfs_schemes_nr_attr =
+ __ATTR_RW_MODE(nr_schemes, 0600);
+
+static struct attribute *damon_sysfs_schemes_attrs[] = {
+ &damon_sysfs_schemes_nr_attr.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_schemes);
+
+static struct kobj_type damon_sysfs_schemes_ktype = {
+ .release = damon_sysfs_schemes_release,
+ .sysfs_ops = &kobj_sysfs_ops,
+ .default_groups = damon_sysfs_schemes_groups,
+};
+
+/*
+ * init region directory
+ */
+
+struct damon_sysfs_region {
+ struct kobject kobj;
+ unsigned long start;
+ unsigned long end;
+};
+
+static struct damon_sysfs_region *damon_sysfs_region_alloc(
+ unsigned long start,
+ unsigned long end)
+{
+ struct damon_sysfs_region *region = kmalloc(sizeof(*region),
+ GFP_KERNEL);
+
+ if (!region)
+ return NULL;
+ region->kobj = (struct kobject){};
+ region->start = start;
+ region->end = end;
+ return region;
+}
+
+static ssize_t start_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ struct damon_sysfs_region *region = container_of(kobj,
+ struct damon_sysfs_region, kobj);
+
+ return sysfs_emit(buf, "%lu\n", region->start);
+}
+
+static ssize_t start_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct damon_sysfs_region *region = container_of(kobj,
+ struct damon_sysfs_region, kobj);
+ int err = kstrtoul(buf, 0, &region->start);
+
+ if (err)
+ return -EINVAL;
+ return count;
+}
+
+static ssize_t end_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ struct damon_sysfs_region *region = container_of(kobj,
+ struct damon_sysfs_region, kobj);
+
+ return sysfs_emit(buf, "%lu\n", region->end);
+}
+
+static ssize_t end_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct damon_sysfs_region *region = container_of(kobj,
+ struct damon_sysfs_region, kobj);
+ int err = kstrtoul(buf, 0, &region->end);
+
+ if (err)
+ return -EINVAL;
+ return count;
+}
+
+static void damon_sysfs_region_release(struct kobject *kobj)
+{
+ kfree(container_of(kobj, struct damon_sysfs_region, kobj));
+}
+
+static struct kobj_attribute damon_sysfs_region_start_attr =
+ __ATTR_RW_MODE(start, 0600);
+
+static struct kobj_attribute damon_sysfs_region_end_attr =
+ __ATTR_RW_MODE(end, 0600);
+
+static struct attribute *damon_sysfs_region_attrs[] = {
+ &damon_sysfs_region_start_attr.attr,
+ &damon_sysfs_region_end_attr.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_region);
+
+static struct kobj_type damon_sysfs_region_ktype = {
+ .release = damon_sysfs_region_release,
+ .sysfs_ops = &kobj_sysfs_ops,
+ .default_groups = damon_sysfs_region_groups,
+};
+
+/*
+ * init_regions directory
+ */
+
+struct damon_sysfs_regions {
+ struct kobject kobj;
+ struct damon_sysfs_region **regions_arr;
+ int nr;
+};
+
+static struct damon_sysfs_regions *damon_sysfs_regions_alloc(void)
+{
+ return kzalloc(sizeof(struct damon_sysfs_regions), GFP_KERNEL);
+}
+
+static void damon_sysfs_regions_rm_dirs(struct damon_sysfs_regions *regions)
+{
+ struct damon_sysfs_region **regions_arr = regions->regions_arr;
+ int i;
+
+ for (i = 0; i < regions->nr; i++)
+ kobject_put(&regions_arr[i]->kobj);
+ regions->nr = 0;
+ kfree(regions_arr);
+ regions->regions_arr = NULL;
+}
+
+static int damon_sysfs_regions_add_dirs(struct damon_sysfs_regions *regions,
+ int nr_regions)
+{
+ struct damon_sysfs_region **regions_arr, *region;
+ int err, i;
+
+ damon_sysfs_regions_rm_dirs(regions);
+ if (!nr_regions)
+ return 0;
+
+ regions_arr = kmalloc_array(nr_regions, sizeof(*regions_arr),
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!regions_arr)
+ return -ENOMEM;
+ regions->regions_arr = regions_arr;
+
+ for (i = 0; i < nr_regions; i++) {
+ region = damon_sysfs_region_alloc(0, 0);
+ if (!region) {
+ damon_sysfs_regions_rm_dirs(regions);
+ return -ENOMEM;
+ }
+
+ err = kobject_init_and_add(&region->kobj,
+ &damon_sysfs_region_ktype, &regions->kobj,
+ "%d", i);
+ if (err) {
+ kobject_put(&region->kobj);
+ damon_sysfs_regions_rm_dirs(regions);
+ return err;
+ }
+
+ regions_arr[i] = region;
+ regions->nr++;
+ }
+ return 0;
+}
+
+static ssize_t nr_regions_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_regions *regions = container_of(kobj,
+ struct damon_sysfs_regions, kobj);
+
+ return sysfs_emit(buf, "%d\n", regions->nr);
+}
+
+static ssize_t nr_regions_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_regions *regions = container_of(kobj,
+ struct damon_sysfs_regions, kobj);
+ int nr, err = kstrtoint(buf, 0, &nr);
+
+ if (err)
+ return err;
+ if (nr < 0)
+ return -EINVAL;
+
+ if (!mutex_trylock(&damon_sysfs_lock))
+ return -EBUSY;
+ err = damon_sysfs_regions_add_dirs(regions, nr);
+ mutex_unlock(&damon_sysfs_lock);
+ if (err)
+ return err;
+
+ return count;
+}
+
+static void damon_sysfs_regions_release(struct kobject *kobj)
+{
+ kfree(container_of(kobj, struct damon_sysfs_regions, kobj));
+}
+
+static struct kobj_attribute damon_sysfs_regions_nr_attr =
+ __ATTR_RW_MODE(nr_regions, 0600);
+
+static struct attribute *damon_sysfs_regions_attrs[] = {
+ &damon_sysfs_regions_nr_attr.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_regions);
+
+static struct kobj_type damon_sysfs_regions_ktype = {
+ .release = damon_sysfs_regions_release,
+ .sysfs_ops = &kobj_sysfs_ops,
+ .default_groups = damon_sysfs_regions_groups,
+};
+
+/*
+ * target directory
+ */
+
+struct damon_sysfs_target {
+ struct kobject kobj;
+ struct damon_sysfs_regions *regions;
+ int pid;
+};
+
+static struct damon_sysfs_target *damon_sysfs_target_alloc(void)
+{
+ return kzalloc(sizeof(struct damon_sysfs_target), GFP_KERNEL);
+}
+
+static int damon_sysfs_target_add_dirs(struct damon_sysfs_target *target)
+{
+ struct damon_sysfs_regions *regions = damon_sysfs_regions_alloc();
+ int err;
+
+ if (!regions)
+ return -ENOMEM;
+
+ err = kobject_init_and_add(&regions->kobj, &damon_sysfs_regions_ktype,
+ &target->kobj, "regions");
+ if (err)
+ kobject_put(&regions->kobj);
+ else
+ target->regions = regions;
+ return err;
+}
+
+static void damon_sysfs_target_rm_dirs(struct damon_sysfs_target *target)
+{
+ damon_sysfs_regions_rm_dirs(target->regions);
+ kobject_put(&target->regions->kobj);
+}
+
+static ssize_t pid_target_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_target *target = container_of(kobj,
+ struct damon_sysfs_target, kobj);
+
+ return sysfs_emit(buf, "%d\n", target->pid);
+}
+
+static ssize_t pid_target_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_target *target = container_of(kobj,
+ struct damon_sysfs_target, kobj);
+ int err = kstrtoint(buf, 0, &target->pid);
+
+ if (err)
+ return -EINVAL;
+ return count;
+}
+
+static void damon_sysfs_target_release(struct kobject *kobj)
+{
+ kfree(container_of(kobj, struct damon_sysfs_target, kobj));
+}
+
+static struct kobj_attribute damon_sysfs_target_pid_attr =
+ __ATTR_RW_MODE(pid_target, 0600);
+
+static struct attribute *damon_sysfs_target_attrs[] = {
+ &damon_sysfs_target_pid_attr.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_target);
+
+static struct kobj_type damon_sysfs_target_ktype = {
+ .release = damon_sysfs_target_release,
+ .sysfs_ops = &kobj_sysfs_ops,
+ .default_groups = damon_sysfs_target_groups,
+};
+
+/*
+ * targets directory
+ */
+
+struct damon_sysfs_targets {
+ struct kobject kobj;
+ struct damon_sysfs_target **targets_arr;
+ int nr;
+};
+
+static struct damon_sysfs_targets *damon_sysfs_targets_alloc(void)
+{
+ return kzalloc(sizeof(struct damon_sysfs_targets), GFP_KERNEL);
+}
+
+static void damon_sysfs_targets_rm_dirs(struct damon_sysfs_targets *targets)
+{
+ struct damon_sysfs_target **targets_arr = targets->targets_arr;
+ int i;
+
+ for (i = 0; i < targets->nr; i++) {
+ damon_sysfs_target_rm_dirs(targets_arr[i]);
+ kobject_put(&targets_arr[i]->kobj);
+ }
+ targets->nr = 0;
+ kfree(targets_arr);
+ targets->targets_arr = NULL;
+}
+
+static int damon_sysfs_targets_add_dirs(struct damon_sysfs_targets *targets,
+ int nr_targets)
+{
+ struct damon_sysfs_target **targets_arr, *target;
+ int err, i;
+
+ damon_sysfs_targets_rm_dirs(targets);
+ if (!nr_targets)
+ return 0;
+
+ targets_arr = kmalloc_array(nr_targets, sizeof(*targets_arr),
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!targets_arr)
+ return -ENOMEM;
+ targets->targets_arr = targets_arr;
+
+ for (i = 0; i < nr_targets; i++) {
+ target = damon_sysfs_target_alloc();
+ if (!target) {
+ damon_sysfs_targets_rm_dirs(targets);
+ return -ENOMEM;
+ }
+
+ err = kobject_init_and_add(&target->kobj,
+ &damon_sysfs_target_ktype, &targets->kobj,
+ "%d", i);
+ if (err)
+ goto out;
+
+ err = damon_sysfs_target_add_dirs(target);
+ if (err)
+ goto out;
+
+ targets_arr[i] = target;
+ targets->nr++;
+ }
+ return 0;
+
+out:
+ damon_sysfs_targets_rm_dirs(targets);
+ kobject_put(&target->kobj);
+ return err;
+}
+
+static ssize_t nr_targets_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_targets *targets = container_of(kobj,
+ struct damon_sysfs_targets, kobj);
+
+ return sysfs_emit(buf, "%d\n", targets->nr);
+}
+
+static ssize_t nr_targets_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_targets *targets = container_of(kobj,
+ struct damon_sysfs_targets, kobj);
+ int nr, err = kstrtoint(buf, 0, &nr);
+
+ if (err)
+ return err;
+ if (nr < 0)
+ return -EINVAL;
+
+ if (!mutex_trylock(&damon_sysfs_lock))
+ return -EBUSY;
+ err = damon_sysfs_targets_add_dirs(targets, nr);
+ mutex_unlock(&damon_sysfs_lock);
+ if (err)
+ return err;
+
+ return count;
+}
+
+static void damon_sysfs_targets_release(struct kobject *kobj)
+{
+ kfree(container_of(kobj, struct damon_sysfs_targets, kobj));
+}
+
+static struct kobj_attribute damon_sysfs_targets_nr_attr =
+ __ATTR_RW_MODE(nr_targets, 0600);
+
+static struct attribute *damon_sysfs_targets_attrs[] = {
+ &damon_sysfs_targets_nr_attr.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_targets);
+
+static struct kobj_type damon_sysfs_targets_ktype = {
+ .release = damon_sysfs_targets_release,
+ .sysfs_ops = &kobj_sysfs_ops,
+ .default_groups = damon_sysfs_targets_groups,
+};
+
+/*
+ * intervals directory
+ */
+
+struct damon_sysfs_intervals {
+ struct kobject kobj;
+ unsigned long sample_us;
+ unsigned long aggr_us;
+ unsigned long update_us;
+};
+
+static struct damon_sysfs_intervals *damon_sysfs_intervals_alloc(
+ unsigned long sample_us, unsigned long aggr_us,
+ unsigned long update_us)
+{
+ struct damon_sysfs_intervals *intervals = kmalloc(sizeof(*intervals),
+ GFP_KERNEL);
+
+ if (!intervals)
+ return NULL;
+
+ intervals->kobj = (struct kobject){};
+ intervals->sample_us = sample_us;
+ intervals->aggr_us = aggr_us;
+ intervals->update_us = update_us;
+ return intervals;
+}
+
+static ssize_t sample_us_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_intervals *intervals = container_of(kobj,
+ struct damon_sysfs_intervals, kobj);
+
+ return sysfs_emit(buf, "%lu\n", intervals->sample_us);
+}
+
+static ssize_t sample_us_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_intervals *intervals = container_of(kobj,
+ struct damon_sysfs_intervals, kobj);
+ unsigned long us;
+ int err = kstrtoul(buf, 0, &us);
+
+ if (err)
+ return -EINVAL;
+
+ intervals->sample_us = us;
+ return count;
+}
+
+static ssize_t aggr_us_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ struct damon_sysfs_intervals *intervals = container_of(kobj,
+ struct damon_sysfs_intervals, kobj);
+
+ return sysfs_emit(buf, "%lu\n", intervals->aggr_us);
+}
+
+static ssize_t aggr_us_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct damon_sysfs_intervals *intervals = container_of(kobj,
+ struct damon_sysfs_intervals, kobj);
+ unsigned long us;
+ int err = kstrtoul(buf, 0, &us);
+
+ if (err)
+ return -EINVAL;
+
+ intervals->aggr_us = us;
+ return count;
+}
+
+static ssize_t update_us_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_intervals *intervals = container_of(kobj,
+ struct damon_sysfs_intervals, kobj);
+
+ return sysfs_emit(buf, "%lu\n", intervals->update_us);
+}
+
+static ssize_t update_us_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_intervals *intervals = container_of(kobj,
+ struct damon_sysfs_intervals, kobj);
+ unsigned long us;
+ int err = kstrtoul(buf, 0, &us);
+
+ if (err)
+ return -EINVAL;
+
+ intervals->update_us = us;
+ return count;
+}
+
+static void damon_sysfs_intervals_release(struct kobject *kobj)
+{
+ kfree(container_of(kobj, struct damon_sysfs_intervals, kobj));
+}
+
+static struct kobj_attribute damon_sysfs_intervals_sample_us_attr =
+ __ATTR_RW_MODE(sample_us, 0600);
+
+static struct kobj_attribute damon_sysfs_intervals_aggr_us_attr =
+ __ATTR_RW_MODE(aggr_us, 0600);
+
+static struct kobj_attribute damon_sysfs_intervals_update_us_attr =
+ __ATTR_RW_MODE(update_us, 0600);
+
+static struct attribute *damon_sysfs_intervals_attrs[] = {
+ &damon_sysfs_intervals_sample_us_attr.attr,
+ &damon_sysfs_intervals_aggr_us_attr.attr,
+ &damon_sysfs_intervals_update_us_attr.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_intervals);
+
+static struct kobj_type damon_sysfs_intervals_ktype = {
+ .release = damon_sysfs_intervals_release,
+ .sysfs_ops = &kobj_sysfs_ops,
+ .default_groups = damon_sysfs_intervals_groups,
+};
+
+/*
+ * monitoring_attrs directory
+ */
+
+struct damon_sysfs_attrs {
+ struct kobject kobj;
+ struct damon_sysfs_intervals *intervals;
+ struct damon_sysfs_ul_range *nr_regions_range;
+};
+
+static struct damon_sysfs_attrs *damon_sysfs_attrs_alloc(void)
+{
+ struct damon_sysfs_attrs *attrs = kmalloc(sizeof(*attrs), GFP_KERNEL);
+
+ if (!attrs)
+ return NULL;
+ attrs->kobj = (struct kobject){};
+ return attrs;
+}
+
+static int damon_sysfs_attrs_add_dirs(struct damon_sysfs_attrs *attrs)
+{
+ struct damon_sysfs_intervals *intervals;
+ struct damon_sysfs_ul_range *nr_regions_range;
+ int err;
+
+ intervals = damon_sysfs_intervals_alloc(5000, 100000, 60000000);
+ if (!intervals)
+ return -ENOMEM;
+
+ err = kobject_init_and_add(&intervals->kobj,
+ &damon_sysfs_intervals_ktype, &attrs->kobj,
+ "intervals");
+ if (err)
+ goto put_intervals_out;
+ attrs->intervals = intervals;
+
+ nr_regions_range = damon_sysfs_ul_range_alloc(10, 1000);
+ if (!nr_regions_range) {
+ err = -ENOMEM;
+ goto put_intervals_out;
+ }
+
+ err = kobject_init_and_add(&nr_regions_range->kobj,
+ &damon_sysfs_ul_range_ktype, &attrs->kobj,
+ "nr_regions");
+ if (err)
+ goto put_nr_regions_intervals_out;
+ attrs->nr_regions_range = nr_regions_range;
+ return 0;
+
+put_nr_regions_intervals_out:
+ kobject_put(&nr_regions_range->kobj);
+ attrs->nr_regions_range = NULL;
+put_intervals_out:
+ kobject_put(&intervals->kobj);
+ attrs->intervals = NULL;
+ return err;
+}
+
+static void damon_sysfs_attrs_rm_dirs(struct damon_sysfs_attrs *attrs)
+{
+ kobject_put(&attrs->nr_regions_range->kobj);
+ kobject_put(&attrs->intervals->kobj);
+}
+
+static void damon_sysfs_attrs_release(struct kobject *kobj)
+{
+ kfree(container_of(kobj, struct damon_sysfs_attrs, kobj));
+}
+
+static struct attribute *damon_sysfs_attrs_attrs[] = {
+ NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_attrs);
+
+static struct kobj_type damon_sysfs_attrs_ktype = {
+ .release = damon_sysfs_attrs_release,
+ .sysfs_ops = &kobj_sysfs_ops,
+ .default_groups = damon_sysfs_attrs_groups,
+};
+
+/*
+ * context directory
+ */
+
+/* This should match with enum damon_ops_id */
+static const char * const damon_sysfs_ops_strs[] = {
+ "vaddr",
+ "paddr",
+};
+
+struct damon_sysfs_context {
+ struct kobject kobj;
+ enum damon_ops_id ops_id;
+ struct damon_sysfs_attrs *attrs;
+ struct damon_sysfs_targets *targets;
+ struct damon_sysfs_schemes *schemes;
+};
+
+static struct damon_sysfs_context *damon_sysfs_context_alloc(
+ enum damon_ops_id ops_id)
+{
+ struct damon_sysfs_context *context = kmalloc(sizeof(*context),
+ GFP_KERNEL);
+
+ if (!context)
+ return NULL;
+ context->kobj = (struct kobject){};
+ context->ops_id = ops_id;
+ return context;
+}
+
+static int damon_sysfs_context_set_attrs(struct damon_sysfs_context *context)
+{
+ struct damon_sysfs_attrs *attrs = damon_sysfs_attrs_alloc();
+ int err;
+
+ if (!attrs)
+ return -ENOMEM;
+ err = kobject_init_and_add(&attrs->kobj, &damon_sysfs_attrs_ktype,
+ &context->kobj, "monitoring_attrs");
+ if (err)
+ goto out;
+ err = damon_sysfs_attrs_add_dirs(attrs);
+ if (err)
+ goto out;
+ context->attrs = attrs;
+ return 0;
+
+out:
+ kobject_put(&attrs->kobj);
+ return err;
+}
+
+static int damon_sysfs_context_set_targets(struct damon_sysfs_context *context)
+{
+ struct damon_sysfs_targets *targets = damon_sysfs_targets_alloc();
+ int err;
+
+ if (!targets)
+ return -ENOMEM;
+ err = kobject_init_and_add(&targets->kobj, &damon_sysfs_targets_ktype,
+ &context->kobj, "targets");
+ if (err) {
+ kobject_put(&targets->kobj);
+ return err;
+ }
+ context->targets = targets;
+ return 0;
+}
+
+static int damon_sysfs_context_set_schemes(struct damon_sysfs_context *context)
+{
+ struct damon_sysfs_schemes *schemes = damon_sysfs_schemes_alloc();
+ int err;
+
+ if (!schemes)
+ return -ENOMEM;
+ err = kobject_init_and_add(&schemes->kobj, &damon_sysfs_schemes_ktype,
+ &context->kobj, "schemes");
+ if (err) {
+ kobject_put(&schemes->kobj);
+ return err;
+ }
+ context->schemes = schemes;
+ return 0;
+}
+
+static int damon_sysfs_context_add_dirs(struct damon_sysfs_context *context)
+{
+ int err;
+
+ err = damon_sysfs_context_set_attrs(context);
+ if (err)
+ return err;
+
+ err = damon_sysfs_context_set_targets(context);
+ if (err)
+ goto put_attrs_out;
+
+ err = damon_sysfs_context_set_schemes(context);
+ if (err)
+ goto put_targets_attrs_out;
+ return 0;
+
+put_targets_attrs_out:
+ kobject_put(&context->targets->kobj);
+ context->targets = NULL;
+put_attrs_out:
+ kobject_put(&context->attrs->kobj);
+ context->attrs = NULL;
+ return err;
+}
+
+static void damon_sysfs_context_rm_dirs(struct damon_sysfs_context *context)
+{
+ damon_sysfs_attrs_rm_dirs(context->attrs);
+ kobject_put(&context->attrs->kobj);
+ damon_sysfs_targets_rm_dirs(context->targets);
+ kobject_put(&context->targets->kobj);
+ damon_sysfs_schemes_rm_dirs(context->schemes);
+ kobject_put(&context->schemes->kobj);
+}
+
+static ssize_t operations_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_context *context = container_of(kobj,
+ struct damon_sysfs_context, kobj);
+
+ return sysfs_emit(buf, "%s\n", damon_sysfs_ops_strs[context->ops_id]);
+}
+
+static ssize_t operations_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_context *context = container_of(kobj,
+ struct damon_sysfs_context, kobj);
+ enum damon_ops_id id;
+
+ for (id = 0; id < NR_DAMON_OPS; id++) {
+ if (sysfs_streq(buf, damon_sysfs_ops_strs[id])) {
+ context->ops_id = id;
+ return count;
+ }
+ }
+ return -EINVAL;
+}
+
+static void damon_sysfs_context_release(struct kobject *kobj)
+{
+ kfree(container_of(kobj, struct damon_sysfs_context, kobj));
+}
+
+static struct kobj_attribute damon_sysfs_context_operations_attr =
+ __ATTR_RW_MODE(operations, 0600);
+
+static struct attribute *damon_sysfs_context_attrs[] = {
+ &damon_sysfs_context_operations_attr.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_context);
+
+static struct kobj_type damon_sysfs_context_ktype = {
+ .release = damon_sysfs_context_release,
+ .sysfs_ops = &kobj_sysfs_ops,
+ .default_groups = damon_sysfs_context_groups,
+};
+
+/*
+ * contexts directory
+ */
+
+struct damon_sysfs_contexts {
+ struct kobject kobj;
+ struct damon_sysfs_context **contexts_arr;
+ int nr;
+};
+
+static struct damon_sysfs_contexts *damon_sysfs_contexts_alloc(void)
+{
+ return kzalloc(sizeof(struct damon_sysfs_contexts), GFP_KERNEL);
+}
+
+static void damon_sysfs_contexts_rm_dirs(struct damon_sysfs_contexts *contexts)
+{
+ struct damon_sysfs_context **contexts_arr = contexts->contexts_arr;
+ int i;
+
+ for (i = 0; i < contexts->nr; i++) {
+ damon_sysfs_context_rm_dirs(contexts_arr[i]);
+ kobject_put(&contexts_arr[i]->kobj);
+ }
+ contexts->nr = 0;
+ kfree(contexts_arr);
+ contexts->contexts_arr = NULL;
+}
+
+static int damon_sysfs_contexts_add_dirs(struct damon_sysfs_contexts *contexts,
+ int nr_contexts)
+{
+ struct damon_sysfs_context **contexts_arr, *context;
+ int err, i;
+
+ damon_sysfs_contexts_rm_dirs(contexts);
+ if (!nr_contexts)
+ return 0;
+
+ contexts_arr = kmalloc_array(nr_contexts, sizeof(*contexts_arr),
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!contexts_arr)
+ return -ENOMEM;
+ contexts->contexts_arr = contexts_arr;
+
+ for (i = 0; i < nr_contexts; i++) {
+ context = damon_sysfs_context_alloc(DAMON_OPS_VADDR);
+ if (!context) {
+ damon_sysfs_contexts_rm_dirs(contexts);
+ return -ENOMEM;
+ }
+
+ err = kobject_init_and_add(&context->kobj,
+ &damon_sysfs_context_ktype, &contexts->kobj,
+ "%d", i);
+ if (err)
+ goto out;
+
+ err = damon_sysfs_context_add_dirs(context);
+ if (err)
+ goto out;
+
+ contexts_arr[i] = context;
+ contexts->nr++;
+ }
+ return 0;
+
+out:
+ damon_sysfs_contexts_rm_dirs(contexts);
+ kobject_put(&context->kobj);
+ return err;
+}
+
+static ssize_t nr_contexts_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_contexts *contexts = container_of(kobj,
+ struct damon_sysfs_contexts, kobj);
+
+ return sysfs_emit(buf, "%d\n", contexts->nr);
+}
+
+static ssize_t nr_contexts_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_contexts *contexts = container_of(kobj,
+ struct damon_sysfs_contexts, kobj);
+ int nr, err;
+
+ err = kstrtoint(buf, 0, &nr);
+ if (err)
+ return err;
+ /* TODO: support multiple contexts per kdamond */
+ if (nr < 0 || 1 < nr)
+ return -EINVAL;
+
+ if (!mutex_trylock(&damon_sysfs_lock))
+ return -EBUSY;
+ err = damon_sysfs_contexts_add_dirs(contexts, nr);
+ mutex_unlock(&damon_sysfs_lock);
+ if (err)
+ return err;
+
+ return count;
+}
+
+static void damon_sysfs_contexts_release(struct kobject *kobj)
+{
+ kfree(container_of(kobj, struct damon_sysfs_contexts, kobj));
+}
+
+static struct kobj_attribute damon_sysfs_contexts_nr_attr
+ = __ATTR_RW_MODE(nr_contexts, 0600);
+
+static struct attribute *damon_sysfs_contexts_attrs[] = {
+ &damon_sysfs_contexts_nr_attr.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_contexts);
+
+static struct kobj_type damon_sysfs_contexts_ktype = {
+ .release = damon_sysfs_contexts_release,
+ .sysfs_ops = &kobj_sysfs_ops,
+ .default_groups = damon_sysfs_contexts_groups,
+};
+
+/*
+ * kdamond directory
+ */
+
+struct damon_sysfs_kdamond {
+ struct kobject kobj;
+ struct damon_sysfs_contexts *contexts;
+ struct damon_ctx *damon_ctx;
+};
+
+static struct damon_sysfs_kdamond *damon_sysfs_kdamond_alloc(void)
+{
+ return kzalloc(sizeof(struct damon_sysfs_kdamond), GFP_KERNEL);
+}
+
+static int damon_sysfs_kdamond_add_dirs(struct damon_sysfs_kdamond *kdamond)
+{
+ struct damon_sysfs_contexts *contexts;
+ int err;
+
+ contexts = damon_sysfs_contexts_alloc();
+ if (!contexts)
+ return -ENOMEM;
+
+ err = kobject_init_and_add(&contexts->kobj,
+ &damon_sysfs_contexts_ktype, &kdamond->kobj,
+ "contexts");
+ if (err) {
+ kobject_put(&contexts->kobj);
+ return err;
+ }
+ kdamond->contexts = contexts;
+
+ return err;
+}
+
+static void damon_sysfs_kdamond_rm_dirs(struct damon_sysfs_kdamond *kdamond)
+{
+ damon_sysfs_contexts_rm_dirs(kdamond->contexts);
+ kobject_put(&kdamond->contexts->kobj);
+}
+
+static bool damon_sysfs_ctx_running(struct damon_ctx *ctx)
+{
+ bool running;
+
+ mutex_lock(&ctx->kdamond_lock);
+ running = ctx->kdamond != NULL;
+ mutex_unlock(&ctx->kdamond_lock);
+ return running;
+}
+
+static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ struct damon_sysfs_kdamond *kdamond = container_of(kobj,
+ struct damon_sysfs_kdamond, kobj);
+ struct damon_ctx *ctx = kdamond->damon_ctx;
+ bool running;
+
+ if (!ctx)
+ running = false;
+ else
+ running = damon_sysfs_ctx_running(ctx);
+
+ return sysfs_emit(buf, "%s\n", running ? "on" : "off");
+}
+
+static int damon_sysfs_set_attrs(struct damon_ctx *ctx,
+ struct damon_sysfs_attrs *sys_attrs)
+{
+ struct damon_sysfs_intervals *sys_intervals = sys_attrs->intervals;
+ struct damon_sysfs_ul_range *sys_nr_regions =
+ sys_attrs->nr_regions_range;
+
+ return damon_set_attrs(ctx, sys_intervals->sample_us,
+ sys_intervals->aggr_us, sys_intervals->update_us,
+ sys_nr_regions->min, sys_nr_regions->max);
+}
+
+static void damon_sysfs_destroy_targets(struct damon_ctx *ctx)
+{
+ struct damon_target *t, *next;
+
+ damon_for_each_target_safe(t, next, ctx) {
+ if (ctx->ops.id == DAMON_OPS_VADDR)
+ put_pid(t->pid);
+ damon_destroy_target(t);
+ }
+}
+
+static int damon_sysfs_set_regions(struct damon_target *t,
+ struct damon_sysfs_regions *sysfs_regions)
+{
+ int i;
+
+ for (i = 0; i < sysfs_regions->nr; i++) {
+ struct damon_sysfs_region *sys_region =
+ sysfs_regions->regions_arr[i];
+ struct damon_region *prev, *r;
+
+ if (sys_region->start > sys_region->end)
+ return -EINVAL;
+ r = damon_new_region(sys_region->start, sys_region->end);
+ if (!r)
+ return -ENOMEM;
+ damon_add_region(r, t);
+ if (damon_nr_regions(t) > 1) {
+ prev = damon_prev_region(r);
+ if (prev->ar.end > r->ar.start) {
+ damon_destroy_region(r, t);
+ return -EINVAL;
+ }
+ }
+ }
+ return 0;
+}
+
+static int damon_sysfs_set_targets(struct damon_ctx *ctx,
+ struct damon_sysfs_targets *sysfs_targets)
+{
+ int i, err;
+
+ for (i = 0; i < sysfs_targets->nr; i++) {
+ struct damon_sysfs_target *sys_target =
+ sysfs_targets->targets_arr[i];
+ struct damon_target *t = damon_new_target();
+
+ if (!t) {
+ damon_sysfs_destroy_targets(ctx);
+ return -ENOMEM;
+ }
+ if (ctx->ops.id == DAMON_OPS_VADDR) {
+ t->pid = find_get_pid(sys_target->pid);
+ if (!t->pid) {
+ damon_sysfs_destroy_targets(ctx);
+ return -EINVAL;
+ }
+ }
+ damon_add_target(ctx, t);
+ err = damon_sysfs_set_regions(t, sys_target->regions);
+ if (err) {
+ damon_sysfs_destroy_targets(ctx);
+ return err;
+ }
+ }
+ return 0;
+}
+
+static struct damos *damon_sysfs_mk_scheme(
+ struct damon_sysfs_scheme *sysfs_scheme)
+{
+ struct damon_sysfs_access_pattern *pattern =
+ sysfs_scheme->access_pattern;
+ struct damon_sysfs_quotas *sysfs_quotas = sysfs_scheme->quotas;
+ struct damon_sysfs_weights *sysfs_weights = sysfs_quotas->weights;
+ struct damon_sysfs_watermarks *sysfs_wmarks = sysfs_scheme->watermarks;
+ struct damos_quota quota = {
+ .ms = sysfs_quotas->ms,
+ .sz = sysfs_quotas->sz,
+ .reset_interval = sysfs_quotas->reset_interval_ms,
+ .weight_sz = sysfs_weights->sz,
+ .weight_nr_accesses = sysfs_weights->nr_accesses,
+ .weight_age = sysfs_weights->age,
+ };
+ struct damos_watermarks wmarks = {
+ .metric = sysfs_wmarks->metric,
+ .interval = sysfs_wmarks->interval_us,
+ .high = sysfs_wmarks->high,
+ .mid = sysfs_wmarks->mid,
+ .low = sysfs_wmarks->low,
+ };
+
+ return damon_new_scheme(pattern->sz->min, pattern->sz->max,
+ pattern->nr_accesses->min, pattern->nr_accesses->max,
+ pattern->age->min, pattern->age->max,
+ sysfs_scheme->action, &quota, &wmarks);
+}
+
+static int damon_sysfs_set_schemes(struct damon_ctx *ctx,
+ struct damon_sysfs_schemes *sysfs_schemes)
+{
+ int i;
+
+ for (i = 0; i < sysfs_schemes->nr; i++) {
+ struct damos *scheme, *next;
+
+ scheme = damon_sysfs_mk_scheme(sysfs_schemes->schemes_arr[i]);
+ if (!scheme) {
+ damon_for_each_scheme_safe(scheme, next, ctx)
+ damon_destroy_scheme(scheme);
+ return -ENOMEM;
+ }
+ damon_add_scheme(ctx, scheme);
+ }
+ return 0;
+}
+
+static void damon_sysfs_before_terminate(struct damon_ctx *ctx)
+{
+ struct damon_target *t, *next;
+
+ if (ctx->ops.id != DAMON_OPS_VADDR)
+ return;
+
+ mutex_lock(&ctx->kdamond_lock);
+ damon_for_each_target_safe(t, next, ctx) {
+ put_pid(t->pid);
+ damon_destroy_target(t);
+ }
+ mutex_unlock(&ctx->kdamond_lock);
+}
+
+static struct damon_ctx *damon_sysfs_build_ctx(
+ struct damon_sysfs_context *sys_ctx)
+{
+ struct damon_ctx *ctx = damon_new_ctx();
+ int err;
+
+ if (!ctx)
+ return ERR_PTR(-ENOMEM);
+
+ err = damon_select_ops(ctx, sys_ctx->ops_id);
+ if (err)
+ goto out;
+ err = damon_sysfs_set_attrs(ctx, sys_ctx->attrs);
+ if (err)
+ goto out;
+ err = damon_sysfs_set_targets(ctx, sys_ctx->targets);
+ if (err)
+ goto out;
+ err = damon_sysfs_set_schemes(ctx, sys_ctx->schemes);
+ if (err)
+ goto out;
+
+ ctx->callback.before_terminate = damon_sysfs_before_terminate;
+ return ctx;
+
+out:
+ damon_destroy_ctx(ctx);
+ return ERR_PTR(err);
+}
+
+static int damon_sysfs_turn_damon_on(struct damon_sysfs_kdamond *kdamond)
+{
+ struct damon_ctx *ctx;
+ int err;
+
+ if (kdamond->damon_ctx &&
+ damon_sysfs_ctx_running(kdamond->damon_ctx))
+ return -EBUSY;
+ /* TODO: support multiple contexts per kdamond */
+ if (kdamond->contexts->nr != 1)
+ return -EINVAL;
+
+ if (kdamond->damon_ctx)
+ damon_destroy_ctx(kdamond->damon_ctx);
+ kdamond->damon_ctx = NULL;
+
+ ctx = damon_sysfs_build_ctx(kdamond->contexts->contexts_arr[0]);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+ err = damon_start(&ctx, 1, false);
+ if (err) {
+ damon_destroy_ctx(ctx);
+ return err;
+ }
+ kdamond->damon_ctx = ctx;
+ return err;
+}
+
+static int damon_sysfs_turn_damon_off(struct damon_sysfs_kdamond *kdamond)
+{
+ if (!kdamond->damon_ctx)
+ return -EINVAL;
+ return damon_stop(&kdamond->damon_ctx, 1);
+ /*
+ * To allow users show final monitoring results of already turned-off
+ * DAMON, we free kdamond->damon_ctx in next
+ * damon_sysfs_turn_damon_on(), or kdamonds_nr_store()
+ */
+}
+
+static int damon_sysfs_update_schemes_stats(struct damon_sysfs_kdamond *kdamond)
+{
+ struct damon_ctx *ctx = kdamond->damon_ctx;
+ struct damos *scheme;
+ int schemes_idx = 0;
+
+ if (!ctx)
+ return -EINVAL;
+ mutex_lock(&ctx->kdamond_lock);
+ damon_for_each_scheme(scheme, ctx) {
+ struct damon_sysfs_schemes *sysfs_schemes;
+ struct damon_sysfs_stats *sysfs_stats;
+
+ sysfs_schemes = kdamond->contexts->contexts_arr[0]->schemes;
+ sysfs_stats = sysfs_schemes->schemes_arr[schemes_idx++]->stats;
+ sysfs_stats->nr_tried = scheme->stat.nr_tried;
+ sysfs_stats->sz_tried = scheme->stat.sz_tried;
+ sysfs_stats->nr_applied = scheme->stat.nr_applied;
+ sysfs_stats->sz_applied = scheme->stat.sz_applied;
+ sysfs_stats->qt_exceeds = scheme->stat.qt_exceeds;
+ }
+ mutex_unlock(&ctx->kdamond_lock);
+ return 0;
+}
+
+static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct damon_sysfs_kdamond *kdamond = container_of(kobj,
+ struct damon_sysfs_kdamond, kobj);
+ ssize_t ret;
+
+ if (!mutex_trylock(&damon_sysfs_lock))
+ return -EBUSY;
+ if (sysfs_streq(buf, "on"))
+ ret = damon_sysfs_turn_damon_on(kdamond);
+ else if (sysfs_streq(buf, "off"))
+ ret = damon_sysfs_turn_damon_off(kdamond);
+ else if (sysfs_streq(buf, "update_schemes_stats"))
+ ret = damon_sysfs_update_schemes_stats(kdamond);
+ else
+ ret = -EINVAL;
+ mutex_unlock(&damon_sysfs_lock);
+ if (!ret)
+ ret = count;
+ return ret;
+}
+
+static ssize_t pid_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_kdamond *kdamond = container_of(kobj,
+ struct damon_sysfs_kdamond, kobj);
+ struct damon_ctx *ctx;
+ int pid;
+
+ if (!mutex_trylock(&damon_sysfs_lock))
+ return -EBUSY;
+ ctx = kdamond->damon_ctx;
+ if (!ctx) {
+ pid = -1;
+ goto out;
+ }
+ mutex_lock(&ctx->kdamond_lock);
+ if (!ctx->kdamond)
+ pid = -1;
+ else
+ pid = ctx->kdamond->pid;
+ mutex_unlock(&ctx->kdamond_lock);
+out:
+ mutex_unlock(&damon_sysfs_lock);
+ return sysfs_emit(buf, "%d\n", pid);
+}
+
+static void damon_sysfs_kdamond_release(struct kobject *kobj)
+{
+ struct damon_sysfs_kdamond *kdamond = container_of(kobj,
+ struct damon_sysfs_kdamond, kobj);
+
+ if (kdamond->damon_ctx)
+ damon_destroy_ctx(kdamond->damon_ctx);
+ kfree(kdamond);
+}
+
+static struct kobj_attribute damon_sysfs_kdamond_state_attr =
+ __ATTR_RW_MODE(state, 0600);
+
+static struct kobj_attribute damon_sysfs_kdamond_pid_attr =
+ __ATTR_RO_MODE(pid, 0400);
+
+static struct attribute *damon_sysfs_kdamond_attrs[] = {
+ &damon_sysfs_kdamond_state_attr.attr,
+ &damon_sysfs_kdamond_pid_attr.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_kdamond);
+
+static struct kobj_type damon_sysfs_kdamond_ktype = {
+ .release = damon_sysfs_kdamond_release,
+ .sysfs_ops = &kobj_sysfs_ops,
+ .default_groups = damon_sysfs_kdamond_groups,
+};
+
+/*
+ * kdamonds directory
+ */
+
+struct damon_sysfs_kdamonds {
+ struct kobject kobj;
+ struct damon_sysfs_kdamond **kdamonds_arr;
+ int nr;
+};
+
+static struct damon_sysfs_kdamonds *damon_sysfs_kdamonds_alloc(void)
+{
+ return kzalloc(sizeof(struct damon_sysfs_kdamonds), GFP_KERNEL);
+}
+
+static void damon_sysfs_kdamonds_rm_dirs(struct damon_sysfs_kdamonds *kdamonds)
+{
+ struct damon_sysfs_kdamond **kdamonds_arr = kdamonds->kdamonds_arr;
+ int i;
+
+ for (i = 0; i < kdamonds->nr; i++) {
+ damon_sysfs_kdamond_rm_dirs(kdamonds_arr[i]);
+ kobject_put(&kdamonds_arr[i]->kobj);
+ }
+ kdamonds->nr = 0;
+ kfree(kdamonds_arr);
+ kdamonds->kdamonds_arr = NULL;
+}
+
+static int damon_sysfs_nr_running_ctxs(struct damon_sysfs_kdamond **kdamonds,
+ int nr_kdamonds)
+{
+ int nr_running_ctxs = 0;
+ int i;
+
+ for (i = 0; i < nr_kdamonds; i++) {
+ struct damon_ctx *ctx = kdamonds[i]->damon_ctx;
+
+ if (!ctx)
+ continue;
+ mutex_lock(&ctx->kdamond_lock);
+ if (ctx->kdamond)
+ nr_running_ctxs++;
+ mutex_unlock(&ctx->kdamond_lock);
+ }
+ return nr_running_ctxs;
+}
+
+static int damon_sysfs_kdamonds_add_dirs(struct damon_sysfs_kdamonds *kdamonds,
+ int nr_kdamonds)
+{
+ struct damon_sysfs_kdamond **kdamonds_arr, *kdamond;
+ int err, i;
+
+ if (damon_sysfs_nr_running_ctxs(kdamonds->kdamonds_arr, kdamonds->nr))
+ return -EBUSY;
+
+ damon_sysfs_kdamonds_rm_dirs(kdamonds);
+ if (!nr_kdamonds)
+ return 0;
+
+ kdamonds_arr = kmalloc_array(nr_kdamonds, sizeof(*kdamonds_arr),
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!kdamonds_arr)
+ return -ENOMEM;
+ kdamonds->kdamonds_arr = kdamonds_arr;
+
+ for (i = 0; i < nr_kdamonds; i++) {
+ kdamond = damon_sysfs_kdamond_alloc();
+ if (!kdamond) {
+ damon_sysfs_kdamonds_rm_dirs(kdamonds);
+ return -ENOMEM;
+ }
+
+ err = kobject_init_and_add(&kdamond->kobj,
+ &damon_sysfs_kdamond_ktype, &kdamonds->kobj,
+ "%d", i);
+ if (err)
+ goto out;
+
+ err = damon_sysfs_kdamond_add_dirs(kdamond);
+ if (err)
+ goto out;
+
+ kdamonds_arr[i] = kdamond;
+ kdamonds->nr++;
+ }
+ return 0;
+
+out:
+ damon_sysfs_kdamonds_rm_dirs(kdamonds);
+ kobject_put(&kdamond->kobj);
+ return err;
+}
+
+static ssize_t nr_kdamonds_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct damon_sysfs_kdamonds *kdamonds = container_of(kobj,
+ struct damon_sysfs_kdamonds, kobj);
+
+ return sysfs_emit(buf, "%d\n", kdamonds->nr);
+}
+
+static ssize_t nr_kdamonds_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
+{
+ struct damon_sysfs_kdamonds *kdamonds = container_of(kobj,
+ struct damon_sysfs_kdamonds, kobj);
+ int nr, err;
+
+ err = kstrtoint(buf, 0, &nr);
+ if (err)
+ return err;
+ if (nr < 0)
+ return -EINVAL;
+
+ if (!mutex_trylock(&damon_sysfs_lock))
+ return -EBUSY;
+ err = damon_sysfs_kdamonds_add_dirs(kdamonds, nr);
+ mutex_unlock(&damon_sysfs_lock);
+ if (err)
+ return err;
+
+ return count;
+}
+
+static void damon_sysfs_kdamonds_release(struct kobject *kobj)
+{
+ kfree(container_of(kobj, struct damon_sysfs_kdamonds, kobj));
+}
+
+static struct kobj_attribute damon_sysfs_kdamonds_nr_attr =
+ __ATTR_RW_MODE(nr_kdamonds, 0600);
+
+static struct attribute *damon_sysfs_kdamonds_attrs[] = {
+ &damon_sysfs_kdamonds_nr_attr.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_kdamonds);
+
+static struct kobj_type damon_sysfs_kdamonds_ktype = {
+ .release = damon_sysfs_kdamonds_release,
+ .sysfs_ops = &kobj_sysfs_ops,
+ .default_groups = damon_sysfs_kdamonds_groups,
+};
+
+/*
+ * damon user interface directory
+ */
+
+struct damon_sysfs_ui_dir {
+ struct kobject kobj;
+ struct damon_sysfs_kdamonds *kdamonds;
+};
+
+static struct damon_sysfs_ui_dir *damon_sysfs_ui_dir_alloc(void)
+{
+ return kzalloc(sizeof(struct damon_sysfs_ui_dir), GFP_KERNEL);
+}
+
+static int damon_sysfs_ui_dir_add_dirs(struct damon_sysfs_ui_dir *ui_dir)
+{
+ struct damon_sysfs_kdamonds *kdamonds;
+ int err;
+
+ kdamonds = damon_sysfs_kdamonds_alloc();
+ if (!kdamonds)
+ return -ENOMEM;
+
+ err = kobject_init_and_add(&kdamonds->kobj,
+ &damon_sysfs_kdamonds_ktype, &ui_dir->kobj,
+ "kdamonds");
+ if (err) {
+ kobject_put(&kdamonds->kobj);
+ return err;
+ }
+ ui_dir->kdamonds = kdamonds;
+ return err;
+}
+
+static void damon_sysfs_ui_dir_release(struct kobject *kobj)
+{
+ kfree(container_of(kobj, struct damon_sysfs_ui_dir, kobj));
+}
+
+static struct attribute *damon_sysfs_ui_dir_attrs[] = {
+ NULL,
+};
+ATTRIBUTE_GROUPS(damon_sysfs_ui_dir);
+
+static struct kobj_type damon_sysfs_ui_dir_ktype = {
+ .release = damon_sysfs_ui_dir_release,
+ .sysfs_ops = &kobj_sysfs_ops,
+ .default_groups = damon_sysfs_ui_dir_groups,
+};
+
+static int __init damon_sysfs_init(void)
+{
+ struct kobject *damon_sysfs_root;
+ struct damon_sysfs_ui_dir *admin;
+ int err;
+
+ damon_sysfs_root = kobject_create_and_add("damon", mm_kobj);
+ if (!damon_sysfs_root)
+ return -ENOMEM;
+
+ admin = damon_sysfs_ui_dir_alloc();
+ if (!admin) {
+ kobject_put(damon_sysfs_root);
+ return -ENOMEM;
+ }
+ err = kobject_init_and_add(&admin->kobj, &damon_sysfs_ui_dir_ktype,
+ damon_sysfs_root, "admin");
+ if (err)
+ goto out;
+ err = damon_sysfs_ui_dir_add_dirs(admin);
+ if (err)
+ goto out;
+ return 0;
+
+out:
+ kobject_put(&admin->kobj);
+ kobject_put(damon_sysfs_root);
+ return err;
+}
+subsys_initcall(damon_sysfs_init);
diff --git a/mm/damon/vaddr-test.h b/mm/damon/vaddr-test.h
index 6a1b9272ea12..1a55bb6c36c3 100644
--- a/mm/damon/vaddr-test.h
+++ b/mm/damon/vaddr-test.h
@@ -139,7 +139,7 @@ static void damon_do_test_apply_three_regions(struct kunit *test,
struct damon_region *r;
int i;
- t = damon_new_target(42);
+ t = damon_new_target();
for (i = 0; i < nr_regions / 2; i++) {
r = damon_new_region(regions[i * 2], regions[i * 2 + 1]);
damon_add_region(r, t);
@@ -251,7 +251,7 @@ static void damon_test_apply_three_regions4(struct kunit *test)
static void damon_test_split_evenly_fail(struct kunit *test,
unsigned long start, unsigned long end, unsigned int nr_pieces)
{
- struct damon_target *t = damon_new_target(42);
+ struct damon_target *t = damon_new_target();
struct damon_region *r = damon_new_region(start, end);
damon_add_region(r, t);
@@ -270,7 +270,7 @@ static void damon_test_split_evenly_fail(struct kunit *test,
static void damon_test_split_evenly_succ(struct kunit *test,
unsigned long start, unsigned long end, unsigned int nr_pieces)
{
- struct damon_target *t = damon_new_target(42);
+ struct damon_target *t = damon_new_target();
struct damon_region *r = damon_new_region(start, end);
unsigned long expected_width = (end - start) / nr_pieces;
unsigned long i = 0;
@@ -314,7 +314,7 @@ static struct kunit_case damon_test_cases[] = {
};
static struct kunit_suite damon_test_suite = {
- .name = "damon-primitives",
+ .name = "damon-operations",
.test_cases = damon_test_cases,
};
kunit_test_suite(damon_test_suite);
diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
index 89b6468da2b9..b2ec0aa1ff45 100644
--- a/mm/damon/vaddr.c
+++ b/mm/damon/vaddr.c
@@ -15,7 +15,7 @@
#include <linux/pagewalk.h>
#include <linux/sched/mm.h>
-#include "prmtv-common.h"
+#include "ops-common.h"
#ifdef CONFIG_DAMON_VADDR_KUNIT_TEST
#undef DAMON_MIN_REGION
@@ -23,12 +23,12 @@
#endif
/*
- * 't->id' should be the pointer to the relevant 'struct pid' having reference
+ * 't->pid' should be the pointer to the relevant 'struct pid' having reference
* count. Caller must put the returned task, unless it is NULL.
*/
static inline struct task_struct *damon_get_task_struct(struct damon_target *t)
{
- return get_pid_task((struct pid *)t->id, PIDTYPE_PID);
+ return get_pid_task(t->pid, PIDTYPE_PID);
}
/*
@@ -402,9 +402,6 @@ static void damon_hugetlb_mkold(pte_t *pte, struct mm_struct *mm,
pte_t entry = huge_ptep_get(pte);
struct page *page = pte_page(entry);
- if (!page)
- return;
-
get_page(page);
if (pte_young(entry)) {
@@ -564,9 +561,6 @@ static int damon_young_hugetlb_entry(pte_t *pte, unsigned long hmask,
goto out;
page = pte_page(entry);
- if (!page)
- goto out;
-
get_page(page);
if (pte_young(entry) || !page_is_idle(page) ||
@@ -659,7 +653,7 @@ static unsigned int damon_va_check_accesses(struct damon_ctx *ctx)
* Functions for the target validity check and cleanup
*/
-bool damon_va_target_valid(void *target)
+static bool damon_va_target_valid(void *target)
{
struct damon_target *t = target;
struct task_struct *task;
@@ -745,17 +739,24 @@ static int damon_va_scheme_score(struct damon_ctx *context,
return DAMOS_MAX_SCORE;
}
-void damon_va_set_primitives(struct damon_ctx *ctx)
+static int __init damon_va_initcall(void)
{
- ctx->primitive.init = damon_va_init;
- ctx->primitive.update = damon_va_update;
- ctx->primitive.prepare_access_checks = damon_va_prepare_access_checks;
- ctx->primitive.check_accesses = damon_va_check_accesses;
- ctx->primitive.reset_aggregated = NULL;
- ctx->primitive.target_valid = damon_va_target_valid;
- ctx->primitive.cleanup = NULL;
- ctx->primitive.apply_scheme = damon_va_apply_scheme;
- ctx->primitive.get_scheme_score = damon_va_scheme_score;
-}
+ struct damon_operations ops = {
+ .id = DAMON_OPS_VADDR,
+ .init = damon_va_init,
+ .update = damon_va_update,
+ .prepare_access_checks = damon_va_prepare_access_checks,
+ .check_accesses = damon_va_check_accesses,
+ .reset_aggregated = NULL,
+ .target_valid = damon_va_target_valid,
+ .cleanup = NULL,
+ .apply_scheme = damon_va_apply_scheme,
+ .get_scheme_score = damon_va_scheme_score,
+ };
+
+ return damon_register_ops(&ops);
+};
+
+subsys_initcall(damon_va_initcall);
#include "vaddr-test.h"
diff --git a/mm/debug.c b/mm/debug.c
index bc9ac87f0e08..bef329bf28f0 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -48,7 +48,8 @@ const struct trace_print_flags vmaflag_names[] = {
static void __dump_page(struct page *page)
{
- struct page *head = compound_head(page);
+ struct folio *folio = page_folio(page);
+ struct page *head = &folio->page;
struct address_space *mapping;
bool compound = PageCompound(page);
/*
@@ -76,6 +77,7 @@ static void __dump_page(struct page *page)
else
mapping = (void *)(tmp & ~PAGE_MAPPING_FLAGS);
head = page;
+ folio = (struct folio *)page;
compound = false;
} else {
mapping = page_mapping(page);
@@ -92,16 +94,10 @@ static void __dump_page(struct page *page)
page, page_ref_count(head), mapcount, mapping,
page_to_pgoff(page), page_to_pfn(page));
if (compound) {
- if (hpage_pincount_available(page)) {
- pr_warn("head:%p order:%u compound_mapcount:%d compound_pincount:%d\n",
- head, compound_order(head),
- head_compound_mapcount(head),
- head_compound_pincount(head));
- } else {
- pr_warn("head:%p order:%u compound_mapcount:%d\n",
- head, compound_order(head),
- head_compound_mapcount(head));
- }
+ pr_warn("head:%p order:%u compound_mapcount:%d compound_pincount:%d\n",
+ head, compound_order(head),
+ folio_entire_mapcount(folio),
+ head_compound_pincount(head));
}
#ifdef CONFIG_MEMCG
@@ -265,5 +261,4 @@ void page_init_poison(struct page *page, size_t size)
if (page_init_poisoning)
memset(page, PAGE_POISON_PATTERN, size);
}
-EXPORT_SYMBOL_GPL(page_init_poison);
#endif /* CONFIG_DEBUG_VM */
diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index a7ac97c76762..db2abd9e415b 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -171,6 +171,8 @@ static void __init pte_advanced_tests(struct pgtable_debug_args *args)
ptep_test_and_clear_young(args->vma, args->vaddr, args->ptep);
pte = ptep_get(args->ptep);
WARN_ON(pte_young(pte));
+
+ ptep_get_and_clear_full(args->mm, args->vaddr, args->ptep, 1);
}
static void __init pte_savedwrite_tests(struct pgtable_debug_args *args)
diff --git a/mm/early_ioremap.c b/mm/early_ioremap.c
index 74984c23a87e..9bc12e526ed0 100644
--- a/mm/early_ioremap.c
+++ b/mm/early_ioremap.c
@@ -17,6 +17,7 @@
#include <linux/vmalloc.h>
#include <asm/fixmap.h>
#include <asm/early_ioremap.h>
+#include "internal.h"
#ifdef CONFIG_MMU
static int early_ioremap_debug __initdata;
diff --git a/mm/fadvise.c b/mm/fadvise.c
index d6baa4f451c5..338f16022012 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -109,9 +109,8 @@ int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice)
case POSIX_FADV_NOREUSE:
break;
case POSIX_FADV_DONTNEED:
- if (!inode_write_congested(mapping->host))
- __filemap_fdatawrite_range(mapping, offset, endbyte,
- WB_SYNC_NONE);
+ __filemap_fdatawrite_range(mapping, offset, endbyte,
+ WB_SYNC_NONE);
/*
* First and last FULL page! Partial pages are deliberately
diff --git a/mm/filemap.c b/mm/filemap.c
index ad8c39d90bf9..647d72bf23b6 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -72,7 +72,7 @@
* Lock ordering:
*
* ->i_mmap_rwsem (truncate_pagecache)
- * ->private_lock (__free_pte->__set_page_dirty_buffers)
+ * ->private_lock (__free_pte->block_dirty_folio)
* ->swap_lock (exclusive_swap_page, others)
* ->i_pages lock
*
@@ -115,7 +115,7 @@
* ->memcg->move_lock (page_remove_rmap->lock_page_memcg)
* bdi.wb->list_lock (zap_pte_range->set_page_dirty)
* ->inode->i_lock (zap_pte_range->set_page_dirty)
- * ->private_lock (zap_pte_range->__set_page_dirty_buffers)
+ * ->private_lock (zap_pte_range->block_dirty_folio)
*
* ->i_mmap_rwsem
* ->tasklist_lock (memory_failure, collect_procs_ao)
@@ -152,25 +152,25 @@ static void filemap_unaccount_folio(struct address_space *mapping,
VM_BUG_ON_FOLIO(folio_mapped(folio), folio);
if (!IS_ENABLED(CONFIG_DEBUG_VM) && unlikely(folio_mapped(folio))) {
- int mapcount;
-
pr_alert("BUG: Bad page cache in process %s pfn:%05lx\n",
current->comm, folio_pfn(folio));
dump_page(&folio->page, "still mapped when deleted");
dump_stack();
add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
- mapcount = page_mapcount(&folio->page);
- if (mapping_exiting(mapping) &&
- folio_ref_count(folio) >= mapcount + 2) {
- /*
- * All vmas have already been torn down, so it's
- * a good bet that actually the folio is unmapped,
- * and we'd prefer not to leak it: if we're wrong,
- * some other bad page check should catch it later.
- */
- page_mapcount_reset(&folio->page);
- folio_ref_sub(folio, mapcount);
+ if (mapping_exiting(mapping) && !folio_test_large(folio)) {
+ int mapcount = page_mapcount(&folio->page);
+
+ if (folio_ref_count(folio) >= mapcount + 2) {
+ /*
+ * All vmas have already been torn down, so it's
+ * a good bet that actually the page is unmapped
+ * and we'd rather not leak it: if we're wrong,
+ * another bad page check should catch it later.
+ */
+ page_mapcount_reset(&folio->page);
+ folio_ref_sub(folio, mapcount);
+ }
}
}
@@ -193,16 +193,20 @@ static void filemap_unaccount_folio(struct address_space *mapping,
/*
* At this point folio must be either written or cleaned by
* truncate. Dirty folio here signals a bug and loss of
- * unwritten data.
+ * unwritten data - on ordinary filesystems.
*
- * This fixes dirty accounting after removing the folio entirely
+ * But it's harmless on in-memory filesystems like tmpfs; and can
+ * occur when a driver which did get_user_pages() sets page dirty
+ * before putting it, while the inode is being finally evicted.
+ *
+ * Below fixes dirty accounting after removing the folio entirely
* but leaves the dirty flag set: it has no effect for truncated
* folio and anyway will be cleared before returning folio to
* buddy allocator.
*/
- if (WARN_ON_ONCE(folio_test_dirty(folio)))
- folio_account_cleaned(folio, mapping,
- inode_to_wb(mapping->host));
+ if (WARN_ON_ONCE(folio_test_dirty(folio) &&
+ mapping_can_writeback(mapping)))
+ folio_account_cleaned(folio, inode_to_wb(mapping->host));
}
/*
@@ -842,26 +846,27 @@ noinline int __filemap_add_folio(struct address_space *mapping,
{
XA_STATE(xas, &mapping->i_pages, index);
int huge = folio_test_hugetlb(folio);
- int error;
bool charged = false;
+ long nr = 1;
VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
VM_BUG_ON_FOLIO(folio_test_swapbacked(folio), folio);
mapping_set_update(&xas, mapping);
- folio_get(folio);
- folio->mapping = mapping;
- folio->index = index;
-
if (!huge) {
- error = mem_cgroup_charge(folio, NULL, gfp);
+ int error = mem_cgroup_charge(folio, NULL, gfp);
VM_BUG_ON_FOLIO(index & (folio_nr_pages(folio) - 1), folio);
if (error)
- goto error;
+ return error;
charged = true;
+ xas_set_order(&xas, index, folio_order(folio));
+ nr = folio_nr_pages(folio);
}
gfp &= GFP_RECLAIM_MASK;
+ folio_ref_add(folio, nr);
+ folio->mapping = mapping;
+ folio->index = xas.xa_index;
do {
unsigned int order = xa_get_order(xas.xa, xas.xa_index);
@@ -885,6 +890,8 @@ noinline int __filemap_add_folio(struct address_space *mapping,
/* entry may have been split before we acquired lock */
order = xa_get_order(xas.xa, xas.xa_index);
if (order > folio_order(folio)) {
+ /* How to handle large swap entries? */
+ BUG_ON(shmem_mapping(mapping));
xas_split(&xas, old, order);
xas_reset(&xas);
}
@@ -894,29 +901,31 @@ noinline int __filemap_add_folio(struct address_space *mapping,
if (xas_error(&xas))
goto unlock;
- mapping->nrpages++;
+ mapping->nrpages += nr;
/* hugetlb pages do not participate in page cache accounting */
- if (!huge)
- __lruvec_stat_add_folio(folio, NR_FILE_PAGES);
+ if (!huge) {
+ __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr);
+ if (folio_test_pmd_mappable(folio))
+ __lruvec_stat_mod_folio(folio,
+ NR_FILE_THPS, nr);
+ }
unlock:
xas_unlock_irq(&xas);
} while (xas_nomem(&xas, gfp));
- if (xas_error(&xas)) {
- error = xas_error(&xas);
- if (charged)
- mem_cgroup_uncharge(folio);
+ if (xas_error(&xas))
goto error;
- }
trace_mm_filemap_add_to_page_cache(folio);
return 0;
error:
+ if (charged)
+ mem_cgroup_uncharge(folio);
folio->mapping = NULL;
/* Leave page->index set: truncation relies upon it */
- folio_put(folio);
- return error;
+ folio_put_refs(folio, nr);
+ return xas_error(&xas);
}
ALLOW_ERROR_INJECTION(__filemap_add_folio, ERRNO);
@@ -1054,6 +1063,12 @@ void __init pagecache_init(void)
init_waitqueue_head(&folio_wait_table[i]);
page_writeback_init();
+
+ /*
+ * tmpfs uses the ZERO_PAGE for reading holes: it is up-to-date,
+ * and splice's page_cache_pipe_buf_confirm() needs to see that.
+ */
+ SetPageUptodate(ZERO_PAGE(0));
}
/*
@@ -1174,24 +1189,17 @@ static void folio_wake_bit(struct folio *folio, int bit_nr)
}
/*
- * It is possible for other pages to have collided on the waitqueue
- * hash, so in that case check for a page match. That prevents a long-
- * term waiter
+ * It's possible to miss clearing waiters here, when we woke our page
+ * waiters, but the hashed waitqueue has waiters for other pages on it.
+ * That's okay, it's a rare case. The next waker will clear it.
*
- * It is still possible to miss a case here, when we woke page waiters
- * and removed them from the waitqueue, but there are still other
- * page waiters.
+ * Note that, depending on the page pool (buddy, hugetlb, ZONE_DEVICE,
+ * other), the flag may be cleared in the course of freeing the page;
+ * but that is not required for correctness.
*/
- if (!waitqueue_active(q) || !key.page_match) {
+ if (!waitqueue_active(q) || !key.page_match)
folio_clear_waiters(folio);
- /*
- * It's possible to miss clearing Waiters here, when we woke
- * our page waiters, but the hashed waitqueue has waiters for
- * other pages on it.
- *
- * That's okay, it's a rare case. The next waker will clear it.
- */
- }
+
spin_unlock_irqrestore(&q->lock, flags);
}
@@ -2229,8 +2237,9 @@ out:
* @nr_pages: The maximum number of pages
* @pages: Where the resulting pages are placed
*
- * find_get_pages_contig() works exactly like find_get_pages(), except
- * that the returned number of pages are guaranteed to be contiguous.
+ * find_get_pages_contig() works exactly like find_get_pages_range(),
+ * except that the returned number of pages are guaranteed to be
+ * contiguous.
*
* Return: the number of pages which were found.
*/
@@ -2290,9 +2299,9 @@ EXPORT_SYMBOL(find_get_pages_contig);
* @nr_pages: the maximum number of pages
* @pages: where the resulting pages are placed
*
- * Like find_get_pages(), except we only return head pages which are tagged
- * with @tag. @index is updated to the index immediately after the last
- * page we return, ready for the next iteration.
+ * Like find_get_pages_range(), except we only return head pages which are
+ * tagged with @tag. @index is updated to the index immediately after the
+ * last page we return, ready for the next iteration.
*
* Return: the number of pages which were found.
*/
@@ -2452,7 +2461,7 @@ static bool filemap_range_uptodate(struct address_space *mapping,
pos -= folio_pos(folio);
}
- return mapping->a_ops->is_partially_uptodate(&folio->page, pos, count);
+ return mapping->a_ops->is_partially_uptodate(folio, pos, count);
}
static int filemap_update_page(struct kiocb *iocb,
@@ -2844,7 +2853,7 @@ static inline loff_t folio_seek_hole_data(struct xa_state *xas,
offset = offset_in_folio(folio, start) & ~(bsz - 1);
do {
- if (ops->is_partially_uptodate(&folio->page, offset, bsz) ==
+ if (ops->is_partially_uptodate(folio, offset, bsz) ==
seek_data)
break;
start = (start + bsz) & ~(bsz - 1);
@@ -2990,6 +2999,24 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
struct file *fpin = NULL;
unsigned int mmap_miss;
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ /* Use the readahead code, even if readahead is disabled */
+ if (vmf->vma->vm_flags & VM_HUGEPAGE) {
+ fpin = maybe_unlock_mmap_for_io(vmf, fpin);
+ ractl._index &= ~((unsigned long)HPAGE_PMD_NR - 1);
+ ra->size = HPAGE_PMD_NR;
+ /*
+ * Fetch two PMD folios, so we get the chance to actually
+ * readahead, unless we've been told not to.
+ */
+ if (!(vmf->vma->vm_flags & VM_RAND_READ))
+ ra->size *= 2;
+ ra->async_size = HPAGE_PMD_NR;
+ page_cache_ra_order(&ractl, ra, HPAGE_PMD_ORDER);
+ return fpin;
+ }
+#endif
+
/* If we don't want any read-ahead, don't bother */
if (vmf->vma->vm_flags & VM_RAND_READ)
return fpin;
@@ -3022,7 +3049,7 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
ra->size = ra->ra_pages;
ra->async_size = ra->ra_pages / 4;
ractl._index = ra->start;
- do_page_cache_ra(&ractl, ra->size, ra->async_size);
+ page_cache_ra_order(&ractl, ra, 0);
return fpin;
}
@@ -3752,7 +3779,7 @@ again:
* same page as we're writing to, without it being marked
* up-to-date.
*/
- if (unlikely(fault_in_iov_iter_readable(i, bytes))) {
+ if (unlikely(fault_in_iov_iter_readable(i, bytes) == bytes)) {
status = -EFAULT;
break;
}
diff --git a/mm/folio-compat.c b/mm/folio-compat.c
index 749555a232a8..46fa179e32fb 100644
--- a/mm/folio-compat.c
+++ b/mm/folio-compat.c
@@ -7,6 +7,7 @@
#include <linux/migrate.h>
#include <linux/pagemap.h>
#include <linux/swap.h>
+#include "internal.h"
struct address_space *page_mapping(struct page *page)
{
@@ -151,3 +152,15 @@ int try_to_release_page(struct page *page, gfp_t gfp)
return filemap_release_folio(page_folio(page), gfp);
}
EXPORT_SYMBOL(try_to_release_page);
+
+int isolate_lru_page(struct page *page)
+{
+ if (WARN_RATELIMIT(PageTail(page), "trying to isolate tail page"))
+ return -EBUSY;
+ return folio_isolate_lru((struct folio *)page);
+}
+
+void putback_lru_page(struct page *page)
+{
+ folio_putback_lru(page_folio(page));
+}
diff --git a/mm/gup.c b/mm/gup.c
index f0af462ac1e2..271fbe8195d7 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -29,107 +29,71 @@ struct follow_page_context {
unsigned int page_mask;
};
-static void hpage_pincount_add(struct page *page, int refs)
-{
- VM_BUG_ON_PAGE(!hpage_pincount_available(page), page);
- VM_BUG_ON_PAGE(page != compound_head(page), page);
-
- atomic_add(refs, compound_pincount_ptr(page));
-}
-
-static void hpage_pincount_sub(struct page *page, int refs)
-{
- VM_BUG_ON_PAGE(!hpage_pincount_available(page), page);
- VM_BUG_ON_PAGE(page != compound_head(page), page);
-
- atomic_sub(refs, compound_pincount_ptr(page));
-}
-
-/* Equivalent to calling put_page() @refs times. */
-static void put_page_refs(struct page *page, int refs)
-{
-#ifdef CONFIG_DEBUG_VM
- if (VM_WARN_ON_ONCE_PAGE(page_ref_count(page) < refs, page))
- return;
-#endif
-
- /*
- * Calling put_page() for each ref is unnecessarily slow. Only the last
- * ref needs a put_page().
- */
- if (refs > 1)
- page_ref_sub(page, refs - 1);
- put_page(page);
-}
-
/*
- * Return the compound head page with ref appropriately incremented,
+ * Return the folio with ref appropriately incremented,
* or NULL if that failed.
*/
-static inline struct page *try_get_compound_head(struct page *page, int refs)
+static inline struct folio *try_get_folio(struct page *page, int refs)
{
- struct page *head = compound_head(page);
+ struct folio *folio;
- if (WARN_ON_ONCE(page_ref_count(head) < 0))
+retry:
+ folio = page_folio(page);
+ if (WARN_ON_ONCE(folio_ref_count(folio) < 0))
return NULL;
- if (unlikely(!page_cache_add_speculative(head, refs)))
+ if (unlikely(!folio_ref_try_add_rcu(folio, refs)))
return NULL;
/*
- * At this point we have a stable reference to the head page; but it
- * could be that between the compound_head() lookup and the refcount
- * increment, the compound page was split, in which case we'd end up
- * holding a reference on a page that has nothing to do with the page
+ * At this point we have a stable reference to the folio; but it
+ * could be that between calling page_folio() and the refcount
+ * increment, the folio was split, in which case we'd end up
+ * holding a reference on a folio that has nothing to do with the page
* we were given anymore.
- * So now that the head page is stable, recheck that the pages still
- * belong together.
+ * So now that the folio is stable, recheck that the page still
+ * belongs to this folio.
*/
- if (unlikely(compound_head(page) != head)) {
- put_page_refs(head, refs);
- return NULL;
+ if (unlikely(page_folio(page) != folio)) {
+ folio_put_refs(folio, refs);
+ goto retry;
}
- return head;
+ return folio;
}
/**
- * try_grab_compound_head() - attempt to elevate a page's refcount, by a
- * flags-dependent amount.
- *
- * Even though the name includes "compound_head", this function is still
- * appropriate for callers that have a non-compound @page to get.
- *
+ * try_grab_folio() - Attempt to get or pin a folio.
* @page: pointer to page to be grabbed
- * @refs: the value to (effectively) add to the page's refcount
+ * @refs: the value to (effectively) add to the folio's refcount
* @flags: gup flags: these are the FOLL_* flag values.
*
* "grab" names in this file mean, "look at flags to decide whether to use
- * FOLL_PIN or FOLL_GET behavior, when incrementing the page's refcount.
+ * FOLL_PIN or FOLL_GET behavior, when incrementing the folio's refcount.
*
* Either FOLL_PIN or FOLL_GET (or neither) must be set, but not both at the
* same time. (That's true throughout the get_user_pages*() and
* pin_user_pages*() APIs.) Cases:
*
- * FOLL_GET: page's refcount will be incremented by @refs.
+ * FOLL_GET: folio's refcount will be incremented by @refs.
*
- * FOLL_PIN on compound pages that are > two pages long: page's refcount will
- * be incremented by @refs, and page[2].hpage_pinned_refcount will be
- * incremented by @refs * GUP_PIN_COUNTING_BIAS.
+ * FOLL_PIN on large folios: folio's refcount will be incremented by
+ * @refs, and its compound_pincount will be incremented by @refs.
*
- * FOLL_PIN on normal pages, or compound pages that are two pages long:
- * page's refcount will be incremented by @refs * GUP_PIN_COUNTING_BIAS.
+ * FOLL_PIN on single-page folios: folio's refcount will be incremented by
+ * @refs * GUP_PIN_COUNTING_BIAS.
*
- * Return: head page (with refcount appropriately incremented) for success, or
- * NULL upon failure. If neither FOLL_GET nor FOLL_PIN was set, that's
- * considered failure, and furthermore, a likely bug in the caller, so a warning
- * is also emitted.
+ * Return: The folio containing @page (with refcount appropriately
+ * incremented) for success, or NULL upon failure. If neither FOLL_GET
+ * nor FOLL_PIN was set, that's considered failure, and furthermore,
+ * a likely bug in the caller, so a warning is also emitted.
*/
-struct page *try_grab_compound_head(struct page *page,
- int refs, unsigned int flags)
+struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags)
{
if (flags & FOLL_GET)
- return try_get_compound_head(page, refs);
+ return try_get_folio(page, refs);
else if (flags & FOLL_PIN) {
+ struct folio *folio;
+
/*
* Can't do FOLL_LONGTERM + FOLL_PIN gup fast path if not in a
* right zone, so fail and let the caller fall back to the slow
@@ -143,63 +107,57 @@ struct page *try_grab_compound_head(struct page *page,
* CAUTION: Don't use compound_head() on the page before this
* point, the result won't be stable.
*/
- page = try_get_compound_head(page, refs);
- if (!page)
+ folio = try_get_folio(page, refs);
+ if (!folio)
return NULL;
/*
- * When pinning a compound page of order > 1 (which is what
- * hpage_pincount_available() checks for), use an exact count to
- * track it, via hpage_pincount_add/_sub().
+ * When pinning a large folio, use an exact count to track it.
*
- * However, be sure to *also* increment the normal page refcount
- * field at least once, so that the page really is pinned.
- * That's why the refcount from the earlier
- * try_get_compound_head() is left intact.
+ * However, be sure to *also* increment the normal folio
+ * refcount field at least once, so that the folio really
+ * is pinned. That's why the refcount from the earlier
+ * try_get_folio() is left intact.
*/
- if (hpage_pincount_available(page))
- hpage_pincount_add(page, refs);
+ if (folio_test_large(folio))
+ atomic_add(refs, folio_pincount_ptr(folio));
else
- page_ref_add(page, refs * (GUP_PIN_COUNTING_BIAS - 1));
-
- mod_node_page_state(page_pgdat(page), NR_FOLL_PIN_ACQUIRED,
- refs);
+ folio_ref_add(folio,
+ refs * (GUP_PIN_COUNTING_BIAS - 1));
+ node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, refs);
- return page;
+ return folio;
}
WARN_ON_ONCE(1);
return NULL;
}
-static void put_compound_head(struct page *page, int refs, unsigned int flags)
+static void gup_put_folio(struct folio *folio, int refs, unsigned int flags)
{
if (flags & FOLL_PIN) {
- mod_node_page_state(page_pgdat(page), NR_FOLL_PIN_RELEASED,
- refs);
-
- if (hpage_pincount_available(page))
- hpage_pincount_sub(page, refs);
+ node_stat_mod_folio(folio, NR_FOLL_PIN_RELEASED, refs);
+ if (folio_test_large(folio))
+ atomic_sub(refs, folio_pincount_ptr(folio));
else
refs *= GUP_PIN_COUNTING_BIAS;
}
- put_page_refs(page, refs);
+ folio_put_refs(folio, refs);
}
/**
* try_grab_page() - elevate a page's refcount by a flag-dependent amount
+ * @page: pointer to page to be grabbed
+ * @flags: gup flags: these are the FOLL_* flag values.
*
* This might not do anything at all, depending on the flags argument.
*
* "grab" names in this file mean, "look at flags to decide whether to use
* FOLL_PIN or FOLL_GET behavior, when incrementing the page's refcount.
*
- * @page: pointer to page to be grabbed
- * @flags: gup flags: these are the FOLL_* flag values.
- *
* Either FOLL_PIN or FOLL_GET (or neither) may be set, but not both at the same
- * time. Cases: please see the try_grab_compound_head() documentation, with
+ * time. Cases: please see the try_grab_folio() documentation, with
* "refs=1".
*
* Return: true for success, or if no action was required (if neither FOLL_PIN
@@ -208,10 +166,31 @@ static void put_compound_head(struct page *page, int refs, unsigned int flags)
*/
bool __must_check try_grab_page(struct page *page, unsigned int flags)
{
- if (!(flags & (FOLL_GET | FOLL_PIN)))
- return true;
+ struct folio *folio = page_folio(page);
+
+ WARN_ON_ONCE((flags & (FOLL_GET | FOLL_PIN)) == (FOLL_GET | FOLL_PIN));
+ if (WARN_ON_ONCE(folio_ref_count(folio) <= 0))
+ return false;
+
+ if (flags & FOLL_GET)
+ folio_ref_inc(folio);
+ else if (flags & FOLL_PIN) {
+ /*
+ * Similar to try_grab_folio(): be sure to *also*
+ * increment the normal page refcount field at least once,
+ * so that the page really is pinned.
+ */
+ if (folio_test_large(folio)) {
+ folio_ref_add(folio, 1);
+ atomic_add(1, folio_pincount_ptr(folio));
+ } else {
+ folio_ref_add(folio, GUP_PIN_COUNTING_BIAS);
+ }
- return try_grab_compound_head(page, 1, flags);
+ node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, 1);
+ }
+
+ return true;
}
/**
@@ -225,62 +204,40 @@ bool __must_check try_grab_page(struct page *page, unsigned int flags)
*/
void unpin_user_page(struct page *page)
{
- put_compound_head(compound_head(page), 1, FOLL_PIN);
+ gup_put_folio(page_folio(page), 1, FOLL_PIN);
}
EXPORT_SYMBOL(unpin_user_page);
-static inline void compound_range_next(unsigned long i, unsigned long npages,
- struct page **list, struct page **head,
- unsigned int *ntails)
+static inline struct folio *gup_folio_range_next(struct page *start,
+ unsigned long npages, unsigned long i, unsigned int *ntails)
{
- struct page *next, *page;
+ struct page *next = nth_page(start, i);
+ struct folio *folio = page_folio(next);
unsigned int nr = 1;
- if (i >= npages)
- return;
-
- next = *list + i;
- page = compound_head(next);
- if (PageCompound(page) && compound_order(page) >= 1)
- nr = min_t(unsigned int,
- page + compound_nr(page) - next, npages - i);
+ if (folio_test_large(folio))
+ nr = min_t(unsigned int, npages - i,
+ folio_nr_pages(folio) - folio_page_idx(folio, next));
- *head = page;
*ntails = nr;
+ return folio;
}
-#define for_each_compound_range(__i, __list, __npages, __head, __ntails) \
- for (__i = 0, \
- compound_range_next(__i, __npages, __list, &(__head), &(__ntails)); \
- __i < __npages; __i += __ntails, \
- compound_range_next(__i, __npages, __list, &(__head), &(__ntails)))
-
-static inline void compound_next(unsigned long i, unsigned long npages,
- struct page **list, struct page **head,
- unsigned int *ntails)
+static inline struct folio *gup_folio_next(struct page **list,
+ unsigned long npages, unsigned long i, unsigned int *ntails)
{
- struct page *page;
+ struct folio *folio = page_folio(list[i]);
unsigned int nr;
- if (i >= npages)
- return;
-
- page = compound_head(list[i]);
for (nr = i + 1; nr < npages; nr++) {
- if (compound_head(list[nr]) != page)
+ if (page_folio(list[nr]) != folio)
break;
}
- *head = page;
*ntails = nr - i;
+ return folio;
}
-#define for_each_compound_head(__i, __list, __npages, __head, __ntails) \
- for (__i = 0, \
- compound_next(__i, __npages, __list, &(__head), &(__ntails)); \
- __i < __npages; __i += __ntails, \
- compound_next(__i, __npages, __list, &(__head), &(__ntails)))
-
/**
* unpin_user_pages_dirty_lock() - release and optionally dirty gup-pinned pages
* @pages: array of pages to be maybe marked dirty, and definitely released.
@@ -306,16 +263,17 @@ static inline void compound_next(unsigned long i, unsigned long npages,
void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages,
bool make_dirty)
{
- unsigned long index;
- struct page *head;
- unsigned int ntails;
+ unsigned long i;
+ struct folio *folio;
+ unsigned int nr;
if (!make_dirty) {
unpin_user_pages(pages, npages);
return;
}
- for_each_compound_head(index, pages, npages, head, ntails) {
+ for (i = 0; i < npages; i += nr) {
+ folio = gup_folio_next(pages, npages, i, &nr);
/*
* Checking PageDirty at this point may race with
* clear_page_dirty_for_io(), but that's OK. Two key
@@ -336,9 +294,12 @@ void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages,
* written back, so it gets written back again in the
* next writeback cycle. This is harmless.
*/
- if (!PageDirty(head))
- set_page_dirty_lock(head);
- put_compound_head(head, ntails, FOLL_PIN);
+ if (!folio_test_dirty(folio)) {
+ folio_lock(folio);
+ folio_mark_dirty(folio);
+ folio_unlock(folio);
+ }
+ gup_put_folio(folio, nr, FOLL_PIN);
}
}
EXPORT_SYMBOL(unpin_user_pages_dirty_lock);
@@ -367,14 +328,18 @@ EXPORT_SYMBOL(unpin_user_pages_dirty_lock);
void unpin_user_page_range_dirty_lock(struct page *page, unsigned long npages,
bool make_dirty)
{
- unsigned long index;
- struct page *head;
- unsigned int ntails;
+ unsigned long i;
+ struct folio *folio;
+ unsigned int nr;
- for_each_compound_range(index, &page, npages, head, ntails) {
- if (make_dirty && !PageDirty(head))
- set_page_dirty_lock(head);
- put_compound_head(head, ntails, FOLL_PIN);
+ for (i = 0; i < npages; i += nr) {
+ folio = gup_folio_range_next(page, npages, i, &nr);
+ if (make_dirty && !folio_test_dirty(folio)) {
+ folio_lock(folio);
+ folio_mark_dirty(folio);
+ folio_unlock(folio);
+ }
+ gup_put_folio(folio, nr, FOLL_PIN);
}
}
EXPORT_SYMBOL(unpin_user_page_range_dirty_lock);
@@ -390,9 +355,9 @@ EXPORT_SYMBOL(unpin_user_page_range_dirty_lock);
*/
void unpin_user_pages(struct page **pages, unsigned long npages)
{
- unsigned long index;
- struct page *head;
- unsigned int ntails;
+ unsigned long i;
+ struct folio *folio;
+ unsigned int nr;
/*
* If this WARN_ON() fires, then the system *might* be leaking pages (by
@@ -402,8 +367,10 @@ void unpin_user_pages(struct page **pages, unsigned long npages)
if (WARN_ON(IS_ERR_VALUE(npages)))
return;
- for_each_compound_head(index, pages, npages, head, ntails)
- put_compound_head(head, ntails, FOLL_PIN);
+ for (i = 0; i < npages; i += nr) {
+ folio = gup_folio_next(pages, npages, i, &nr);
+ gup_put_folio(folio, nr, FOLL_PIN);
+ }
}
EXPORT_SYMBOL(unpin_user_pages);
@@ -439,10 +406,6 @@ static struct page *no_page_table(struct vm_area_struct *vma,
static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address,
pte_t *pte, unsigned int flags)
{
- /* No page to get reference */
- if (flags & FOLL_GET)
- return -EFAULT;
-
if (flags & FOLL_TOUCH) {
pte_t entry = *pte;
@@ -572,32 +535,6 @@ retry:
*/
mark_page_accessed(page);
}
- if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
- /* Do not mlock pte-mapped THP */
- if (PageTransCompound(page))
- goto out;
-
- /*
- * The preliminary mapping check is mainly to avoid the
- * pointless overhead of lock_page on the ZERO_PAGE
- * which might bounce very badly if there is contention.
- *
- * If the page is already locked, we don't need to
- * handle it now - vmscan will handle it later if and
- * when it attempts to reclaim the page.
- */
- if (page->mapping && trylock_page(page)) {
- lru_add_drain(); /* push cached pages to LRU */
- /*
- * Because we lock page here, and migration is
- * blocked by the pte's page reference, and we
- * know the page is still mapped, we don't even
- * need to check for file-cache page truncation.
- */
- mlock_vma_page(page);
- unlock_page(page);
- }
- }
out:
pte_unmap_unlock(ptep, ptl);
return page;
@@ -920,9 +857,6 @@ static int faultin_page(struct vm_area_struct *vma,
unsigned int fault_flags = 0;
vm_fault_t ret;
- /* mlock all present pages, but do not fault in new pages */
- if ((*flags & (FOLL_POPULATE | FOLL_MLOCK)) == FOLL_MLOCK)
- return -ENOENT;
if (*flags & FOLL_NOFAULT)
return -EFAULT;
if (*flags & FOLL_WRITE)
@@ -1173,15 +1107,20 @@ retry:
case -ENOMEM:
case -EHWPOISON:
goto out;
- case -ENOENT:
- goto next_page;
}
BUG();
} else if (PTR_ERR(page) == -EEXIST) {
/*
* Proper page table entry exists, but no corresponding
- * struct page.
+ * struct page. If the caller expects **pages to be
+ * filled in, bail out now, because that can't be done
+ * for this page.
*/
+ if (pages) {
+ ret = PTR_ERR(page);
+ goto out;
+ }
+
goto next_page;
} else if (IS_ERR(page)) {
ret = PTR_ERR(page);
@@ -1472,9 +1411,14 @@ long populate_vma_page_range(struct vm_area_struct *vma,
VM_BUG_ON_VMA(end > vma->vm_end, vma);
mmap_assert_locked(mm);
- gup_flags = FOLL_TOUCH | FOLL_POPULATE | FOLL_MLOCK;
+ /*
+ * Rightly or wrongly, the VM_LOCKONFAULT case has never used
+ * faultin_page() to break COW, so it has no work to do here.
+ */
if (vma->vm_flags & VM_LOCKONFAULT)
- gup_flags &= ~FOLL_POPULATE;
+ return nr_pages;
+
+ gup_flags = FOLL_TOUCH;
/*
* We want to touch writable mappings with a write fault in order
* to break COW, except for shared mappings because these don't COW
@@ -1541,10 +1485,9 @@ long faultin_vma_page_range(struct vm_area_struct *vma, unsigned long start,
* in the page table.
* FOLL_HWPOISON: Return -EHWPOISON instead of -EFAULT when we hit
* a poisoned page.
- * FOLL_POPULATE: Always populate memory with VM_LOCKONFAULT.
* !FOLL_FORCE: Require proper access permissions.
*/
- gup_flags = FOLL_TOUCH | FOLL_POPULATE | FOLL_MLOCK | FOLL_HWPOISON;
+ gup_flags = FOLL_TOUCH | FOLL_HWPOISON;
if (write)
gup_flags |= FOLL_WRITE;
@@ -1704,11 +1647,11 @@ EXPORT_SYMBOL(fault_in_writeable);
* @uaddr: start of address range
* @size: length of address range
*
- * Faults in an address range using get_user_pages, i.e., without triggering
- * hardware page faults. This is primarily useful when we already know that
- * some or all of the pages in the address range aren't in memory.
+ * Faults in an address range for writing. This is primarily useful when we
+ * already know that some or all of the pages in the address range aren't in
+ * memory.
*
- * Other than fault_in_writeable(), this function is non-destructive.
+ * Unlike fault_in_writeable(), this function is non-destructive.
*
* Note that we don't pin or otherwise hold the pages referenced that we fault
* in. There's no guarantee that they'll stay in memory for any duration of
@@ -1719,46 +1662,27 @@ EXPORT_SYMBOL(fault_in_writeable);
*/
size_t fault_in_safe_writeable(const char __user *uaddr, size_t size)
{
- unsigned long start = (unsigned long)untagged_addr(uaddr);
- unsigned long end, nstart, nend;
+ unsigned long start = (unsigned long)uaddr, end;
struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma = NULL;
- int locked = 0;
+ bool unlocked = false;
- nstart = start & PAGE_MASK;
+ if (unlikely(size == 0))
+ return 0;
end = PAGE_ALIGN(start + size);
- if (end < nstart)
+ if (end < start)
end = 0;
- for (; nstart != end; nstart = nend) {
- unsigned long nr_pages;
- long ret;
- if (!locked) {
- locked = 1;
- mmap_read_lock(mm);
- vma = find_vma(mm, nstart);
- } else if (nstart >= vma->vm_end)
- vma = vma->vm_next;
- if (!vma || vma->vm_start >= end)
- break;
- nend = end ? min(end, vma->vm_end) : vma->vm_end;
- if (vma->vm_flags & (VM_IO | VM_PFNMAP))
- continue;
- if (nstart < vma->vm_start)
- nstart = vma->vm_start;
- nr_pages = (nend - nstart) / PAGE_SIZE;
- ret = __get_user_pages_locked(mm, nstart, nr_pages,
- NULL, NULL, &locked,
- FOLL_TOUCH | FOLL_WRITE);
- if (ret <= 0)
+ mmap_read_lock(mm);
+ do {
+ if (fixup_user_fault(mm, start, FAULT_FLAG_WRITE, &unlocked))
break;
- nend = nstart + ret * PAGE_SIZE;
- }
- if (locked)
- mmap_read_unlock(mm);
- if (nstart == end)
- return 0;
- return size - min_t(size_t, nstart - start, size);
+ start = (start + PAGE_SIZE) & PAGE_MASK;
+ } while (start != end);
+ mmap_read_unlock(mm);
+
+ if (size > (unsigned long)uaddr - start)
+ return size - ((unsigned long)uaddr - start);
+ return 0;
}
EXPORT_SYMBOL(fault_in_safe_writeable);
@@ -1843,72 +1767,80 @@ static long check_and_migrate_movable_pages(unsigned long nr_pages,
struct page **pages,
unsigned int gup_flags)
{
- unsigned long i;
- unsigned long isolation_error_count = 0;
- bool drain_allow = true;
+ unsigned long isolation_error_count = 0, i;
+ struct folio *prev_folio = NULL;
LIST_HEAD(movable_page_list);
- long ret = 0;
- struct page *prev_head = NULL;
- struct page *head;
- struct migration_target_control mtc = {
- .nid = NUMA_NO_NODE,
- .gfp_mask = GFP_USER | __GFP_NOWARN,
- };
+ bool drain_allow = true;
+ int ret = 0;
for (i = 0; i < nr_pages; i++) {
- head = compound_head(pages[i]);
- if (head == prev_head)
+ struct folio *folio = page_folio(pages[i]);
+
+ if (folio == prev_folio)
+ continue;
+ prev_folio = folio;
+
+ if (folio_is_pinnable(folio))
continue;
- prev_head = head;
+
/*
- * If we get a movable page, since we are going to be pinning
- * these entries, try to move them out if possible.
+ * Try to move out any movable page before pinning the range.
*/
- if (!is_pinnable_page(head)) {
- if (PageHuge(head)) {
- if (!isolate_huge_page(head, &movable_page_list))
- isolation_error_count++;
- } else {
- if (!PageLRU(head) && drain_allow) {
- lru_add_drain_all();
- drain_allow = false;
- }
+ if (folio_test_hugetlb(folio)) {
+ if (!isolate_huge_page(&folio->page,
+ &movable_page_list))
+ isolation_error_count++;
+ continue;
+ }
- if (isolate_lru_page(head)) {
- isolation_error_count++;
- continue;
- }
- list_add_tail(&head->lru, &movable_page_list);
- mod_node_page_state(page_pgdat(head),
- NR_ISOLATED_ANON +
- page_is_file_lru(head),
- thp_nr_pages(head));
- }
+ if (!folio_test_lru(folio) && drain_allow) {
+ lru_add_drain_all();
+ drain_allow = false;
+ }
+
+ if (folio_isolate_lru(folio)) {
+ isolation_error_count++;
+ continue;
}
+ list_add_tail(&folio->lru, &movable_page_list);
+ node_stat_mod_folio(folio,
+ NR_ISOLATED_ANON + folio_is_file_lru(folio),
+ folio_nr_pages(folio));
}
+ if (!list_empty(&movable_page_list) || isolation_error_count)
+ goto unpin_pages;
+
/*
* If list is empty, and no isolation errors, means that all pages are
* in the correct zone.
*/
- if (list_empty(&movable_page_list) && !isolation_error_count)
- return nr_pages;
+ return nr_pages;
+unpin_pages:
if (gup_flags & FOLL_PIN) {
unpin_user_pages(pages, nr_pages);
} else {
for (i = 0; i < nr_pages; i++)
put_page(pages[i]);
}
+
if (!list_empty(&movable_page_list)) {
+ struct migration_target_control mtc = {
+ .nid = NUMA_NO_NODE,
+ .gfp_mask = GFP_USER | __GFP_NOWARN,
+ };
+
ret = migrate_pages(&movable_page_list, alloc_migration_target,
NULL, (unsigned long)&mtc, MIGRATE_SYNC,
MR_LONGTERM_PIN, NULL);
- if (ret && !list_empty(&movable_page_list))
- putback_movable_pages(&movable_page_list);
+ if (ret > 0) /* number of pages not migrated */
+ ret = -ENOMEM;
}
- return ret > 0 ? -ENOMEM : ret;
+ if (ret && !list_empty(&movable_page_list))
+ putback_movable_pages(&movable_page_list);
+ return ret;
}
#else
static long check_and_migrate_movable_pages(unsigned long nr_pages,
@@ -2117,65 +2049,6 @@ long get_user_pages(unsigned long start, unsigned long nr_pages,
}
EXPORT_SYMBOL(get_user_pages);
-/**
- * get_user_pages_locked() - variant of get_user_pages()
- *
- * @start: starting user address
- * @nr_pages: number of pages from start to pin
- * @gup_flags: flags modifying lookup behaviour
- * @pages: array that receives pointers to the pages pinned.
- * Should be at least nr_pages long. Or NULL, if caller
- * only intends to ensure the pages are faulted in.
- * @locked: pointer to lock flag indicating whether lock is held and
- * subsequently whether VM_FAULT_RETRY functionality can be
- * utilised. Lock must initially be held.
- *
- * It is suitable to replace the form:
- *
- * mmap_read_lock(mm);
- * do_something()
- * get_user_pages(mm, ..., pages, NULL);
- * mmap_read_unlock(mm);
- *
- * to:
- *
- * int locked = 1;
- * mmap_read_lock(mm);
- * do_something()
- * get_user_pages_locked(mm, ..., pages, &locked);
- * if (locked)
- * mmap_read_unlock(mm);
- *
- * We can leverage the VM_FAULT_RETRY functionality in the page fault
- * paths better by using either get_user_pages_locked() or
- * get_user_pages_unlocked().
- *
- */
-long get_user_pages_locked(unsigned long start, unsigned long nr_pages,
- unsigned int gup_flags, struct page **pages,
- int *locked)
-{
- /*
- * FIXME: Current FOLL_LONGTERM behavior is incompatible with
- * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on
- * vmas. As there are no users of this flag in this call we simply
- * disallow this option for now.
- */
- if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM))
- return -EINVAL;
- /*
- * FOLL_PIN must only be set internally by the pin_user_pages*() APIs,
- * never directly by the caller, so enforce that:
- */
- if (WARN_ON_ONCE(gup_flags & FOLL_PIN))
- return -EINVAL;
-
- return __get_user_pages_locked(current->mm, start, nr_pages,
- pages, NULL, locked,
- gup_flags | FOLL_TOUCH);
-}
-EXPORT_SYMBOL(get_user_pages_locked);
-
/*
* get_user_pages_unlocked() is suitable to replace the form:
*
@@ -2277,7 +2150,8 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
ptem = ptep = pte_offset_map(&pmd, addr);
do {
pte_t pte = ptep_get_lockless(ptep);
- struct page *head, *page;
+ struct page *page;
+ struct folio *folio;
/*
* Similar to the PMD case below, NUMA hinting must take slow
@@ -2304,22 +2178,20 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
page = pte_page(pte);
- head = try_grab_compound_head(page, 1, flags);
- if (!head)
+ folio = try_grab_folio(page, 1, flags);
+ if (!folio)
goto pte_unmap;
if (unlikely(page_is_secretmem(page))) {
- put_compound_head(head, 1, flags);
+ gup_put_folio(folio, 1, flags);
goto pte_unmap;
}
if (unlikely(pte_val(pte) != pte_val(*ptep))) {
- put_compound_head(head, 1, flags);
+ gup_put_folio(folio, 1, flags);
goto pte_unmap;
}
- VM_BUG_ON_PAGE(compound_head(page) != head, page);
-
/*
* We need to make the page accessible if and only if we are
* going to access its content (the FOLL_PIN case). Please
@@ -2329,14 +2201,13 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
if (flags & FOLL_PIN) {
ret = arch_make_page_accessible(page);
if (ret) {
- unpin_user_page(page);
+ gup_put_folio(folio, 1, flags);
goto pte_unmap;
}
}
- SetPageReferenced(page);
+ folio_set_referenced(folio);
pages[*nr] = page;
(*nr)++;
-
} while (ptep++, addr += PAGE_SIZE, addr != end);
ret = 1;
@@ -2453,8 +2324,8 @@ static int record_subpages(struct page *page, unsigned long addr,
{
int nr;
- for (nr = 0; addr != end; addr += PAGE_SIZE)
- pages[nr++] = page++;
+ for (nr = 0; addr != end; nr++, addr += PAGE_SIZE)
+ pages[nr] = nth_page(page, nr);
return nr;
}
@@ -2472,7 +2343,8 @@ static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
struct page **pages, int *nr)
{
unsigned long pte_end;
- struct page *head, *page;
+ struct page *page;
+ struct folio *folio;
pte_t pte;
int refs;
@@ -2488,21 +2360,20 @@ static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
/* hugepages are never "special" */
VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
- head = pte_page(pte);
- page = head + ((addr & (sz-1)) >> PAGE_SHIFT);
+ page = nth_page(pte_page(pte), (addr & (sz - 1)) >> PAGE_SHIFT);
refs = record_subpages(page, addr, end, pages + *nr);
- head = try_grab_compound_head(head, refs, flags);
- if (!head)
+ folio = try_grab_folio(page, refs, flags);
+ if (!folio)
return 0;
if (unlikely(pte_val(pte) != pte_val(*ptep))) {
- put_compound_head(head, refs, flags);
+ gup_put_folio(folio, refs, flags);
return 0;
}
*nr += refs;
- SetPageReferenced(head);
+ folio_set_referenced(folio);
return 1;
}
@@ -2536,7 +2407,8 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
unsigned long end, unsigned int flags,
struct page **pages, int *nr)
{
- struct page *head, *page;
+ struct page *page;
+ struct folio *folio;
int refs;
if (!pmd_access_permitted(orig, flags & FOLL_WRITE))
@@ -2549,20 +2421,20 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
pages, nr);
}
- page = pmd_page(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
+ page = nth_page(pmd_page(orig), (addr & ~PMD_MASK) >> PAGE_SHIFT);
refs = record_subpages(page, addr, end, pages + *nr);
- head = try_grab_compound_head(pmd_page(orig), refs, flags);
- if (!head)
+ folio = try_grab_folio(page, refs, flags);
+ if (!folio)
return 0;
if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) {
- put_compound_head(head, refs, flags);
+ gup_put_folio(folio, refs, flags);
return 0;
}
*nr += refs;
- SetPageReferenced(head);
+ folio_set_referenced(folio);
return 1;
}
@@ -2570,7 +2442,8 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
unsigned long end, unsigned int flags,
struct page **pages, int *nr)
{
- struct page *head, *page;
+ struct page *page;
+ struct folio *folio;
int refs;
if (!pud_access_permitted(orig, flags & FOLL_WRITE))
@@ -2583,20 +2456,20 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
pages, nr);
}
- page = pud_page(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
+ page = nth_page(pud_page(orig), (addr & ~PUD_MASK) >> PAGE_SHIFT);
refs = record_subpages(page, addr, end, pages + *nr);
- head = try_grab_compound_head(pud_page(orig), refs, flags);
- if (!head)
+ folio = try_grab_folio(page, refs, flags);
+ if (!folio)
return 0;
if (unlikely(pud_val(orig) != pud_val(*pudp))) {
- put_compound_head(head, refs, flags);
+ gup_put_folio(folio, refs, flags);
return 0;
}
*nr += refs;
- SetPageReferenced(head);
+ folio_set_referenced(folio);
return 1;
}
@@ -2605,27 +2478,28 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
struct page **pages, int *nr)
{
int refs;
- struct page *head, *page;
+ struct page *page;
+ struct folio *folio;
if (!pgd_access_permitted(orig, flags & FOLL_WRITE))
return 0;
BUILD_BUG_ON(pgd_devmap(orig));
- page = pgd_page(orig) + ((addr & ~PGDIR_MASK) >> PAGE_SHIFT);
+ page = nth_page(pgd_page(orig), (addr & ~PGDIR_MASK) >> PAGE_SHIFT);
refs = record_subpages(page, addr, end, pages + *nr);
- head = try_grab_compound_head(pgd_page(orig), refs, flags);
- if (!head)
+ folio = try_grab_folio(page, refs, flags);
+ if (!folio)
return 0;
if (unlikely(pgd_val(orig) != pgd_val(*pgdp))) {
- put_compound_head(head, refs, flags);
+ gup_put_folio(folio, refs, flags);
return 0;
}
*nr += refs;
- SetPageReferenced(head);
+ folio_set_referenced(folio);
return 1;
}
@@ -3118,32 +2992,3 @@ long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
return get_user_pages_unlocked(start, nr_pages, pages, gup_flags);
}
EXPORT_SYMBOL(pin_user_pages_unlocked);
-
-/*
- * pin_user_pages_locked() is the FOLL_PIN variant of get_user_pages_locked().
- * Behavior is the same, except that this one sets FOLL_PIN and rejects
- * FOLL_GET.
- */
-long pin_user_pages_locked(unsigned long start, unsigned long nr_pages,
- unsigned int gup_flags, struct page **pages,
- int *locked)
-{
- /*
- * FIXME: Current FOLL_LONGTERM behavior is incompatible with
- * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on
- * vmas. As there are no users of this flag in this call we simply
- * disallow this option for now.
- */
- if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM))
- return -EINVAL;
-
- /* FOLL_GET and FOLL_PIN are mutually exclusive. */
- if (WARN_ON_ONCE(gup_flags & FOLL_GET))
- return -EINVAL;
-
- gup_flags |= FOLL_PIN;
- return __get_user_pages_locked(current->mm, start, nr_pages,
- pages, NULL, locked,
- gup_flags | FOLL_TOUCH);
-}
-EXPORT_SYMBOL(pin_user_pages_locked);
diff --git a/mm/highmem.c b/mm/highmem.c
index 762679050c9a..0cc0c4da7ed9 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -736,11 +736,11 @@ void *page_address(const struct page *page)
list_for_each_entry(pam, &pas->lh, list) {
if (pam->page == page) {
ret = pam->virtual;
- goto done;
+ break;
}
}
}
-done:
+
spin_unlock_irqrestore(&pas->lock, flags);
return ret;
}
@@ -773,13 +773,12 @@ void set_page_address(struct page *page, void *virtual)
list_for_each_entry(pam, &pas->lh, list) {
if (pam->page == page) {
list_del(&pam->list);
- spin_unlock_irqrestore(&pas->lock, flags);
- goto done;
+ break;
}
}
spin_unlock_irqrestore(&pas->lock, flags);
}
-done:
+
return;
}
diff --git a/mm/hmm.c b/mm/hmm.c
index bd56641c79d4..af71aac3140e 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -417,7 +417,6 @@ static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end,
struct hmm_range *range = hmm_vma_walk->range;
unsigned long addr = start;
pud_t pud;
- int ret = 0;
spinlock_t *ptl = pud_trans_huge_lock(pudp, walk->vma);
if (!ptl)
@@ -466,7 +465,7 @@ static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end,
out_unlock:
spin_unlock(ptl);
- return ret;
+ return 0;
}
#else
#define hmm_vma_walk_pud NULL
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 406a3c28c026..2fe38212e07c 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -34,11 +34,15 @@
#include <linux/oom.h>
#include <linux/numa.h>
#include <linux/page_owner.h>
+#include <linux/sched/sysctl.h>
#include <asm/tlb.h>
#include <asm/pgalloc.h>
#include "internal.h"
+#define CREATE_TRACE_POINTS
+#include <trace/events/thp.h>
+
/*
* By default, transparent hugepage support is disabled in order to avoid
* risking an increased memory footprint for applications that are not
@@ -529,7 +533,7 @@ void prep_transhuge_page(struct page *page)
set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR);
}
-bool is_transparent_hugepage(struct page *page)
+static inline bool is_transparent_hugepage(struct page *page)
{
if (!PageCompound(page))
return false;
@@ -538,7 +542,6 @@ bool is_transparent_hugepage(struct page *page)
return is_huge_zero_page(page) ||
page[1].compound_dtor == TRANSHUGE_PAGE_DTOR;
}
-EXPORT_SYMBOL_GPL(is_transparent_hugepage);
static unsigned long __thp_get_unmapped_area(struct file *filp,
unsigned long addr, unsigned long len,
@@ -582,13 +585,10 @@ unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
unsigned long ret;
loff_t off = (loff_t)pgoff << PAGE_SHIFT;
- if (!IS_DAX(filp->f_mapping->host) || !IS_ENABLED(CONFIG_FS_DAX_PMD))
- goto out;
-
ret = __thp_get_unmapped_area(filp, addr, len, off, flags, PMD_SIZE);
if (ret)
return ret;
-out:
+
return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags);
}
EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
@@ -1303,7 +1303,6 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
page = pmd_page(orig_pmd);
VM_BUG_ON_PAGE(!PageHead(page), page);
- /* Lock page for reuse_swap_page() */
if (!trylock_page(page)) {
get_page(page);
spin_unlock(vmf->ptl);
@@ -1319,10 +1318,15 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
}
/*
- * We can only reuse the page if nobody else maps the huge page or it's
- * part.
+ * See do_wp_page(): we can only map the page writable if there are
+ * no additional references. Note that we always drain the LRU
+ * pagevecs immediately after adding a THP.
*/
- if (reuse_swap_page(page)) {
+ if (page_count(page) > 1 + PageSwapCache(page) * thp_nr_pages(page))
+ goto unlock_fallback;
+ if (PageSwapCache(page))
+ try_to_free_swap(page);
+ if (page_count(page) == 1) {
pmd_t entry;
entry = pmd_mkyoung(orig_pmd);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
@@ -1333,6 +1337,7 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
return VM_FAULT_WRITE;
}
+unlock_fallback:
unlock_page(page);
spin_unlock(vmf->ptl);
fallback:
@@ -1380,39 +1385,6 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
if (flags & FOLL_TOUCH)
touch_pmd(vma, addr, pmd, flags);
- if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
- /*
- * We don't mlock() pte-mapped THPs. This way we can avoid
- * leaking mlocked pages into non-VM_LOCKED VMAs.
- *
- * For anon THP:
- *
- * In most cases the pmd is the only mapping of the page as we
- * break COW for the mlock() -- see gup_flags |= FOLL_WRITE for
- * writable private mappings in populate_vma_page_range().
- *
- * The only scenario when we have the page shared here is if we
- * mlocking read-only mapping shared over fork(). We skip
- * mlocking such pages.
- *
- * For file THP:
- *
- * We can expect PageDoubleMap() to be stable under page lock:
- * for file pages we set it in page_add_file_rmap(), which
- * requires page to be locked.
- */
-
- if (PageAnon(page) && compound_mapcount(page) != 1)
- goto skip_mlock;
- if (PageDoubleMap(page) || !page->mapping)
- goto skip_mlock;
- if (!trylock_page(page))
- goto skip_mlock;
- if (page->mapping && !PageDoubleMap(page))
- mlock_vma_page(page);
- unlock_page(page);
- }
-skip_mlock:
page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
VM_BUG_ON_PAGE(!PageCompound(page) && !is_zone_device_page(page), page);
@@ -1610,7 +1582,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
if (pmd_present(orig_pmd)) {
page = pmd_page(orig_pmd);
- page_remove_rmap(page, true);
+ page_remove_rmap(page, vma, true);
VM_BUG_ON_PAGE(page_mapcount(page) < 0, page);
VM_BUG_ON_PAGE(!PageHead(page), page);
} else if (thp_migration_supported()) {
@@ -1766,17 +1738,28 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
}
#endif
- /*
- * Avoid trapping faults against the zero page. The read-only
- * data is likely to be read-cached on the local CPU and
- * local/remote hits to the zero page are not interesting.
- */
- if (prot_numa && is_huge_zero_pmd(*pmd))
- goto unlock;
+ if (prot_numa) {
+ struct page *page;
+ /*
+ * Avoid trapping faults against the zero page. The read-only
+ * data is likely to be read-cached on the local CPU and
+ * local/remote hits to the zero page are not interesting.
+ */
+ if (is_huge_zero_pmd(*pmd))
+ goto unlock;
- if (prot_numa && pmd_protnone(*pmd))
- goto unlock;
+ if (pmd_protnone(*pmd))
+ goto unlock;
+ page = pmd_page(*pmd);
+ /*
+ * Skip scanning top tier node if normal numa
+ * balancing is disabled
+ */
+ if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) &&
+ node_is_toptier(page_to_nid(page)))
+ goto unlock;
+ }
/*
* In case prot_numa, we are under mmap_read_lock(mm). It's critical
* to not clear pmd intermittently to avoid race with MADV_DONTNEED
@@ -1995,7 +1978,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
set_page_dirty(page);
if (!PageReferenced(page) && pmd_young(old_pmd))
SetPageReferenced(page);
- page_remove_rmap(page, true);
+ page_remove_rmap(page, vma, true);
put_page(page);
}
add_mm_counter(mm, mm_counter_file(page), -HPAGE_PMD_NR);
@@ -2055,9 +2038,9 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
young = pmd_young(old_pmd);
soft_dirty = pmd_soft_dirty(old_pmd);
uffd_wp = pmd_uffd_wp(old_pmd);
+ VM_BUG_ON_PAGE(!page_count(page), page);
+ page_ref_add(page, HPAGE_PMD_NR - 1);
}
- VM_BUG_ON_PAGE(!page_count(page), page);
- page_ref_add(page, HPAGE_PMD_NR - 1);
/*
* Withdraw the table only after we mark the pmd entry invalid.
@@ -2129,6 +2112,9 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
}
}
unlock_page_memcg(page);
+
+ /* Above is effectively page_remove_rmap(page, vma, true) */
+ munlock_vma_page(page, vma, true);
}
smp_wmb(); /* make pte visible before pmd */
@@ -2136,19 +2122,17 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
if (freeze) {
for (i = 0; i < HPAGE_PMD_NR; i++) {
- page_remove_rmap(page + i, false);
+ page_remove_rmap(page + i, vma, false);
put_page(page + i);
}
}
}
void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
- unsigned long address, bool freeze, struct page *page)
+ unsigned long address, bool freeze, struct folio *folio)
{
spinlock_t *ptl;
struct mmu_notifier_range range;
- bool do_unlock_page = false;
- pmd_t _pmd;
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
address & HPAGE_PMD_MASK,
@@ -2157,54 +2141,22 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
ptl = pmd_lock(vma->vm_mm, pmd);
/*
- * If caller asks to setup a migration entries, we need a page to check
- * pmd against. Otherwise we can end up replacing wrong page.
+ * If caller asks to setup a migration entry, we need a folio to check
+ * pmd against. Otherwise we can end up replacing wrong folio.
*/
- VM_BUG_ON(freeze && !page);
- if (page) {
- VM_WARN_ON_ONCE(!PageLocked(page));
- if (page != pmd_page(*pmd))
+ VM_BUG_ON(freeze && !folio);
+ if (folio) {
+ VM_WARN_ON_ONCE(!folio_test_locked(folio));
+ if (folio != page_folio(pmd_page(*pmd)))
goto out;
}
-repeat:
- if (pmd_trans_huge(*pmd)) {
- if (!page) {
- page = pmd_page(*pmd);
- /*
- * An anonymous page must be locked, to ensure that a
- * concurrent reuse_swap_page() sees stable mapcount;
- * but reuse_swap_page() is not used on shmem or file,
- * and page lock must not be taken when zap_pmd_range()
- * calls __split_huge_pmd() while i_mmap_lock is held.
- */
- if (PageAnon(page)) {
- if (unlikely(!trylock_page(page))) {
- get_page(page);
- _pmd = *pmd;
- spin_unlock(ptl);
- lock_page(page);
- spin_lock(ptl);
- if (unlikely(!pmd_same(*pmd, _pmd))) {
- unlock_page(page);
- put_page(page);
- page = NULL;
- goto repeat;
- }
- put_page(page);
- }
- do_unlock_page = true;
- }
- }
- if (PageMlocked(page))
- clear_page_mlock(page);
- } else if (!(pmd_devmap(*pmd) || is_pmd_migration_entry(*pmd)))
- goto out;
- __split_huge_pmd_locked(vma, pmd, range.start, freeze);
+ if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd) ||
+ is_pmd_migration_entry(*pmd))
+ __split_huge_pmd_locked(vma, pmd, range.start, freeze);
+
out:
spin_unlock(ptl);
- if (do_unlock_page)
- unlock_page(page);
/*
* No need to double call mmu_notifier->invalidate_range() callback.
* They are 3 cases to consider inside __split_huge_pmd_locked():
@@ -2222,7 +2174,7 @@ out:
}
void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
- bool freeze, struct page *page)
+ bool freeze, struct folio *folio)
{
pgd_t *pgd;
p4d_t *p4d;
@@ -2243,7 +2195,7 @@ void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
pmd = pmd_offset(pud, address);
- __split_huge_pmd(vma, pmd, address, freeze, page);
+ __split_huge_pmd(vma, pmd, address, freeze, folio);
}
static inline void split_huge_pmd_if_needed(struct vm_area_struct *vma, unsigned long address)
@@ -2283,6 +2235,7 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma,
static void unmap_page(struct page *page)
{
+ struct folio *folio = page_folio(page);
enum ttu_flags ttu_flags = TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD |
TTU_SYNC;
@@ -2293,26 +2246,27 @@ static void unmap_page(struct page *page)
* pages can simply be left unmapped, then faulted back on demand.
* If that is ever changed (perhaps for mlock), update remap_page().
*/
- if (PageAnon(page))
- try_to_migrate(page, ttu_flags);
+ if (folio_test_anon(folio))
+ try_to_migrate(folio, ttu_flags);
else
- try_to_unmap(page, ttu_flags | TTU_IGNORE_MLOCK);
+ try_to_unmap(folio, ttu_flags | TTU_IGNORE_MLOCK);
VM_WARN_ON_ONCE_PAGE(page_mapped(page), page);
}
-static void remap_page(struct page *page, unsigned int nr)
+static void remap_page(struct folio *folio, unsigned long nr)
{
- int i;
+ int i = 0;
/* If unmap_page() uses try_to_migrate() on file, remove this check */
- if (!PageAnon(page))
+ if (!folio_test_anon(folio))
return;
- if (PageTransHuge(page)) {
- remove_migration_ptes(page, page, true);
- } else {
- for (i = 0; i < nr; i++)
- remove_migration_ptes(page + i, page + i, true);
+ for (;;) {
+ remove_migration_ptes(folio, folio, true);
+ i += folio_nr_pages(folio);
+ if (i >= nr)
+ break;
+ folio = folio_next(folio);
}
}
@@ -2332,8 +2286,11 @@ static void lru_add_page_tail(struct page *head, struct page *tail,
} else {
/* head is still on lru (and we have it frozen) */
VM_WARN_ON(!PageLRU(head));
+ if (PageUnevictable(tail))
+ tail->mlock_count = 0;
+ else
+ list_add_tail(&tail->lru, &head->lru);
SetPageLRU(tail);
- list_add_tail(&tail->lru, &head->lru);
}
}
@@ -2469,7 +2426,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
}
local_irq_enable();
- remap_page(head, nr);
+ remap_page(folio, nr);
if (PageSwapCache(head)) {
swp_entry_t entry = { .val = page_private(head) };
@@ -2494,91 +2451,20 @@ static void __split_huge_page(struct page *page, struct list_head *list,
}
}
-int total_mapcount(struct page *page)
-{
- int i, compound, nr, ret;
-
- VM_BUG_ON_PAGE(PageTail(page), page);
-
- if (likely(!PageCompound(page)))
- return atomic_read(&page->_mapcount) + 1;
-
- compound = compound_mapcount(page);
- nr = compound_nr(page);
- if (PageHuge(page))
- return compound;
- ret = compound;
- for (i = 0; i < nr; i++)
- ret += atomic_read(&page[i]._mapcount) + 1;
- /* File pages has compound_mapcount included in _mapcount */
- if (!PageAnon(page))
- return ret - compound * nr;
- if (PageDoubleMap(page))
- ret -= nr;
- return ret;
-}
-
-/*
- * This calculates accurately how many mappings a transparent hugepage
- * has (unlike page_mapcount() which isn't fully accurate). This full
- * accuracy is primarily needed to know if copy-on-write faults can
- * reuse the page and change the mapping to read-write instead of
- * copying them. At the same time this returns the total_mapcount too.
- *
- * The function returns the highest mapcount any one of the subpages
- * has. If the return value is one, even if different processes are
- * mapping different subpages of the transparent hugepage, they can
- * all reuse it, because each process is reusing a different subpage.
- *
- * The total_mapcount is instead counting all virtual mappings of the
- * subpages. If the total_mapcount is equal to "one", it tells the
- * caller all mappings belong to the same "mm" and in turn the
- * anon_vma of the transparent hugepage can become the vma->anon_vma
- * local one as no other process may be mapping any of the subpages.
- *
- * It would be more accurate to replace page_mapcount() with
- * page_trans_huge_mapcount(), however we only use
- * page_trans_huge_mapcount() in the copy-on-write faults where we
- * need full accuracy to avoid breaking page pinning, because
- * page_trans_huge_mapcount() is slower than page_mapcount().
- */
-int page_trans_huge_mapcount(struct page *page)
-{
- int i, ret;
-
- /* hugetlbfs shouldn't call it */
- VM_BUG_ON_PAGE(PageHuge(page), page);
-
- if (likely(!PageTransCompound(page)))
- return atomic_read(&page->_mapcount) + 1;
-
- page = compound_head(page);
-
- ret = 0;
- for (i = 0; i < thp_nr_pages(page); i++) {
- int mapcount = atomic_read(&page[i]._mapcount) + 1;
- ret = max(ret, mapcount);
- }
-
- if (PageDoubleMap(page))
- ret -= 1;
-
- return ret + compound_mapcount(page);
-}
-
/* Racy check whether the huge page can be split */
-bool can_split_huge_page(struct page *page, int *pextra_pins)
+bool can_split_folio(struct folio *folio, int *pextra_pins)
{
int extra_pins;
/* Additional pins from page cache */
- if (PageAnon(page))
- extra_pins = PageSwapCache(page) ? thp_nr_pages(page) : 0;
+ if (folio_test_anon(folio))
+ extra_pins = folio_test_swapcache(folio) ?
+ folio_nr_pages(folio) : 0;
else
- extra_pins = thp_nr_pages(page);
+ extra_pins = folio_nr_pages(folio);
if (pextra_pins)
*pextra_pins = extra_pins;
- return total_mapcount(page) == page_count(page) - extra_pins - 1;
+ return folio_mapcount(folio) == folio_ref_count(folio) - extra_pins - 1;
}
/*
@@ -2602,7 +2488,8 @@ bool can_split_huge_page(struct page *page, int *pextra_pins)
*/
int split_huge_page_to_list(struct page *page, struct list_head *list)
{
- struct page *head = compound_head(page);
+ struct folio *folio = page_folio(page);
+ struct page *head = &folio->page;
struct deferred_split *ds_queue = get_deferred_split_queue(head);
XA_STATE(xas, &head->mapping->i_pages, head->index);
struct anon_vma *anon_vma = NULL;
@@ -2622,7 +2509,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
* The caller does not necessarily hold an mmap_lock that would
* prevent the anon_vma disappearing so we first we take a
* reference to it and then lock the anon_vma for write. This
- * is similar to page_lock_anon_vma_read except the write lock
+ * is similar to folio_lock_anon_vma_read except the write lock
* is taken to serialise against parallel split or collapse
* operations.
*/
@@ -2669,7 +2556,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
* Racy check if we can split the page, before unmap_page() will
* split PMDs
*/
- if (!can_split_huge_page(head, &extra_pins)) {
+ if (!can_split_folio(folio, &extra_pins)) {
ret = -EBUSY;
goto out_unlock;
}
@@ -2719,7 +2606,7 @@ fail:
if (mapping)
xas_unlock(&xas);
local_irq_enable();
- remap_page(head, thp_nr_pages(head));
+ remap_page(folio, folio_nr_pages(folio));
ret = -EBUSY;
}
@@ -2953,7 +2840,6 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
*/
for (addr = vaddr_start; addr < vaddr_end; addr += PAGE_SIZE) {
struct vm_area_struct *vma = find_vma(mm, addr);
- unsigned int follflags;
struct page *page;
if (!vma || addr < vma->vm_start)
@@ -2966,8 +2852,7 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
}
/* FOLL_DUMP to ignore special (like zero) pages */
- follflags = FOLL_GET | FOLL_DUMP;
- page = follow_page(vma, addr, follflags);
+ page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP);
if (IS_ERR(page))
continue;
@@ -2978,7 +2863,7 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
goto next;
total++;
- if (!can_split_huge_page(compound_head(page), NULL))
+ if (!can_split_folio(page_folio(page), NULL))
goto next;
if (!trylock_page(page))
@@ -3171,8 +3056,9 @@ void set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
if (pmd_soft_dirty(pmdval))
pmdswp = pmd_swp_mksoft_dirty(pmdswp);
set_pmd_at(mm, address, pvmw->pmd, pmdswp);
- page_remove_rmap(page, true);
+ page_remove_rmap(page, vma, true);
put_page(page);
+ trace_set_migration_pmd(address, pmd_val(pmdswp));
}
void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
@@ -3197,14 +3083,14 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
if (pmd_swp_uffd_wp(*pvmw->pmd))
pmde = pmd_wrprotect(pmd_mkuffd_wp(pmde));
- flush_cache_range(vma, mmun_start, mmun_start + HPAGE_PMD_SIZE);
if (PageAnon(new))
page_add_anon_rmap(new, vma, mmun_start, true);
else
- page_add_file_rmap(new, true);
+ page_add_file_rmap(new, vma, true);
set_pmd_at(mm, mmun_start, pvmw->pmd, pmde);
- if ((vma->vm_flags & VM_LOCKED) && !PageDoubleMap(new))
- mlock_vma_page(new);
+
+ /* No need to invalidate - it was non-present before */
update_mmu_cache_pmd(vma, address, pvmw->pmd);
+ trace_remove_migration_pmd(address, pmd_val(pmde));
}
#endif
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 61895cc01d09..b34f50156f7e 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -31,6 +31,7 @@
#include <linux/llist.h>
#include <linux/cma.h>
#include <linux/migrate.h>
+#include <linux/nospec.h>
#include <asm/page.h>
#include <asm/pgalloc.h>
@@ -1320,7 +1321,9 @@ static void __destroy_compound_gigantic_page(struct page *page,
}
set_compound_order(page, 0);
+#ifdef CONFIG_64BIT
page[1].compound_nr = 0;
+#endif
__ClearPageHead(page);
}
@@ -1812,7 +1815,9 @@ out_error:
for (; j < nr_pages; j++, p = mem_map_next(p, page, j))
__ClearPageReserved(p);
set_compound_order(page, 0);
+#ifdef CONFIG_64BIT
page[1].compound_nr = 0;
+#endif
__ClearPageHead(page);
return false;
}
@@ -1854,6 +1859,7 @@ int PageHeadHuge(struct page *page_head)
return page_head[1].compound_dtor == HUGETLB_PAGE_DTOR;
}
+EXPORT_SYMBOL_GPL(PageHeadHuge);
/*
* Find and lock address space (mapping) in write mode.
@@ -3498,8 +3504,7 @@ static int demote_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
static struct kobj_attribute _name##_attr = __ATTR_WO(_name)
#define HSTATE_ATTR(_name) \
- static struct kobj_attribute _name##_attr = \
- __ATTR(_name, 0644, _name##_show, _name##_store)
+ static struct kobj_attribute _name##_attr = __ATTR_RW(_name)
static struct kobject *hugepages_kobj;
static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE];
@@ -4159,10 +4164,10 @@ static int __init hugepages_setup(char *s)
pr_warn("HugeTLB: architecture can't support node specific alloc, ignoring!\n");
return 0;
}
- node = tmp;
- p += count + 1;
- if (node < 0 || node >= nr_online_nodes)
+ if (tmp >= nr_online_nodes)
goto invalid;
+ node = array_index_nospec(tmp, nr_online_nodes);
+ p += count + 1;
/* Parse hugepages */
if (sscanf(p, "%lu%n", &tmp, &count) != 1)
goto invalid;
@@ -4637,7 +4642,6 @@ static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
vma->vm_page_prot));
}
entry = pte_mkyoung(entry);
- entry = pte_mkhuge(entry);
entry = arch_make_huge_pte(entry, shift, vma->vm_flags);
return entry;
@@ -4851,14 +4855,13 @@ again:
}
static void move_huge_pte(struct vm_area_struct *vma, unsigned long old_addr,
- unsigned long new_addr, pte_t *src_pte)
+ unsigned long new_addr, pte_t *src_pte, pte_t *dst_pte)
{
struct hstate *h = hstate_vma(vma);
struct mm_struct *mm = vma->vm_mm;
- pte_t *dst_pte, pte;
spinlock_t *src_ptl, *dst_ptl;
+ pte_t pte;
- dst_pte = huge_pte_offset(mm, new_addr, huge_page_size(h));
dst_ptl = huge_pte_lock(h, mm, dst_pte);
src_ptl = huge_pte_lockptr(h, mm, src_pte);
@@ -4917,7 +4920,7 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
if (!dst_pte)
break;
- move_huge_pte(vma, old_addr, new_addr, src_pte);
+ move_huge_pte(vma, old_addr, new_addr, src_pte, dst_pte);
}
flush_tlb_range(vma, old_end - len, old_end);
mmu_notifier_invalidate_range_end(&range);
@@ -5014,7 +5017,7 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct
set_page_dirty(page);
hugetlb_count_sub(pages_per_huge_page(h), mm);
- page_remove_rmap(page, true);
+ page_remove_rmap(page, vma, true);
spin_unlock(ptl);
tlb_remove_page_size(tlb, page, huge_page_size(h));
@@ -5259,7 +5262,7 @@ retry_avoidcopy:
/* Break COW */
huge_ptep_clear_flush(vma, haddr, ptep);
mmu_notifier_invalidate_range(mm, range.start, range.end);
- page_remove_rmap(old_page, true);
+ page_remove_rmap(old_page, vma, true);
hugepage_add_new_anon_rmap(new_page, vma, haddr);
set_huge_pte_at(mm, haddr, ptep,
make_huge_pte(vma, new_page, 1));
@@ -5342,6 +5345,7 @@ static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma,
pgoff_t idx,
unsigned int flags,
unsigned long haddr,
+ unsigned long addr,
unsigned long reason)
{
vm_fault_t ret;
@@ -5349,6 +5353,7 @@ static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma,
struct vm_fault vmf = {
.vma = vma,
.address = haddr,
+ .real_address = addr,
.flags = flags,
/*
@@ -5417,7 +5422,7 @@ retry:
/* Check for page in userfault range */
if (userfaultfd_missing(vma)) {
ret = hugetlb_handle_userfault(vma, mapping, idx,
- flags, haddr,
+ flags, haddr, address,
VM_UFFD_MISSING);
goto out;
}
@@ -5481,7 +5486,7 @@ retry:
unlock_page(page);
put_page(page);
ret = hugetlb_handle_userfault(vma, mapping, idx,
- flags, haddr,
+ flags, haddr, address,
VM_UFFD_MINOR);
goto out;
}
@@ -5818,7 +5823,8 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
*pagep = NULL;
goto out;
}
- folio_copy(page_folio(page), page_folio(*pagep));
+ copy_user_huge_page(page, *pagep, dst_addr, dst_vma,
+ pages_per_huge_page(h));
put_page(*pagep);
*pagep = NULL;
}
@@ -6072,7 +6078,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
if (pages) {
/*
- * try_grab_compound_head() should always succeed here,
+ * try_grab_folio() should always succeed here,
* because: a) we hold the ptl lock, and b) we've just
* checked that the huge page is present in the page
* tables. If the huge page is present, then the tail
@@ -6081,9 +6087,8 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
* any way. So this page must be available at this
* point, unless the page refcount overflowed:
*/
- if (WARN_ON_ONCE(!try_grab_compound_head(pages[i],
- refs,
- flags))) {
+ if (WARN_ON_ONCE(!try_grab_folio(pages[i], refs,
+ flags))) {
spin_unlock(ptl);
remainder = 0;
err = -ENOMEM;
@@ -6172,7 +6177,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
unsigned int shift = huge_page_shift(hstate_vma(vma));
old_pte = huge_ptep_modify_prot_start(vma, address, ptep);
- pte = pte_mkhuge(huge_pte_modify(old_pte, newprot));
+ pte = huge_pte_modify(old_pte, newprot);
pte = arch_make_huge_pte(pte, shift, vma->vm_flags);
huge_ptep_modify_prot_commit(vma, address, ptep, old_pte, pte);
pages++;
@@ -6890,9 +6895,9 @@ static int __init cmdline_parse_hugetlb_cma(char *p)
break;
if (s[count] == ':') {
- nid = tmp;
- if (nid < 0 || nid >= MAX_NUMNODES)
+ if (tmp >= MAX_NUMNODES)
break;
+ nid = array_index_nospec(tmp, MAX_NUMNODES);
s += count + 1;
tmp = memparse(s, &s);
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index c540c21e26f5..791626983c2e 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -124,9 +124,9 @@
* page of page structs (page 0) associated with the HugeTLB page contains the 4
* page structs necessary to describe the HugeTLB. The only use of the remaining
* pages of page structs (page 1 to page 7) is to point to page->compound_head.
- * Therefore, we can remap pages 2 to 7 to page 1. Only 2 pages of page structs
+ * Therefore, we can remap pages 1 to 7 to page 0. Only 1 page of page structs
* will be used for each HugeTLB page. This will allow us to free the remaining
- * 6 pages to the buddy allocator.
+ * 7 pages to the buddy allocator.
*
* Here is how things look after remapping.
*
@@ -134,30 +134,30 @@
* +-----------+ ---virt_to_page---> +-----------+ mapping to +-----------+
* | | | 0 | -------------> | 0 |
* | | +-----------+ +-----------+
- * | | | 1 | -------------> | 1 |
- * | | +-----------+ +-----------+
- * | | | 2 | ----------------^ ^ ^ ^ ^ ^
- * | | +-----------+ | | | | |
- * | | | 3 | ------------------+ | | | |
- * | | +-----------+ | | | |
- * | | | 4 | --------------------+ | | |
- * | PMD | +-----------+ | | |
- * | level | | 5 | ----------------------+ | |
- * | mapping | +-----------+ | |
- * | | | 6 | ------------------------+ |
- * | | +-----------+ |
- * | | | 7 | --------------------------+
+ * | | | 1 | ---------------^ ^ ^ ^ ^ ^ ^
+ * | | +-----------+ | | | | | |
+ * | | | 2 | -----------------+ | | | | |
+ * | | +-----------+ | | | | |
+ * | | | 3 | -------------------+ | | | |
+ * | | +-----------+ | | | |
+ * | | | 4 | ---------------------+ | | |
+ * | PMD | +-----------+ | | |
+ * | level | | 5 | -----------------------+ | |
+ * | mapping | +-----------+ | |
+ * | | | 6 | -------------------------+ |
+ * | | +-----------+ |
+ * | | | 7 | ---------------------------+
* | | +-----------+
* | |
* | |
* | |
* +-----------+
*
- * When a HugeTLB is freed to the buddy system, we should allocate 6 pages for
+ * When a HugeTLB is freed to the buddy system, we should allocate 7 pages for
* vmemmap pages and restore the previous mapping relationship.
*
* For the HugeTLB page of the pud level mapping. It is similar to the former.
- * We also can use this approach to free (PAGE_SIZE - 2) vmemmap pages.
+ * We also can use this approach to free (PAGE_SIZE - 1) vmemmap pages.
*
* Apart from the HugeTLB page of the pmd/pud level mapping, some architectures
* (e.g. aarch64) provides a contiguous bit in the translation table entries
@@ -166,7 +166,13 @@
*
* The contiguous bit is used to increase the mapping size at the pmd and pte
* (last) level. So this type of HugeTLB page can be optimized only when its
- * size of the struct page structs is greater than 2 pages.
+ * size of the struct page structs is greater than 1 page.
+ *
+ * Notice: The head vmemmap page is not freed to the buddy allocator and all
+ * tail vmemmap pages are mapped to the head vmemmap page frame. So we can see
+ * more than one struct page struct with PG_head (e.g. 8 per 2 MB HugeTLB page)
+ * associated with each HugeTLB page. The compound_head() can handle this
+ * correctly (more details refer to the comment above compound_head()).
*/
#define pr_fmt(fmt) "HugeTLB: " fmt
@@ -175,19 +181,21 @@
/*
* There are a lot of struct page structures associated with each HugeTLB page.
* For tail pages, the value of compound_head is the same. So we can reuse first
- * page of tail page structures. We map the virtual addresses of the remaining
- * pages of tail page structures to the first tail page struct, and then free
- * these page frames. Therefore, we need to reserve two pages as vmemmap areas.
+ * page of head page structures. We map the virtual addresses of all the pages
+ * of tail page structures to the head page struct, and then free these page
+ * frames. Therefore, we need to reserve one pages as vmemmap areas.
*/
-#define RESERVE_VMEMMAP_NR 2U
+#define RESERVE_VMEMMAP_NR 1U
#define RESERVE_VMEMMAP_SIZE (RESERVE_VMEMMAP_NR << PAGE_SHIFT)
-bool hugetlb_free_vmemmap_enabled = IS_ENABLED(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON);
+DEFINE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON,
+ hugetlb_free_vmemmap_enabled_key);
+EXPORT_SYMBOL(hugetlb_free_vmemmap_enabled_key);
static int __init early_hugetlb_free_vmemmap_param(char *buf)
{
/* We cannot optimize if a "struct page" crosses page boundaries. */
- if ((!is_power_of_2(sizeof(struct page)))) {
+ if (!is_power_of_2(sizeof(struct page))) {
pr_warn("cannot free vmemmap pages because \"struct page\" crosses page boundaries\n");
return 0;
}
@@ -196,9 +204,9 @@ static int __init early_hugetlb_free_vmemmap_param(char *buf)
return -EINVAL;
if (!strcmp(buf, "on"))
- hugetlb_free_vmemmap_enabled = true;
+ static_branch_enable(&hugetlb_free_vmemmap_enabled_key);
else if (!strcmp(buf, "off"))
- hugetlb_free_vmemmap_enabled = false;
+ static_branch_disable(&hugetlb_free_vmemmap_enabled_key);
else
return -EINVAL;
@@ -236,7 +244,6 @@ int alloc_huge_page_vmemmap(struct hstate *h, struct page *head)
*/
ret = vmemmap_remap_alloc(vmemmap_addr, vmemmap_end, vmemmap_reuse,
GFP_KERNEL | __GFP_NORETRY | __GFP_THISNODE);
-
if (!ret)
ClearHPageVmemmapOptimized(head);
@@ -277,14 +284,13 @@ void __init hugetlb_vmemmap_init(struct hstate *h)
BUILD_BUG_ON(__NR_USED_SUBPAGE >=
RESERVE_VMEMMAP_SIZE / sizeof(struct page));
- if (!hugetlb_free_vmemmap_enabled)
+ if (!hugetlb_free_vmemmap_enabled())
return;
vmemmap_pages = (nr_pages * sizeof(struct page)) >> PAGE_SHIFT;
/*
- * The head page and the first tail page are not to be freed to buddy
- * allocator, the other pages will map to the first tail page, so they
- * can be freed.
+ * The head page is not to be freed to buddy allocator, the other tail
+ * pages will map to the head page, so they can be freed.
*
* Could RESERVE_VMEMMAP_NR be greater than @vmemmap_pages? It is true
* on some architectures (e.g. aarch64). See Documentation/arm64/
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c
index aff4d27ec235..bb0cea5468cb 100644
--- a/mm/hwpoison-inject.c
+++ b/mm/hwpoison-inject.c
@@ -32,9 +32,9 @@ static int hwpoison_inject(void *data, u64 val)
shake_page(hpage);
/*
- * This implies unable to support non-LRU pages.
+ * This implies unable to support non-LRU pages except free page.
*/
- if (!PageLRU(hpage) && !PageHuge(p))
+ if (!PageLRU(hpage) && !PageHuge(p) && !is_free_buddy_page(p))
return 0;
/*
@@ -48,7 +48,8 @@ static int hwpoison_inject(void *data, u64 val)
inject:
pr_info("Injecting memory failure at pfn %#lx\n", pfn);
- return memory_failure(pfn, 0);
+ err = memory_failure(pfn, 0);
+ return (err == -EOPNOTSUPP) ? 0 : err;
}
static int hwpoison_unpoison(void *data, u64 val)
diff --git a/mm/init-mm.c b/mm/init-mm.c
index b4a6f38fb51d..fbe7844d0912 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -10,6 +10,7 @@
#include <linux/atomic.h>
#include <linux/user_namespace.h>
+#include <linux/ioasid.h>
#include <asm/mmu.h>
#ifndef INIT_MM_CONTEXT
@@ -38,6 +39,9 @@ struct mm_struct init_mm = {
.mmlist = LIST_HEAD_INIT(init_mm.mmlist),
.user_ns = &init_user_ns,
.cpu_bitmap = CPU_BITS_NONE,
+#ifdef CONFIG_IOMMU_SVA
+ .pasid = INVALID_IOASID,
+#endif
INIT_MM_CONTEXT(init_mm)
};
diff --git a/mm/internal.h b/mm/internal.h
index d80300392a19..58dc6adc19c5 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -10,6 +10,7 @@
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/pagemap.h>
+#include <linux/rmap.h>
#include <linux/tracepoint-defs.h>
struct folio_batch;
@@ -66,24 +67,20 @@ static inline void wake_throttle_isolated(pg_data_t *pgdat)
vm_fault_t do_swap_page(struct vm_fault *vmf);
void folio_rotate_reclaimable(struct folio *folio);
bool __folio_end_writeback(struct folio *folio);
+void deactivate_file_folio(struct folio *folio);
void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
unsigned long floor, unsigned long ceiling);
void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte);
-static inline bool can_madv_lru_vma(struct vm_area_struct *vma)
-{
- return !(vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP));
-}
-
struct zap_details;
void unmap_page_range(struct mmu_gather *tlb,
struct vm_area_struct *vma,
unsigned long addr, unsigned long end,
struct zap_details *details);
-void do_page_cache_ra(struct readahead_control *, unsigned long nr_to_read,
- unsigned long lookahead_size);
+void page_cache_ra_order(struct readahead_control *, struct file_ra_state *,
+ unsigned int order);
void force_page_cache_ra(struct readahead_control *, unsigned long nr);
static inline void force_page_cache_readahead(struct address_space *mapping,
struct file *file, pgoff_t index, unsigned long nr_to_read)
@@ -100,6 +97,9 @@ void filemap_free_folio(struct address_space *mapping, struct folio *folio);
int truncate_inode_folio(struct address_space *mapping, struct folio *folio);
bool truncate_inode_partial_folio(struct folio *folio, loff_t start,
loff_t end);
+long invalidate_inode_page(struct page *page);
+unsigned long invalidate_mapping_pagevec(struct address_space *mapping,
+ pgoff_t start, pgoff_t end, unsigned long *nr_pagevec);
/**
* folio_evictable - Test whether a folio is evictable.
@@ -155,10 +155,18 @@ extern unsigned long highest_memmap_pfn;
#define MAX_RECLAIM_RETRIES 16
/*
+ * in mm/early_ioremap.c
+ */
+pgprot_t __init early_memremap_pgprot_adjust(resource_size_t phys_addr,
+ unsigned long size, pgprot_t prot);
+
+/*
* in mm/vmscan.c:
*/
-extern int isolate_lru_page(struct page *page);
-extern void putback_lru_page(struct page *page);
+int isolate_lru_page(struct page *page);
+int folio_isolate_lru(struct folio *folio);
+void putback_lru_page(struct page *page);
+void folio_putback_lru(struct folio *folio);
extern void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason);
/*
@@ -390,6 +398,7 @@ static inline bool is_data_mapping(vm_flags_t flags)
void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
struct vm_area_struct *prev);
void __vma_unlink_list(struct mm_struct *mm, struct vm_area_struct *vma);
+struct anon_vma *folio_anon_vma(struct folio *folio);
#ifdef CONFIG_MMU
void unmap_mapping_folio(struct folio *folio);
@@ -398,32 +407,56 @@ extern long populate_vma_page_range(struct vm_area_struct *vma,
extern long faultin_vma_page_range(struct vm_area_struct *vma,
unsigned long start, unsigned long end,
bool write, int *locked);
-extern void munlock_vma_pages_range(struct vm_area_struct *vma,
- unsigned long start, unsigned long end);
-static inline void munlock_vma_pages_all(struct vm_area_struct *vma)
-{
- munlock_vma_pages_range(vma, vma->vm_start, vma->vm_end);
-}
-
-/*
- * must be called with vma's mmap_lock held for read or write, and page locked.
- */
-extern void mlock_vma_page(struct page *page);
-extern unsigned int munlock_vma_page(struct page *page);
-
extern int mlock_future_check(struct mm_struct *mm, unsigned long flags,
unsigned long len);
-
/*
- * Clear the page's PageMlocked(). This can be useful in a situation where
- * we want to unconditionally remove a page from the pagecache -- e.g.,
- * on truncation or freeing.
+ * mlock_vma_page() and munlock_vma_page():
+ * should be called with vma's mmap_lock held for read or write,
+ * under page table lock for the pte/pmd being added or removed.
*
- * It is legal to call this function for any page, mlocked or not.
- * If called for a page that is still mapped by mlocked vmas, all we do
- * is revert to lazy LRU behaviour -- semantics are not broken.
+ * mlock is usually called at the end of page_add_*_rmap(),
+ * munlock at the end of page_remove_rmap(); but new anon
+ * pages are managed by lru_cache_add_inactive_or_unevictable()
+ * calling mlock_new_page().
+ *
+ * @compound is used to include pmd mappings of THPs, but filter out
+ * pte mappings of THPs, which cannot be consistently counted: a pte
+ * mapping of the THP head cannot be distinguished by the page alone.
*/
-extern void clear_page_mlock(struct page *page);
+void mlock_folio(struct folio *folio);
+static inline void mlock_vma_folio(struct folio *folio,
+ struct vm_area_struct *vma, bool compound)
+{
+ /*
+ * The VM_SPECIAL check here serves two purposes.
+ * 1) VM_IO check prevents migration from double-counting during mlock.
+ * 2) Although mmap_region() and mlock_fixup() take care that VM_LOCKED
+ * is never left set on a VM_SPECIAL vma, there is an interval while
+ * file->f_op->mmap() is using vm_insert_page(s), when VM_LOCKED may
+ * still be set while VM_SPECIAL bits are added: so ignore it then.
+ */
+ if (unlikely((vma->vm_flags & (VM_LOCKED|VM_SPECIAL)) == VM_LOCKED) &&
+ (compound || !folio_test_large(folio)))
+ mlock_folio(folio);
+}
+
+static inline void mlock_vma_page(struct page *page,
+ struct vm_area_struct *vma, bool compound)
+{
+ mlock_vma_folio(page_folio(page), vma, compound);
+}
+
+void munlock_page(struct page *page);
+static inline void munlock_vma_page(struct page *page,
+ struct vm_area_struct *vma, bool compound)
+{
+ if (unlikely(vma->vm_flags & VM_LOCKED) &&
+ (compound || !PageTransCompound(page)))
+ munlock_page(page);
+}
+void mlock_new_page(struct page *page);
+bool need_mlock_page_drain(int cpu);
+void mlock_page_drain(int cpu);
extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma);
@@ -457,18 +490,20 @@ vma_address(struct page *page, struct vm_area_struct *vma)
}
/*
- * Then at what user virtual address will none of the page be found in vma?
+ * Then at what user virtual address will none of the range be found in vma?
* Assumes that vma_address() already returned a good starting address.
- * If page is a compound head, the entire compound page is considered.
*/
-static inline unsigned long
-vma_address_end(struct page *page, struct vm_area_struct *vma)
+static inline unsigned long vma_address_end(struct page_vma_mapped_walk *pvmw)
{
+ struct vm_area_struct *vma = pvmw->vma;
pgoff_t pgoff;
unsigned long address;
- VM_BUG_ON_PAGE(PageKsm(page), page); /* KSM page->index unusable */
- pgoff = page_to_pgoff(page) + compound_nr(page);
+ /* Common case, plus ->pgoff is invalid for KSM */
+ if (pvmw->nr_pages == 1)
+ return pvmw->address + PAGE_SIZE;
+
+ pgoff = pvmw->pgoff + pvmw->nr_pages;
address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
/* Check for address beyond vma (or wrapped through 0?) */
if (address < vma->vm_start || address > vma->vm_end)
@@ -498,8 +533,13 @@ static inline struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf,
}
#else /* !CONFIG_MMU */
static inline void unmap_mapping_folio(struct folio *folio) { }
-static inline void clear_page_mlock(struct page *page) { }
-static inline void mlock_vma_page(struct page *page) { }
+static inline void mlock_vma_page(struct page *page,
+ struct vm_area_struct *vma, bool compound) { }
+static inline void munlock_vma_page(struct page *page,
+ struct vm_area_struct *vma, bool compound) { }
+static inline void mlock_new_page(struct page *page) { }
+static inline bool need_mlock_page_drain(int cpu) { return false; }
+static inline void mlock_page_drain(int cpu) { }
static inline void vunmap_range_noflush(unsigned long start, unsigned long end)
{
}
@@ -572,17 +612,6 @@ static inline void mminit_verify_zonelist(void)
}
#endif /* CONFIG_DEBUG_MEMORY_INIT */
-/* mminit_validate_memmodel_limits is independent of CONFIG_DEBUG_MEMORY_INIT */
-#if defined(CONFIG_SPARSEMEM)
-extern void mminit_validate_memmodel_limits(unsigned long *start_pfn,
- unsigned long *end_pfn);
-#else
-static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn,
- unsigned long *end_pfn)
-{
-}
-#endif /* CONFIG_SPARSEMEM */
-
#define NODE_RECLAIM_NOSCAN -2
#define NODE_RECLAIM_FULL -1
#define NODE_RECLAIM_SOME 0
@@ -718,4 +747,13 @@ void vunmap_range_noflush(unsigned long start, unsigned long end);
int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
unsigned long addr, int page_nid, int *flags);
+void free_zone_device_page(struct page *page);
+
+/*
+ * mm/gup.c
+ */
+struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags);
+
+DECLARE_PER_CPU(struct per_cpu_nodestat, boot_nodestats);
+
#endif /* __MM_INTERNAL_H */
diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile
index adcd9acaef61..1f84df9c302e 100644
--- a/mm/kasan/Makefile
+++ b/mm/kasan/Makefile
@@ -35,7 +35,7 @@ CFLAGS_shadow.o := $(CC_FLAGS_KASAN_RUNTIME)
CFLAGS_hw_tags.o := $(CC_FLAGS_KASAN_RUNTIME)
CFLAGS_sw_tags.o := $(CC_FLAGS_KASAN_RUNTIME)
-obj-$(CONFIG_KASAN) := common.o report.o
+obj-y := common.o report.o
obj-$(CONFIG_KASAN_GENERIC) += init.o generic.o report_generic.o shadow.o quarantine.o
obj-$(CONFIG_KASAN_HW_TAGS) += hw_tags.o report_hw_tags.o tags.o report_tags.o
obj-$(CONFIG_KASAN_SW_TAGS) += init.o report_sw_tags.o shadow.o sw_tags.o tags.o report_tags.o
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index 92196562687b..d9079ec11f31 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -387,7 +387,7 @@ static inline bool ____kasan_kfree_large(void *ptr, unsigned long ip)
}
/*
- * The object will be poisoned by kasan_free_pages() or
+ * The object will be poisoned by kasan_poison_pages() or
* kasan_slab_free_mempool().
*/
@@ -538,7 +538,7 @@ void * __must_check __kasan_kmalloc_large(const void *ptr, size_t size,
return NULL;
/*
- * The object has already been unpoisoned by kasan_alloc_pages() for
+ * The object has already been unpoisoned by kasan_unpoison_pages() for
* alloc_pages() or by kasan_krealloc() for krealloc().
*/
diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c
index 7355cb534e4f..07a76c46daa5 100644
--- a/mm/kasan/hw_tags.c
+++ b/mm/kasan/hw_tags.c
@@ -32,6 +32,12 @@ enum kasan_arg_mode {
KASAN_ARG_MODE_ASYMM,
};
+enum kasan_arg_vmalloc {
+ KASAN_ARG_VMALLOC_DEFAULT,
+ KASAN_ARG_VMALLOC_OFF,
+ KASAN_ARG_VMALLOC_ON,
+};
+
enum kasan_arg_stacktrace {
KASAN_ARG_STACKTRACE_DEFAULT,
KASAN_ARG_STACKTRACE_OFF,
@@ -40,18 +46,28 @@ enum kasan_arg_stacktrace {
static enum kasan_arg kasan_arg __ro_after_init;
static enum kasan_arg_mode kasan_arg_mode __ro_after_init;
-static enum kasan_arg_stacktrace kasan_arg_stacktrace __ro_after_init;
+static enum kasan_arg_vmalloc kasan_arg_vmalloc __initdata;
+static enum kasan_arg_stacktrace kasan_arg_stacktrace __initdata;
-/* Whether KASAN is enabled at all. */
+/*
+ * Whether KASAN is enabled at all.
+ * The value remains false until KASAN is initialized by kasan_init_hw_tags().
+ */
DEFINE_STATIC_KEY_FALSE(kasan_flag_enabled);
EXPORT_SYMBOL(kasan_flag_enabled);
-/* Whether the selected mode is synchronous/asynchronous/asymmetric.*/
+/*
+ * Whether the selected mode is synchronous, asynchronous, or asymmetric.
+ * Defaults to KASAN_MODE_SYNC.
+ */
enum kasan_mode kasan_mode __ro_after_init;
EXPORT_SYMBOL_GPL(kasan_mode);
+/* Whether to enable vmalloc tagging. */
+DEFINE_STATIC_KEY_TRUE(kasan_flag_vmalloc);
+
/* Whether to collect alloc/free stack traces. */
-DEFINE_STATIC_KEY_FALSE(kasan_flag_stacktrace);
+DEFINE_STATIC_KEY_TRUE(kasan_flag_stacktrace);
/* kasan=off/on */
static int __init early_kasan_flag(char *arg)
@@ -89,6 +105,23 @@ static int __init early_kasan_mode(char *arg)
}
early_param("kasan.mode", early_kasan_mode);
+/* kasan.vmalloc=off/on */
+static int __init early_kasan_flag_vmalloc(char *arg)
+{
+ if (!arg)
+ return -EINVAL;
+
+ if (!strcmp(arg, "off"))
+ kasan_arg_vmalloc = KASAN_ARG_VMALLOC_OFF;
+ else if (!strcmp(arg, "on"))
+ kasan_arg_vmalloc = KASAN_ARG_VMALLOC_ON;
+ else
+ return -EINVAL;
+
+ return 0;
+}
+early_param("kasan.vmalloc", early_kasan_flag_vmalloc);
+
/* kasan.stacktrace=off/on */
static int __init early_kasan_flag_stacktrace(char *arg)
{
@@ -116,7 +149,10 @@ static inline const char *kasan_mode_info(void)
return "sync";
}
-/* kasan_init_hw_tags_cpu() is called for each CPU. */
+/*
+ * kasan_init_hw_tags_cpu() is called for each CPU.
+ * Not marked as __init as a CPU can be hot-plugged after boot.
+ */
void kasan_init_hw_tags_cpu(void)
{
/*
@@ -124,7 +160,11 @@ void kasan_init_hw_tags_cpu(void)
* as this function is only called for MTE-capable hardware.
*/
- /* If KASAN is disabled via command line, don't initialize it. */
+ /*
+ * If KASAN is disabled via command line, don't initialize it.
+ * When this function is called, kasan_flag_enabled is not yet
+ * set by kasan_init_hw_tags(). Thus, check kasan_arg instead.
+ */
if (kasan_arg == KASAN_ARG_OFF)
return;
@@ -132,12 +172,7 @@ void kasan_init_hw_tags_cpu(void)
* Enable async or asymm modes only when explicitly requested
* through the command line.
*/
- if (kasan_arg_mode == KASAN_ARG_MODE_ASYNC)
- hw_enable_tagging_async();
- else if (kasan_arg_mode == KASAN_ARG_MODE_ASYMM)
- hw_enable_tagging_asymm();
- else
- hw_enable_tagging_sync();
+ kasan_enable_tagging();
}
/* kasan_init_hw_tags() is called once on boot CPU. */
@@ -151,86 +186,168 @@ void __init kasan_init_hw_tags(void)
if (kasan_arg == KASAN_ARG_OFF)
return;
- /* Enable KASAN. */
- static_branch_enable(&kasan_flag_enabled);
-
switch (kasan_arg_mode) {
case KASAN_ARG_MODE_DEFAULT:
- /*
- * Default to sync mode.
- */
- fallthrough;
+ /* Default is specified by kasan_mode definition. */
+ break;
case KASAN_ARG_MODE_SYNC:
- /* Sync mode enabled. */
kasan_mode = KASAN_MODE_SYNC;
break;
case KASAN_ARG_MODE_ASYNC:
- /* Async mode enabled. */
kasan_mode = KASAN_MODE_ASYNC;
break;
case KASAN_ARG_MODE_ASYMM:
- /* Asymm mode enabled. */
kasan_mode = KASAN_MODE_ASYMM;
break;
}
+ switch (kasan_arg_vmalloc) {
+ case KASAN_ARG_VMALLOC_DEFAULT:
+ /* Default is specified by kasan_flag_vmalloc definition. */
+ break;
+ case KASAN_ARG_VMALLOC_OFF:
+ static_branch_disable(&kasan_flag_vmalloc);
+ break;
+ case KASAN_ARG_VMALLOC_ON:
+ static_branch_enable(&kasan_flag_vmalloc);
+ break;
+ }
+
switch (kasan_arg_stacktrace) {
case KASAN_ARG_STACKTRACE_DEFAULT:
- /* Default to enabling stack trace collection. */
- static_branch_enable(&kasan_flag_stacktrace);
+ /* Default is specified by kasan_flag_stacktrace definition. */
break;
case KASAN_ARG_STACKTRACE_OFF:
- /* Do nothing, kasan_flag_stacktrace keeps its default value. */
+ static_branch_disable(&kasan_flag_stacktrace);
break;
case KASAN_ARG_STACKTRACE_ON:
static_branch_enable(&kasan_flag_stacktrace);
break;
}
- pr_info("KernelAddressSanitizer initialized (hw-tags, mode=%s, stacktrace=%s)\n",
+ /* KASAN is now initialized, enable it. */
+ static_branch_enable(&kasan_flag_enabled);
+
+ pr_info("KernelAddressSanitizer initialized (hw-tags, mode=%s, vmalloc=%s, stacktrace=%s)\n",
kasan_mode_info(),
+ kasan_vmalloc_enabled() ? "on" : "off",
kasan_stack_collection_enabled() ? "on" : "off");
}
-void kasan_alloc_pages(struct page *page, unsigned int order, gfp_t flags)
+#ifdef CONFIG_KASAN_VMALLOC
+
+static void unpoison_vmalloc_pages(const void *addr, u8 tag)
{
+ struct vm_struct *area;
+ int i;
+
/*
- * This condition should match the one in post_alloc_hook() in
- * page_alloc.c.
+ * As hardware tag-based KASAN only tags VM_ALLOC vmalloc allocations
+ * (see the comment in __kasan_unpoison_vmalloc), all of the pages
+ * should belong to a single area.
*/
- bool init = !want_init_on_free() && want_init_on_alloc(flags);
-
- if (flags & __GFP_SKIP_KASAN_POISON)
- SetPageSkipKASanPoison(page);
+ area = find_vm_area((void *)addr);
+ if (WARN_ON(!area))
+ return;
- if (flags & __GFP_ZEROTAGS) {
- int i;
+ for (i = 0; i < area->nr_pages; i++) {
+ struct page *page = area->pages[i];
- for (i = 0; i != 1 << order; ++i)
- tag_clear_highpage(page + i);
- } else {
- kasan_unpoison_pages(page, order, init);
+ page_kasan_tag_set(page, tag);
}
}
-void kasan_free_pages(struct page *page, unsigned int order)
+void *__kasan_unpoison_vmalloc(const void *start, unsigned long size,
+ kasan_vmalloc_flags_t flags)
{
+ u8 tag;
+ unsigned long redzone_start, redzone_size;
+
+ if (!kasan_vmalloc_enabled())
+ return (void *)start;
+
+ if (!is_vmalloc_or_module_addr(start))
+ return (void *)start;
+
+ /*
+ * Skip unpoisoning and assigning a pointer tag for non-VM_ALLOC
+ * mappings as:
+ *
+ * 1. Unlike the software KASAN modes, hardware tag-based KASAN only
+ * supports tagging physical memory. Therefore, it can only tag a
+ * single mapping of normal physical pages.
+ * 2. Hardware tag-based KASAN can only tag memory mapped with special
+ * mapping protection bits, see arch_vmalloc_pgprot_modify().
+ * As non-VM_ALLOC mappings can be mapped outside of vmalloc code,
+ * providing these bits would require tracking all non-VM_ALLOC
+ * mappers.
+ *
+ * Thus, for VM_ALLOC mappings, hardware tag-based KASAN only tags
+ * the first virtual mapping, which is created by vmalloc().
+ * Tagging the page_alloc memory backing that vmalloc() allocation is
+ * skipped, see ___GFP_SKIP_KASAN_UNPOISON.
+ *
+ * For non-VM_ALLOC allocations, page_alloc memory is tagged as usual.
+ */
+ if (!(flags & KASAN_VMALLOC_VM_ALLOC))
+ return (void *)start;
+
+ /*
+ * Don't tag executable memory.
+ * The kernel doesn't tolerate having the PC register tagged.
+ */
+ if (!(flags & KASAN_VMALLOC_PROT_NORMAL))
+ return (void *)start;
+
+ tag = kasan_random_tag();
+ start = set_tag(start, tag);
+
+ /* Unpoison and initialize memory up to size. */
+ kasan_unpoison(start, size, flags & KASAN_VMALLOC_INIT);
+
+ /*
+ * Explicitly poison and initialize the in-page vmalloc() redzone.
+ * Unlike software KASAN modes, hardware tag-based KASAN doesn't
+ * unpoison memory when populating shadow for vmalloc() space.
+ */
+ redzone_start = round_up((unsigned long)start + size,
+ KASAN_GRANULE_SIZE);
+ redzone_size = round_up(redzone_start, PAGE_SIZE) - redzone_start;
+ kasan_poison((void *)redzone_start, redzone_size, KASAN_TAG_INVALID,
+ flags & KASAN_VMALLOC_INIT);
+
/*
- * This condition should match the one in free_pages_prepare() in
- * page_alloc.c.
+ * Set per-page tag flags to allow accessing physical memory for the
+ * vmalloc() mapping through page_address(vmalloc_to_page()).
*/
- bool init = want_init_on_free();
+ unpoison_vmalloc_pages(start, tag);
- kasan_poison_pages(page, order, init);
+ return (void *)start;
+}
+
+void __kasan_poison_vmalloc(const void *start, unsigned long size)
+{
+ /*
+ * No tagging here.
+ * The physical pages backing the vmalloc() allocation are poisoned
+ * through the usual page_alloc paths.
+ */
}
+#endif
+
#if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST)
-void kasan_enable_tagging_sync(void)
+void kasan_enable_tagging(void)
{
- hw_enable_tagging_sync();
+ if (kasan_arg_mode == KASAN_ARG_MODE_ASYNC)
+ hw_enable_tagging_async();
+ else if (kasan_arg_mode == KASAN_ARG_MODE_ASYMM)
+ hw_enable_tagging_asymm();
+ else
+ hw_enable_tagging_sync();
}
-EXPORT_SYMBOL_GPL(kasan_enable_tagging_sync);
+EXPORT_SYMBOL_GPL(kasan_enable_tagging);
void kasan_force_async_fault(void)
{
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index c17fa8d26ffe..d79b83d673b1 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -12,7 +12,8 @@
#include <linux/static_key.h>
#include "../slab.h"
-DECLARE_STATIC_KEY_FALSE(kasan_flag_stacktrace);
+DECLARE_STATIC_KEY_TRUE(kasan_flag_vmalloc);
+DECLARE_STATIC_KEY_TRUE(kasan_flag_stacktrace);
enum kasan_mode {
KASAN_MODE_SYNC,
@@ -22,6 +23,11 @@ enum kasan_mode {
extern enum kasan_mode kasan_mode __ro_after_init;
+static inline bool kasan_vmalloc_enabled(void)
+{
+ return static_branch_likely(&kasan_flag_vmalloc);
+}
+
static inline bool kasan_stack_collection_enabled(void)
{
return static_branch_unlikely(&kasan_flag_stacktrace);
@@ -71,17 +77,19 @@ static inline bool kasan_sync_fault_possible(void)
#define KASAN_PAGE_REDZONE 0xFE /* redzone for kmalloc_large allocations */
#define KASAN_KMALLOC_REDZONE 0xFC /* redzone inside slub object */
#define KASAN_KMALLOC_FREE 0xFB /* object was freed (kmem_cache_free/kfree) */
-#define KASAN_KMALLOC_FREETRACK 0xFA /* object was freed and has free track set */
+#define KASAN_VMALLOC_INVALID 0xF8 /* unallocated space in vmapped page */
#else
#define KASAN_FREE_PAGE KASAN_TAG_INVALID
#define KASAN_PAGE_REDZONE KASAN_TAG_INVALID
#define KASAN_KMALLOC_REDZONE KASAN_TAG_INVALID
#define KASAN_KMALLOC_FREE KASAN_TAG_INVALID
-#define KASAN_KMALLOC_FREETRACK KASAN_TAG_INVALID
+#define KASAN_VMALLOC_INVALID KASAN_TAG_INVALID /* only for SW_TAGS */
#endif
+#ifdef CONFIG_KASAN_GENERIC
+
+#define KASAN_KMALLOC_FREETRACK 0xFA /* object was freed and has free track set */
#define KASAN_GLOBAL_REDZONE 0xF9 /* redzone for global variable */
-#define KASAN_VMALLOC_INVALID 0xF8 /* unallocated space in vmapped page */
/*
* Stack redzone shadow values
@@ -110,6 +118,8 @@ static inline bool kasan_sync_fault_possible(void)
#define KASAN_ABI_VERSION 1
#endif
+#endif /* CONFIG_KASAN_GENERIC */
+
/* Metadata layout customization. */
#define META_BYTES_PER_BLOCK 1
#define META_BLOCKS_PER_ROW 16
@@ -117,9 +127,15 @@ static inline bool kasan_sync_fault_possible(void)
#define META_MEM_BYTES_PER_ROW (META_BYTES_PER_ROW * KASAN_GRANULE_SIZE)
#define META_ROWS_AROUND_ADDR 2
-struct kasan_access_info {
- const void *access_addr;
- const void *first_bad_addr;
+enum kasan_report_type {
+ KASAN_REPORT_ACCESS,
+ KASAN_REPORT_INVALID_FREE,
+};
+
+struct kasan_report_info {
+ enum kasan_report_type type;
+ void *access_addr;
+ void *first_bad_addr;
size_t access_size;
bool is_write;
unsigned long ip;
@@ -204,6 +220,14 @@ struct kasan_free_meta {
#endif
};
+#if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST)
+/* Used in KUnit-compatible KASAN tests. */
+struct kunit_kasan_status {
+ bool report_found;
+ bool sync_fault;
+};
+#endif
+
struct kasan_alloc_meta *kasan_get_alloc_meta(struct kmem_cache *cache,
const void *object);
#ifdef CONFIG_KASAN_GENERIC
@@ -221,7 +245,8 @@ static inline const void *kasan_shadow_to_mem(const void *shadow_addr)
static inline bool addr_has_metadata(const void *addr)
{
- return (addr >= kasan_shadow_to_mem((void *)KASAN_SHADOW_START));
+ return (kasan_reset_tag(addr) >=
+ kasan_shadow_to_mem((void *)KASAN_SHADOW_START));
}
/**
@@ -251,10 +276,10 @@ static inline void kasan_print_tags(u8 addr_tag, const void *addr) { }
#endif
void *kasan_find_first_bad_addr(void *addr, size_t size);
-const char *kasan_get_bug_type(struct kasan_access_info *info);
+const char *kasan_get_bug_type(struct kasan_report_info *info);
void kasan_metadata_fetch_row(char *buffer, void *row);
-#if defined(CONFIG_KASAN_GENERIC) && defined(CONFIG_KASAN_STACK)
+#if defined(CONFIG_KASAN_STACK)
void kasan_print_address_stack_frame(const void *addr);
#else
static inline void kasan_print_address_stack_frame(const void *addr) { }
@@ -340,12 +365,12 @@ static inline const void *arch_kasan_set_tag(const void *addr, u8 tag)
#if defined(CONFIG_KASAN_HW_TAGS) && IS_ENABLED(CONFIG_KASAN_KUNIT_TEST)
-void kasan_enable_tagging_sync(void);
+void kasan_enable_tagging(void);
void kasan_force_async_fault(void);
#else /* CONFIG_KASAN_HW_TAGS || CONFIG_KASAN_KUNIT_TEST */
-static inline void kasan_enable_tagging_sync(void) { }
+static inline void kasan_enable_tagging(void) { }
static inline void kasan_force_async_fault(void) { }
#endif /* CONFIG_KASAN_HW_TAGS || CONFIG_KASAN_KUNIT_TEST */
@@ -467,6 +492,13 @@ static inline bool kasan_arch_is_ready(void) { return true; }
#error kasan_arch_is_ready only works in KASAN generic outline mode!
#endif
+#if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST) || IS_ENABLED(CONFIG_KASAN_MODULE_TEST)
+
+bool kasan_save_enable_multi_shot(void);
+void kasan_restore_multi_shot(bool enabled);
+
+#endif
+
/*
* Exported functions for interfaces called from assembly or from generated
* code. Declarations here to avoid warning about missing declarations.
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index 3ad9624dcc56..199d77cce21a 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -13,6 +13,7 @@
#include <linux/ftrace.h>
#include <linux/init.h>
#include <linux/kernel.h>
+#include <linux/lockdep.h>
#include <linux/mm.h>
#include <linux/printk.h>
#include <linux/sched.h>
@@ -64,6 +65,40 @@ static int __init early_kasan_fault(char *arg)
}
early_param("kasan.fault", early_kasan_fault);
+static int __init kasan_set_multi_shot(char *str)
+{
+ set_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags);
+ return 1;
+}
+__setup("kasan_multi_shot", kasan_set_multi_shot);
+
+/*
+ * Used to suppress reports within kasan_disable/enable_current() critical
+ * sections, which are used for marking accesses to slab metadata.
+ */
+static bool report_suppressed(void)
+{
+#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
+ if (current->kasan_depth)
+ return true;
+#endif
+ return false;
+}
+
+/*
+ * Used to avoid reporting more than one KASAN bug unless kasan_multi_shot
+ * is enabled. Note that KASAN tests effectively enable kasan_multi_shot
+ * for their duration.
+ */
+static bool report_enabled(void)
+{
+ if (test_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags))
+ return true;
+ return !test_and_set_bit(KASAN_BIT_REPORTED, &kasan_flags);
+}
+
+#if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST) || IS_ENABLED(CONFIG_KASAN_MODULE_TEST)
+
bool kasan_save_enable_multi_shot(void)
{
return test_and_set_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags);
@@ -77,61 +112,87 @@ void kasan_restore_multi_shot(bool enabled)
}
EXPORT_SYMBOL_GPL(kasan_restore_multi_shot);
-static int __init kasan_set_multi_shot(char *str)
-{
- set_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags);
- return 1;
-}
-__setup("kasan_multi_shot", kasan_set_multi_shot);
+#endif
-static void print_error_description(struct kasan_access_info *info)
+#if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST)
+static void update_kunit_status(bool sync)
{
- pr_err("BUG: KASAN: %s in %pS\n",
- kasan_get_bug_type(info), (void *)info->ip);
- if (info->access_size)
- pr_err("%s of size %zu at addr %px by task %s/%d\n",
- info->is_write ? "Write" : "Read", info->access_size,
- info->access_addr, current->comm, task_pid_nr(current));
- else
- pr_err("%s at addr %px by task %s/%d\n",
- info->is_write ? "Write" : "Read",
- info->access_addr, current->comm, task_pid_nr(current));
+ struct kunit *test;
+ struct kunit_resource *resource;
+ struct kunit_kasan_status *status;
+
+ test = current->kunit_test;
+ if (!test)
+ return;
+
+ resource = kunit_find_named_resource(test, "kasan_status");
+ if (!resource) {
+ kunit_set_failure(test);
+ return;
+ }
+
+ status = (struct kunit_kasan_status *)resource->data;
+ WRITE_ONCE(status->report_found, true);
+ WRITE_ONCE(status->sync_fault, sync);
+
+ kunit_put_resource(resource);
}
+#else
+static void update_kunit_status(bool sync) { }
+#endif
static DEFINE_SPINLOCK(report_lock);
-static void start_report(unsigned long *flags)
+static void start_report(unsigned long *flags, bool sync)
{
- /*
- * Make sure we don't end up in loop.
- */
+ /* Respect the /proc/sys/kernel/traceoff_on_warning interface. */
+ disable_trace_on_warning();
+ /* Update status of the currently running KASAN test. */
+ update_kunit_status(sync);
+ /* Do not allow LOCKDEP mangling KASAN reports. */
+ lockdep_off();
+ /* Make sure we don't end up in loop. */
kasan_disable_current();
spin_lock_irqsave(&report_lock, *flags);
pr_err("==================================================================\n");
}
-static void end_report(unsigned long *flags, unsigned long addr)
+static void end_report(unsigned long *flags, void *addr)
{
- if (!kasan_async_fault_possible())
- trace_error_report_end(ERROR_DETECTOR_KASAN, addr);
+ if (addr)
+ trace_error_report_end(ERROR_DETECTOR_KASAN,
+ (unsigned long)addr);
pr_err("==================================================================\n");
- add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
spin_unlock_irqrestore(&report_lock, *flags);
- if (panic_on_warn && !test_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags)) {
- /*
- * This thread may hit another WARN() in the panic path.
- * Resetting this prevents additional WARN() from panicking the
- * system on this thread. Other threads are blocked by the
- * panic_mutex in panic().
- */
- panic_on_warn = 0;
+ if (panic_on_warn && !test_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags))
panic("panic_on_warn set ...\n");
- }
if (kasan_arg_fault == KASAN_ARG_FAULT_PANIC)
panic("kasan.fault=panic set ...\n");
+ add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
+ lockdep_on();
kasan_enable_current();
}
+static void print_error_description(struct kasan_report_info *info)
+{
+ if (info->type == KASAN_REPORT_INVALID_FREE) {
+ pr_err("BUG: KASAN: double-free or invalid-free in %pS\n",
+ (void *)info->ip);
+ return;
+ }
+
+ pr_err("BUG: KASAN: %s in %pS\n",
+ kasan_get_bug_type(info), (void *)info->ip);
+ if (info->access_size)
+ pr_err("%s of size %zu at addr %px by task %s/%d\n",
+ info->is_write ? "Write" : "Read", info->access_size,
+ info->access_addr, current->comm, task_pid_nr(current));
+ else
+ pr_err("%s at addr %px by task %s/%d\n",
+ info->is_write ? "Write" : "Read",
+ info->access_addr, current->comm, task_pid_nr(current));
+}
+
static void print_track(struct kasan_track *track, const char *prefix)
{
pr_err("%s by task %u:\n", prefix, track->pid);
@@ -170,9 +231,6 @@ static void describe_object_addr(struct kmem_cache *cache, void *object,
" which belongs to the cache %s of size %d\n",
object, cache->name, cache->object_size);
- if (!addr)
- return;
-
if (access_addr < object_addr) {
rel_type = "to the left";
rel_bytes = object_addr - access_addr;
@@ -261,19 +319,43 @@ static void print_address_description(void *addr, u8 tag)
void *object = nearest_obj(cache, slab, addr);
describe_object(cache, object, addr, tag);
+ pr_err("\n");
}
if (kernel_or_module_addr(addr) && !init_task_stack_addr(addr)) {
pr_err("The buggy address belongs to the variable:\n");
pr_err(" %pS\n", addr);
+ pr_err("\n");
+ }
+
+ if (object_is_on_stack(addr)) {
+ /*
+ * Currently, KASAN supports printing frame information only
+ * for accesses to the task's own stack.
+ */
+ kasan_print_address_stack_frame(addr);
+ pr_err("\n");
+ }
+
+ if (is_vmalloc_addr(addr)) {
+ struct vm_struct *va = find_vm_area(addr);
+
+ if (va) {
+ pr_err("The buggy address belongs to the virtual mapping at\n"
+ " [%px, %px) created by:\n"
+ " %pS\n",
+ va->addr, va->addr + va->size, va->caller);
+ pr_err("\n");
+
+ page = vmalloc_to_page(page);
+ }
}
if (page) {
- pr_err("The buggy address belongs to the page:\n");
+ pr_err("The buggy address belongs to the physical page:\n");
dump_page(page, "kasan: bad access detected");
+ pr_err("\n");
}
-
- kasan_print_address_stack_frame(addr);
}
static bool meta_row_is_guilty(const void *row, const void *addr)
@@ -332,138 +414,110 @@ static void print_memory_metadata(const void *addr)
}
}
-static bool report_enabled(void)
+static void print_report(struct kasan_report_info *info)
{
-#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
- if (current->kasan_depth)
- return false;
-#endif
- if (test_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags))
- return true;
- return !test_and_set_bit(KASAN_BIT_REPORTED, &kasan_flags);
-}
+ void *tagged_addr = info->access_addr;
+ void *untagged_addr = kasan_reset_tag(tagged_addr);
+ u8 tag = get_tag(tagged_addr);
-#if IS_ENABLED(CONFIG_KUNIT)
-static void kasan_update_kunit_status(struct kunit *cur_test)
-{
- struct kunit_resource *resource;
- struct kunit_kasan_expectation *kasan_data;
-
- resource = kunit_find_named_resource(cur_test, "kasan_data");
+ print_error_description(info);
+ if (addr_has_metadata(untagged_addr))
+ kasan_print_tags(tag, info->first_bad_addr);
+ pr_err("\n");
- if (!resource) {
- kunit_set_failure(cur_test);
- return;
+ if (addr_has_metadata(untagged_addr)) {
+ print_address_description(untagged_addr, tag);
+ print_memory_metadata(info->first_bad_addr);
+ } else {
+ dump_stack_lvl(KERN_ERR);
}
-
- kasan_data = (struct kunit_kasan_expectation *)resource->data;
- WRITE_ONCE(kasan_data->report_found, true);
- kunit_put_resource(resource);
}
-#endif /* IS_ENABLED(CONFIG_KUNIT) */
-void kasan_report_invalid_free(void *object, unsigned long ip)
+void kasan_report_invalid_free(void *ptr, unsigned long ip)
{
unsigned long flags;
- u8 tag = get_tag(object);
-
- object = kasan_reset_tag(object);
+ struct kasan_report_info info;
-#if IS_ENABLED(CONFIG_KUNIT)
- if (current->kunit_test)
- kasan_update_kunit_status(current->kunit_test);
-#endif /* IS_ENABLED(CONFIG_KUNIT) */
+ /*
+ * Do not check report_suppressed(), as an invalid-free cannot be
+ * caused by accessing slab metadata and thus should not be
+ * suppressed by kasan_disable/enable_current() critical sections.
+ */
+ if (unlikely(!report_enabled()))
+ return;
- start_report(&flags);
- pr_err("BUG: KASAN: double-free or invalid-free in %pS\n", (void *)ip);
- kasan_print_tags(tag, object);
- pr_err("\n");
- print_address_description(object, tag);
- pr_err("\n");
- print_memory_metadata(object);
- end_report(&flags, (unsigned long)object);
-}
+ start_report(&flags, true);
-#ifdef CONFIG_KASAN_HW_TAGS
-void kasan_report_async(void)
-{
- unsigned long flags;
+ info.type = KASAN_REPORT_INVALID_FREE;
+ info.access_addr = ptr;
+ info.first_bad_addr = kasan_reset_tag(ptr);
+ info.access_size = 0;
+ info.is_write = false;
+ info.ip = ip;
-#if IS_ENABLED(CONFIG_KUNIT)
- if (current->kunit_test)
- kasan_update_kunit_status(current->kunit_test);
-#endif /* IS_ENABLED(CONFIG_KUNIT) */
+ print_report(&info);
- start_report(&flags);
- pr_err("BUG: KASAN: invalid-access\n");
- pr_err("Asynchronous mode enabled: no access details available\n");
- pr_err("\n");
- dump_stack_lvl(KERN_ERR);
- end_report(&flags, 0);
+ end_report(&flags, ptr);
}
-#endif /* CONFIG_KASAN_HW_TAGS */
-static void __kasan_report(unsigned long addr, size_t size, bool is_write,
- unsigned long ip)
+/*
+ * kasan_report() is the only reporting function that uses
+ * user_access_save/restore(): kasan_report_invalid_free() cannot be called
+ * from a UACCESS region, and kasan_report_async() is not used on x86.
+ */
+bool kasan_report(unsigned long addr, size_t size, bool is_write,
+ unsigned long ip)
{
- struct kasan_access_info info;
- void *tagged_addr;
- void *untagged_addr;
- unsigned long flags;
-
-#if IS_ENABLED(CONFIG_KUNIT)
- if (current->kunit_test)
- kasan_update_kunit_status(current->kunit_test);
-#endif /* IS_ENABLED(CONFIG_KUNIT) */
-
- disable_trace_on_warning();
+ bool ret = true;
+ void *ptr = (void *)addr;
+ unsigned long ua_flags = user_access_save();
+ unsigned long irq_flags;
+ struct kasan_report_info info;
+
+ if (unlikely(report_suppressed()) || unlikely(!report_enabled())) {
+ ret = false;
+ goto out;
+ }
- tagged_addr = (void *)addr;
- untagged_addr = kasan_reset_tag(tagged_addr);
+ start_report(&irq_flags, true);
- info.access_addr = tagged_addr;
- if (addr_has_metadata(untagged_addr))
- info.first_bad_addr =
- kasan_find_first_bad_addr(tagged_addr, size);
- else
- info.first_bad_addr = untagged_addr;
+ info.type = KASAN_REPORT_ACCESS;
+ info.access_addr = ptr;
+ info.first_bad_addr = kasan_find_first_bad_addr(ptr, size);
info.access_size = size;
info.is_write = is_write;
info.ip = ip;
- start_report(&flags);
+ print_report(&info);
- print_error_description(&info);
- if (addr_has_metadata(untagged_addr))
- kasan_print_tags(get_tag(tagged_addr), info.first_bad_addr);
- pr_err("\n");
+ end_report(&irq_flags, ptr);
- if (addr_has_metadata(untagged_addr)) {
- print_address_description(untagged_addr, get_tag(tagged_addr));
- pr_err("\n");
- print_memory_metadata(info.first_bad_addr);
- } else {
- dump_stack_lvl(KERN_ERR);
- }
+out:
+ user_access_restore(ua_flags);
- end_report(&flags, addr);
+ return ret;
}
-bool kasan_report(unsigned long addr, size_t size, bool is_write,
- unsigned long ip)
+#ifdef CONFIG_KASAN_HW_TAGS
+void kasan_report_async(void)
{
- unsigned long flags = user_access_save();
- bool ret = false;
-
- if (likely(report_enabled())) {
- __kasan_report(addr, size, is_write, ip);
- ret = true;
- }
+ unsigned long flags;
- user_access_restore(flags);
+ /*
+ * Do not check report_suppressed(), as kasan_disable/enable_current()
+ * critical sections do not affect Hardware Tag-Based KASAN.
+ */
+ if (unlikely(!report_enabled()))
+ return;
- return ret;
+ start_report(&flags, false);
+ pr_err("BUG: KASAN: invalid-access\n");
+ pr_err("Asynchronous fault: no details available\n");
+ pr_err("\n");
+ dump_stack_lvl(KERN_ERR);
+ end_report(&flags, NULL);
}
+#endif /* CONFIG_KASAN_HW_TAGS */
#ifdef CONFIG_KASAN_INLINE
/*
diff --git a/mm/kasan/report_generic.c b/mm/kasan/report_generic.c
index 139615ef326b..efc5e79a103f 100644
--- a/mm/kasan/report_generic.c
+++ b/mm/kasan/report_generic.c
@@ -34,12 +34,16 @@ void *kasan_find_first_bad_addr(void *addr, size_t size)
{
void *p = addr;
+ if (!addr_has_metadata(p))
+ return p;
+
while (p < addr + size && !(*(u8 *)kasan_mem_to_shadow(p)))
p += KASAN_GRANULE_SIZE;
+
return p;
}
-static const char *get_shadow_bug_type(struct kasan_access_info *info)
+static const char *get_shadow_bug_type(struct kasan_report_info *info)
{
const char *bug_type = "unknown-crash";
u8 *shadow_addr;
@@ -91,7 +95,7 @@ static const char *get_shadow_bug_type(struct kasan_access_info *info)
return bug_type;
}
-static const char *get_wild_bug_type(struct kasan_access_info *info)
+static const char *get_wild_bug_type(struct kasan_report_info *info)
{
const char *bug_type = "unknown-crash";
@@ -105,7 +109,7 @@ static const char *get_wild_bug_type(struct kasan_access_info *info)
return bug_type;
}
-const char *kasan_get_bug_type(struct kasan_access_info *info)
+const char *kasan_get_bug_type(struct kasan_report_info *info)
{
/*
* If access_size is a negative number, then it has reason to be
@@ -180,7 +184,7 @@ static void print_decoded_frame_descr(const char *frame_descr)
return;
pr_err("\n");
- pr_err("this frame has %lu %s:\n", num_objects,
+ pr_err("This frame has %lu %s:\n", num_objects,
num_objects == 1 ? "object" : "objects");
while (num_objects--) {
@@ -211,6 +215,7 @@ static void print_decoded_frame_descr(const char *frame_descr)
}
}
+/* Returns true only if the address is on the current task's stack. */
static bool __must_check get_address_stack_frame_info(const void *addr,
unsigned long *offset,
const char **frame_descr,
@@ -224,13 +229,6 @@ static bool __must_check get_address_stack_frame_info(const void *addr,
BUILD_BUG_ON(IS_ENABLED(CONFIG_STACK_GROWSUP));
- /*
- * NOTE: We currently only support printing frame information for
- * accesses to the task's own stack.
- */
- if (!object_is_on_stack(addr))
- return false;
-
aligned_addr = round_down((unsigned long)addr, sizeof(long));
mem_ptr = round_down(aligned_addr, KASAN_GRANULE_SIZE);
shadow_ptr = kasan_mem_to_shadow((void *)aligned_addr);
@@ -269,17 +267,17 @@ void kasan_print_address_stack_frame(const void *addr)
const char *frame_descr;
const void *frame_pc;
+ if (WARN_ON(!object_is_on_stack(addr)))
+ return;
+
+ pr_err("The buggy address belongs to stack of task %s/%d\n",
+ current->comm, task_pid_nr(current));
+
if (!get_address_stack_frame_info(addr, &offset, &frame_descr,
&frame_pc))
return;
- /*
- * get_address_stack_frame_info only returns true if the given addr is
- * on the current task's stack.
- */
- pr_err("\n");
- pr_err("addr %px is located in stack of task %s/%d at offset %lu in frame:\n",
- addr, current->comm, task_pid_nr(current), offset);
+ pr_err(" and is located at offset %lu in frame:\n", offset);
pr_err(" %pS\n", frame_pc);
if (!frame_descr)
diff --git a/mm/kasan/report_hw_tags.c b/mm/kasan/report_hw_tags.c
index 5dbbbb930e7a..f3d3be614e4b 100644
--- a/mm/kasan/report_hw_tags.c
+++ b/mm/kasan/report_hw_tags.c
@@ -17,6 +17,7 @@
void *kasan_find_first_bad_addr(void *addr, size_t size)
{
+ /* Return the same value regardless of whether addr_has_metadata(). */
return kasan_reset_tag(addr);
}
diff --git a/mm/kasan/report_sw_tags.c b/mm/kasan/report_sw_tags.c
index d2298c357834..7a26397297ed 100644
--- a/mm/kasan/report_sw_tags.c
+++ b/mm/kasan/report_sw_tags.c
@@ -16,6 +16,7 @@
#include <linux/mm.h>
#include <linux/printk.h>
#include <linux/sched.h>
+#include <linux/sched/task_stack.h>
#include <linux/slab.h>
#include <linux/stackdepot.h>
#include <linux/stacktrace.h>
@@ -35,8 +36,12 @@ void *kasan_find_first_bad_addr(void *addr, size_t size)
void *p = kasan_reset_tag(addr);
void *end = p + size;
+ if (!addr_has_metadata(p))
+ return p;
+
while (p < end && tag == *(u8 *)kasan_mem_to_shadow(p))
p += KASAN_GRANULE_SIZE;
+
return p;
}
@@ -51,3 +56,14 @@ void kasan_print_tags(u8 addr_tag, const void *addr)
pr_err("Pointer tag: [%02x], memory tag: [%02x]\n", addr_tag, *shadow);
}
+
+#ifdef CONFIG_KASAN_STACK
+void kasan_print_address_stack_frame(const void *addr)
+{
+ if (WARN_ON(!object_is_on_stack(addr)))
+ return;
+
+ pr_err("The buggy address belongs to stack of task %s/%d\n",
+ current->comm, task_pid_nr(current));
+}
+#endif
diff --git a/mm/kasan/report_tags.c b/mm/kasan/report_tags.c
index 1b41de88c53e..e25d2166e813 100644
--- a/mm/kasan/report_tags.c
+++ b/mm/kasan/report_tags.c
@@ -7,7 +7,7 @@
#include "kasan.h"
#include "../slab.h"
-const char *kasan_get_bug_type(struct kasan_access_info *info)
+const char *kasan_get_bug_type(struct kasan_report_info *info)
{
#ifdef CONFIG_KASAN_TAGS_IDENTIFY
struct kasan_alloc_meta *alloc_meta;
diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c
index 94136f84b449..a4f07de21771 100644
--- a/mm/kasan/shadow.c
+++ b/mm/kasan/shadow.c
@@ -345,27 +345,6 @@ int kasan_populate_vmalloc(unsigned long addr, unsigned long size)
return 0;
}
-/*
- * Poison the shadow for a vmalloc region. Called as part of the
- * freeing process at the time the region is freed.
- */
-void kasan_poison_vmalloc(const void *start, unsigned long size)
-{
- if (!is_vmalloc_or_module_addr(start))
- return;
-
- size = round_up(size, KASAN_GRANULE_SIZE);
- kasan_poison(start, size, KASAN_VMALLOC_INVALID, false);
-}
-
-void kasan_unpoison_vmalloc(const void *start, unsigned long size)
-{
- if (!is_vmalloc_or_module_addr(start))
- return;
-
- kasan_unpoison(start, size, false);
-}
-
static int kasan_depopulate_vmalloc_pte(pte_t *ptep, unsigned long addr,
void *unused)
{
@@ -496,9 +475,48 @@ void kasan_release_vmalloc(unsigned long start, unsigned long end,
}
}
+void *__kasan_unpoison_vmalloc(const void *start, unsigned long size,
+ kasan_vmalloc_flags_t flags)
+{
+ /*
+ * Software KASAN modes unpoison both VM_ALLOC and non-VM_ALLOC
+ * mappings, so the KASAN_VMALLOC_VM_ALLOC flag is ignored.
+ * Software KASAN modes can't optimize zeroing memory by combining it
+ * with setting memory tags, so the KASAN_VMALLOC_INIT flag is ignored.
+ */
+
+ if (!is_vmalloc_or_module_addr(start))
+ return (void *)start;
+
+ /*
+ * Don't tag executable memory with the tag-based mode.
+ * The kernel doesn't tolerate having the PC register tagged.
+ */
+ if (IS_ENABLED(CONFIG_KASAN_SW_TAGS) &&
+ !(flags & KASAN_VMALLOC_PROT_NORMAL))
+ return (void *)start;
+
+ start = set_tag(start, kasan_random_tag());
+ kasan_unpoison(start, size, false);
+ return (void *)start;
+}
+
+/*
+ * Poison the shadow for a vmalloc region. Called as part of the
+ * freeing process at the time the region is freed.
+ */
+void __kasan_poison_vmalloc(const void *start, unsigned long size)
+{
+ if (!is_vmalloc_or_module_addr(start))
+ return;
+
+ size = round_up(size, KASAN_GRANULE_SIZE);
+ kasan_poison(start, size, KASAN_VMALLOC_INVALID, false);
+}
+
#else /* CONFIG_KASAN_VMALLOC */
-int kasan_module_alloc(void *addr, size_t size, gfp_t gfp_mask)
+int kasan_alloc_module_shadow(void *addr, size_t size, gfp_t gfp_mask)
{
void *ret;
size_t scaled_size;
@@ -534,7 +552,7 @@ int kasan_module_alloc(void *addr, size_t size, gfp_t gfp_mask)
return -ENOMEM;
}
-void kasan_free_shadow(const struct vm_struct *vm)
+void kasan_free_module_shadow(const struct vm_struct *vm)
{
if (vm->flags & VM_KASAN)
vfree(kasan_mem_to_shadow(vm->addr));
diff --git a/mm/kfence/Makefile b/mm/kfence/Makefile
index 6872cd5e5390..0bb95728a784 100644
--- a/mm/kfence/Makefile
+++ b/mm/kfence/Makefile
@@ -1,6 +1,6 @@
# SPDX-License-Identifier: GPL-2.0
-obj-$(CONFIG_KFENCE) := core.o report.o
+obj-y := core.o report.o
CFLAGS_kfence_test.o := -g -fno-omit-frame-pointer -fno-optimize-sibling-calls
obj-$(CONFIG_KFENCE_KUNIT_TEST) += kfence_test.o
diff --git a/mm/kfence/core.c b/mm/kfence/core.c
index 5ad40e3add45..2f9fdfde1941 100644
--- a/mm/kfence/core.c
+++ b/mm/kfence/core.c
@@ -38,22 +38,27 @@
#define KFENCE_WARN_ON(cond) \
({ \
const bool __cond = WARN_ON(cond); \
- if (unlikely(__cond)) \
+ if (unlikely(__cond)) { \
WRITE_ONCE(kfence_enabled, false); \
+ disabled_by_warn = true; \
+ } \
__cond; \
})
/* === Data ================================================================= */
static bool kfence_enabled __read_mostly;
+static bool disabled_by_warn __read_mostly;
-static unsigned long kfence_sample_interval __read_mostly = CONFIG_KFENCE_SAMPLE_INTERVAL;
+unsigned long kfence_sample_interval __read_mostly = CONFIG_KFENCE_SAMPLE_INTERVAL;
+EXPORT_SYMBOL_GPL(kfence_sample_interval); /* Export for test modules. */
#ifdef MODULE_PARAM_PREFIX
#undef MODULE_PARAM_PREFIX
#endif
#define MODULE_PARAM_PREFIX "kfence."
+static int kfence_enable_late(void);
static int param_set_sample_interval(const char *val, const struct kernel_param *kp)
{
unsigned long num;
@@ -64,10 +69,11 @@ static int param_set_sample_interval(const char *val, const struct kernel_param
if (!num) /* Using 0 to indicate KFENCE is disabled. */
WRITE_ONCE(kfence_enabled, false);
- else if (!READ_ONCE(kfence_enabled) && system_state != SYSTEM_BOOTING)
- return -EINVAL; /* Cannot (re-)enable KFENCE on-the-fly. */
*((unsigned long *)kp->arg) = num;
+
+ if (num && !READ_ONCE(kfence_enabled) && system_state != SYSTEM_BOOTING)
+ return disabled_by_warn ? -EINVAL : kfence_enable_late();
return 0;
}
@@ -89,8 +95,12 @@ module_param_cb(sample_interval, &sample_interval_param_ops, &kfence_sample_inte
static unsigned long kfence_skip_covered_thresh __read_mostly = 75;
module_param_named(skip_covered_thresh, kfence_skip_covered_thresh, ulong, 0644);
+/* If true, use a deferrable timer. */
+static bool kfence_deferrable __read_mostly = IS_ENABLED(CONFIG_KFENCE_DEFERRABLE);
+module_param_named(deferrable, kfence_deferrable, bool, 0444);
+
/* The pool of pages used for guard pages and objects. */
-char *__kfence_pool __ro_after_init;
+char *__kfence_pool __read_mostly;
EXPORT_SYMBOL(__kfence_pool); /* Export for test modules. */
/*
@@ -531,17 +541,19 @@ static void rcu_guarded_free(struct rcu_head *h)
kfence_guarded_free((void *)meta->addr, meta, false);
}
-static bool __init kfence_init_pool(void)
+/*
+ * Initialization of the KFENCE pool after its allocation.
+ * Returns 0 on success; otherwise returns the address up to
+ * which partial initialization succeeded.
+ */
+static unsigned long kfence_init_pool(void)
{
unsigned long addr = (unsigned long)__kfence_pool;
struct page *pages;
int i;
- if (!__kfence_pool)
- return false;
-
if (!arch_kfence_init_pool())
- goto err;
+ return addr;
pages = virt_to_page(addr);
@@ -559,7 +571,7 @@ static bool __init kfence_init_pool(void)
/* Verify we do not have a compound head page. */
if (WARN_ON(compound_head(&pages[i]) != &pages[i]))
- goto err;
+ return addr;
__SetPageSlab(&pages[i]);
}
@@ -572,7 +584,7 @@ static bool __init kfence_init_pool(void)
*/
for (i = 0; i < 2; i++) {
if (unlikely(!kfence_protect(addr)))
- goto err;
+ return addr;
addr += PAGE_SIZE;
}
@@ -589,7 +601,7 @@ static bool __init kfence_init_pool(void)
/* Protect the right redzone. */
if (unlikely(!kfence_protect(addr + PAGE_SIZE)))
- goto err;
+ return addr;
addr += 2 * PAGE_SIZE;
}
@@ -602,9 +614,21 @@ static bool __init kfence_init_pool(void)
*/
kmemleak_free(__kfence_pool);
- return true;
+ return 0;
+}
+
+static bool __init kfence_init_pool_early(void)
+{
+ unsigned long addr;
+
+ if (!__kfence_pool)
+ return false;
+
+ addr = kfence_init_pool();
+
+ if (!addr)
+ return true;
-err:
/*
* Only release unprotected pages, and do not try to go back and change
* page attributes due to risk of failing to do so as well. If changing
@@ -617,6 +641,26 @@ err:
return false;
}
+static bool kfence_init_pool_late(void)
+{
+ unsigned long addr, free_size;
+
+ addr = kfence_init_pool();
+
+ if (!addr)
+ return true;
+
+ /* Same as above. */
+ free_size = KFENCE_POOL_SIZE - (addr - (unsigned long)__kfence_pool);
+#ifdef CONFIG_CONTIG_ALLOC
+ free_contig_range(page_to_pfn(virt_to_page(addr)), free_size / PAGE_SIZE);
+#else
+ free_pages_exact((void *)addr, free_size);
+#endif
+ __kfence_pool = NULL;
+ return false;
+}
+
/* === DebugFS Interface ==================================================== */
static int stats_show(struct seq_file *seq, void *v)
@@ -700,6 +744,8 @@ late_initcall(kfence_debugfs_init);
/* === Allocation Gate Timer ================================================ */
+static struct delayed_work kfence_timer;
+
#ifdef CONFIG_KFENCE_STATIC_KEYS
/* Wait queue to wake up allocation-gate timer task. */
static DECLARE_WAIT_QUEUE_HEAD(allocation_wait);
@@ -722,7 +768,6 @@ static DEFINE_IRQ_WORK(wake_up_kfence_timer_work, wake_up_kfence_timer);
* avoids IPIs, at the cost of not immediately capturing allocations if the
* instructions remain cached.
*/
-static struct delayed_work kfence_timer;
static void toggle_allocation_gate(struct work_struct *work)
{
if (!READ_ONCE(kfence_enabled))
@@ -750,7 +795,6 @@ static void toggle_allocation_gate(struct work_struct *work)
queue_delayed_work(system_unbound_wq, &kfence_timer,
msecs_to_jiffies(kfence_sample_interval));
}
-static DECLARE_DELAYED_WORK(kfence_timer, toggle_allocation_gate);
/* === Public interface ===================================================== */
@@ -765,25 +809,77 @@ void __init kfence_alloc_pool(void)
pr_err("failed to allocate pool\n");
}
+static void kfence_init_enable(void)
+{
+ if (!IS_ENABLED(CONFIG_KFENCE_STATIC_KEYS))
+ static_branch_enable(&kfence_allocation_key);
+
+ if (kfence_deferrable)
+ INIT_DEFERRABLE_WORK(&kfence_timer, toggle_allocation_gate);
+ else
+ INIT_DELAYED_WORK(&kfence_timer, toggle_allocation_gate);
+
+ WRITE_ONCE(kfence_enabled, true);
+ queue_delayed_work(system_unbound_wq, &kfence_timer, 0);
+
+ pr_info("initialized - using %lu bytes for %d objects at 0x%p-0x%p\n", KFENCE_POOL_SIZE,
+ CONFIG_KFENCE_NUM_OBJECTS, (void *)__kfence_pool,
+ (void *)(__kfence_pool + KFENCE_POOL_SIZE));
+}
+
void __init kfence_init(void)
{
+ stack_hash_seed = (u32)random_get_entropy();
+
/* Setting kfence_sample_interval to 0 on boot disables KFENCE. */
if (!kfence_sample_interval)
return;
- stack_hash_seed = (u32)random_get_entropy();
- if (!kfence_init_pool()) {
+ if (!kfence_init_pool_early()) {
pr_err("%s failed\n", __func__);
return;
}
- if (!IS_ENABLED(CONFIG_KFENCE_STATIC_KEYS))
- static_branch_enable(&kfence_allocation_key);
+ kfence_init_enable();
+}
+
+static int kfence_init_late(void)
+{
+ const unsigned long nr_pages = KFENCE_POOL_SIZE / PAGE_SIZE;
+#ifdef CONFIG_CONTIG_ALLOC
+ struct page *pages;
+
+ pages = alloc_contig_pages(nr_pages, GFP_KERNEL, first_online_node, NULL);
+ if (!pages)
+ return -ENOMEM;
+ __kfence_pool = page_to_virt(pages);
+#else
+ if (nr_pages > MAX_ORDER_NR_PAGES) {
+ pr_warn("KFENCE_NUM_OBJECTS too large for buddy allocator\n");
+ return -EINVAL;
+ }
+ __kfence_pool = alloc_pages_exact(KFENCE_POOL_SIZE, GFP_KERNEL);
+ if (!__kfence_pool)
+ return -ENOMEM;
+#endif
+
+ if (!kfence_init_pool_late()) {
+ pr_err("%s failed\n", __func__);
+ return -EBUSY;
+ }
+
+ kfence_init_enable();
+ return 0;
+}
+
+static int kfence_enable_late(void)
+{
+ if (!__kfence_pool)
+ return kfence_init_late();
+
WRITE_ONCE(kfence_enabled, true);
queue_delayed_work(system_unbound_wq, &kfence_timer, 0);
- pr_info("initialized - using %lu bytes for %d objects at 0x%p-0x%p\n", KFENCE_POOL_SIZE,
- CONFIG_KFENCE_NUM_OBJECTS, (void *)__kfence_pool,
- (void *)(__kfence_pool + KFENCE_POOL_SIZE));
+ return 0;
}
void kfence_shutdown_cache(struct kmem_cache *s)
diff --git a/mm/kfence/kfence_test.c b/mm/kfence/kfence_test.c
index a22b1af85577..1b50f70a4c0f 100644
--- a/mm/kfence/kfence_test.c
+++ b/mm/kfence/kfence_test.c
@@ -268,13 +268,13 @@ static void *test_alloc(struct kunit *test, size_t size, gfp_t gfp, enum allocat
* 100x the sample interval should be more than enough to ensure we get
* a KFENCE allocation eventually.
*/
- timeout = jiffies + msecs_to_jiffies(100 * CONFIG_KFENCE_SAMPLE_INTERVAL);
+ timeout = jiffies + msecs_to_jiffies(100 * kfence_sample_interval);
/*
* Especially for non-preemption kernels, ensure the allocation-gate
* timer can catch up: after @resched_after, every failed allocation
* attempt yields, to ensure the allocation-gate timer is scheduled.
*/
- resched_after = jiffies + msecs_to_jiffies(CONFIG_KFENCE_SAMPLE_INTERVAL);
+ resched_after = jiffies + msecs_to_jiffies(kfence_sample_interval);
do {
if (test_cache)
alloc = kmem_cache_alloc(test_cache, gfp);
@@ -608,7 +608,7 @@ static void test_gfpzero(struct kunit *test)
int i;
/* Skip if we think it'd take too long. */
- KFENCE_TEST_REQUIRES(test, CONFIG_KFENCE_SAMPLE_INTERVAL <= 100);
+ KFENCE_TEST_REQUIRES(test, kfence_sample_interval <= 100);
setup_test_cache(test, size, 0, NULL);
buf1 = test_alloc(test, size, GFP_KERNEL, ALLOCATE_ANY);
@@ -623,10 +623,11 @@ static void test_gfpzero(struct kunit *test)
break;
test_free(buf2);
- if (i == CONFIG_KFENCE_NUM_OBJECTS) {
+ if (kthread_should_stop() || (i == CONFIG_KFENCE_NUM_OBJECTS)) {
kunit_warn(test, "giving up ... cannot get same object back\n");
return;
}
+ cond_resched();
}
for (i = 0; i < size; i++)
@@ -739,7 +740,7 @@ static void test_memcache_alloc_bulk(struct kunit *test)
* 100x the sample interval should be more than enough to ensure we get
* a KFENCE allocation eventually.
*/
- timeout = jiffies + msecs_to_jiffies(100 * CONFIG_KFENCE_SAMPLE_INTERVAL);
+ timeout = jiffies + msecs_to_jiffies(100 * kfence_sample_interval);
do {
void *objects[100];
int i, num = kmem_cache_alloc_bulk(test_cache, GFP_ATOMIC, ARRAY_SIZE(objects),
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 35f14d0a00a6..a4e5eaf3eb01 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -16,6 +16,7 @@
#include <linux/hashtable.h>
#include <linux/userfaultfd_k.h>
#include <linux/page_idle.h>
+#include <linux/page_table_check.h>
#include <linux/swapops.h>
#include <linux/shmem_fs.h>
@@ -45,7 +46,6 @@ enum scan_result {
SCAN_VMA_NULL,
SCAN_VMA_CHECK,
SCAN_ADDRESS_RANGE,
- SCAN_SWAP_CACHE_PAGE,
SCAN_DEL_PAGE_LRU,
SCAN_ALLOC_HUGE_PAGE_FAIL,
SCAN_CGROUP_CHARGE_FAIL,
@@ -682,16 +682,6 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
result = SCAN_PAGE_COUNT;
goto out;
}
- if (!pte_write(pteval) && PageSwapCache(page) &&
- !reuse_swap_page(page)) {
- /*
- * Page is in the swap cache and cannot be re-used.
- * It cannot be collapsed into a THP.
- */
- unlock_page(page);
- result = SCAN_SWAP_CACHE_PAGE;
- goto out;
- }
/*
* Isolate the page to avoid collapsing an hugepage
@@ -773,7 +763,7 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
*/
spin_lock(ptl);
ptep_clear(vma->vm_mm, address, _pte);
- page_remove_rmap(src_page, false);
+ page_remove_rmap(src_page, vma, false);
spin_unlock(ptl);
free_page_and_swap_cache(src_page);
}
@@ -1416,6 +1406,21 @@ static int khugepaged_add_pte_mapped_thp(struct mm_struct *mm,
return 0;
}
+static void collapse_and_free_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long addr, pmd_t *pmdp)
+{
+ spinlock_t *ptl;
+ pmd_t pmd;
+
+ mmap_assert_write_locked(mm);
+ ptl = pmd_lock(vma->vm_mm, pmdp);
+ pmd = pmdp_collapse_flush(vma, addr, pmdp);
+ spin_unlock(ptl);
+ mm_dec_nr_ptes(mm);
+ page_table_check_pte_clear_range(mm, addr, pmd);
+ pte_free(mm, pmd_pgtable(pmd));
+}
+
/**
* collapse_pte_mapped_thp - Try to collapse a pte-mapped THP for mm at
* address haddr.
@@ -1433,7 +1438,7 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
struct vm_area_struct *vma = find_vma(mm, haddr);
struct page *hpage;
pte_t *start_pte, *pte;
- pmd_t *pmd, _pmd;
+ pmd_t *pmd;
spinlock_t *ptl;
int count = 0;
int i;
@@ -1497,7 +1502,7 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
if (pte_none(*pte))
continue;
page = vm_normal_page(vma, addr, *pte);
- page_remove_rmap(page, false);
+ page_remove_rmap(page, vma, false);
}
pte_unmap_unlock(start_pte, ptl);
@@ -1509,12 +1514,7 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
}
/* step 4: collapse pmd */
- ptl = pmd_lock(vma->vm_mm, pmd);
- _pmd = pmdp_collapse_flush(vma, haddr, pmd);
- spin_unlock(ptl);
- mm_dec_nr_ptes(mm);
- pte_free(mm, pmd_pgtable(_pmd));
-
+ collapse_and_free_pmd(mm, vma, haddr, pmd);
drop_hpage:
unlock_page(hpage);
put_page(hpage);
@@ -1552,7 +1552,7 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
struct vm_area_struct *vma;
struct mm_struct *mm;
unsigned long addr;
- pmd_t *pmd, _pmd;
+ pmd_t *pmd;
i_mmap_lock_write(mapping);
vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
@@ -1591,14 +1591,8 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
* reverse order. Trylock is a way to avoid deadlock.
*/
if (mmap_write_trylock(mm)) {
- if (!khugepaged_test_exit(mm)) {
- spinlock_t *ptl = pmd_lock(mm, pmd);
- /* assume page table is clear */
- _pmd = pmdp_collapse_flush(vma, addr, pmd);
- spin_unlock(ptl);
- mm_dec_nr_ptes(mm);
- pte_free(mm, pmd_pgtable(_pmd));
- }
+ if (!khugepaged_test_exit(mm))
+ collapse_and_free_pmd(mm, vma, addr, pmd);
mmap_write_unlock(mm);
} else {
/* Try again later */
@@ -1829,13 +1823,13 @@ static void collapse_file(struct mm_struct *mm,
}
if (page_mapped(page))
- unmap_mapping_pages(mapping, index, 1, false);
+ try_to_unmap(page_folio(page),
+ TTU_IGNORE_MLOCK | TTU_BATCH_FLUSH);
xas_lock_irq(&xas);
xas_set(&xas, index);
VM_BUG_ON_PAGE(page != xas_load(&xas), page);
- VM_BUG_ON_PAGE(page_mapped(page), page);
/*
* The page is expected to have page_count() == 3:
@@ -1899,6 +1893,13 @@ xa_locked:
xas_unlock_irq(&xas);
xa_unlocked:
+ /*
+ * If collapse is successful, flush must be done now before copying.
+ * If collapse is unsuccessful, does flush actually need to be done?
+ * Do it anyway, to clear the state.
+ */
+ try_to_unmap_flush();
+
if (result == SCAN_SUCCEED) {
struct page *page, *tmp;
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index dc3758fdba68..7580baa76af1 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -1410,7 +1410,8 @@ static void kmemleak_scan(void)
{
unsigned long flags;
struct kmemleak_object *object;
- int i;
+ struct zone *zone;
+ int __maybe_unused i;
int new_leaks = 0;
jiffies_last_scan = jiffies;
@@ -1450,9 +1451,9 @@ static void kmemleak_scan(void)
* Struct page scanning for each node.
*/
get_online_mems();
- for_each_online_node(i) {
- unsigned long start_pfn = node_start_pfn(i);
- unsigned long end_pfn = node_end_pfn(i);
+ for_each_populated_zone(zone) {
+ unsigned long start_pfn = zone->zone_start_pfn;
+ unsigned long end_pfn = zone_end_pfn(zone);
unsigned long pfn;
for (pfn = start_pfn; pfn < end_pfn; pfn++) {
@@ -1461,8 +1462,8 @@ static void kmemleak_scan(void)
if (!page)
continue;
- /* only scan pages belonging to this node */
- if (page_to_nid(page) != i)
+ /* only scan pages belonging to this zone */
+ if (page_zone(page) != zone)
continue;
/* only scan if page is in use */
if (page_count(page) == 0)
diff --git a/mm/ksm.c b/mm/ksm.c
index c20bd4d9a0d9..063a48eeb5ee 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1034,10 +1034,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
pte_t *orig_pte)
{
struct mm_struct *mm = vma->vm_mm;
- struct page_vma_mapped_walk pvmw = {
- .page = page,
- .vma = vma,
- };
+ DEFINE_PAGE_VMA_WALK(pvmw, page, vma, 0, 0);
int swapped;
int err = -EFAULT;
struct mmu_notifier_range range;
@@ -1177,7 +1174,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
ptep_clear_flush(vma, addr, ptep);
set_pte_at_notify(mm, addr, ptep, newpte);
- page_remove_rmap(page, false);
+ page_remove_rmap(page, vma, false);
if (!page_mapped(page))
try_to_free_swap(page);
put_page(page);
@@ -1252,16 +1249,6 @@ static int try_to_merge_one_page(struct vm_area_struct *vma,
err = replace_page(vma, page, kpage, orig_pte);
}
- if ((vma->vm_flags & VM_LOCKED) && kpage && !err) {
- munlock_vma_page(page);
- if (!PageMlocked(kpage)) {
- unlock_page(page);
- lock_page(kpage);
- mlock_vma_page(kpage);
- page = kpage; /* for final unlock */
- }
- }
-
out_unlock:
unlock_page(page);
out:
@@ -2567,7 +2554,8 @@ void __ksm_exit(struct mm_struct *mm)
struct page *ksm_might_need_to_copy(struct page *page,
struct vm_area_struct *vma, unsigned long address)
{
- struct anon_vma *anon_vma = page_anon_vma(page);
+ struct folio *folio = page_folio(page);
+ struct anon_vma *anon_vma = folio_anon_vma(folio);
struct page *new_page;
if (PageKsm(page)) {
@@ -2595,26 +2583,29 @@ struct page *ksm_might_need_to_copy(struct page *page,
SetPageDirty(new_page);
__SetPageUptodate(new_page);
__SetPageLocked(new_page);
+#ifdef CONFIG_SWAP
+ count_vm_event(KSM_SWPIN_COPY);
+#endif
}
return new_page;
}
-void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc)
+void rmap_walk_ksm(struct folio *folio, const struct rmap_walk_control *rwc)
{
struct stable_node *stable_node;
struct rmap_item *rmap_item;
int search_new_forks = 0;
- VM_BUG_ON_PAGE(!PageKsm(page), page);
+ VM_BUG_ON_FOLIO(!folio_test_ksm(folio), folio);
/*
* Rely on the page lock to protect against concurrent modifications
* to that page's node of the stable tree.
*/
- VM_BUG_ON_PAGE(!PageLocked(page), page);
+ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
- stable_node = page_stable_node(page);
+ stable_node = folio_stable_node(folio);
if (!stable_node)
return;
again:
@@ -2649,11 +2640,11 @@ again:
if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
continue;
- if (!rwc->rmap_one(page, vma, addr, rwc->arg)) {
+ if (!rwc->rmap_one(folio, vma, addr, rwc->arg)) {
anon_vma_unlock_read(anon_vma);
return;
}
- if (rwc->done && rwc->done(page)) {
+ if (rwc->done && rwc->done(folio)) {
anon_vma_unlock_read(anon_vma);
return;
}
@@ -2826,8 +2817,7 @@ static void wait_while_offlining(void)
#define KSM_ATTR_RO(_name) \
static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
#define KSM_ATTR(_name) \
- static struct kobj_attribute _name##_attr = \
- __ATTR(_name, 0644, _name##_show, _name##_store)
+ static struct kobj_attribute _name##_attr = __ATTR_RW(_name)
static ssize_t sleep_millisecs_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
diff --git a/mm/list_lru.c b/mm/list_lru.c
index 0cd5e89ca063..c669d87001a6 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -13,6 +13,7 @@
#include <linux/mutex.h>
#include <linux/memcontrol.h>
#include "slab.h"
+#include "internal.h"
#ifdef CONFIG_MEMCG_KMEM
static LIST_HEAD(memcg_list_lrus);
@@ -49,35 +50,32 @@ static int lru_shrinker_id(struct list_lru *lru)
}
static inline struct list_lru_one *
-list_lru_from_memcg_idx(struct list_lru_node *nlru, int idx)
+list_lru_from_memcg_idx(struct list_lru *lru, int nid, int idx)
{
- struct list_lru_memcg *memcg_lrus;
- /*
- * Either lock or RCU protects the array of per cgroup lists
- * from relocation (see memcg_update_list_lru_node).
- */
- memcg_lrus = rcu_dereference_check(nlru->memcg_lrus,
- lockdep_is_held(&nlru->lock));
- if (memcg_lrus && idx >= 0)
- return memcg_lrus->lru[idx];
- return &nlru->lru;
+ if (list_lru_memcg_aware(lru) && idx >= 0) {
+ struct list_lru_memcg *mlru = xa_load(&lru->xa, idx);
+
+ return mlru ? &mlru->node[nid] : NULL;
+ }
+ return &lru->node[nid].lru;
}
static inline struct list_lru_one *
-list_lru_from_kmem(struct list_lru_node *nlru, void *ptr,
+list_lru_from_kmem(struct list_lru *lru, int nid, void *ptr,
struct mem_cgroup **memcg_ptr)
{
+ struct list_lru_node *nlru = &lru->node[nid];
struct list_lru_one *l = &nlru->lru;
struct mem_cgroup *memcg = NULL;
- if (!nlru->memcg_lrus)
+ if (!list_lru_memcg_aware(lru))
goto out;
memcg = mem_cgroup_from_obj(ptr);
if (!memcg)
goto out;
- l = list_lru_from_memcg_idx(nlru, memcg_cache_id(memcg));
+ l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg));
out:
if (memcg_ptr)
*memcg_ptr = memcg;
@@ -103,18 +101,18 @@ static inline bool list_lru_memcg_aware(struct list_lru *lru)
}
static inline struct list_lru_one *
-list_lru_from_memcg_idx(struct list_lru_node *nlru, int idx)
+list_lru_from_memcg_idx(struct list_lru *lru, int nid, int idx)
{
- return &nlru->lru;
+ return &lru->node[nid].lru;
}
static inline struct list_lru_one *
-list_lru_from_kmem(struct list_lru_node *nlru, void *ptr,
+list_lru_from_kmem(struct list_lru *lru, int nid, void *ptr,
struct mem_cgroup **memcg_ptr)
{
if (memcg_ptr)
*memcg_ptr = NULL;
- return &nlru->lru;
+ return &lru->node[nid].lru;
}
#endif /* CONFIG_MEMCG_KMEM */
@@ -127,7 +125,7 @@ bool list_lru_add(struct list_lru *lru, struct list_head *item)
spin_lock(&nlru->lock);
if (list_empty(item)) {
- l = list_lru_from_kmem(nlru, item, &memcg);
+ l = list_lru_from_kmem(lru, nid, item, &memcg);
list_add_tail(item, &l->list);
/* Set shrinker bit if the first element was added */
if (!l->nr_items++)
@@ -150,7 +148,7 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item)
spin_lock(&nlru->lock);
if (!list_empty(item)) {
- l = list_lru_from_kmem(nlru, item, NULL);
+ l = list_lru_from_kmem(lru, nid, item, NULL);
list_del_init(item);
l->nr_items--;
nlru->nr_items--;
@@ -180,13 +178,12 @@ EXPORT_SYMBOL_GPL(list_lru_isolate_move);
unsigned long list_lru_count_one(struct list_lru *lru,
int nid, struct mem_cgroup *memcg)
{
- struct list_lru_node *nlru = &lru->node[nid];
struct list_lru_one *l;
long count;
rcu_read_lock();
- l = list_lru_from_memcg_idx(nlru, memcg_cache_id(memcg));
- count = READ_ONCE(l->nr_items);
+ l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg));
+ count = l ? READ_ONCE(l->nr_items) : 0;
rcu_read_unlock();
if (unlikely(count < 0))
@@ -206,17 +203,20 @@ unsigned long list_lru_count_node(struct list_lru *lru, int nid)
EXPORT_SYMBOL_GPL(list_lru_count_node);
static unsigned long
-__list_lru_walk_one(struct list_lru_node *nlru, int memcg_idx,
+__list_lru_walk_one(struct list_lru *lru, int nid, int memcg_idx,
list_lru_walk_cb isolate, void *cb_arg,
unsigned long *nr_to_walk)
{
-
+ struct list_lru_node *nlru = &lru->node[nid];
struct list_lru_one *l;
struct list_head *item, *n;
unsigned long isolated = 0;
- l = list_lru_from_memcg_idx(nlru, memcg_idx);
restart:
+ l = list_lru_from_memcg_idx(lru, nid, memcg_idx);
+ if (!l)
+ goto out;
+
list_for_each_safe(item, n, &l->list) {
enum lru_status ret;
@@ -260,6 +260,7 @@ restart:
BUG();
}
}
+out:
return isolated;
}
@@ -272,8 +273,8 @@ list_lru_walk_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg,
unsigned long ret;
spin_lock(&nlru->lock);
- ret = __list_lru_walk_one(nlru, memcg_cache_id(memcg), isolate, cb_arg,
- nr_to_walk);
+ ret = __list_lru_walk_one(lru, nid, memcg_kmem_id(memcg), isolate,
+ cb_arg, nr_to_walk);
spin_unlock(&nlru->lock);
return ret;
}
@@ -288,8 +289,8 @@ list_lru_walk_one_irq(struct list_lru *lru, int nid, struct mem_cgroup *memcg,
unsigned long ret;
spin_lock_irq(&nlru->lock);
- ret = __list_lru_walk_one(nlru, memcg_cache_id(memcg), isolate, cb_arg,
- nr_to_walk);
+ ret = __list_lru_walk_one(lru, nid, memcg_kmem_id(memcg), isolate,
+ cb_arg, nr_to_walk);
spin_unlock_irq(&nlru->lock);
return ret;
}
@@ -299,16 +300,20 @@ unsigned long list_lru_walk_node(struct list_lru *lru, int nid,
unsigned long *nr_to_walk)
{
long isolated = 0;
- int memcg_idx;
isolated += list_lru_walk_one(lru, nid, NULL, isolate, cb_arg,
nr_to_walk);
+
+#ifdef CONFIG_MEMCG_KMEM
if (*nr_to_walk > 0 && list_lru_memcg_aware(lru)) {
- for_each_memcg_cache_index(memcg_idx) {
+ struct list_lru_memcg *mlru;
+ unsigned long index;
+
+ xa_for_each(&lru->xa, index, mlru) {
struct list_lru_node *nlru = &lru->node[nid];
spin_lock(&nlru->lock);
- isolated += __list_lru_walk_one(nlru, memcg_idx,
+ isolated += __list_lru_walk_one(lru, nid, index,
isolate, cb_arg,
nr_to_walk);
spin_unlock(&nlru->lock);
@@ -317,6 +322,8 @@ unsigned long list_lru_walk_node(struct list_lru *lru, int nid,
break;
}
}
+#endif
+
return isolated;
}
EXPORT_SYMBOL_GPL(list_lru_walk_node);
@@ -328,204 +335,81 @@ static void init_one_lru(struct list_lru_one *l)
}
#ifdef CONFIG_MEMCG_KMEM
-static void __memcg_destroy_list_lru_node(struct list_lru_memcg *memcg_lrus,
- int begin, int end)
+static struct list_lru_memcg *memcg_init_list_lru_one(gfp_t gfp)
{
- int i;
-
- for (i = begin; i < end; i++)
- kfree(memcg_lrus->lru[i]);
-}
-
-static int __memcg_init_list_lru_node(struct list_lru_memcg *memcg_lrus,
- int begin, int end)
-{
- int i;
+ int nid;
+ struct list_lru_memcg *mlru;
- for (i = begin; i < end; i++) {
- struct list_lru_one *l;
+ mlru = kmalloc(struct_size(mlru, node, nr_node_ids), gfp);
+ if (!mlru)
+ return NULL;
- l = kmalloc(sizeof(struct list_lru_one), GFP_KERNEL);
- if (!l)
- goto fail;
+ for_each_node(nid)
+ init_one_lru(&mlru->node[nid]);
- init_one_lru(l);
- memcg_lrus->lru[i] = l;
- }
- return 0;
-fail:
- __memcg_destroy_list_lru_node(memcg_lrus, begin, i);
- return -ENOMEM;
+ return mlru;
}
-static int memcg_init_list_lru_node(struct list_lru_node *nlru)
+static void memcg_list_lru_free(struct list_lru *lru, int src_idx)
{
- struct list_lru_memcg *memcg_lrus;
- int size = memcg_nr_cache_ids;
-
- memcg_lrus = kvmalloc(struct_size(memcg_lrus, lru, size), GFP_KERNEL);
- if (!memcg_lrus)
- return -ENOMEM;
-
- if (__memcg_init_list_lru_node(memcg_lrus, 0, size)) {
- kvfree(memcg_lrus);
- return -ENOMEM;
- }
- RCU_INIT_POINTER(nlru->memcg_lrus, memcg_lrus);
-
- return 0;
-}
+ struct list_lru_memcg *mlru = xa_erase_irq(&lru->xa, src_idx);
-static void memcg_destroy_list_lru_node(struct list_lru_node *nlru)
-{
- struct list_lru_memcg *memcg_lrus;
/*
- * This is called when shrinker has already been unregistered,
- * and nobody can use it. So, there is no need to use kvfree_rcu().
+ * The __list_lru_walk_one() can walk the list of this node.
+ * We need kvfree_rcu() here. And the walking of the list
+ * is under lru->node[nid]->lock, which can serve as a RCU
+ * read-side critical section.
*/
- memcg_lrus = rcu_dereference_protected(nlru->memcg_lrus, true);
- __memcg_destroy_list_lru_node(memcg_lrus, 0, memcg_nr_cache_ids);
- kvfree(memcg_lrus);
+ if (mlru)
+ kvfree_rcu(mlru, rcu);
}
-static int memcg_update_list_lru_node(struct list_lru_node *nlru,
- int old_size, int new_size)
+static inline void memcg_init_list_lru(struct list_lru *lru, bool memcg_aware)
{
- struct list_lru_memcg *old, *new;
-
- BUG_ON(old_size > new_size);
-
- old = rcu_dereference_protected(nlru->memcg_lrus,
- lockdep_is_held(&list_lrus_mutex));
- new = kvmalloc(struct_size(new, lru, new_size), GFP_KERNEL);
- if (!new)
- return -ENOMEM;
-
- if (__memcg_init_list_lru_node(new, old_size, new_size)) {
- kvfree(new);
- return -ENOMEM;
- }
-
- memcpy(&new->lru, &old->lru, flex_array_size(new, lru, old_size));
- rcu_assign_pointer(nlru->memcg_lrus, new);
- kvfree_rcu(old, rcu);
- return 0;
-}
-
-static void memcg_cancel_update_list_lru_node(struct list_lru_node *nlru,
- int old_size, int new_size)
-{
- struct list_lru_memcg *memcg_lrus;
-
- memcg_lrus = rcu_dereference_protected(nlru->memcg_lrus,
- lockdep_is_held(&list_lrus_mutex));
- /* do not bother shrinking the array back to the old size, because we
- * cannot handle allocation failures here */
- __memcg_destroy_list_lru_node(memcg_lrus, old_size, new_size);
-}
-
-static int memcg_init_list_lru(struct list_lru *lru, bool memcg_aware)
-{
- int i;
-
+ if (memcg_aware)
+ xa_init_flags(&lru->xa, XA_FLAGS_LOCK_IRQ);
lru->memcg_aware = memcg_aware;
-
- if (!memcg_aware)
- return 0;
-
- for_each_node(i) {
- if (memcg_init_list_lru_node(&lru->node[i]))
- goto fail;
- }
- return 0;
-fail:
- for (i = i - 1; i >= 0; i--) {
- if (!lru->node[i].memcg_lrus)
- continue;
- memcg_destroy_list_lru_node(&lru->node[i]);
- }
- return -ENOMEM;
}
static void memcg_destroy_list_lru(struct list_lru *lru)
{
- int i;
+ XA_STATE(xas, &lru->xa, 0);
+ struct list_lru_memcg *mlru;
if (!list_lru_memcg_aware(lru))
return;
- for_each_node(i)
- memcg_destroy_list_lru_node(&lru->node[i]);
-}
-
-static int memcg_update_list_lru(struct list_lru *lru,
- int old_size, int new_size)
-{
- int i;
-
- for_each_node(i) {
- if (memcg_update_list_lru_node(&lru->node[i],
- old_size, new_size))
- goto fail;
- }
- return 0;
-fail:
- for (i = i - 1; i >= 0; i--) {
- if (!lru->node[i].memcg_lrus)
- continue;
-
- memcg_cancel_update_list_lru_node(&lru->node[i],
- old_size, new_size);
+ xas_lock_irq(&xas);
+ xas_for_each(&xas, mlru, ULONG_MAX) {
+ kfree(mlru);
+ xas_store(&xas, NULL);
}
- return -ENOMEM;
-}
-
-static void memcg_cancel_update_list_lru(struct list_lru *lru,
- int old_size, int new_size)
-{
- int i;
-
- for_each_node(i)
- memcg_cancel_update_list_lru_node(&lru->node[i],
- old_size, new_size);
+ xas_unlock_irq(&xas);
}
-int memcg_update_all_list_lrus(int new_size)
-{
- int ret = 0;
- struct list_lru *lru;
- int old_size = memcg_nr_cache_ids;
-
- mutex_lock(&list_lrus_mutex);
- list_for_each_entry(lru, &memcg_list_lrus, list) {
- ret = memcg_update_list_lru(lru, old_size, new_size);
- if (ret)
- goto fail;
- }
-out:
- mutex_unlock(&list_lrus_mutex);
- return ret;
-fail:
- list_for_each_entry_continue_reverse(lru, &memcg_list_lrus, list)
- memcg_cancel_update_list_lru(lru, old_size, new_size);
- goto out;
-}
-
-static void memcg_drain_list_lru_node(struct list_lru *lru, int nid,
- int src_idx, struct mem_cgroup *dst_memcg)
+static void memcg_reparent_list_lru_node(struct list_lru *lru, int nid,
+ int src_idx, struct mem_cgroup *dst_memcg)
{
struct list_lru_node *nlru = &lru->node[nid];
int dst_idx = dst_memcg->kmemcg_id;
struct list_lru_one *src, *dst;
/*
+ * If there is no lru entry in this nlru, we can skip it immediately.
+ */
+ if (!READ_ONCE(nlru->nr_items))
+ return;
+
+ /*
* Since list_lru_{add,del} may be called under an IRQ-safe lock,
* we have to use IRQ-safe primitives here to avoid deadlock.
*/
spin_lock_irq(&nlru->lock);
- src = list_lru_from_memcg_idx(nlru, src_idx);
- dst = list_lru_from_memcg_idx(nlru, dst_idx);
+ src = list_lru_from_memcg_idx(lru, nid, src_idx);
+ if (!src)
+ goto out;
+ dst = list_lru_from_memcg_idx(lru, nid, dst_idx);
list_splice_init(&src->list, &dst->list);
@@ -534,32 +418,143 @@ static void memcg_drain_list_lru_node(struct list_lru *lru, int nid,
set_shrinker_bit(dst_memcg, nid, lru_shrinker_id(lru));
src->nr_items = 0;
}
-
+out:
spin_unlock_irq(&nlru->lock);
}
-static void memcg_drain_list_lru(struct list_lru *lru,
- int src_idx, struct mem_cgroup *dst_memcg)
+static void memcg_reparent_list_lru(struct list_lru *lru,
+ int src_idx, struct mem_cgroup *dst_memcg)
{
int i;
for_each_node(i)
- memcg_drain_list_lru_node(lru, i, src_idx, dst_memcg);
+ memcg_reparent_list_lru_node(lru, i, src_idx, dst_memcg);
+
+ memcg_list_lru_free(lru, src_idx);
}
-void memcg_drain_all_list_lrus(int src_idx, struct mem_cgroup *dst_memcg)
+void memcg_reparent_list_lrus(struct mem_cgroup *memcg, struct mem_cgroup *parent)
{
+ struct cgroup_subsys_state *css;
struct list_lru *lru;
+ int src_idx = memcg->kmemcg_id;
+
+ /*
+ * Change kmemcg_id of this cgroup and all its descendants to the
+ * parent's id, and then move all entries from this cgroup's list_lrus
+ * to ones of the parent.
+ *
+ * After we have finished, all list_lrus corresponding to this cgroup
+ * are guaranteed to remain empty. So we can safely free this cgroup's
+ * list lrus in memcg_list_lru_free().
+ *
+ * Changing ->kmemcg_id to the parent can prevent memcg_list_lru_alloc()
+ * from allocating list lrus for this cgroup after memcg_list_lru_free()
+ * call.
+ */
+ rcu_read_lock();
+ css_for_each_descendant_pre(css, &memcg->css) {
+ struct mem_cgroup *child;
+
+ child = mem_cgroup_from_css(css);
+ WRITE_ONCE(child->kmemcg_id, parent->kmemcg_id);
+ }
+ rcu_read_unlock();
mutex_lock(&list_lrus_mutex);
list_for_each_entry(lru, &memcg_list_lrus, list)
- memcg_drain_list_lru(lru, src_idx, dst_memcg);
+ memcg_reparent_list_lru(lru, src_idx, parent);
mutex_unlock(&list_lrus_mutex);
}
+
+static inline bool memcg_list_lru_allocated(struct mem_cgroup *memcg,
+ struct list_lru *lru)
+{
+ int idx = memcg->kmemcg_id;
+
+ return idx < 0 || xa_load(&lru->xa, idx);
+}
+
+int memcg_list_lru_alloc(struct mem_cgroup *memcg, struct list_lru *lru,
+ gfp_t gfp)
+{
+ int i;
+ unsigned long flags;
+ struct list_lru_memcg_table {
+ struct list_lru_memcg *mlru;
+ struct mem_cgroup *memcg;
+ } *table;
+ XA_STATE(xas, &lru->xa, 0);
+
+ if (!list_lru_memcg_aware(lru) || memcg_list_lru_allocated(memcg, lru))
+ return 0;
+
+ gfp &= GFP_RECLAIM_MASK;
+ table = kmalloc_array(memcg->css.cgroup->level, sizeof(*table), gfp);
+ if (!table)
+ return -ENOMEM;
+
+ /*
+ * Because the list_lru can be reparented to the parent cgroup's
+ * list_lru, we should make sure that this cgroup and all its
+ * ancestors have allocated list_lru_memcg.
+ */
+ for (i = 0; memcg; memcg = parent_mem_cgroup(memcg), i++) {
+ if (memcg_list_lru_allocated(memcg, lru))
+ break;
+
+ table[i].memcg = memcg;
+ table[i].mlru = memcg_init_list_lru_one(gfp);
+ if (!table[i].mlru) {
+ while (i--)
+ kfree(table[i].mlru);
+ kfree(table);
+ return -ENOMEM;
+ }
+ }
+
+ xas_lock_irqsave(&xas, flags);
+ while (i--) {
+ int index = READ_ONCE(table[i].memcg->kmemcg_id);
+ struct list_lru_memcg *mlru = table[i].mlru;
+
+ xas_set(&xas, index);
+retry:
+ if (unlikely(index < 0 || xas_error(&xas) || xas_load(&xas))) {
+ kfree(mlru);
+ } else {
+ xas_store(&xas, mlru);
+ if (xas_error(&xas) == -ENOMEM) {
+ xas_unlock_irqrestore(&xas, flags);
+ if (xas_nomem(&xas, gfp))
+ xas_set_err(&xas, 0);
+ xas_lock_irqsave(&xas, flags);
+ /*
+ * The xas lock has been released, this memcg
+ * can be reparented before us. So reload
+ * memcg id. More details see the comments
+ * in memcg_reparent_list_lrus().
+ */
+ index = READ_ONCE(table[i].memcg->kmemcg_id);
+ if (index < 0)
+ xas_set_err(&xas, 0);
+ else if (!xas_error(&xas) && index != xas.xa_index)
+ xas_set(&xas, index);
+ goto retry;
+ }
+ }
+ }
+ /* xas_nomem() is used to free memory instead of memory allocation. */
+ if (xas.xa_alloc)
+ xas_nomem(&xas, gfp);
+ xas_unlock_irqrestore(&xas, flags);
+ kfree(table);
+
+ return xas_error(&xas);
+}
#else
-static int memcg_init_list_lru(struct list_lru *lru, bool memcg_aware)
+static inline void memcg_init_list_lru(struct list_lru *lru, bool memcg_aware)
{
- return 0;
}
static void memcg_destroy_list_lru(struct list_lru *lru)
@@ -571,7 +566,6 @@ int __list_lru_init(struct list_lru *lru, bool memcg_aware,
struct lock_class_key *key, struct shrinker *shrinker)
{
int i;
- int err = -ENOMEM;
#ifdef CONFIG_MEMCG_KMEM
if (shrinker)
@@ -579,11 +573,10 @@ int __list_lru_init(struct list_lru *lru, bool memcg_aware,
else
lru->shrinker_id = -1;
#endif
- memcg_get_cache_ids();
lru->node = kcalloc(nr_node_ids, sizeof(*lru->node), GFP_KERNEL);
if (!lru->node)
- goto out;
+ return -ENOMEM;
for_each_node(i) {
spin_lock_init(&lru->node[i].lock);
@@ -592,18 +585,10 @@ int __list_lru_init(struct list_lru *lru, bool memcg_aware,
init_one_lru(&lru->node[i].lru);
}
- err = memcg_init_list_lru(lru, memcg_aware);
- if (err) {
- kfree(lru->node);
- /* Do this so a list_lru_destroy() doesn't crash: */
- lru->node = NULL;
- goto out;
- }
-
+ memcg_init_list_lru(lru, memcg_aware);
list_lru_register(lru);
-out:
- memcg_put_cache_ids();
- return err;
+
+ return 0;
}
EXPORT_SYMBOL_GPL(__list_lru_init);
@@ -613,8 +598,6 @@ void list_lru_destroy(struct list_lru *lru)
if (!lru->node)
return;
- memcg_get_cache_ids();
-
list_lru_unregister(lru);
memcg_destroy_list_lru(lru);
@@ -624,6 +607,5 @@ void list_lru_destroy(struct list_lru *lru)
#ifdef CONFIG_MEMCG_KMEM
lru->shrinker_id = -1;
#endif
- memcg_put_cache_ids();
}
EXPORT_SYMBOL_GPL(list_lru_destroy);
diff --git a/mm/maccess.c b/mm/maccess.c
index d3f1a1f0b1c1..5f4d240f67ec 100644
--- a/mm/maccess.c
+++ b/mm/maccess.c
@@ -12,8 +12,6 @@ bool __weak copy_from_kernel_nofault_allowed(const void *unsafe_src,
return true;
}
-#ifdef HAVE_GET_KERNEL_NOFAULT
-
#define copy_from_kernel_nofault_loop(dst, src, len, type, err_label) \
while (len >= sizeof(type)) { \
__get_kernel_nofault(dst, src, type, err_label); \
@@ -102,112 +100,6 @@ Efault:
dst[-1] = '\0';
return -EFAULT;
}
-#else /* HAVE_GET_KERNEL_NOFAULT */
-/**
- * copy_from_kernel_nofault(): safely attempt to read from kernel-space
- * @dst: pointer to the buffer that shall take the data
- * @src: address to read from
- * @size: size of the data chunk
- *
- * Safely read from kernel address @src to the buffer at @dst. If a kernel
- * fault happens, handle that and return -EFAULT. If @src is not a valid kernel
- * address, return -ERANGE.
- *
- * We ensure that the copy_from_user is executed in atomic context so that
- * do_page_fault() doesn't attempt to take mmap_lock. This makes
- * copy_from_kernel_nofault() suitable for use within regions where the caller
- * already holds mmap_lock, or other locks which nest inside mmap_lock.
- */
-long copy_from_kernel_nofault(void *dst, const void *src, size_t size)
-{
- long ret;
- mm_segment_t old_fs = get_fs();
-
- if (!copy_from_kernel_nofault_allowed(src, size))
- return -ERANGE;
-
- set_fs(KERNEL_DS);
- pagefault_disable();
- ret = __copy_from_user_inatomic(dst, (__force const void __user *)src,
- size);
- pagefault_enable();
- set_fs(old_fs);
-
- if (ret)
- return -EFAULT;
- return 0;
-}
-EXPORT_SYMBOL_GPL(copy_from_kernel_nofault);
-
-/**
- * copy_to_kernel_nofault(): safely attempt to write to a location
- * @dst: address to write to
- * @src: pointer to the data that shall be written
- * @size: size of the data chunk
- *
- * Safely write to address @dst from the buffer at @src. If a kernel fault
- * happens, handle that and return -EFAULT.
- */
-long copy_to_kernel_nofault(void *dst, const void *src, size_t size)
-{
- long ret;
- mm_segment_t old_fs = get_fs();
-
- set_fs(KERNEL_DS);
- pagefault_disable();
- ret = __copy_to_user_inatomic((__force void __user *)dst, src, size);
- pagefault_enable();
- set_fs(old_fs);
-
- if (ret)
- return -EFAULT;
- return 0;
-}
-
-/**
- * strncpy_from_kernel_nofault: - Copy a NUL terminated string from unsafe
- * address.
- * @dst: Destination address, in kernel space. This buffer must be at
- * least @count bytes long.
- * @unsafe_addr: Unsafe address.
- * @count: Maximum number of bytes to copy, including the trailing NUL.
- *
- * Copies a NUL-terminated string from unsafe address to kernel buffer.
- *
- * On success, returns the length of the string INCLUDING the trailing NUL.
- *
- * If access fails, returns -EFAULT (some data may have been copied and the
- * trailing NUL added). If @unsafe_addr is not a valid kernel address, return
- * -ERANGE.
- *
- * If @count is smaller than the length of the string, copies @count-1 bytes,
- * sets the last byte of @dst buffer to NUL and returns @count.
- */
-long strncpy_from_kernel_nofault(char *dst, const void *unsafe_addr, long count)
-{
- mm_segment_t old_fs = get_fs();
- const void *src = unsafe_addr;
- long ret;
-
- if (unlikely(count <= 0))
- return 0;
- if (!copy_from_kernel_nofault_allowed(unsafe_addr, count))
- return -ERANGE;
-
- set_fs(KERNEL_DS);
- pagefault_disable();
-
- do {
- ret = __get_user(*dst++, (const char __user __force *)src++);
- } while (dst[-1] && ret == 0 && src - unsafe_addr < count);
-
- dst[-1] = '\0';
- pagefault_enable();
- set_fs(old_fs);
-
- return ret ? -EFAULT : src - unsafe_addr;
-}
-#endif /* HAVE_GET_KERNEL_NOFAULT */
/**
* copy_from_user_nofault(): safely attempt to read from a user-space location
@@ -221,14 +113,11 @@ long strncpy_from_kernel_nofault(char *dst, const void *unsafe_addr, long count)
long copy_from_user_nofault(void *dst, const void __user *src, size_t size)
{
long ret = -EFAULT;
- mm_segment_t old_fs = force_uaccess_begin();
-
if (access_ok(src, size)) {
pagefault_disable();
ret = __copy_from_user_inatomic(dst, src, size);
pagefault_enable();
}
- force_uaccess_end(old_fs);
if (ret)
return -EFAULT;
@@ -248,14 +137,12 @@ EXPORT_SYMBOL_GPL(copy_from_user_nofault);
long copy_to_user_nofault(void __user *dst, const void *src, size_t size)
{
long ret = -EFAULT;
- mm_segment_t old_fs = force_uaccess_begin();
if (access_ok(dst, size)) {
pagefault_disable();
ret = __copy_to_user_inatomic(dst, src, size);
pagefault_enable();
}
- force_uaccess_end(old_fs);
if (ret)
return -EFAULT;
@@ -284,17 +171,14 @@ EXPORT_SYMBOL_GPL(copy_to_user_nofault);
long strncpy_from_user_nofault(char *dst, const void __user *unsafe_addr,
long count)
{
- mm_segment_t old_fs;
long ret;
if (unlikely(count <= 0))
return 0;
- old_fs = force_uaccess_begin();
pagefault_disable();
ret = strncpy_from_user(dst, unsafe_addr, count);
pagefault_enable();
- force_uaccess_end(old_fs);
if (ret >= count) {
ret = count;
@@ -324,14 +208,17 @@ long strncpy_from_user_nofault(char *dst, const void __user *unsafe_addr,
*/
long strnlen_user_nofault(const void __user *unsafe_addr, long count)
{
- mm_segment_t old_fs;
int ret;
- old_fs = force_uaccess_begin();
pagefault_disable();
ret = strnlen_user(unsafe_addr, count);
pagefault_enable();
- force_uaccess_end(old_fs);
return ret;
}
+
+void __copy_overflow(int size, unsigned long count)
+{
+ WARN(1, "Buffer overflow detected (%d < %lu)!\n", size, count);
+}
+EXPORT_SYMBOL(__copy_overflow);
diff --git a/mm/madvise.c b/mm/madvise.c
index 5604064df464..b41858ee937b 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -52,6 +52,7 @@ static int madvise_need_mmap_write(int behavior)
case MADV_REMOVE:
case MADV_WILLNEED:
case MADV_DONTNEED:
+ case MADV_DONTNEED_LOCKED:
case MADV_COLD:
case MADV_PAGEOUT:
case MADV_FREE:
@@ -65,7 +66,7 @@ static int madvise_need_mmap_write(int behavior)
}
#ifdef CONFIG_ANON_VMA_NAME
-static struct anon_vma_name *anon_vma_name_alloc(const char *name)
+struct anon_vma_name *anon_vma_name_alloc(const char *name)
{
struct anon_vma_name *anon_name;
size_t count;
@@ -81,78 +82,48 @@ static struct anon_vma_name *anon_vma_name_alloc(const char *name)
return anon_name;
}
-static void vma_anon_name_free(struct kref *kref)
+void anon_vma_name_free(struct kref *kref)
{
struct anon_vma_name *anon_name =
container_of(kref, struct anon_vma_name, kref);
kfree(anon_name);
}
-static inline bool has_vma_anon_name(struct vm_area_struct *vma)
+struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma)
{
- return !vma->vm_file && vma->anon_name;
-}
-
-const char *vma_anon_name(struct vm_area_struct *vma)
-{
- if (!has_vma_anon_name(vma))
- return NULL;
-
mmap_assert_locked(vma->vm_mm);
- return vma->anon_name->name;
-}
-
-void dup_vma_anon_name(struct vm_area_struct *orig_vma,
- struct vm_area_struct *new_vma)
-{
- if (!has_vma_anon_name(orig_vma))
- return;
-
- kref_get(&orig_vma->anon_name->kref);
- new_vma->anon_name = orig_vma->anon_name;
-}
-
-void free_vma_anon_name(struct vm_area_struct *vma)
-{
- struct anon_vma_name *anon_name;
-
- if (!has_vma_anon_name(vma))
- return;
+ if (vma->vm_file)
+ return NULL;
- anon_name = vma->anon_name;
- vma->anon_name = NULL;
- kref_put(&anon_name->kref, vma_anon_name_free);
+ return vma->anon_name;
}
/* mmap_lock should be write-locked */
-static int replace_vma_anon_name(struct vm_area_struct *vma, const char *name)
+static int replace_anon_vma_name(struct vm_area_struct *vma,
+ struct anon_vma_name *anon_name)
{
- const char *anon_name;
+ struct anon_vma_name *orig_name = anon_vma_name(vma);
- if (!name) {
- free_vma_anon_name(vma);
+ if (!anon_name) {
+ vma->anon_name = NULL;
+ anon_vma_name_put(orig_name);
return 0;
}
- anon_name = vma_anon_name(vma);
- if (anon_name) {
- /* Same name, nothing to do here */
- if (!strcmp(name, anon_name))
- return 0;
+ if (anon_vma_name_eq(orig_name, anon_name))
+ return 0;
- free_vma_anon_name(vma);
- }
- vma->anon_name = anon_vma_name_alloc(name);
- if (!vma->anon_name)
- return -ENOMEM;
+ vma->anon_name = anon_vma_name_reuse(anon_name);
+ anon_vma_name_put(orig_name);
return 0;
}
#else /* CONFIG_ANON_VMA_NAME */
-static int replace_vma_anon_name(struct vm_area_struct *vma, const char *name)
+static int replace_anon_vma_name(struct vm_area_struct *vma,
+ struct anon_vma_name *anon_name)
{
- if (name)
+ if (anon_name)
return -EINVAL;
return 0;
@@ -161,17 +132,19 @@ static int replace_vma_anon_name(struct vm_area_struct *vma, const char *name)
/*
* Update the vm_flags on region of a vma, splitting it or merging it as
* necessary. Must be called with mmap_sem held for writing;
+ * Caller should ensure anon_name stability by raising its refcount even when
+ * anon_name belongs to a valid vma because this function might free that vma.
*/
static int madvise_update_vma(struct vm_area_struct *vma,
struct vm_area_struct **prev, unsigned long start,
unsigned long end, unsigned long new_flags,
- const char *name)
+ struct anon_vma_name *anon_name)
{
struct mm_struct *mm = vma->vm_mm;
int error;
pgoff_t pgoff;
- if (new_flags == vma->vm_flags && is_same_vma_anon_name(vma, name)) {
+ if (new_flags == vma->vm_flags && anon_vma_name_eq(anon_vma_name(vma), anon_name)) {
*prev = vma;
return 0;
}
@@ -179,7 +152,7 @@ static int madvise_update_vma(struct vm_area_struct *vma,
pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
*prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma,
vma->vm_file, pgoff, vma_policy(vma),
- vma->vm_userfaultfd_ctx, name);
+ vma->vm_userfaultfd_ctx, anon_name);
if (*prev) {
vma = *prev;
goto success;
@@ -209,7 +182,7 @@ success:
*/
vma->vm_flags = new_flags;
if (!vma->vm_file) {
- error = replace_vma_anon_name(vma, name);
+ error = replace_anon_vma_name(vma, anon_name);
if (error)
return error;
}
@@ -530,6 +503,11 @@ static void madvise_cold_page_range(struct mmu_gather *tlb,
tlb_end_vma(tlb, vma);
}
+static inline bool can_madv_lru_vma(struct vm_area_struct *vma)
+{
+ return !(vma->vm_flags & (VM_LOCKED|VM_PFNMAP|VM_HUGETLB));
+}
+
static long madvise_cold(struct vm_area_struct *vma,
struct vm_area_struct **prev,
unsigned long start_addr, unsigned long end_addr)
@@ -800,6 +778,29 @@ static long madvise_dontneed_single_vma(struct vm_area_struct *vma,
return 0;
}
+static bool madvise_dontneed_free_valid_vma(struct vm_area_struct *vma,
+ unsigned long start,
+ unsigned long *end,
+ int behavior)
+{
+ if (!is_vm_hugetlb_page(vma)) {
+ unsigned int forbidden = VM_PFNMAP;
+
+ if (behavior != MADV_DONTNEED_LOCKED)
+ forbidden |= VM_LOCKED;
+
+ return !(vma->vm_flags & forbidden);
+ }
+
+ if (behavior != MADV_DONTNEED && behavior != MADV_DONTNEED_LOCKED)
+ return false;
+ if (start & ~huge_page_mask(hstate_vma(vma)))
+ return false;
+
+ *end = ALIGN(*end, huge_page_size(hstate_vma(vma)));
+ return true;
+}
+
static long madvise_dontneed_free(struct vm_area_struct *vma,
struct vm_area_struct **prev,
unsigned long start, unsigned long end,
@@ -808,7 +809,7 @@ static long madvise_dontneed_free(struct vm_area_struct *vma,
struct mm_struct *mm = vma->vm_mm;
*prev = vma;
- if (!can_madv_lru_vma(vma))
+ if (!madvise_dontneed_free_valid_vma(vma, start, &end, behavior))
return -EINVAL;
if (!userfaultfd_remove(vma, start, end)) {
@@ -830,7 +831,12 @@ static long madvise_dontneed_free(struct vm_area_struct *vma,
*/
return -ENOMEM;
}
- if (!can_madv_lru_vma(vma))
+ /*
+ * Potential end adjustment for hugetlb vma is OK as
+ * the check below keeps end within vma.
+ */
+ if (!madvise_dontneed_free_valid_vma(vma, start, &end,
+ behavior))
return -EINVAL;
if (end > vma->vm_end) {
/*
@@ -850,7 +856,7 @@ static long madvise_dontneed_free(struct vm_area_struct *vma,
VM_WARN_ON(start >= end);
}
- if (behavior == MADV_DONTNEED)
+ if (behavior == MADV_DONTNEED || behavior == MADV_DONTNEED_LOCKED)
return madvise_dontneed_single_vma(vma, start, end);
else if (behavior == MADV_FREE)
return madvise_free_single_vma(vma, start, end);
@@ -877,8 +883,8 @@ static long madvise_populate(struct vm_area_struct *vma,
* our VMA might have been split.
*/
if (!vma || start >= vma->vm_end) {
- vma = find_vma(mm, start);
- if (!vma || start < vma->vm_start)
+ vma = vma_lookup(mm, start);
+ if (!vma)
return -ENOMEM;
}
@@ -975,6 +981,7 @@ static int madvise_vma_behavior(struct vm_area_struct *vma,
unsigned long behavior)
{
int error;
+ struct anon_vma_name *anon_name;
unsigned long new_flags = vma->vm_flags;
switch (behavior) {
@@ -988,6 +995,7 @@ static int madvise_vma_behavior(struct vm_area_struct *vma,
return madvise_pageout(vma, prev, start, end);
case MADV_FREE:
case MADV_DONTNEED:
+ case MADV_DONTNEED_LOCKED:
return madvise_dontneed_free(vma, prev, start, end, behavior);
case MADV_POPULATE_READ:
case MADV_POPULATE_WRITE:
@@ -1040,8 +1048,11 @@ static int madvise_vma_behavior(struct vm_area_struct *vma,
break;
}
+ anon_name = anon_vma_name(vma);
+ anon_vma_name_get(anon_name);
error = madvise_update_vma(vma, prev, start, end, new_flags,
- vma_anon_name(vma));
+ anon_name);
+ anon_vma_name_put(anon_name);
out:
/*
@@ -1091,6 +1102,8 @@ static int madvise_inject_error(int behavior,
pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n",
pfn, start);
ret = memory_failure(pfn, MF_COUNT_INCREASED);
+ if (ret == -EOPNOTSUPP)
+ ret = 0;
}
if (ret)
@@ -1113,6 +1126,7 @@ madvise_behavior_valid(int behavior)
case MADV_REMOVE:
case MADV_WILLNEED:
case MADV_DONTNEED:
+ case MADV_DONTNEED_LOCKED:
case MADV_FREE:
case MADV_COLD:
case MADV_PAGEOUT:
@@ -1225,7 +1239,7 @@ int madvise_walk_vmas(struct mm_struct *mm, unsigned long start,
static int madvise_vma_anon_name(struct vm_area_struct *vma,
struct vm_area_struct **prev,
unsigned long start, unsigned long end,
- unsigned long name)
+ unsigned long anon_name)
{
int error;
@@ -1234,7 +1248,7 @@ static int madvise_vma_anon_name(struct vm_area_struct *vma,
return -EBADF;
error = madvise_update_vma(vma, prev, start, end, vma->vm_flags,
- (const char *)name);
+ (struct anon_vma_name *)anon_name);
/*
* madvise() returns EAGAIN if kernel resources, such as
@@ -1246,7 +1260,7 @@ static int madvise_vma_anon_name(struct vm_area_struct *vma,
}
int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
- unsigned long len_in, const char *name)
+ unsigned long len_in, struct anon_vma_name *anon_name)
{
unsigned long end;
unsigned long len;
@@ -1266,7 +1280,7 @@ int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
if (end == start)
return 0;
- return madvise_walk_vmas(mm, start, end, (unsigned long)name,
+ return madvise_walk_vmas(mm, start, end, (unsigned long)anon_name,
madvise_vma_anon_name);
}
#endif /* CONFIG_ANON_VMA_NAME */
@@ -1450,15 +1464,21 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
while (iov_iter_count(&iter)) {
iovec = iov_iter_iovec(&iter);
+ /*
+ * do_madvise returns ENOMEM if unmapped holes are present
+ * in the passed VMA. process_madvise() is expected to skip
+ * unmapped holes passed to it in the 'struct iovec' list
+ * and not fail because of them. Thus treat -ENOMEM return
+ * from do_madvise as valid and continue processing.
+ */
ret = do_madvise(mm, (unsigned long)iovec.iov_base,
iovec.iov_len, behavior);
- if (ret < 0)
+ if (ret < 0 && ret != -ENOMEM)
break;
iov_iter_advance(&iter, iovec.iov_len);
}
- if (ret == 0)
- ret = total_len - iov_iter_count(&iter);
+ ret = (total_len - iov_iter_count(&iter)) ? : ret;
release_mm:
mmput(mm);
diff --git a/mm/memblock.c b/mm/memblock.c
index 1018e50566f3..e4f03a6e8e56 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -366,14 +366,20 @@ void __init memblock_discard(void)
addr = __pa(memblock.reserved.regions);
size = PAGE_ALIGN(sizeof(struct memblock_region) *
memblock.reserved.max);
- memblock_free_late(addr, size);
+ if (memblock_reserved_in_slab)
+ kfree(memblock.reserved.regions);
+ else
+ memblock_free_late(addr, size);
}
if (memblock.memory.regions != memblock_memory_init_regions) {
addr = __pa(memblock.memory.regions);
size = PAGE_ALIGN(sizeof(struct memblock_region) *
memblock.memory.max);
- memblock_free_late(addr, size);
+ if (memblock_memory_in_slab)
+ kfree(memblock.memory.regions);
+ else
+ memblock_free_late(addr, size);
}
memblock_memory = NULL;
@@ -1278,11 +1284,10 @@ __next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone,
{
int zone_nid = zone_to_nid(zone);
phys_addr_t spa, epa;
- int nid;
__next_mem_range(idx, zone_nid, MEMBLOCK_NONE,
&memblock.memory, &memblock.reserved,
- &spa, &epa, &nid);
+ &spa, &epa, NULL);
while (*idx != U64_MAX) {
unsigned long epfn = PFN_DOWN(epa);
@@ -1309,7 +1314,7 @@ __next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone,
__next_mem_range(idx, zone_nid, MEMBLOCK_NONE,
&memblock.memory, &memblock.reserved,
- &spa, &epa, &nid);
+ &spa, &epa, NULL);
}
/* signal end of iteration */
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 2aaa400f34d6..725f76723220 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -53,6 +53,7 @@
#include <linux/fs.h>
#include <linux/seq_file.h>
#include <linux/vmpressure.h>
+#include <linux/memremap.h>
#include <linux/mm_inline.h>
#include <linux/swap_cgroup.h>
#include <linux/cpu.h>
@@ -254,7 +255,7 @@ struct mem_cgroup *vmpressure_to_memcg(struct vmpressure *vmpr)
}
#ifdef CONFIG_MEMCG_KMEM
-extern spinlock_t css_set_lock;
+static DEFINE_SPINLOCK(objcg_lock);
bool mem_cgroup_kmem_disabled(void)
{
@@ -298,9 +299,9 @@ static void obj_cgroup_release(struct percpu_ref *ref)
if (nr_pages)
obj_cgroup_uncharge_pages(objcg, nr_pages);
- spin_lock_irqsave(&css_set_lock, flags);
+ spin_lock_irqsave(&objcg_lock, flags);
list_del(&objcg->list);
- spin_unlock_irqrestore(&css_set_lock, flags);
+ spin_unlock_irqrestore(&objcg_lock, flags);
percpu_ref_exit(ref);
kfree_rcu(objcg, rcu);
@@ -332,7 +333,7 @@ static void memcg_reparent_objcgs(struct mem_cgroup *memcg,
objcg = rcu_replace_pointer(memcg->objcg, NULL, true);
- spin_lock_irq(&css_set_lock);
+ spin_lock_irq(&objcg_lock);
/* 1) Ready to reparent active objcg. */
list_add(&objcg->list, &memcg->objcg_list);
@@ -342,54 +343,12 @@ static void memcg_reparent_objcgs(struct mem_cgroup *memcg,
/* 3) Move already reparented objcgs to the parent's list */
list_splice(&memcg->objcg_list, &parent->objcg_list);
- spin_unlock_irq(&css_set_lock);
+ spin_unlock_irq(&objcg_lock);
percpu_ref_kill(&objcg->refcnt);
}
/*
- * This will be used as a shrinker list's index.
- * The main reason for not using cgroup id for this:
- * this works better in sparse environments, where we have a lot of memcgs,
- * but only a few kmem-limited. Or also, if we have, for instance, 200
- * memcgs, and none but the 200th is kmem-limited, we'd have to have a
- * 200 entry array for that.
- *
- * The current size of the caches array is stored in memcg_nr_cache_ids. It
- * will double each time we have to increase it.
- */
-static DEFINE_IDA(memcg_cache_ida);
-int memcg_nr_cache_ids;
-
-/* Protects memcg_nr_cache_ids */
-static DECLARE_RWSEM(memcg_cache_ids_sem);
-
-void memcg_get_cache_ids(void)
-{
- down_read(&memcg_cache_ids_sem);
-}
-
-void memcg_put_cache_ids(void)
-{
- up_read(&memcg_cache_ids_sem);
-}
-
-/*
- * MIN_SIZE is different than 1, because we would like to avoid going through
- * the alloc/free process all the time. In a small machine, 4 kmem-limited
- * cgroups is a reasonable guess. In the future, it could be a parameter or
- * tunable, but that is strictly not necessary.
- *
- * MAX_SIZE should be as large as the number of cgrp_ids. Ideally, we could get
- * this constant directly from cgroup, but it is understandable that this is
- * better kept as an internal representation in cgroup.c. In any case, the
- * cgrp_id space is not getting any smaller, and we don't have to necessarily
- * increase ours as well if it increases.
- */
-#define MEMCG_CACHES_MIN_SIZE 4
-#define MEMCG_CACHES_MAX_SIZE MEM_CGROUP_ID_MAX
-
-/*
* A lot of the calls to the cache allocation functions are expected to be
* inlined by the compiler. Since the calls to memcg_slab_pre_alloc_hook() are
* conditional to this static branch, we'll have to allow modules that does
@@ -629,6 +588,35 @@ static DEFINE_SPINLOCK(stats_flush_lock);
static DEFINE_PER_CPU(unsigned int, stats_updates);
static atomic_t stats_flush_threshold = ATOMIC_INIT(0);
+/*
+ * Accessors to ensure that preemption is disabled on PREEMPT_RT because it can
+ * not rely on this as part of an acquired spinlock_t lock. These functions are
+ * never used in hardirq context on PREEMPT_RT and therefore disabling preemtion
+ * is sufficient.
+ */
+static void memcg_stats_lock(void)
+{
+#ifdef CONFIG_PREEMPT_RT
+ preempt_disable();
+#else
+ VM_BUG_ON(!irqs_disabled());
+#endif
+}
+
+static void __memcg_stats_lock(void)
+{
+#ifdef CONFIG_PREEMPT_RT
+ preempt_disable();
+#endif
+}
+
+static void memcg_stats_unlock(void)
+{
+#ifdef CONFIG_PREEMPT_RT
+ preempt_enable();
+#endif
+}
+
static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val)
{
unsigned int x;
@@ -705,6 +693,27 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
memcg = pn->memcg;
+ /*
+ * The caller from rmap relay on disabled preemption becase they never
+ * update their counter from in-interrupt context. For these two
+ * counters we check that the update is never performed from an
+ * interrupt context while other caller need to have disabled interrupt.
+ */
+ __memcg_stats_lock();
+ if (IS_ENABLED(CONFIG_DEBUG_VM) && !IS_ENABLED(CONFIG_PREEMPT_RT)) {
+ switch (idx) {
+ case NR_ANON_MAPPED:
+ case NR_FILE_MAPPED:
+ case NR_ANON_THPS:
+ case NR_SHMEM_PMDMAPPED:
+ case NR_FILE_PMDMAPPED:
+ WARN_ON_ONCE(!in_task());
+ break;
+ default:
+ WARN_ON_ONCE(!irqs_disabled());
+ }
+ }
+
/* Update memcg */
__this_cpu_add(memcg->vmstats_percpu->state[idx], val);
@@ -712,6 +721,7 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
__this_cpu_add(pn->lruvec_stats_percpu->state[idx], val);
memcg_rstat_updated(memcg, val);
+ memcg_stats_unlock();
}
/**
@@ -794,8 +804,10 @@ void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
if (mem_cgroup_disabled())
return;
+ memcg_stats_lock();
__this_cpu_add(memcg->vmstats_percpu->events[idx], count);
memcg_rstat_updated(memcg, count);
+ memcg_stats_unlock();
}
static unsigned long memcg_events(struct mem_cgroup *memcg, int event)
@@ -858,6 +870,9 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
*/
static void memcg_check_events(struct mem_cgroup *memcg, int nid)
{
+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
+ return;
+
/* threshold event is triggered in finer grain than soft limit */
if (unlikely(mem_cgroup_event_ratelimit(memcg,
MEM_CGROUP_TARGET_THRESH))) {
@@ -1257,8 +1272,7 @@ struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio,
* @nr_pages: positive when adding or negative when removing
*
* This function must be called under lru_lock, just before a page is added
- * to or just after a page is removed from an lru list (that ordering being
- * so as to allow it to check that lru_size 0 is consistent with list_empty).
+ * to or just after a page is removed from an lru list.
*/
void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
int zid, int nr_pages)
@@ -1371,6 +1385,7 @@ struct memory_stat {
static const struct memory_stat memory_stats[] = {
{ "anon", NR_ANON_MAPPED },
{ "file", NR_FILE_PAGES },
+ { "kernel", MEMCG_KMEM },
{ "kernel_stack", NR_KERNEL_STACK_KB },
{ "pagetables", NR_PAGETABLE },
{ "percpu", MEMCG_PERCPU_B },
@@ -1795,20 +1810,16 @@ static void memcg_oom_recover(struct mem_cgroup *memcg)
__wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
}
-enum oom_status {
- OOM_SUCCESS,
- OOM_FAILED,
- OOM_ASYNC,
- OOM_SKIPPED
-};
-
-static enum oom_status mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
+/*
+ * Returns true if successfully killed one or more processes. Though in some
+ * corner cases it can return true even without killing any process.
+ */
+static bool mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
{
- enum oom_status ret;
- bool locked;
+ bool locked, ret;
if (order > PAGE_ALLOC_COSTLY_ORDER)
- return OOM_SKIPPED;
+ return false;
memcg_memory_event(memcg, MEMCG_OOM);
@@ -1831,14 +1842,13 @@ static enum oom_status mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int
* victim and then we have to bail out from the charge path.
*/
if (memcg->oom_kill_disable) {
- if (!current->in_user_fault)
- return OOM_SKIPPED;
- css_get(&memcg->css);
- current->memcg_in_oom = memcg;
- current->memcg_oom_gfp_mask = mask;
- current->memcg_oom_order = order;
-
- return OOM_ASYNC;
+ if (current->in_user_fault) {
+ css_get(&memcg->css);
+ current->memcg_in_oom = memcg;
+ current->memcg_oom_gfp_mask = mask;
+ current->memcg_oom_order = order;
+ }
+ return false;
}
mem_cgroup_mark_under_oom(memcg);
@@ -1849,10 +1859,7 @@ static enum oom_status mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int
mem_cgroup_oom_notify(memcg);
mem_cgroup_unmark_under_oom(memcg);
- if (mem_cgroup_out_of_memory(memcg, mask, order))
- ret = OOM_SUCCESS;
- else
- ret = OOM_FAILED;
+ ret = mem_cgroup_out_of_memory(memcg, mask, order);
if (locked)
mem_cgroup_oom_unlock(memcg);
@@ -2085,45 +2092,47 @@ void unlock_page_memcg(struct page *page)
folio_memcg_unlock(page_folio(page));
}
-struct obj_stock {
+struct memcg_stock_pcp {
+ local_lock_t stock_lock;
+ struct mem_cgroup *cached; /* this never be root cgroup */
+ unsigned int nr_pages;
+
#ifdef CONFIG_MEMCG_KMEM
struct obj_cgroup *cached_objcg;
struct pglist_data *cached_pgdat;
unsigned int nr_bytes;
int nr_slab_reclaimable_b;
int nr_slab_unreclaimable_b;
-#else
- int dummy[0];
#endif
-};
-
-struct memcg_stock_pcp {
- struct mem_cgroup *cached; /* this never be root cgroup */
- unsigned int nr_pages;
- struct obj_stock task_obj;
- struct obj_stock irq_obj;
struct work_struct work;
unsigned long flags;
#define FLUSHING_CACHED_CHARGE 0
};
-static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
+static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock) = {
+ .stock_lock = INIT_LOCAL_LOCK(stock_lock),
+};
static DEFINE_MUTEX(percpu_charge_mutex);
#ifdef CONFIG_MEMCG_KMEM
-static void drain_obj_stock(struct obj_stock *stock);
+static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock);
static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
struct mem_cgroup *root_memcg);
+static void memcg_account_kmem(struct mem_cgroup *memcg, int nr_pages);
#else
-static inline void drain_obj_stock(struct obj_stock *stock)
+static inline struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock)
{
+ return NULL;
}
static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
struct mem_cgroup *root_memcg)
{
return false;
}
+static void memcg_account_kmem(struct mem_cgroup *memcg, int nr_pages)
+{
+}
#endif
/**
@@ -2146,7 +2155,7 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
if (nr_pages > MEMCG_CHARGE_BATCH)
return ret;
- local_irq_save(flags);
+ local_lock_irqsave(&memcg_stock.stock_lock, flags);
stock = this_cpu_ptr(&memcg_stock);
if (memcg == stock->cached && stock->nr_pages >= nr_pages) {
@@ -2154,7 +2163,7 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
ret = true;
}
- local_irq_restore(flags);
+ local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
return ret;
}
@@ -2183,6 +2192,7 @@ static void drain_stock(struct memcg_stock_pcp *stock)
static void drain_local_stock(struct work_struct *dummy)
{
struct memcg_stock_pcp *stock;
+ struct obj_cgroup *old = NULL;
unsigned long flags;
/*
@@ -2190,28 +2200,25 @@ static void drain_local_stock(struct work_struct *dummy)
* drain_stock races is that we always operate on local CPU stock
* here with IRQ disabled
*/
- local_irq_save(flags);
+ local_lock_irqsave(&memcg_stock.stock_lock, flags);
stock = this_cpu_ptr(&memcg_stock);
- drain_obj_stock(&stock->irq_obj);
- if (in_task())
- drain_obj_stock(&stock->task_obj);
+ old = drain_obj_stock(stock);
drain_stock(stock);
clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
- local_irq_restore(flags);
+ local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
+ if (old)
+ obj_cgroup_put(old);
}
/*
* Cache charges(val) to local per_cpu area.
* This will be consumed by consume_stock() function, later.
*/
-static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
+static void __refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
{
struct memcg_stock_pcp *stock;
- unsigned long flags;
-
- local_irq_save(flags);
stock = this_cpu_ptr(&memcg_stock);
if (stock->cached != memcg) { /* reset if necessary */
@@ -2223,8 +2230,15 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
if (stock->nr_pages > MEMCG_CHARGE_BATCH)
drain_stock(stock);
+}
- local_irq_restore(flags);
+static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
+{
+ unsigned long flags;
+
+ local_lock_irqsave(&memcg_stock.stock_lock, flags);
+ __refill_stock(memcg, nr_pages);
+ local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
}
/*
@@ -2244,7 +2258,8 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
* as well as workers from this path always operate on the local
* per-cpu data. CPU up doesn't touch memcg_stock at all.
*/
- curcpu = get_cpu();
+ migrate_disable();
+ curcpu = smp_processor_id();
for_each_online_cpu(cpu) {
struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
struct mem_cgroup *memcg;
@@ -2267,7 +2282,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
schedule_work_on(cpu, &stock->work);
}
}
- put_cpu();
+ migrate_enable();
mutex_unlock(&percpu_charge_mutex);
}
@@ -2541,7 +2556,6 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
int nr_retries = MAX_RECLAIM_RETRIES;
struct mem_cgroup *mem_over_limit;
struct page_counter *counter;
- enum oom_status oom_status;
unsigned long nr_reclaimed;
bool passed_oom = false;
bool may_swap = true;
@@ -2570,15 +2584,6 @@ retry:
}
/*
- * Memcg doesn't have a dedicated reserve for atomic
- * allocations. But like the global atomic pool, we need to
- * put the burden of reclaim on regular allocation requests
- * and let these go through as privileged allocations.
- */
- if (gfp_mask & __GFP_ATOMIC)
- goto force;
-
- /*
* Prevent unbounded recursion when reclaim operations need to
* allocate memory. This might exceed the limits temporarily,
* but we prefer facilitating memory reclaim and getting back
@@ -2644,15 +2649,20 @@ retry:
* a forward progress or bypass the charge if the oom killer
* couldn't make any progress.
*/
- oom_status = mem_cgroup_oom(mem_over_limit, gfp_mask,
- get_order(nr_pages * PAGE_SIZE));
- if (oom_status == OOM_SUCCESS) {
+ if (mem_cgroup_oom(mem_over_limit, gfp_mask,
+ get_order(nr_pages * PAGE_SIZE))) {
passed_oom = true;
nr_retries = MAX_RECLAIM_RETRIES;
goto retry;
}
nomem:
- if (!(gfp_mask & __GFP_NOFAIL))
+ /*
+ * Memcg doesn't have a dedicated reserve for atomic
+ * allocations. But like the global atomic pool, we need to
+ * put the burden of reclaim on regular allocation requests
+ * and let these go through as privileged allocations.
+ */
+ if (!(gfp_mask & (__GFP_NOFAIL | __GFP_HIGH)))
return -ENOMEM;
force:
/*
@@ -2688,7 +2698,7 @@ done_restock:
READ_ONCE(memcg->swap.high);
/* Don't bother a random interrupted task */
- if (in_interrupt()) {
+ if (!in_task()) {
if (mem_high) {
schedule_work(&memcg->high_work);
break;
@@ -2712,6 +2722,11 @@ done_restock:
}
} while ((memcg = parent_mem_cgroup(memcg)));
+ if (current->memcg_nr_pages_over_high > MEMCG_CHARGE_BATCH &&
+ !(current->flags & PF_MEMALLOC) &&
+ gfpflags_allow_blocking(gfp_mask)) {
+ mem_cgroup_handle_over_high();
+ }
return 0;
}
@@ -2748,20 +2763,6 @@ static void commit_charge(struct folio *folio, struct mem_cgroup *memcg)
folio->memcg_data = (unsigned long)memcg;
}
-static struct mem_cgroup *get_mem_cgroup_from_objcg(struct obj_cgroup *objcg)
-{
- struct mem_cgroup *memcg;
-
- rcu_read_lock();
-retry:
- memcg = obj_cgroup_memcg(objcg);
- if (unlikely(!css_tryget(&memcg->css)))
- goto retry;
- rcu_read_unlock();
-
- return memcg;
-}
-
#ifdef CONFIG_MEMCG_KMEM
/*
* The allocated objcg pointers array is not accounted directly.
@@ -2771,41 +2772,6 @@ retry:
#define OBJCGS_CLEAR_MASK (__GFP_DMA | __GFP_RECLAIMABLE | __GFP_ACCOUNT)
/*
- * Most kmem_cache_alloc() calls are from user context. The irq disable/enable
- * sequence used in this case to access content from object stock is slow.
- * To optimize for user context access, there are now two object stocks for
- * task context and interrupt context access respectively.
- *
- * The task context object stock can be accessed by disabling preemption only
- * which is cheap in non-preempt kernel. The interrupt context object stock
- * can only be accessed after disabling interrupt. User context code can
- * access interrupt object stock, but not vice versa.
- */
-static inline struct obj_stock *get_obj_stock(unsigned long *pflags)
-{
- struct memcg_stock_pcp *stock;
-
- if (likely(in_task())) {
- *pflags = 0UL;
- preempt_disable();
- stock = this_cpu_ptr(&memcg_stock);
- return &stock->task_obj;
- }
-
- local_irq_save(*pflags);
- stock = this_cpu_ptr(&memcg_stock);
- return &stock->irq_obj;
-}
-
-static inline void put_obj_stock(unsigned long flags)
-{
- if (likely(in_task()))
- preempt_enable();
- else
- local_irq_restore(flags);
-}
-
-/*
* mod_objcg_mlstate() may be called with irq enabled, so
* mod_memcg_lruvec_state() should be used.
*/
@@ -2936,48 +2902,17 @@ __always_inline struct obj_cgroup *get_obj_cgroup_from_current(void)
return objcg;
}
-static int memcg_alloc_cache_id(void)
+static void memcg_account_kmem(struct mem_cgroup *memcg, int nr_pages)
{
- int id, size;
- int err;
-
- id = ida_simple_get(&memcg_cache_ida,
- 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
- if (id < 0)
- return id;
-
- if (id < memcg_nr_cache_ids)
- return id;
-
- /*
- * There's no space for the new id in memcg_caches arrays,
- * so we have to grow them.
- */
- down_write(&memcg_cache_ids_sem);
-
- size = 2 * (id + 1);
- if (size < MEMCG_CACHES_MIN_SIZE)
- size = MEMCG_CACHES_MIN_SIZE;
- else if (size > MEMCG_CACHES_MAX_SIZE)
- size = MEMCG_CACHES_MAX_SIZE;
-
- err = memcg_update_all_list_lrus(size);
- if (!err)
- memcg_nr_cache_ids = size;
-
- up_write(&memcg_cache_ids_sem);
-
- if (err) {
- ida_simple_remove(&memcg_cache_ida, id);
- return err;
+ mod_memcg_state(memcg, MEMCG_KMEM, nr_pages);
+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
+ if (nr_pages > 0)
+ page_counter_charge(&memcg->kmem, nr_pages);
+ else
+ page_counter_uncharge(&memcg->kmem, -nr_pages);
}
- return id;
}
-static void memcg_free_cache_id(int id)
-{
- ida_simple_remove(&memcg_cache_ida, id);
-}
/*
* obj_cgroup_uncharge_pages: uncharge a number of kernel pages from a objcg
@@ -2991,8 +2926,7 @@ static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg,
memcg = get_mem_cgroup_from_objcg(objcg);
- if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
- page_counter_uncharge(&memcg->kmem, nr_pages);
+ memcg_account_kmem(memcg, -nr_pages);
refill_stock(memcg, nr_pages);
css_put(&memcg->css);
@@ -3018,8 +2952,7 @@ static int obj_cgroup_charge_pages(struct obj_cgroup *objcg, gfp_t gfp,
if (ret)
goto out;
- if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
- page_counter_charge(&memcg->kmem, nr_pages);
+ memcg_account_kmem(memcg, nr_pages);
out:
css_put(&memcg->css);
@@ -3075,17 +3008,21 @@ void __memcg_kmem_uncharge_page(struct page *page, int order)
void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
enum node_stat_item idx, int nr)
{
+ struct memcg_stock_pcp *stock;
+ struct obj_cgroup *old = NULL;
unsigned long flags;
- struct obj_stock *stock = get_obj_stock(&flags);
int *bytes;
+ local_lock_irqsave(&memcg_stock.stock_lock, flags);
+ stock = this_cpu_ptr(&memcg_stock);
+
/*
* Save vmstat data in stock and skip vmstat array update unless
* accumulating over a page of vmstat data or when pgdat or idx
* changes.
*/
if (stock->cached_objcg != objcg) {
- drain_obj_stock(stock);
+ old = drain_obj_stock(stock);
obj_cgroup_get(objcg);
stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes)
? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0;
@@ -3129,38 +3066,51 @@ void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
if (nr)
mod_objcg_mlstate(objcg, pgdat, idx, nr);
- put_obj_stock(flags);
+ local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
+ if (old)
+ obj_cgroup_put(old);
}
static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
{
+ struct memcg_stock_pcp *stock;
unsigned long flags;
- struct obj_stock *stock = get_obj_stock(&flags);
bool ret = false;
+ local_lock_irqsave(&memcg_stock.stock_lock, flags);
+
+ stock = this_cpu_ptr(&memcg_stock);
if (objcg == stock->cached_objcg && stock->nr_bytes >= nr_bytes) {
stock->nr_bytes -= nr_bytes;
ret = true;
}
- put_obj_stock(flags);
+ local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
return ret;
}
-static void drain_obj_stock(struct obj_stock *stock)
+static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock)
{
struct obj_cgroup *old = stock->cached_objcg;
if (!old)
- return;
+ return NULL;
if (stock->nr_bytes) {
unsigned int nr_pages = stock->nr_bytes >> PAGE_SHIFT;
unsigned int nr_bytes = stock->nr_bytes & (PAGE_SIZE - 1);
- if (nr_pages)
- obj_cgroup_uncharge_pages(old, nr_pages);
+ if (nr_pages) {
+ struct mem_cgroup *memcg;
+
+ memcg = get_mem_cgroup_from_objcg(old);
+
+ memcg_account_kmem(memcg, -nr_pages);
+ __refill_stock(memcg, nr_pages);
+
+ css_put(&memcg->css);
+ }
/*
* The leftover is flushed to the centralized per-memcg value.
@@ -3195,8 +3145,12 @@ static void drain_obj_stock(struct obj_stock *stock)
stock->cached_pgdat = NULL;
}
- obj_cgroup_put(old);
stock->cached_objcg = NULL;
+ /*
+ * The `old' objects needs to be released by the caller via
+ * obj_cgroup_put() outside of memcg_stock_pcp::stock_lock.
+ */
+ return old;
}
static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
@@ -3204,13 +3158,8 @@ static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
{
struct mem_cgroup *memcg;
- if (in_task() && stock->task_obj.cached_objcg) {
- memcg = obj_cgroup_memcg(stock->task_obj.cached_objcg);
- if (memcg && mem_cgroup_is_descendant(memcg, root_memcg))
- return true;
- }
- if (stock->irq_obj.cached_objcg) {
- memcg = obj_cgroup_memcg(stock->irq_obj.cached_objcg);
+ if (stock->cached_objcg) {
+ memcg = obj_cgroup_memcg(stock->cached_objcg);
if (memcg && mem_cgroup_is_descendant(memcg, root_memcg))
return true;
}
@@ -3221,12 +3170,16 @@ static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes,
bool allow_uncharge)
{
+ struct memcg_stock_pcp *stock;
+ struct obj_cgroup *old = NULL;
unsigned long flags;
- struct obj_stock *stock = get_obj_stock(&flags);
unsigned int nr_pages = 0;
+ local_lock_irqsave(&memcg_stock.stock_lock, flags);
+
+ stock = this_cpu_ptr(&memcg_stock);
if (stock->cached_objcg != objcg) { /* reset if necessary */
- drain_obj_stock(stock);
+ old = drain_obj_stock(stock);
obj_cgroup_get(objcg);
stock->cached_objcg = objcg;
stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes)
@@ -3240,7 +3193,9 @@ static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes,
stock->nr_bytes &= (PAGE_SIZE - 1);
}
- put_obj_stock(flags);
+ local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
+ if (old)
+ obj_cgroup_put(old);
if (nr_pages)
obj_cgroup_uncharge_pages(objcg, nr_pages);
@@ -3625,28 +3580,23 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
static int memcg_online_kmem(struct mem_cgroup *memcg)
{
struct obj_cgroup *objcg;
- int memcg_id;
if (cgroup_memory_nokmem)
return 0;
- BUG_ON(memcg->kmemcg_id >= 0);
-
- memcg_id = memcg_alloc_cache_id();
- if (memcg_id < 0)
- return memcg_id;
+ if (unlikely(mem_cgroup_is_root(memcg)))
+ return 0;
objcg = obj_cgroup_alloc();
- if (!objcg) {
- memcg_free_cache_id(memcg_id);
+ if (!objcg)
return -ENOMEM;
- }
+
objcg->memcg = memcg;
rcu_assign_pointer(memcg->objcg, objcg);
static_branch_enable(&memcg_kmem_enabled_key);
- memcg->kmemcg_id = memcg_id;
+ memcg->kmemcg_id = memcg->id.id;
return 0;
}
@@ -3654,9 +3604,11 @@ static int memcg_online_kmem(struct mem_cgroup *memcg)
static void memcg_offline_kmem(struct mem_cgroup *memcg)
{
struct mem_cgroup *parent;
- int kmemcg_id;
- if (memcg->kmemcg_id == -1)
+ if (cgroup_memory_nokmem)
+ return;
+
+ if (unlikely(mem_cgroup_is_root(memcg)))
return;
parent = parent_mem_cgroup(memcg);
@@ -3665,19 +3617,13 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg)
memcg_reparent_objcgs(memcg, parent);
- kmemcg_id = memcg->kmemcg_id;
- BUG_ON(kmemcg_id < 0);
-
/*
* After we have finished memcg_reparent_objcgs(), all list_lrus
* corresponding to this cgroup are guaranteed to remain empty.
* The ordering is imposed by list_lru_node->lock taken by
- * memcg_drain_all_list_lrus().
+ * memcg_reparent_list_lrus().
*/
- memcg_drain_all_list_lrus(kmemcg_id, parent);
-
- memcg_free_cache_id(kmemcg_id);
- memcg->kmemcg_id = -1;
+ memcg_reparent_list_lrus(memcg, parent);
}
#else
static int memcg_online_kmem(struct mem_cgroup *memcg)
@@ -3763,8 +3709,12 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
}
break;
case RES_SOFT_LIMIT:
- memcg->soft_limit = nr_pages;
- ret = 0;
+ if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
+ ret = -EOPNOTSUPP;
+ } else {
+ memcg->soft_limit = nr_pages;
+ ret = 0;
+ }
break;
}
return ret ?: nbytes;
@@ -4740,6 +4690,9 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
char *endp;
int ret;
+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
+ return -EOPNOTSUPP;
+
buf = strstrip(buf);
efd = simple_strtoul(buf, &endp, 10);
@@ -5067,18 +5020,8 @@ struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
{
struct mem_cgroup_per_node *pn;
- int tmp = node;
- /*
- * This routine is called against possible nodes.
- * But it's BUG to call kmalloc() against offline node.
- *
- * TODO: this routine can waste much memory for nodes which will
- * never be onlined. It's better to use memory hotplug callback
- * function.
- */
- if (!node_state(node, N_NORMAL_MEMORY))
- tmp = -1;
- pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
+
+ pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, node);
if (!pn)
return 1;
@@ -5090,8 +5033,6 @@ static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
}
lruvec_init(&pn->lruvec);
- pn->usage_in_excess = 0;
- pn->on_tree = false;
pn->memcg = memcg;
memcg->nodeinfo[node] = pn;
@@ -5137,8 +5078,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
return ERR_PTR(error);
memcg->id.id = idr_alloc(&mem_cgroup_idr, NULL,
- 1, MEM_CGROUP_ID_MAX,
- GFP_KERNEL);
+ 1, MEM_CGROUP_ID_MAX + 1, GFP_KERNEL);
if (memcg->id.id < 0) {
error = memcg->id.id;
goto fail;
@@ -5192,7 +5132,6 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
{
struct mem_cgroup *parent = mem_cgroup_from_css(parent_css);
struct mem_cgroup *memcg, *old_memcg;
- long error = -ENOMEM;
old_memcg = set_active_memcg(parent);
memcg = mem_cgroup_alloc();
@@ -5221,34 +5160,26 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
return &memcg->css;
}
- /* The following stuff does not apply to the root */
- error = memcg_online_kmem(memcg);
- if (error)
- goto fail;
-
if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
static_branch_inc(&memcg_sockets_enabled_key);
return &memcg->css;
-fail:
- mem_cgroup_id_remove(memcg);
- mem_cgroup_free(memcg);
- return ERR_PTR(error);
}
static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ if (memcg_online_kmem(memcg))
+ goto remove_id;
+
/*
* A memcg must be visible for expand_shrinker_info()
* by the time the maps are allocated. So, we allocate maps
* here, when for_each_mem_cgroup() can't skip it.
*/
- if (alloc_shrinker_info(memcg)) {
- mem_cgroup_id_remove(memcg);
- return -ENOMEM;
- }
+ if (alloc_shrinker_info(memcg))
+ goto offline_kmem;
/* Online state pins memcg ID, memcg ID pins CSS */
refcount_set(&memcg->id.ref, 1);
@@ -5258,6 +5189,11 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
queue_delayed_work(system_unbound_wq, &stats_flush_dwork,
2UL*HZ);
return 0;
+offline_kmem:
+ memcg_offline_kmem(memcg);
+remove_id:
+ mem_cgroup_id_remove(memcg);
+ return -ENOMEM;
}
static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
@@ -5315,9 +5251,6 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
cancel_work_sync(&memcg->high_work);
mem_cgroup_remove_from_trees(memcg);
free_shrinker_info(memcg);
-
- /* Need to offline kmem if online_css() fails */
- memcg_offline_kmem(memcg);
mem_cgroup_free(memcg);
}
@@ -5503,17 +5436,12 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
return NULL;
/*
- * Handle MEMORY_DEVICE_PRIVATE which are ZONE_DEVICE page belonging to
- * a device and because they are not accessible by CPU they are store
- * as special swap entry in the CPU page table.
+ * Handle device private pages that are not accessible by the CPU, but
+ * stored as special swap entries in the page table.
*/
if (is_device_private_entry(ent)) {
page = pfn_swap_entry_to_page(ent);
- /*
- * MEMORY_DEVICE_PRIVATE means ZONE_DEVICE page and which have
- * a refcount of 1 when free (unlike normal page)
- */
- if (!page_ref_add_unless(page, 1, 1))
+ if (!get_page_unless_zero(page))
return NULL;
return page;
}
@@ -6801,8 +6729,8 @@ static void uncharge_batch(const struct uncharge_gather *ug)
page_counter_uncharge(&ug->memcg->memory, ug->nr_memory);
if (do_memsw_account())
page_counter_uncharge(&ug->memcg->memsw, ug->nr_memory);
- if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && ug->nr_kmem)
- page_counter_uncharge(&ug->memcg->kmem, ug->nr_kmem);
+ if (ug->nr_kmem)
+ memcg_account_kmem(ug->memcg, -ug->nr_kmem);
memcg_oom_recover(ug->memcg);
}
@@ -6821,7 +6749,6 @@ static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug)
long nr_pages;
struct mem_cgroup *memcg;
struct obj_cgroup *objcg;
- bool use_objcg = folio_memcg_kmem(folio);
VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
@@ -6830,7 +6757,7 @@ static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug)
* folio memcg or objcg at this point, we have fully
* exclusive access to the folio.
*/
- if (use_objcg) {
+ if (folio_memcg_kmem(folio)) {
objcg = __folio_objcg(folio);
/*
* This get matches the put at the end of the function and
@@ -6858,7 +6785,7 @@ static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug)
nr_pages = folio_nr_pages(folio);
- if (use_objcg) {
+ if (folio_memcg_kmem(folio)) {
ug->nr_memory += nr_pages;
ug->nr_kmem += nr_pages;
@@ -6968,7 +6895,7 @@ void mem_cgroup_sk_alloc(struct sock *sk)
return;
/* Do not associate the sock with unrelated interrupted task's memcg. */
- if (in_interrupt())
+ if (!in_task())
return;
rcu_read_lock();
@@ -7053,7 +6980,7 @@ static int __init cgroup_memory(char *s)
if (!strcmp(token, "nokmem"))
cgroup_memory_nokmem = true;
}
- return 0;
+ return 1;
}
__setup("cgroup.memory=", cgroup_memory);
@@ -7121,19 +7048,19 @@ static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg)
/**
* mem_cgroup_swapout - transfer a memsw charge to swap
- * @page: page whose memsw charge to transfer
+ * @folio: folio whose memsw charge to transfer
* @entry: swap entry to move the charge to
*
- * Transfer the memsw charge of @page to @entry.
+ * Transfer the memsw charge of @folio to @entry.
*/
-void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
+void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry)
{
struct mem_cgroup *memcg, *swap_memcg;
unsigned int nr_entries;
unsigned short oldid;
- VM_BUG_ON_PAGE(PageLRU(page), page);
- VM_BUG_ON_PAGE(page_count(page), page);
+ VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
+ VM_BUG_ON_FOLIO(folio_ref_count(folio), folio);
if (mem_cgroup_disabled())
return;
@@ -7141,9 +7068,9 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
return;
- memcg = page_memcg(page);
+ memcg = folio_memcg(folio);
- VM_WARN_ON_ONCE_PAGE(!memcg, page);
+ VM_WARN_ON_ONCE_FOLIO(!memcg, folio);
if (!memcg)
return;
@@ -7153,16 +7080,16 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
* ancestor for the swap instead and transfer the memory+swap charge.
*/
swap_memcg = mem_cgroup_id_get_online(memcg);
- nr_entries = thp_nr_pages(page);
+ nr_entries = folio_nr_pages(folio);
/* Get references for the tail pages, too */
if (nr_entries > 1)
mem_cgroup_id_get_many(swap_memcg, nr_entries - 1);
oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg),
nr_entries);
- VM_BUG_ON_PAGE(oldid, page);
+ VM_BUG_ON_FOLIO(oldid, folio);
mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries);
- page->memcg_data = 0;
+ folio->memcg_data = 0;
if (!mem_cgroup_is_root(memcg))
page_counter_uncharge(&memcg->memory, nr_entries);
@@ -7179,9 +7106,10 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
* important here to have the interrupts disabled because it is the
* only synchronisation we have for updating the per-CPU variables.
*/
- VM_BUG_ON(!irqs_disabled());
+ memcg_stats_lock();
mem_cgroup_charge_statistics(memcg, -nr_entries);
- memcg_check_events(memcg, page_to_nid(page));
+ memcg_stats_unlock();
+ memcg_check_events(memcg, folio_nid(folio));
css_put(&memcg->css);
}
diff --git a/mm/memfd.c b/mm/memfd.c
index 9f80f162791a..08f5f8304746 100644
--- a/mm/memfd.c
+++ b/mm/memfd.c
@@ -31,20 +31,28 @@
static void memfd_tag_pins(struct xa_state *xas)
{
struct page *page;
- unsigned int tagged = 0;
+ int latency = 0;
+ int cache_count;
lru_add_drain();
xas_lock_irq(xas);
xas_for_each(xas, page, ULONG_MAX) {
- if (xa_is_value(page))
- continue;
- page = find_subpage(page, xas->xa_index);
- if (page_count(page) - page_mapcount(page) > 1)
+ cache_count = 1;
+ if (!xa_is_value(page) &&
+ PageTransHuge(page) && !PageHuge(page))
+ cache_count = HPAGE_PMD_NR;
+
+ if (!xa_is_value(page) &&
+ page_count(page) - total_mapcount(page) != cache_count)
xas_set_mark(xas, MEMFD_TAG_PINNED);
+ if (cache_count != 1)
+ xas_set(xas, page->index + cache_count);
- if (++tagged % XA_CHECK_SCHED)
+ latency += cache_count;
+ if (latency < XA_CHECK_SCHED)
continue;
+ latency = 0;
xas_pause(xas);
xas_unlock_irq(xas);
@@ -73,7 +81,8 @@ static int memfd_wait_for_pins(struct address_space *mapping)
error = 0;
for (scan = 0; scan <= LAST_SCAN; scan++) {
- unsigned int tagged = 0;
+ int latency = 0;
+ int cache_count;
if (!xas_marked(&xas, MEMFD_TAG_PINNED))
break;
@@ -87,10 +96,14 @@ static int memfd_wait_for_pins(struct address_space *mapping)
xas_lock_irq(&xas);
xas_for_each_marked(&xas, page, ULONG_MAX, MEMFD_TAG_PINNED) {
bool clear = true;
- if (xa_is_value(page))
- continue;
- page = find_subpage(page, xas.xa_index);
- if (page_count(page) - page_mapcount(page) != 1) {
+
+ cache_count = 1;
+ if (!xa_is_value(page) &&
+ PageTransHuge(page) && !PageHuge(page))
+ cache_count = HPAGE_PMD_NR;
+
+ if (!xa_is_value(page) && cache_count !=
+ page_count(page) - total_mapcount(page)) {
/*
* On the last scan, we clean up all those tags
* we inserted; but make a note that we still
@@ -103,8 +116,11 @@ static int memfd_wait_for_pins(struct address_space *mapping)
}
if (clear)
xas_clear_mark(&xas, MEMFD_TAG_PINNED);
- if (++tagged % XA_CHECK_SCHED)
+
+ latency += cache_count;
+ if (latency < XA_CHECK_SCHED)
continue;
+ latency = 0;
xas_pause(&xas);
xas_unlock_irq(&xas);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 14ae5c18e776..dcb6bb9cf731 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -130,12 +130,6 @@ static int hwpoison_filter_dev(struct page *p)
hwpoison_filter_dev_minor == ~0U)
return 0;
- /*
- * page_mapping() does not accept slab pages.
- */
- if (PageSlab(p))
- return -EINVAL;
-
mapping = page_mapping(p);
if (mapping == NULL || mapping->host == NULL)
return -EINVAL;
@@ -258,16 +252,13 @@ static int kill_proc(struct to_kill *tk, unsigned long pfn, int flags)
pr_err("Memory failure: %#lx: Sending SIGBUS to %s:%d due to hardware memory corruption\n",
pfn, t->comm, t->pid);
- if (flags & MF_ACTION_REQUIRED) {
- if (t == current)
- ret = force_sig_mceerr(BUS_MCEERR_AR,
- (void __user *)tk->addr, addr_lsb);
- else
- /* Signal other processes sharing the page if they have PF_MCE_EARLY set. */
- ret = send_sig_mceerr(BUS_MCEERR_AO, (void __user *)tk->addr,
- addr_lsb, t);
- } else {
+ if ((flags & MF_ACTION_REQUIRED) && (t == current))
+ ret = force_sig_mceerr(BUS_MCEERR_AR,
+ (void __user *)tk->addr, addr_lsb);
+ else
/*
+ * Signal other processes sharing the page if they have
+ * PF_MCE_EARLY set.
* Don't use force here, it's convenient if the signal
* can be temporarily blocked.
* This could cause a loop when the user sets SIGBUS
@@ -275,7 +266,6 @@ static int kill_proc(struct to_kill *tk, unsigned long pfn, int flags)
*/
ret = send_sig_mceerr(BUS_MCEERR_AO, (void __user *)tk->addr,
addr_lsb, t); /* synchronous? */
- }
if (ret < 0)
pr_info("Memory failure: Error sending signal to %s:%d: %d\n",
t->comm, t->pid, ret);
@@ -315,6 +305,7 @@ static unsigned long dev_pagemap_mapping_shift(struct page *page,
pmd_t *pmd;
pte_t *pte;
+ VM_BUG_ON_VMA(address == -EFAULT, vma);
pgd = pgd_offset(vma->vm_mm, address);
if (!pgd_present(*pgd))
return 0;
@@ -487,12 +478,13 @@ static struct task_struct *task_early_kill(struct task_struct *tsk,
static void collect_procs_anon(struct page *page, struct list_head *to_kill,
int force_early)
{
+ struct folio *folio = page_folio(page);
struct vm_area_struct *vma;
struct task_struct *tsk;
struct anon_vma *av;
pgoff_t pgoff;
- av = page_lock_anon_vma_read(page);
+ av = folio_lock_anon_vma_read(folio);
if (av == NULL) /* Not actually mapped anymore */
return;
@@ -707,8 +699,10 @@ static int kill_accessing_process(struct task_struct *p, unsigned long pfn,
(void *)&priv);
if (ret == 1 && priv.tk.addr)
kill_proc(&priv.tk, pfn, flags);
+ else
+ ret = 0;
mmap_read_unlock(p->mm);
- return ret ? -EFAULT : -EHWPOISON;
+ return ret > 0 ? -EHWPOISON : -EFAULT;
}
static const char *action_name[] = {
@@ -739,6 +733,7 @@ static const char * const action_page_types[] = {
[MF_MSG_BUDDY] = "free buddy page",
[MF_MSG_DAX] = "dax page",
[MF_MSG_UNSPLIT_THP] = "unsplit thp",
+ [MF_MSG_DIFFERENT_PAGE_SIZE] = "different page size",
[MF_MSG_UNKNOWN] = "unknown page",
};
@@ -1182,12 +1177,18 @@ void ClearPageHWPoisonTakenOff(struct page *page)
* does not return true for hugetlb or device memory pages, so it's assumed
* to be called only in the context where we never have such pages.
*/
-static inline bool HWPoisonHandlable(struct page *page)
+static inline bool HWPoisonHandlable(struct page *page, unsigned long flags)
{
- return PageLRU(page) || __PageMovable(page) || is_free_buddy_page(page);
+ bool movable = false;
+
+ /* Soft offline could mirgate non-LRU movable pages */
+ if ((flags & MF_SOFT_OFFLINE) && __PageMovable(page))
+ movable = true;
+
+ return movable || PageLRU(page) || is_free_buddy_page(page);
}
-static int __get_hwpoison_page(struct page *page)
+static int __get_hwpoison_page(struct page *page, unsigned long flags)
{
struct page *head = compound_head(page);
int ret = 0;
@@ -1202,7 +1203,7 @@ static int __get_hwpoison_page(struct page *page)
* for any unsupported type of page in order to reduce the risk of
* unexpected races caused by taking a page refcount.
*/
- if (!HWPoisonHandlable(head))
+ if (!HWPoisonHandlable(head, flags))
return -EBUSY;
if (get_page_unless_zero(head)) {
@@ -1227,7 +1228,7 @@ static int get_any_page(struct page *p, unsigned long flags)
try_again:
if (!count_increased) {
- ret = __get_hwpoison_page(p);
+ ret = __get_hwpoison_page(p, flags);
if (!ret) {
if (page_count(p)) {
/* We raced with an allocation, retry. */
@@ -1255,7 +1256,7 @@ try_again:
}
}
- if (PageHuge(p) || HWPoisonHandlable(p)) {
+ if (PageHuge(p) || HWPoisonHandlable(p, flags)) {
ret = 1;
} else {
/*
@@ -1347,6 +1348,7 @@ static int get_hwpoison_page(struct page *p, unsigned long flags)
static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
int flags, struct page *hpage)
{
+ struct folio *folio = page_folio(hpage);
enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_SYNC;
struct address_space *mapping;
LIST_HEAD(tokill);
@@ -1411,26 +1413,22 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
if (kill)
collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED);
- if (!PageHuge(hpage)) {
- try_to_unmap(hpage, ttu);
+ if (PageHuge(hpage) && !PageAnon(hpage)) {
+ /*
+ * For hugetlb pages in shared mappings, try_to_unmap
+ * could potentially call huge_pmd_unshare. Because of
+ * this, take semaphore in write mode here and set
+ * TTU_RMAP_LOCKED to indicate we have taken the lock
+ * at this higher level.
+ */
+ mapping = hugetlb_page_mapping_lock_write(hpage);
+ if (mapping) {
+ try_to_unmap(folio, ttu|TTU_RMAP_LOCKED);
+ i_mmap_unlock_write(mapping);
+ } else
+ pr_info("Memory failure: %#lx: could not lock mapping for mapped huge page\n", pfn);
} else {
- if (!PageAnon(hpage)) {
- /*
- * For hugetlb pages in shared mappings, try_to_unmap
- * could potentially call huge_pmd_unshare. Because of
- * this, take semaphore in write mode here and set
- * TTU_RMAP_LOCKED to indicate we have taken the lock
- * at this higher level.
- */
- mapping = hugetlb_page_mapping_lock_write(hpage);
- if (mapping) {
- try_to_unmap(hpage, ttu|TTU_RMAP_LOCKED);
- i_mmap_unlock_write(mapping);
- } else
- pr_info("Memory failure: %#lx: could not lock mapping for mapped huge page\n", pfn);
- } else {
- try_to_unmap(hpage, ttu);
- }
+ try_to_unmap(folio, ttu);
}
unmap_success = !page_mapped(hpage);
@@ -1526,7 +1524,7 @@ static int memory_failure_hugetlb(unsigned long pfn, int flags)
if (TestClearPageHWPoison(head))
num_poisoned_pages_dec();
unlock_page(head);
- return 0;
+ return -EOPNOTSUPP;
}
unlock_page(head);
res = MF_FAILED;
@@ -1543,8 +1541,27 @@ static int memory_failure_hugetlb(unsigned long pfn, int flags)
}
lock_page(head);
+
+ /*
+ * The page could have changed compound pages due to race window.
+ * If this happens just bail out.
+ */
+ if (!PageHuge(p) || compound_head(p) != head) {
+ action_result(pfn, MF_MSG_DIFFERENT_PAGE_SIZE, MF_IGNORED);
+ res = -EBUSY;
+ goto out;
+ }
+
page_flags = head->flags;
+ if (hwpoison_filter(p)) {
+ if (TestClearPageHWPoison(head))
+ num_poisoned_pages_dec();
+ put_page(p);
+ res = -EOPNOTSUPP;
+ goto out;
+ }
+
/*
* TODO: hwpoison for pud-sized hugetlb doesn't work right now, so
* simply disable it. In order to make it work properly, we need
@@ -1596,6 +1613,12 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
}
/*
+ * Pages instantiated by device-dax (not filesystem-dax)
+ * may be compound pages.
+ */
+ page = compound_head(page);
+
+ /*
* Prevent the inode from being freed while we are interrogating
* the address_space, typically this would be handled by
* lock_page(), but dax pages do not use the page lock. This
@@ -1607,7 +1630,7 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
goto out;
if (hwpoison_filter(page)) {
- rc = 0;
+ rc = -EOPNOTSUPP;
goto unlock;
}
@@ -1632,7 +1655,7 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
* SIGBUS (i.e. MF_MUST_KILL)
*/
flags |= MF_ACTION_REQUIRED | MF_MUST_KILL;
- collect_procs(page, &tokill, flags & MF_ACTION_REQUIRED);
+ collect_procs(page, &tokill, true);
list_for_each_entry(tk, &tokill, nd)
if (tk->size_shift)
@@ -1647,7 +1670,7 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
start = (page->index << PAGE_SHIFT) & ~(size - 1);
unmap_mapping_range(page->mapping, start, size, 0);
}
- kill_procs(&tokill, flags & MF_MUST_KILL, false, pfn, flags);
+ kill_procs(&tokill, true, false, pfn, flags);
rc = 0;
unlock:
dax_unlock_page(page, cookie);
@@ -1676,12 +1699,15 @@ static DEFINE_MUTEX(mf_mutex);
*
* Must run in process context (e.g. a work queue) with interrupts
* enabled and no spinlocks hold.
+ *
+ * Return: 0 for successfully handled the memory error,
+ * -EOPNOTSUPP for memory_filter() filtered the error event,
+ * < 0(except -EOPNOTSUPP) on failure.
*/
int memory_failure(unsigned long pfn, int flags)
{
struct page *p;
struct page *hpage;
- struct page *orig_head;
struct dev_pagemap *pgmap;
int res = 0;
unsigned long page_flags;
@@ -1727,7 +1753,7 @@ try_again:
goto unlock_mutex;
}
- orig_head = hpage = compound_head(p);
+ hpage = compound_head(p);
num_poisoned_pages_inc();
/*
@@ -1808,10 +1834,21 @@ try_again:
lock_page(p);
/*
- * The page could have changed compound pages during the locking.
- * If this happens just bail out.
+ * We're only intended to deal with the non-Compound page here.
+ * However, the page could have changed compound pages due to
+ * race window. If this happens, we could try again to hopefully
+ * handle the page next round.
*/
- if (PageCompound(p) && compound_head(p) != orig_head) {
+ if (PageCompound(p)) {
+ if (retry) {
+ if (TestClearPageHWPoison(p))
+ num_poisoned_pages_dec();
+ unlock_page(p);
+ put_page(p);
+ flags &= ~MF_COUNT_INCREASED;
+ retry = false;
+ goto try_again;
+ }
action_result(pfn, MF_MSG_DIFFERENT_COMPOUND, MF_IGNORED);
res = -EBUSY;
goto unlock_page;
@@ -1831,6 +1868,7 @@ try_again:
num_poisoned_pages_dec();
unlock_page(p);
put_page(p);
+ res = -EOPNOTSUPP;
goto unlock_mutex;
}
@@ -1839,7 +1877,7 @@ try_again:
* page_lock. We need wait writeback completion for this page or it
* may trigger vfs BUG while evict inode.
*/
- if (!PageTransTail(p) && !PageLRU(p) && !PageWriteback(p))
+ if (!PageLRU(p) && !PageWriteback(p))
goto identify_page_state;
/*
@@ -2133,7 +2171,7 @@ static bool isolate_page(struct page *page, struct list_head *pagelist)
*/
static int __soft_offline_page(struct page *page)
{
- int ret = 0;
+ long ret = 0;
unsigned long pfn = page_to_pfn(page);
struct page *hpage = compound_head(page);
char const *msg_page[] = {"page", "hugepage"};
@@ -2144,12 +2182,6 @@ static int __soft_offline_page(struct page *page)
.gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
};
- /*
- * Check PageHWPoison again inside page lock because PageHWPoison
- * is set by memory_failure() outside page lock. Note that
- * memory_failure() also double-checks PageHWPoison inside page lock,
- * so there's no race between soft_offline_page() and memory_failure().
- */
lock_page(page);
if (!PageHuge(page))
wait_on_page_writeback(page);
@@ -2160,7 +2192,7 @@ static int __soft_offline_page(struct page *page)
return 0;
}
- if (!PageHuge(page))
+ if (!PageHuge(page) && PageLRU(page) && !PageSwapCache(page))
/*
* Try to invalidate first. This should work for
* non dirty unmapped page cache pages.
@@ -2168,10 +2200,6 @@ static int __soft_offline_page(struct page *page)
ret = invalidate_inode_page(page);
unlock_page(page);
- /*
- * RED-PEN would be better to keep it isolated here, but we
- * would need to fix isolation locking first.
- */
if (ret) {
pr_info("soft_offline: %#lx: invalidated\n", pfn);
page_handle_poison(page, false, true);
@@ -2190,7 +2218,7 @@ static int __soft_offline_page(struct page *page)
if (!list_empty(&pagelist))
putback_movable_pages(&pagelist);
- pr_info("soft offline: %#lx: %s migration failed %d, type %pGp\n",
+ pr_info("soft offline: %#lx: %s migration failed %ld, type %pGp\n",
pfn, msg_page[huge], ret, &page->flags);
if (ret > 0)
ret = -EBUSY;
@@ -2282,7 +2310,7 @@ int soft_offline_page(unsigned long pfn, int flags)
retry:
get_online_mems();
- ret = get_hwpoison_page(page, flags);
+ ret = get_hwpoison_page(page, flags | MF_SOFT_OFFLINE);
put_online_mems();
if (ret > 0) {
diff --git a/mm/memory.c b/mm/memory.c
index c125c4969913..be44d0b36b18 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -735,9 +735,6 @@ static void restore_exclusive_pte(struct vm_area_struct *vma,
set_pte_at(vma->vm_mm, address, ptep, pte);
- if (vma->vm_flags & VM_LOCKED)
- mlock_vma_page(page);
-
/*
* No need to invalidate - it was non-present before. However
* secondary CPUs may have mappings that need invalidating.
@@ -1309,22 +1306,34 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
* Parameter block passed down to zap_pte_range in exceptional cases.
*/
struct zap_details {
- struct address_space *zap_mapping; /* Check page->mapping if set */
struct folio *single_folio; /* Locked folio to be unmapped */
+ bool even_cows; /* Zap COWed private pages too? */
};
-/*
- * We set details->zap_mapping when we want to unmap shared but keep private
- * pages. Return true if skip zapping this page, false otherwise.
- */
-static inline bool
-zap_skip_check_mapping(struct zap_details *details, struct page *page)
+/* Whether we should zap all COWed (private) pages too */
+static inline bool should_zap_cows(struct zap_details *details)
{
- if (!details || !page)
- return false;
+ /* By default, zap all pages */
+ if (!details)
+ return true;
- return details->zap_mapping &&
- (details->zap_mapping != page_rmapping(page));
+ /* Or, we zap COWed pages only if the caller wants to */
+ return details->even_cows;
+}
+
+/* Decides whether we should zap this page with the page pointer specified */
+static inline bool should_zap_page(struct zap_details *details, struct page *page)
+{
+ /* If we can make a decision without *page.. */
+ if (should_zap_cows(details))
+ return true;
+
+ /* E.g. the caller passes NULL for the case of a zero page */
+ if (!page)
+ return true;
+
+ /* Otherwise we should only zap non-anon pages */
+ return !PageAnon(page);
}
static unsigned long zap_pte_range(struct mmu_gather *tlb,
@@ -1349,6 +1358,8 @@ again:
arch_enter_lazy_mmu_mode();
do {
pte_t ptent = *pte;
+ struct page *page;
+
if (pte_none(ptent))
continue;
@@ -1356,10 +1367,8 @@ again:
break;
if (pte_present(ptent)) {
- struct page *page;
-
page = vm_normal_page(vma, addr, ptent);
- if (unlikely(zap_skip_check_mapping(details, page)))
+ if (unlikely(!should_zap_page(details, page)))
continue;
ptent = ptep_get_and_clear_full(mm, addr, pte,
tlb->fullmm);
@@ -1377,7 +1386,7 @@ again:
mark_page_accessed(page);
}
rss[mm_counter(page)]--;
- page_remove_rmap(page, false);
+ page_remove_rmap(page, vma, false);
if (unlikely(page_mapcount(page) < 0))
print_bad_pte(vma, addr, ptent, page);
if (unlikely(__tlb_remove_page(tlb, page))) {
@@ -1391,34 +1400,32 @@ again:
entry = pte_to_swp_entry(ptent);
if (is_device_private_entry(entry) ||
is_device_exclusive_entry(entry)) {
- struct page *page = pfn_swap_entry_to_page(entry);
-
- if (unlikely(zap_skip_check_mapping(details, page)))
+ page = pfn_swap_entry_to_page(entry);
+ if (unlikely(!should_zap_page(details, page)))
continue;
- pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
rss[mm_counter(page)]--;
-
if (is_device_private_entry(entry))
- page_remove_rmap(page, false);
-
+ page_remove_rmap(page, vma, false);
put_page(page);
- continue;
- }
-
- /* If details->check_mapping, we leave swap entries. */
- if (unlikely(details))
- continue;
-
- if (!non_swap_entry(entry))
+ } else if (!non_swap_entry(entry)) {
+ /* Genuine swap entry, hence a private anon page */
+ if (!should_zap_cows(details))
+ continue;
rss[MM_SWAPENTS]--;
- else if (is_migration_entry(entry)) {
- struct page *page;
-
+ if (unlikely(!free_swap_and_cache(entry)))
+ print_bad_pte(vma, addr, ptent, NULL);
+ } else if (is_migration_entry(entry)) {
page = pfn_swap_entry_to_page(entry);
+ if (!should_zap_page(details, page))
+ continue;
rss[mm_counter(page)]--;
+ } else if (is_hwpoison_entry(entry)) {
+ if (!should_zap_cows(details))
+ continue;
+ } else {
+ /* We should have covered all the swap entry types */
+ WARN_ON_ONCE(1);
}
- if (unlikely(!free_swap_and_cache(entry)))
- print_bad_pte(vma, addr, ptent, NULL);
pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
} while (pte++, addr += PAGE_SIZE, addr != end);
@@ -1705,7 +1712,7 @@ static void zap_page_range_single(struct vm_area_struct *vma, unsigned long addr
void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
unsigned long size)
{
- if (address < vma->vm_start || address + size > vma->vm_end ||
+ if (!range_in_vma(vma, address, address + size) ||
!(vma->vm_flags & VM_PFNMAP))
return;
@@ -1753,16 +1760,16 @@ static int validate_page_before_insert(struct page *page)
return 0;
}
-static int insert_page_into_pte_locked(struct mm_struct *mm, pte_t *pte,
+static int insert_page_into_pte_locked(struct vm_area_struct *vma, pte_t *pte,
unsigned long addr, struct page *page, pgprot_t prot)
{
if (!pte_none(*pte))
return -EBUSY;
/* Ok, finally just insert the thing.. */
get_page(page);
- inc_mm_counter_fast(mm, mm_counter_file(page));
- page_add_file_rmap(page, false);
- set_pte_at(mm, addr, pte, mk_pte(page, prot));
+ inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
+ page_add_file_rmap(page, vma, false);
+ set_pte_at(vma->vm_mm, addr, pte, mk_pte(page, prot));
return 0;
}
@@ -1776,7 +1783,6 @@ static int insert_page_into_pte_locked(struct mm_struct *mm, pte_t *pte,
static int insert_page(struct vm_area_struct *vma, unsigned long addr,
struct page *page, pgprot_t prot)
{
- struct mm_struct *mm = vma->vm_mm;
int retval;
pte_t *pte;
spinlock_t *ptl;
@@ -1785,17 +1791,17 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr,
if (retval)
goto out;
retval = -ENOMEM;
- pte = get_locked_pte(mm, addr, &ptl);
+ pte = get_locked_pte(vma->vm_mm, addr, &ptl);
if (!pte)
goto out;
- retval = insert_page_into_pte_locked(mm, pte, addr, page, prot);
+ retval = insert_page_into_pte_locked(vma, pte, addr, page, prot);
pte_unmap_unlock(pte, ptl);
out:
return retval;
}
#ifdef pte_index
-static int insert_page_in_batch_locked(struct mm_struct *mm, pte_t *pte,
+static int insert_page_in_batch_locked(struct vm_area_struct *vma, pte_t *pte,
unsigned long addr, struct page *page, pgprot_t prot)
{
int err;
@@ -1805,7 +1811,7 @@ static int insert_page_in_batch_locked(struct mm_struct *mm, pte_t *pte,
err = validate_page_before_insert(page);
if (err)
return err;
- return insert_page_into_pte_locked(mm, pte, addr, page, prot);
+ return insert_page_into_pte_locked(vma, pte, addr, page, prot);
}
/* insert_pages() amortizes the cost of spinlock operations
@@ -1842,7 +1848,7 @@ more:
start_pte = pte_offset_map_lock(mm, pmd, addr, &pte_lock);
for (pte = start_pte; pte_idx < batch_size; ++pte, ++pte_idx) {
- int err = insert_page_in_batch_locked(mm, pte,
+ int err = insert_page_in_batch_locked(vma, pte,
addr, pages[curr_page_idx], prot);
if (unlikely(err)) {
pte_unmap_unlock(start_pte, pte_lock);
@@ -3098,7 +3104,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
* mapcount is visible. So transitively, TLBs to
* old page will be flushed before it can be reused.
*/
- page_remove_rmap(old_page, false);
+ page_remove_rmap(old_page, vma, false);
}
/* Free the old page.. */
@@ -3118,16 +3124,6 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
*/
mmu_notifier_invalidate_range_only_end(&range);
if (old_page) {
- /*
- * Don't let another task, with possibly unlocked vma,
- * keep the mlocked page.
- */
- if (page_copied && (vma->vm_flags & VM_LOCKED)) {
- lock_page(old_page); /* LRU manipulation */
- if (PageMlocked(old_page))
- munlock_vma_page(old_page);
- unlock_page(old_page);
- }
if (page_copied)
free_swap_cache(old_page);
put_page(old_page);
@@ -3291,19 +3287,35 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
if (PageAnon(vmf->page)) {
struct page *page = vmf->page;
- /* PageKsm() doesn't necessarily raise the page refcount */
- if (PageKsm(page) || page_count(page) != 1)
+ /*
+ * We have to verify under page lock: these early checks are
+ * just an optimization to avoid locking the page and freeing
+ * the swapcache if there is little hope that we can reuse.
+ *
+ * PageKsm() doesn't necessarily raise the page refcount.
+ */
+ if (PageKsm(page) || page_count(page) > 3)
+ goto copy;
+ if (!PageLRU(page))
+ /*
+ * Note: We cannot easily detect+handle references from
+ * remote LRU pagevecs or references to PageLRU() pages.
+ */
+ lru_add_drain();
+ if (page_count(page) > 1 + PageSwapCache(page))
goto copy;
if (!trylock_page(page))
goto copy;
- if (PageKsm(page) || page_mapcount(page) != 1 || page_count(page) != 1) {
+ if (PageSwapCache(page))
+ try_to_free_swap(page);
+ if (PageKsm(page) || page_count(page) != 1) {
unlock_page(page);
goto copy;
}
/*
- * Ok, we've got the only map reference, and the only
- * page count reference, and the page is locked,
- * it's dark out, and we're wearing sunglasses. Hit it.
+ * Ok, we've got the only page reference from our mapping
+ * and the page is locked, it's dark out, and we're wearing
+ * sunglasses. Hit it.
*/
unlock_page(page);
wp_page_reuse(vmf);
@@ -3340,12 +3352,8 @@ static inline void unmap_mapping_range_tree(struct rb_root_cached *root,
vma_interval_tree_foreach(vma, root, first_index, last_index) {
vba = vma->vm_pgoff;
vea = vba + vma_pages(vma) - 1;
- zba = first_index;
- if (zba < vba)
- zba = vba;
- zea = last_index;
- if (zea > vea)
- zea = vea;
+ zba = max(first_index, vba);
+ zea = min(last_index, vea);
unmap_mapping_range_vma(vma,
((zba - vba) << PAGE_SHIFT) + vma->vm_start,
@@ -3377,14 +3385,14 @@ void unmap_mapping_folio(struct folio *folio)
first_index = folio->index;
last_index = folio->index + folio_nr_pages(folio) - 1;
- details.zap_mapping = mapping;
+ details.even_cows = false;
details.single_folio = folio;
- i_mmap_lock_write(mapping);
+ i_mmap_lock_read(mapping);
if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
unmap_mapping_range_tree(&mapping->i_mmap, first_index,
last_index, &details);
- i_mmap_unlock_write(mapping);
+ i_mmap_unlock_read(mapping);
}
/**
@@ -3406,15 +3414,15 @@ void unmap_mapping_pages(struct address_space *mapping, pgoff_t start,
pgoff_t first_index = start;
pgoff_t last_index = start + nr - 1;
- details.zap_mapping = even_cows ? NULL : mapping;
+ details.even_cows = even_cows;
if (last_index < first_index)
last_index = ULONG_MAX;
- i_mmap_lock_write(mapping);
+ i_mmap_lock_read(mapping);
if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
unmap_mapping_range_tree(&mapping->i_mmap, first_index,
last_index, &details);
- i_mmap_unlock_write(mapping);
+ i_mmap_unlock_read(mapping);
}
EXPORT_SYMBOL_GPL(unmap_mapping_pages);
@@ -3481,6 +3489,25 @@ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf)
return 0;
}
+static inline bool should_try_to_free_swap(struct page *page,
+ struct vm_area_struct *vma,
+ unsigned int fault_flags)
+{
+ if (!PageSwapCache(page))
+ return false;
+ if (mem_cgroup_swap_full(page) || (vma->vm_flags & VM_LOCKED) ||
+ PageMlocked(page))
+ return true;
+ /*
+ * If we want to map a page that's in the swapcache writable, we
+ * have to detect via the refcount if we're really the exclusive
+ * user. Try freeing the swapcache to get rid of the swapcache
+ * reference only in case it's likely that we'll be the exlusive user.
+ */
+ return (fault_flags & FAULT_FLAG_WRITE) && !PageKsm(page) &&
+ page_count(page) == 2;
+}
+
/*
* We enter with non-exclusive mmap_lock (to exclude vma changes,
* but allow concurrent faults), and pte mapped but not yet locked.
@@ -3599,21 +3626,39 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
goto out_release;
}
- /*
- * Make sure try_to_free_swap or reuse_swap_page or swapoff did not
- * release the swapcache from under us. The page pin, and pte_same
- * test below, are not enough to exclude that. Even if it is still
- * swapcache, we need to check that the page's swap has not changed.
- */
- if (unlikely((!PageSwapCache(page) ||
- page_private(page) != entry.val)) && swapcache)
- goto out_page;
-
- page = ksm_might_need_to_copy(page, vma, vmf->address);
- if (unlikely(!page)) {
- ret = VM_FAULT_OOM;
- page = swapcache;
- goto out_page;
+ if (swapcache) {
+ /*
+ * Make sure try_to_free_swap or swapoff did not release the
+ * swapcache from under us. The page pin, and pte_same test
+ * below, are not enough to exclude that. Even if it is still
+ * swapcache, we need to check that the page's swap has not
+ * changed.
+ */
+ if (unlikely(!PageSwapCache(page) ||
+ page_private(page) != entry.val))
+ goto out_page;
+
+ /*
+ * KSM sometimes has to copy on read faults, for example, if
+ * page->index of !PageKSM() pages would be nonlinear inside the
+ * anon VMA -- PageKSM() is lost on actual swapout.
+ */
+ page = ksm_might_need_to_copy(page, vma, vmf->address);
+ if (unlikely(!page)) {
+ ret = VM_FAULT_OOM;
+ page = swapcache;
+ goto out_page;
+ }
+
+ /*
+ * If we want to map a page that's in the swapcache writable, we
+ * have to detect via the refcount if we're really the exclusive
+ * owner. Try removing the extra reference from the local LRU
+ * pagevecs if required.
+ */
+ if ((vmf->flags & FAULT_FLAG_WRITE) && page == swapcache &&
+ !PageKsm(page) && !PageLRU(page))
+ lru_add_drain();
}
cgroup_throttle_swaprate(page, GFP_KERNEL);
@@ -3632,19 +3677,25 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
}
/*
- * The page isn't present yet, go ahead with the fault.
- *
- * Be careful about the sequence of operations here.
- * To get its accounting right, reuse_swap_page() must be called
- * while the page is counted on swap but not yet in mapcount i.e.
- * before page_add_anon_rmap() and swap_free(); try_to_free_swap()
- * must be called after the swap_free(), or it will never succeed.
+ * Remove the swap entry and conditionally try to free up the swapcache.
+ * We're already holding a reference on the page but haven't mapped it
+ * yet.
*/
+ swap_free(entry);
+ if (should_try_to_free_swap(page, vma, vmf->flags))
+ try_to_free_swap(page);
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS);
pte = mk_pte(page, vma->vm_page_prot);
- if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
+
+ /*
+ * Same logic as in do_wp_page(); however, optimize for fresh pages
+ * that are certainly not shared because we just allocated them without
+ * exposing them to the swapcache.
+ */
+ if ((vmf->flags & FAULT_FLAG_WRITE) && !PageKsm(page) &&
+ (page != swapcache || page_count(page) == 1)) {
pte = maybe_mkwrite(pte_mkdirty(pte), vma);
vmf->flags &= ~FAULT_FLAG_WRITE;
ret |= VM_FAULT_WRITE;
@@ -3670,10 +3721,6 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte);
- swap_free(entry);
- if (mem_cgroup_swap_full(page) ||
- (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
- try_to_free_swap(page);
unlock_page(page);
if (page != swapcache && swapcache) {
/*
@@ -3871,11 +3918,16 @@ static vm_fault_t __do_fault(struct vm_fault *vmf)
return ret;
if (unlikely(PageHWPoison(vmf->page))) {
- if (ret & VM_FAULT_LOCKED)
+ vm_fault_t poisonret = VM_FAULT_HWPOISON;
+ if (ret & VM_FAULT_LOCKED) {
+ /* Retry if a clean page was removed from the cache. */
+ if (invalidate_inode_page(vmf->page))
+ poisonret = 0;
unlock_page(vmf->page);
+ }
put_page(vmf->page);
vmf->page = NULL;
- return VM_FAULT_HWPOISON;
+ return poisonret;
}
if (unlikely(!(ret & VM_FAULT_LOCKED)))
@@ -3947,7 +3999,8 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
add_mm_counter(vma->vm_mm, mm_counter_file(page), HPAGE_PMD_NR);
- page_add_file_rmap(page, true);
+ page_add_file_rmap(page, vma, true);
+
/*
* deposit and withdraw with pmd lock held
*/
@@ -3996,7 +4049,7 @@ void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
lru_cache_add_inactive_or_unevictable(page, vma);
} else {
inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
- page_add_file_rmap(page, false);
+ page_add_file_rmap(page, vma, false);
}
set_pte_at(vma->vm_mm, addr, vmf->pte, entry);
}
@@ -4622,6 +4675,7 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
struct vm_fault vmf = {
.vma = vma,
.address = address & PAGE_MASK,
+ .real_address = address,
.flags = flags,
.pgoff = linear_page_index(vma, address),
.gfp_mask = __get_fault_gfp_mask(vma),
@@ -5256,14 +5310,6 @@ void print_vma_addr(char *prefix, unsigned long ip)
#if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_DEBUG_ATOMIC_SLEEP)
void __might_fault(const char *file, int line)
{
- /*
- * Some code (nfs/sunrpc) uses socket ops on kernel memory while
- * holding the mmap_lock, this is safe because kernel memory doesn't
- * get paged out, therefore we'll never actually fault, and the
- * below annotations will generate false positives.
- */
- if (uaccess_kernel())
- return;
if (pagefault_disabled())
return;
__might_sleep(file, line);
@@ -5444,6 +5490,8 @@ long copy_huge_page_from_user(struct page *dst_page,
if (rc)
break;
+ flush_dcache_page(subpage);
+
cond_resched();
}
return ret_val;
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 2a9627dc784c..416b38ca8def 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -295,12 +295,6 @@ struct page *pfn_to_online_page(unsigned long pfn)
}
EXPORT_SYMBOL_GPL(pfn_to_online_page);
-/*
- * Reasonably generic function for adding memory. It is
- * expected that archs that support memory hotplug will
- * call this function after deciding the zone to which to
- * add the new pages.
- */
int __ref __add_pages(int nid, unsigned long pfn, unsigned long nr_pages,
struct mhp_params *params)
{
@@ -829,7 +823,7 @@ static struct zone *default_kernel_zone_for_pfn(int nid, unsigned long start_pfn
struct pglist_data *pgdat = NODE_DATA(nid);
int zid;
- for (zid = 0; zid <= ZONE_NORMAL; zid++) {
+ for (zid = 0; zid < ZONE_NORMAL; zid++) {
struct zone *zone = &pgdat->node_zones[zid];
if (zone_intersects(zone, start_pfn, nr_pages))
@@ -1162,43 +1156,20 @@ static void reset_node_present_pages(pg_data_t *pgdat)
}
/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
-static pg_data_t __ref *hotadd_new_pgdat(int nid)
+static pg_data_t __ref *hotadd_init_pgdat(int nid)
{
struct pglist_data *pgdat;
+ /*
+ * NODE_DATA is preallocated (free_area_init) but its internal
+ * state is not allocated completely. Add missing pieces.
+ * Completely offline nodes stay around and they just need
+ * reintialization.
+ */
pgdat = NODE_DATA(nid);
- if (!pgdat) {
- pgdat = arch_alloc_nodedata(nid);
- if (!pgdat)
- return NULL;
-
- pgdat->per_cpu_nodestats =
- alloc_percpu(struct per_cpu_nodestat);
- arch_refresh_nodedata(nid, pgdat);
- } else {
- int cpu;
- /*
- * Reset the nr_zones, order and highest_zoneidx before reuse.
- * Note that kswapd will init kswapd_highest_zoneidx properly
- * when it starts in the near future.
- */
- pgdat->nr_zones = 0;
- pgdat->kswapd_order = 0;
- pgdat->kswapd_highest_zoneidx = 0;
- for_each_online_cpu(cpu) {
- struct per_cpu_nodestat *p;
-
- p = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu);
- memset(p, 0, sizeof(*p));
- }
- }
-
- /* we can use NODE_DATA(nid) from here */
- pgdat->node_id = nid;
- pgdat->node_start_pfn = 0;
/* init node's zones as empty zones, we don't have any present pages.*/
- free_area_init_core_hotplug(nid);
+ free_area_init_core_hotplug(pgdat);
/*
* The node we allocated has no zone fallback lists. For avoiding
@@ -1210,6 +1181,7 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid)
* When memory is hot-added, all the memory is in offline state. So
* clear all zones' present_pages because they will be updated in
* online_pages() and offline_pages().
+ * TODO: should be in free_area_init_core_hotplug?
*/
reset_node_managed_pages(pgdat);
reset_node_present_pages(pgdat);
@@ -1217,16 +1189,6 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid)
return pgdat;
}
-static void rollback_node_hotadd(int nid)
-{
- pg_data_t *pgdat = NODE_DATA(nid);
-
- arch_refresh_nodedata(nid, NULL);
- free_percpu(pgdat->per_cpu_nodestats);
- arch_free_nodedata(pgdat);
-}
-
-
/*
* __try_online_node - online a node if offlined
* @nid: the node ID
@@ -1246,7 +1208,7 @@ static int __try_online_node(int nid, bool set_node_online)
if (node_online(nid))
return 0;
- pgdat = hotadd_new_pgdat(nid);
+ pgdat = hotadd_init_pgdat(nid);
if (!pgdat) {
pr_err("Cannot online node %d due to NULL pgdat\n", nid);
ret = -ENOMEM;
@@ -1327,7 +1289,7 @@ bool mhp_supports_memmap_on_memory(unsigned long size)
* populate a single PMD.
*/
return memmap_on_memory &&
- !hugetlb_free_vmemmap_enabled &&
+ !hugetlb_free_vmemmap_enabled() &&
IS_ENABLED(CONFIG_MHP_MEMMAP_ON_MEMORY) &&
size == memory_block_size_bytes() &&
IS_ALIGNED(vmemmap_size, PMD_SIZE) &&
@@ -1421,9 +1383,9 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
BUG_ON(ret);
}
- /* link memory sections under this node.*/
- link_mem_sections(nid, PFN_DOWN(start), PFN_UP(start + size - 1),
- MEMINIT_HOTPLUG);
+ register_memory_blocks_under_node(nid, PFN_DOWN(start),
+ PFN_UP(start + size - 1),
+ MEMINIT_HOTPLUG);
/* create new memmap entry */
if (!strcmp(res->name, "System RAM"))
@@ -1445,9 +1407,6 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
return ret;
error:
- /* rollback pgdat allocation and others */
- if (new_node)
- rollback_node_hotadd(nid);
if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK))
memblock_remove(start, size);
error_mem_hotplug_end:
@@ -1590,38 +1549,6 @@ bool mhp_range_allowed(u64 start, u64 size, bool need_mapping)
#ifdef CONFIG_MEMORY_HOTREMOVE
/*
- * Confirm all pages in a range [start, end) belong to the same zone (skipping
- * memory holes). When true, return the zone.
- */
-struct zone *test_pages_in_a_zone(unsigned long start_pfn,
- unsigned long end_pfn)
-{
- unsigned long pfn, sec_end_pfn;
- struct zone *zone = NULL;
- struct page *page;
-
- for (pfn = start_pfn, sec_end_pfn = SECTION_ALIGN_UP(start_pfn + 1);
- pfn < end_pfn;
- pfn = sec_end_pfn, sec_end_pfn += PAGES_PER_SECTION) {
- /* Make sure the memory section is present first */
- if (!present_section_nr(pfn_to_section_nr(pfn)))
- continue;
- for (; pfn < sec_end_pfn && pfn < end_pfn;
- pfn += MAX_ORDER_NR_PAGES) {
- /* Check if we got outside of the zone */
- if (zone && !zone_spans_pfn(zone, pfn))
- return NULL;
- page = pfn_to_page(pfn);
- if (zone && page_zone(page) != zone)
- return NULL;
- zone = page_zone(page);
- }
- }
-
- return zone;
-}
-
-/*
* Scan pfn range [start,end) to find movable/migratable pages (LRU pages,
* non-lru movable pages and hugepages). Will skip over most unmovable
* pages (esp., pages that can be skipped when offlining), but bail out on
@@ -1690,10 +1617,13 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
DEFAULT_RATELIMIT_BURST);
for (pfn = start_pfn; pfn < end_pfn; pfn++) {
+ struct folio *folio;
+
if (!pfn_valid(pfn))
continue;
page = pfn_to_page(pfn);
- head = compound_head(page);
+ folio = page_folio(page);
+ head = &folio->page;
if (PageHuge(page)) {
pfn = page_to_pfn(head) + compound_nr(head) - 1;
@@ -1710,10 +1640,10 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
* the unmap as the catch all safety net).
*/
if (PageHWPoison(page)) {
- if (WARN_ON(PageLRU(page)))
- isolate_lru_page(page);
- if (page_mapped(page))
- try_to_unmap(page, TTU_IGNORE_MLOCK);
+ if (WARN_ON(folio_test_lru(folio)))
+ folio_isolate_lru(folio);
+ if (folio_mapped(folio))
+ try_to_unmap(folio, TTU_IGNORE_MLOCK);
continue;
}
@@ -1844,15 +1774,15 @@ static int count_system_ram_pages_cb(unsigned long start_pfn,
}
int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages,
- struct memory_group *group)
+ struct zone *zone, struct memory_group *group)
{
const unsigned long end_pfn = start_pfn + nr_pages;
unsigned long pfn, system_ram_pages = 0;
+ const int node = zone_to_nid(zone);
unsigned long flags;
- struct zone *zone;
struct memory_notify arg;
- int ret, node;
char *reason;
+ int ret;
/*
* {on,off}lining is constrained to full memory sections (or more
@@ -1884,15 +1814,17 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages,
goto failed_removal;
}
- /* This makes hotplug much easier...and readable.
- we assume this for now. .*/
- zone = test_pages_in_a_zone(start_pfn, end_pfn);
- if (!zone) {
+ /*
+ * We only support offlining of memory blocks managed by a single zone,
+ * checked by calling code. This is just a sanity check that we might
+ * want to remove in the future.
+ */
+ if (WARN_ON_ONCE(page_zone(pfn_to_page(start_pfn)) != zone ||
+ page_zone(pfn_to_page(end_pfn - 1)) != zone)) {
ret = -EINVAL;
reason = "multizone range";
goto failed_removal;
}
- node = zone_to_nid(zone);
/*
* Disable pcplists so that page isolation cannot race with freeing
@@ -2004,6 +1936,7 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages,
return 0;
failed_removal_isolated:
+ /* pushback to free area */
undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
memory_notify(MEM_CANCEL_OFFLINE, &arg);
failed_removal_pcplists_disabled:
@@ -2014,7 +1947,6 @@ failed_removal:
(unsigned long long) start_pfn << PAGE_SHIFT,
((unsigned long long) end_pfn << PAGE_SHIFT) - 1,
reason);
- /* pushback to free area */
mem_hotplug_done();
return ret;
}
@@ -2046,12 +1978,12 @@ static int get_nr_vmemmap_pages_cb(struct memory_block *mem, void *arg)
return mem->nr_vmemmap_pages;
}
-static int check_cpu_on_node(pg_data_t *pgdat)
+static int check_cpu_on_node(int nid)
{
int cpu;
for_each_present_cpu(cpu) {
- if (cpu_to_node(cpu) == pgdat->node_id)
+ if (cpu_to_node(cpu) == nid)
/*
* the cpu on this node isn't removed, and we can't
* offline this node.
@@ -2085,7 +2017,6 @@ static int check_no_memblock_for_node_cb(struct memory_block *mem, void *arg)
*/
void try_offline_node(int nid)
{
- pg_data_t *pgdat = NODE_DATA(nid);
int rc;
/*
@@ -2093,7 +2024,7 @@ void try_offline_node(int nid)
* offline it. A node spans memory after move_pfn_range_to_zone(),
* e.g., after the memory block was onlined.
*/
- if (pgdat->node_spanned_pages)
+ if (node_spanned_pages(nid))
return;
/*
@@ -2105,7 +2036,7 @@ void try_offline_node(int nid)
if (rc)
return;
- if (check_cpu_on_node(pgdat))
+ if (check_cpu_on_node(nid))
return;
/*
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 028e8dd82b44..a2516d31db6c 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -786,7 +786,6 @@ static int vma_replace_policy(struct vm_area_struct *vma,
static int mbind_range(struct mm_struct *mm, unsigned long start,
unsigned long end, struct mempolicy *new_pol)
{
- struct vm_area_struct *next;
struct vm_area_struct *prev;
struct vm_area_struct *vma;
int err = 0;
@@ -801,8 +800,7 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
if (start > vma->vm_start)
prev = vma;
- for (; vma && vma->vm_start < end; prev = vma, vma = next) {
- next = vma->vm_next;
+ for (; vma && vma->vm_start < end; prev = vma, vma = vma->vm_next) {
vmstart = max(start, vma->vm_start);
vmend = min(end, vma->vm_end);
@@ -814,13 +812,9 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
vma->anon_vma, vma->vm_file, pgoff,
new_pol, vma->vm_userfaultfd_ctx,
- vma_anon_name(vma));
+ anon_vma_name(vma));
if (prev) {
vma = prev;
- next = vma->vm_next;
- if (mpol_equal(vma_policy(vma), new_pol))
- continue;
- /* vma_merge() joined vma && vma->next, case 8 */
goto replace;
}
if (vma->vm_start != vmstart) {
@@ -907,17 +901,14 @@ static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
static int lookup_node(struct mm_struct *mm, unsigned long addr)
{
struct page *p = NULL;
- int err;
+ int ret;
- int locked = 1;
- err = get_user_pages_locked(addr & PAGE_MASK, 1, 0, &p, &locked);
- if (err > 0) {
- err = page_to_nid(p);
+ ret = get_user_pages_fast(addr & PAGE_MASK, 1, 0, &p);
+ if (ret > 0) {
+ ret = page_to_nid(p);
put_page(p);
}
- if (locked)
- mmap_read_unlock(mm);
- return err;
+ return ret;
}
/* Retrieve NUMA policy */
@@ -968,14 +959,14 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
if (flags & MPOL_F_NODE) {
if (flags & MPOL_F_ADDR) {
/*
- * Take a refcount on the mpol, lookup_node()
- * will drop the mmap_lock, so after calling
- * lookup_node() only "pol" remains valid, "vma"
- * is stale.
+ * Take a refcount on the mpol, because we are about to
+ * drop the mmap_lock, after which only "pol" remains
+ * valid, "vma" is stale.
*/
pol_refcount = pol;
vma = NULL;
mpol_get(pol);
+ mmap_read_unlock(mm);
err = lookup_node(mm, addr);
if (err < 0)
goto out;
diff --git a/mm/memremap.c b/mm/memremap.c
index 6aa5f0c2d11f..af0223605e69 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -4,7 +4,7 @@
#include <linux/io.h>
#include <linux/kasan.h>
#include <linux/memory_hotplug.h>
-#include <linux/mm.h>
+#include <linux/memremap.h>
#include <linux/pfn_t.h>
#include <linux/swap.h>
#include <linux/mmzone.h>
@@ -12,6 +12,7 @@
#include <linux/types.h>
#include <linux/wait_bit.h>
#include <linux/xarray.h>
+#include "internal.h"
static DEFINE_XARRAY(pgmap_array);
@@ -37,21 +38,19 @@ unsigned long memremap_compat_align(void)
EXPORT_SYMBOL_GPL(memremap_compat_align);
#endif
-#ifdef CONFIG_DEV_PAGEMAP_OPS
+#ifdef CONFIG_FS_DAX
DEFINE_STATIC_KEY_FALSE(devmap_managed_key);
EXPORT_SYMBOL(devmap_managed_key);
static void devmap_managed_enable_put(struct dev_pagemap *pgmap)
{
- if (pgmap->type == MEMORY_DEVICE_PRIVATE ||
- pgmap->type == MEMORY_DEVICE_FS_DAX)
+ if (pgmap->type == MEMORY_DEVICE_FS_DAX)
static_branch_dec(&devmap_managed_key);
}
static void devmap_managed_enable_get(struct dev_pagemap *pgmap)
{
- if (pgmap->type == MEMORY_DEVICE_PRIVATE ||
- pgmap->type == MEMORY_DEVICE_FS_DAX)
+ if (pgmap->type == MEMORY_DEVICE_FS_DAX)
static_branch_inc(&devmap_managed_key);
}
#else
@@ -61,7 +60,7 @@ static void devmap_managed_enable_get(struct dev_pagemap *pgmap)
static void devmap_managed_enable_put(struct dev_pagemap *pgmap)
{
}
-#endif /* CONFIG_DEV_PAGEMAP_OPS */
+#endif /* CONFIG_FS_DAX */
static void pgmap_array_delete(struct range *range)
{
@@ -102,23 +101,12 @@ static unsigned long pfn_end(struct dev_pagemap *pgmap, int range_id)
return (range->start + range_len(range)) >> PAGE_SHIFT;
}
-static unsigned long pfn_next(struct dev_pagemap *pgmap, unsigned long pfn)
-{
- if (pfn % (1024 << pgmap->vmemmap_shift))
- cond_resched();
- return pfn + pgmap_vmemmap_nr(pgmap);
-}
-
static unsigned long pfn_len(struct dev_pagemap *pgmap, unsigned long range_id)
{
return (pfn_end(pgmap, range_id) -
pfn_first(pgmap, range_id)) >> pgmap->vmemmap_shift;
}
-#define for_each_device_pfn(pfn, map, i) \
- for (pfn = pfn_first(map, i); pfn < pfn_end(map, i); \
- pfn = pfn_next(map, pfn))
-
static void pageunmap_range(struct dev_pagemap *pgmap, int range_id)
{
struct range *range = &pgmap->ranges[range_id];
@@ -147,13 +135,11 @@ static void pageunmap_range(struct dev_pagemap *pgmap, int range_id)
void memunmap_pages(struct dev_pagemap *pgmap)
{
- unsigned long pfn;
int i;
percpu_ref_kill(&pgmap->ref);
for (i = 0; i < pgmap->nr_range; i++)
- for_each_device_pfn(pfn, pgmap, i)
- put_page(pfn_to_page(pfn));
+ percpu_ref_put_many(&pgmap->ref, pfn_len(pgmap, i));
wait_for_completion(&pgmap->done);
percpu_ref_exit(&pgmap->ref);
@@ -282,7 +268,8 @@ static int pagemap_range(struct dev_pagemap *pgmap, struct mhp_params *params,
return 0;
err_add_memory:
- kasan_remove_zero_shadow(__va(range->start), range_len(range));
+ if (!is_private)
+ kasan_remove_zero_shadow(__va(range->start), range_len(range));
err_kasan:
untrack_pfn(NULL, PHYS_PFN(range->start), range_len(range));
err_pfn_remap:
@@ -328,8 +315,7 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid)
}
break;
case MEMORY_DEVICE_FS_DAX:
- if (!IS_ENABLED(CONFIG_ZONE_DEVICE) ||
- IS_ENABLED(CONFIG_FS_DAX_LIMITED)) {
+ if (IS_ENABLED(CONFIG_FS_DAX_LIMITED)) {
WARN(1, "File system DAX not supported\n");
return ERR_PTR(-EINVAL);
}
@@ -465,21 +451,15 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
}
EXPORT_SYMBOL_GPL(get_dev_pagemap);
-#ifdef CONFIG_DEV_PAGEMAP_OPS
-void free_devmap_managed_page(struct page *page)
+void free_zone_device_page(struct page *page)
{
- /* notify page idle for dax */
- if (!is_device_private_page(page)) {
- wake_up_var(&page->_refcount);
+ if (WARN_ON_ONCE(!page->pgmap->ops || !page->pgmap->ops->page_free))
return;
- }
-
- __ClearPageWaiters(page);
mem_cgroup_uncharge(page_folio(page));
/*
- * When a device_private page is freed, the page->mapping field
+ * When a device managed page is freed, the page->mapping field
* may still contain a (stale) mapping value. For example, the
* lower bits of page->mapping may still identify the page as an
* anonymous page. Ultimately, this entire field is just stale
@@ -501,5 +481,27 @@ void free_devmap_managed_page(struct page *page)
*/
page->mapping = NULL;
page->pgmap->ops->page_free(page);
+
+ /*
+ * Reset the page count to 1 to prepare for handing out the page again.
+ */
+ set_page_count(page, 1);
+}
+
+#ifdef CONFIG_FS_DAX
+bool __put_devmap_managed_page(struct page *page)
+{
+ if (page->pgmap->type != MEMORY_DEVICE_FS_DAX)
+ return false;
+
+ /*
+ * fsdax page refcounts are 1-based, rather than 0-based: if
+ * refcount is 1, then the page is free and the refcount is
+ * stable because nobody holds a reference on the page.
+ */
+ if (page_ref_dec_return(page) == 1)
+ wake_up_var(&page->_refcount);
+ return true;
}
-#endif /* CONFIG_DEV_PAGEMAP_OPS */
+EXPORT_SYMBOL(__put_devmap_managed_page);
+#endif /* CONFIG_FS_DAX */
diff --git a/mm/migrate.c b/mm/migrate.c
index c7da064b4781..3d60823afd2d 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -38,12 +38,10 @@
#include <linux/hugetlb.h>
#include <linux/hugetlb_cgroup.h>
#include <linux/gfp.h>
-#include <linux/pagewalk.h>
#include <linux/pfn_t.h>
#include <linux/memremap.h>
#include <linux/userfaultfd_k.h>
#include <linux/balloon_compaction.h>
-#include <linux/mmu_notifier.h>
#include <linux/page_idle.h>
#include <linux/page_owner.h>
#include <linux/sched/mm.h>
@@ -51,10 +49,10 @@
#include <linux/oom.h>
#include <linux/memory.h>
#include <linux/random.h>
+#include <linux/sched/sysctl.h>
#include <asm/tlbflush.h>
-#define CREATE_TRACE_POINTS
#include <trace/events/migrate.h>
#include "internal.h"
@@ -107,7 +105,7 @@ int isolate_movable_page(struct page *page, isolate_mode_t mode)
/* Driver shouldn't use PG_isolated bit of page->flags */
WARN_ON_ONCE(PageIsolated(page));
- __SetPageIsolated(page);
+ SetPageIsolated(page);
unlock_page(page);
return 0;
@@ -126,7 +124,7 @@ static void putback_movable_page(struct page *page)
mapping = page_mapping(page);
mapping->a_ops->putback_page(page);
- __ClearPageIsolated(page);
+ ClearPageIsolated(page);
}
/*
@@ -159,7 +157,7 @@ void putback_movable_pages(struct list_head *l)
if (PageMovable(page))
putback_movable_page(page);
else
- __ClearPageIsolated(page);
+ ClearPageIsolated(page);
unlock_page(page);
put_page(page);
} else {
@@ -173,37 +171,33 @@ void putback_movable_pages(struct list_head *l)
/*
* Restore a potential migration pte to a working pte entry
*/
-static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma,
- unsigned long addr, void *old)
+static bool remove_migration_pte(struct folio *folio,
+ struct vm_area_struct *vma, unsigned long addr, void *old)
{
- struct page_vma_mapped_walk pvmw = {
- .page = old,
- .vma = vma,
- .address = addr,
- .flags = PVMW_SYNC | PVMW_MIGRATION,
- };
- struct page *new;
- pte_t pte;
- swp_entry_t entry;
+ DEFINE_FOLIO_VMA_WALK(pvmw, old, vma, addr, PVMW_SYNC | PVMW_MIGRATION);
- VM_BUG_ON_PAGE(PageTail(page), page);
while (page_vma_mapped_walk(&pvmw)) {
- if (PageKsm(page))
- new = page;
- else
- new = page - pvmw.page->index +
- linear_page_index(vma, pvmw.address);
+ pte_t pte;
+ swp_entry_t entry;
+ struct page *new;
+ unsigned long idx = 0;
+
+ /* pgoff is invalid for ksm pages, but they are never large */
+ if (folio_test_large(folio) && !folio_test_hugetlb(folio))
+ idx = linear_page_index(vma, pvmw.address) - pvmw.pgoff;
+ new = folio_page(folio, idx);
#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
/* PMD-mapped THP migration entry */
if (!pvmw.pte) {
- VM_BUG_ON_PAGE(PageHuge(page) || !PageTransCompound(page), page);
+ VM_BUG_ON_FOLIO(folio_test_hugetlb(folio) ||
+ !folio_test_pmd_mappable(folio), folio);
remove_migration_pmd(&pvmw, new);
continue;
}
#endif
- get_page(new);
+ folio_get(folio);
pte = pte_mkold(mk_pte(new, READ_ONCE(vma->vm_page_prot)));
if (pte_swp_soft_dirty(*pvmw.pte))
pte = pte_mksoft_dirty(pte);
@@ -232,12 +226,12 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma,
}
#ifdef CONFIG_HUGETLB_PAGE
- if (PageHuge(new)) {
+ if (folio_test_hugetlb(folio)) {
unsigned int shift = huge_page_shift(hstate_vma(vma));
pte = pte_mkhuge(pte);
pte = arch_make_huge_pte(pte, shift, vma->vm_flags);
- if (PageAnon(new))
+ if (folio_test_anon(folio))
hugepage_add_anon_rmap(new, vma, pvmw.address);
else
page_dup_rmap(new, true);
@@ -245,17 +239,17 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma,
} else
#endif
{
- if (PageAnon(new))
+ if (folio_test_anon(folio))
page_add_anon_rmap(new, vma, pvmw.address, false);
else
- page_add_file_rmap(new, false);
+ page_add_file_rmap(new, vma, false);
set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
}
- if (vma->vm_flags & VM_LOCKED && !PageTransCompound(new))
- mlock_vma_page(new);
+ if (vma->vm_flags & VM_LOCKED)
+ mlock_page_drain(smp_processor_id());
- if (PageTransHuge(page) && PageMlocked(page))
- clear_page_mlock(page);
+ trace_remove_migration_pte(pvmw.address, pte_val(pte),
+ compound_order(new));
/* No need to invalidate - it was non-present before */
update_mmu_cache(vma, pvmw.address, pvmw.pte);
@@ -268,17 +262,17 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma,
* Get rid of all migration entries and replace them by
* references to the indicated page.
*/
-void remove_migration_ptes(struct page *old, struct page *new, bool locked)
+void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked)
{
struct rmap_walk_control rwc = {
.rmap_one = remove_migration_pte,
- .arg = old,
+ .arg = src,
};
if (locked)
- rmap_walk_locked(new, &rwc);
+ rmap_walk_locked(dst, &rwc);
else
- rmap_walk(new, &rwc);
+ rmap_walk(dst, &rwc);
}
/*
@@ -341,14 +335,8 @@ static int expected_page_refs(struct address_space *mapping, struct page *page)
{
int expected_count = 1;
- /*
- * Device private pages have an extra refcount as they are
- * ZONE_DEVICE pages.
- */
- expected_count += is_device_private_page(page);
if (mapping)
expected_count += compound_nr(page) + page_has_private(page);
-
return expected_count;
}
@@ -771,6 +759,7 @@ int buffer_migrate_page_norefs(struct address_space *mapping,
*/
static int writeout(struct address_space *mapping, struct page *page)
{
+ struct folio *folio = page_folio(page);
struct writeback_control wbc = {
.sync_mode = WB_SYNC_NONE,
.nr_to_write = 1,
@@ -796,7 +785,7 @@ static int writeout(struct address_space *mapping, struct page *page)
* At this point we know that the migration attempt cannot
* be successful.
*/
- remove_migration_ptes(page, page, false);
+ remove_migration_ptes(folio, folio, false);
rc = mapping->a_ops->writepage(page, &wbc);
@@ -883,7 +872,7 @@ static int move_to_new_page(struct page *newpage, struct page *page,
VM_BUG_ON_PAGE(!PageIsolated(page), page);
if (!PageMovable(page)) {
rc = MIGRATEPAGE_SUCCESS;
- __ClearPageIsolated(page);
+ ClearPageIsolated(page);
goto out;
}
@@ -905,7 +894,7 @@ static int move_to_new_page(struct page *newpage, struct page *page,
* We clear PG_movable under page_lock so any compactor
* cannot try to migrate this page.
*/
- __ClearPageIsolated(page);
+ ClearPageIsolated(page);
}
/*
@@ -917,8 +906,7 @@ static int move_to_new_page(struct page *newpage, struct page *page,
page->mapping = NULL;
if (likely(!is_zone_device_page(newpage)))
- flush_dcache_page(newpage);
-
+ flush_dcache_folio(page_folio(newpage));
}
out:
return rc;
@@ -927,6 +915,8 @@ out:
static int __unmap_and_move(struct page *page, struct page *newpage,
int force, enum migrate_mode mode)
{
+ struct folio *folio = page_folio(page);
+ struct folio *dst = page_folio(newpage);
int rc = -EAGAIN;
bool page_was_mapped = false;
struct anon_vma *anon_vma = NULL;
@@ -1030,16 +1020,31 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
/* Establish migration ptes */
VM_BUG_ON_PAGE(PageAnon(page) && !PageKsm(page) && !anon_vma,
page);
- try_to_migrate(page, 0);
+ try_to_migrate(folio, 0);
page_was_mapped = true;
}
if (!page_mapped(page))
rc = move_to_new_page(newpage, page, mode);
+ /*
+ * When successful, push newpage to LRU immediately: so that if it
+ * turns out to be an mlocked page, remove_migration_ptes() will
+ * automatically build up the correct newpage->mlock_count for it.
+ *
+ * We would like to do something similar for the old page, when
+ * unsuccessful, and other cases when a page has been temporarily
+ * isolated from the unevictable LRU: but this case is the easiest.
+ */
+ if (rc == MIGRATEPAGE_SUCCESS) {
+ lru_cache_add(newpage);
+ if (page_was_mapped)
+ lru_add_drain();
+ }
+
if (page_was_mapped)
- remove_migration_ptes(page,
- rc == MIGRATEPAGE_SUCCESS ? newpage : page, false);
+ remove_migration_ptes(folio,
+ rc == MIGRATEPAGE_SUCCESS ? dst : folio, false);
out_unlock_both:
unlock_page(newpage);
@@ -1050,20 +1055,12 @@ out_unlock:
unlock_page(page);
out:
/*
- * If migration is successful, decrease refcount of the newpage
+ * If migration is successful, decrease refcount of the newpage,
* which will not free the page because new page owner increased
- * refcounter. As well, if it is LRU page, add the page to LRU
- * list in here. Use the old state of the isolated source page to
- * determine if we migrated a LRU page. newpage was already unlocked
- * and possibly modified by its owner - don't rely on the page
- * state.
+ * refcounter.
*/
- if (rc == MIGRATEPAGE_SUCCESS) {
- if (unlikely(!is_lru))
- put_page(newpage);
- else
- putback_lru_page(newpage);
- }
+ if (rc == MIGRATEPAGE_SUCCESS)
+ put_page(newpage);
return rc;
}
@@ -1092,7 +1089,7 @@ static int unmap_and_move(new_page_t get_new_page,
if (unlikely(__PageMovable(page))) {
lock_page(page);
if (!PageMovable(page))
- __ClearPageIsolated(page);
+ ClearPageIsolated(page);
unlock_page(page);
}
goto out;
@@ -1173,6 +1170,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
enum migrate_mode mode, int reason,
struct list_head *ret)
{
+ struct folio *dst, *src = page_folio(hpage);
int rc = -EAGAIN;
int page_was_mapped = 0;
struct page *new_hpage;
@@ -1200,6 +1198,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
new_hpage = get_new_page(hpage, private);
if (!new_hpage)
return -ENOMEM;
+ dst = page_folio(new_hpage);
if (!trylock_page(hpage)) {
if (!force)
@@ -1249,7 +1248,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
ttu |= TTU_RMAP_LOCKED;
}
- try_to_migrate(hpage, ttu);
+ try_to_migrate(src, ttu);
page_was_mapped = 1;
if (mapping_locked)
@@ -1260,8 +1259,8 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
rc = move_to_new_page(new_hpage, hpage, mode);
if (page_was_mapped)
- remove_migration_ptes(hpage,
- rc == MIGRATEPAGE_SUCCESS ? new_hpage : hpage, false);
+ remove_migration_ptes(src,
+ rc == MIGRATEPAGE_SUCCESS ? dst : src, false);
unlock_put_anon:
unlock_page(new_hpage);
@@ -1351,7 +1350,6 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
bool is_thp = false;
struct page *page;
struct page *page2;
- int swapwrite = current->flags & PF_SWAPWRITE;
int rc, nr_subpages;
LIST_HEAD(ret_pages);
LIST_HEAD(thp_split_pages);
@@ -1360,9 +1358,6 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
trace_mm_migrate_pages_start(mode, reason);
- if (!swapwrite)
- current->flags |= PF_SWAPWRITE;
-
thp_subpage_migration:
for (pass = 0; pass < 10 && (retry || thp_retry); pass++) {
retry = 0;
@@ -1517,9 +1512,6 @@ out:
trace_mm_migrate_pages(nr_succeeded, nr_failed_pages, nr_thp_succeeded,
nr_thp_failed, nr_thp_split, mode, reason);
- if (!swapwrite)
- current->flags &= ~PF_SWAPWRITE;
-
if (ret_succeeded)
*ret_succeeded = nr_succeeded;
@@ -1612,7 +1604,6 @@ static int add_page_for_migration(struct mm_struct *mm, unsigned long addr,
{
struct vm_area_struct *vma;
struct page *page;
- unsigned int follflags;
int err;
mmap_read_lock(mm);
@@ -1622,8 +1613,7 @@ static int add_page_for_migration(struct mm_struct *mm, unsigned long addr,
goto out;
/* FOLL_DUMP to ignore special (like zero) pages */
- follflags = FOLL_GET | FOLL_DUMP;
- page = follow_page(vma, addr, follflags);
+ page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP);
err = PTR_ERR(page);
if (IS_ERR(page))
@@ -1762,6 +1752,13 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
}
/*
+ * The move_pages() man page does not have an -EEXIST choice, so
+ * use -EFAULT instead.
+ */
+ if (err == -EEXIST)
+ err = -EFAULT;
+
+ /*
* If the page is already on the target node (!err), store the
* node, otherwise, store the err.
*/
@@ -2034,16 +2031,27 @@ static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
{
int page_lru;
int nr_pages = thp_nr_pages(page);
+ int order = compound_order(page);
- VM_BUG_ON_PAGE(compound_order(page) && !PageTransHuge(page), page);
+ VM_BUG_ON_PAGE(order && !PageTransHuge(page), page);
/* Do not migrate THP mapped by multiple processes */
if (PageTransHuge(page) && total_mapcount(page) > 1)
return 0;
/* Avoid migrating to a node that is nearly full */
- if (!migrate_balanced_pgdat(pgdat, nr_pages))
+ if (!migrate_balanced_pgdat(pgdat, nr_pages)) {
+ int z;
+
+ if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING))
+ return 0;
+ for (z = pgdat->nr_zones - 1; z >= 0; z--) {
+ if (populated_zone(pgdat->node_zones + z))
+ break;
+ }
+ wakeup_kswapd(pgdat->node_zones + z, 0, order, ZONE_MOVABLE);
return 0;
+ }
if (isolate_lru_page(page))
return 0;
@@ -2072,6 +2080,7 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
pg_data_t *pgdat = NODE_DATA(node);
int isolated;
int nr_remaining;
+ unsigned int nr_succeeded;
LIST_HEAD(migratepages);
new_page_t *new;
bool compound;
@@ -2110,7 +2119,8 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
list_add(&page->lru, &migratepages);
nr_remaining = migrate_pages(&migratepages, *new, NULL, node,
- MIGRATE_ASYNC, MR_NUMA_MISPLACED, NULL);
+ MIGRATE_ASYNC, MR_NUMA_MISPLACED,
+ &nr_succeeded);
if (nr_remaining) {
if (!list_empty(&migratepages)) {
list_del(&page->lru);
@@ -2119,8 +2129,13 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
putback_lru_page(page);
}
isolated = 0;
- } else
- count_vm_numa_events(NUMA_PAGE_MIGRATE, nr_pages);
+ }
+ if (nr_succeeded) {
+ count_vm_numa_events(NUMA_PAGE_MIGRATE, nr_succeeded);
+ if (!node_is_toptier(page_to_nid(page)) && node_is_toptier(node))
+ mod_node_page_state(pgdat, PGPROMOTE_SUCCESS,
+ nr_succeeded);
+ }
BUG_ON(!list_empty(&migratepages));
return isolated;
@@ -2131,761 +2146,6 @@ out:
#endif /* CONFIG_NUMA_BALANCING */
#endif /* CONFIG_NUMA */
-#ifdef CONFIG_DEVICE_PRIVATE
-static int migrate_vma_collect_skip(unsigned long start,
- unsigned long end,
- struct mm_walk *walk)
-{
- struct migrate_vma *migrate = walk->private;
- unsigned long addr;
-
- for (addr = start; addr < end; addr += PAGE_SIZE) {
- migrate->dst[migrate->npages] = 0;
- migrate->src[migrate->npages++] = 0;
- }
-
- return 0;
-}
-
-static int migrate_vma_collect_hole(unsigned long start,
- unsigned long end,
- __always_unused int depth,
- struct mm_walk *walk)
-{
- struct migrate_vma *migrate = walk->private;
- unsigned long addr;
-
- /* Only allow populating anonymous memory. */
- if (!vma_is_anonymous(walk->vma))
- return migrate_vma_collect_skip(start, end, walk);
-
- for (addr = start; addr < end; addr += PAGE_SIZE) {
- migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE;
- migrate->dst[migrate->npages] = 0;
- migrate->npages++;
- migrate->cpages++;
- }
-
- return 0;
-}
-
-static int migrate_vma_collect_pmd(pmd_t *pmdp,
- unsigned long start,
- unsigned long end,
- struct mm_walk *walk)
-{
- struct migrate_vma *migrate = walk->private;
- struct vm_area_struct *vma = walk->vma;
- struct mm_struct *mm = vma->vm_mm;
- unsigned long addr = start, unmapped = 0;
- spinlock_t *ptl;
- pte_t *ptep;
-
-again:
- if (pmd_none(*pmdp))
- return migrate_vma_collect_hole(start, end, -1, walk);
-
- if (pmd_trans_huge(*pmdp)) {
- struct page *page;
-
- ptl = pmd_lock(mm, pmdp);
- if (unlikely(!pmd_trans_huge(*pmdp))) {
- spin_unlock(ptl);
- goto again;
- }
-
- page = pmd_page(*pmdp);
- if (is_huge_zero_page(page)) {
- spin_unlock(ptl);
- split_huge_pmd(vma, pmdp, addr);
- if (pmd_trans_unstable(pmdp))
- return migrate_vma_collect_skip(start, end,
- walk);
- } else {
- int ret;
-
- get_page(page);
- spin_unlock(ptl);
- if (unlikely(!trylock_page(page)))
- return migrate_vma_collect_skip(start, end,
- walk);
- ret = split_huge_page(page);
- unlock_page(page);
- put_page(page);
- if (ret)
- return migrate_vma_collect_skip(start, end,
- walk);
- if (pmd_none(*pmdp))
- return migrate_vma_collect_hole(start, end, -1,
- walk);
- }
- }
-
- if (unlikely(pmd_bad(*pmdp)))
- return migrate_vma_collect_skip(start, end, walk);
-
- ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
- arch_enter_lazy_mmu_mode();
-
- for (; addr < end; addr += PAGE_SIZE, ptep++) {
- unsigned long mpfn = 0, pfn;
- struct page *page;
- swp_entry_t entry;
- pte_t pte;
-
- pte = *ptep;
-
- if (pte_none(pte)) {
- if (vma_is_anonymous(vma)) {
- mpfn = MIGRATE_PFN_MIGRATE;
- migrate->cpages++;
- }
- goto next;
- }
-
- if (!pte_present(pte)) {
- /*
- * Only care about unaddressable device page special
- * page table entry. Other special swap entries are not
- * migratable, and we ignore regular swapped page.
- */
- entry = pte_to_swp_entry(pte);
- if (!is_device_private_entry(entry))
- goto next;
-
- page = pfn_swap_entry_to_page(entry);
- if (!(migrate->flags &
- MIGRATE_VMA_SELECT_DEVICE_PRIVATE) ||
- page->pgmap->owner != migrate->pgmap_owner)
- goto next;
-
- mpfn = migrate_pfn(page_to_pfn(page)) |
- MIGRATE_PFN_MIGRATE;
- if (is_writable_device_private_entry(entry))
- mpfn |= MIGRATE_PFN_WRITE;
- } else {
- if (!(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM))
- goto next;
- pfn = pte_pfn(pte);
- if (is_zero_pfn(pfn)) {
- mpfn = MIGRATE_PFN_MIGRATE;
- migrate->cpages++;
- goto next;
- }
- page = vm_normal_page(migrate->vma, addr, pte);
- mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
- mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0;
- }
-
- /* FIXME support THP */
- if (!page || !page->mapping || PageTransCompound(page)) {
- mpfn = 0;
- goto next;
- }
-
- /*
- * By getting a reference on the page we pin it and that blocks
- * any kind of migration. Side effect is that it "freezes" the
- * pte.
- *
- * We drop this reference after isolating the page from the lru
- * for non device page (device page are not on the lru and thus
- * can't be dropped from it).
- */
- get_page(page);
-
- /*
- * Optimize for the common case where page is only mapped once
- * in one process. If we can lock the page, then we can safely
- * set up a special migration page table entry now.
- */
- if (trylock_page(page)) {
- pte_t swp_pte;
-
- migrate->cpages++;
- ptep_get_and_clear(mm, addr, ptep);
-
- /* Setup special migration page table entry */
- if (mpfn & MIGRATE_PFN_WRITE)
- entry = make_writable_migration_entry(
- page_to_pfn(page));
- else
- entry = make_readable_migration_entry(
- page_to_pfn(page));
- swp_pte = swp_entry_to_pte(entry);
- if (pte_present(pte)) {
- if (pte_soft_dirty(pte))
- swp_pte = pte_swp_mksoft_dirty(swp_pte);
- if (pte_uffd_wp(pte))
- swp_pte = pte_swp_mkuffd_wp(swp_pte);
- } else {
- if (pte_swp_soft_dirty(pte))
- swp_pte = pte_swp_mksoft_dirty(swp_pte);
- if (pte_swp_uffd_wp(pte))
- swp_pte = pte_swp_mkuffd_wp(swp_pte);
- }
- set_pte_at(mm, addr, ptep, swp_pte);
-
- /*
- * This is like regular unmap: we remove the rmap and
- * drop page refcount. Page won't be freed, as we took
- * a reference just above.
- */
- page_remove_rmap(page, false);
- put_page(page);
-
- if (pte_present(pte))
- unmapped++;
- } else {
- put_page(page);
- mpfn = 0;
- }
-
-next:
- migrate->dst[migrate->npages] = 0;
- migrate->src[migrate->npages++] = mpfn;
- }
- arch_leave_lazy_mmu_mode();
- pte_unmap_unlock(ptep - 1, ptl);
-
- /* Only flush the TLB if we actually modified any entries */
- if (unmapped)
- flush_tlb_range(walk->vma, start, end);
-
- return 0;
-}
-
-static const struct mm_walk_ops migrate_vma_walk_ops = {
- .pmd_entry = migrate_vma_collect_pmd,
- .pte_hole = migrate_vma_collect_hole,
-};
-
-/*
- * migrate_vma_collect() - collect pages over a range of virtual addresses
- * @migrate: migrate struct containing all migration information
- *
- * This will walk the CPU page table. For each virtual address backed by a
- * valid page, it updates the src array and takes a reference on the page, in
- * order to pin the page until we lock it and unmap it.
- */
-static void migrate_vma_collect(struct migrate_vma *migrate)
-{
- struct mmu_notifier_range range;
-
- /*
- * Note that the pgmap_owner is passed to the mmu notifier callback so
- * that the registered device driver can skip invalidating device
- * private page mappings that won't be migrated.
- */
- mmu_notifier_range_init_owner(&range, MMU_NOTIFY_MIGRATE, 0,
- migrate->vma, migrate->vma->vm_mm, migrate->start, migrate->end,
- migrate->pgmap_owner);
- mmu_notifier_invalidate_range_start(&range);
-
- walk_page_range(migrate->vma->vm_mm, migrate->start, migrate->end,
- &migrate_vma_walk_ops, migrate);
-
- mmu_notifier_invalidate_range_end(&range);
- migrate->end = migrate->start + (migrate->npages << PAGE_SHIFT);
-}
-
-/*
- * migrate_vma_check_page() - check if page is pinned or not
- * @page: struct page to check
- *
- * Pinned pages cannot be migrated. This is the same test as in
- * folio_migrate_mapping(), except that here we allow migration of a
- * ZONE_DEVICE page.
- */
-static bool migrate_vma_check_page(struct page *page)
-{
- /*
- * One extra ref because caller holds an extra reference, either from
- * isolate_lru_page() for a regular page, or migrate_vma_collect() for
- * a device page.
- */
- int extra = 1;
-
- /*
- * FIXME support THP (transparent huge page), it is bit more complex to
- * check them than regular pages, because they can be mapped with a pmd
- * or with a pte (split pte mapping).
- */
- if (PageCompound(page))
- return false;
-
- /* Page from ZONE_DEVICE have one extra reference */
- if (is_zone_device_page(page))
- extra++;
-
- /* For file back page */
- if (page_mapping(page))
- extra += 1 + page_has_private(page);
-
- if ((page_count(page) - extra) > page_mapcount(page))
- return false;
-
- return true;
-}
-
-/*
- * migrate_vma_unmap() - replace page mapping with special migration pte entry
- * @migrate: migrate struct containing all migration information
- *
- * Isolate pages from the LRU and replace mappings (CPU page table pte) with a
- * special migration pte entry and check if it has been pinned. Pinned pages are
- * restored because we cannot migrate them.
- *
- * This is the last step before we call the device driver callback to allocate
- * destination memory and copy contents of original page over to new page.
- */
-static void migrate_vma_unmap(struct migrate_vma *migrate)
-{
- const unsigned long npages = migrate->npages;
- unsigned long i, restore = 0;
- bool allow_drain = true;
-
- lru_add_drain();
-
- for (i = 0; i < npages; i++) {
- struct page *page = migrate_pfn_to_page(migrate->src[i]);
-
- if (!page)
- continue;
-
- /* ZONE_DEVICE pages are not on LRU */
- if (!is_zone_device_page(page)) {
- if (!PageLRU(page) && allow_drain) {
- /* Drain CPU's pagevec */
- lru_add_drain_all();
- allow_drain = false;
- }
-
- if (isolate_lru_page(page)) {
- migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
- migrate->cpages--;
- restore++;
- continue;
- }
-
- /* Drop the reference we took in collect */
- put_page(page);
- }
-
- if (page_mapped(page))
- try_to_migrate(page, 0);
-
- if (page_mapped(page) || !migrate_vma_check_page(page)) {
- if (!is_zone_device_page(page)) {
- get_page(page);
- putback_lru_page(page);
- }
-
- migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
- migrate->cpages--;
- restore++;
- continue;
- }
- }
-
- for (i = 0; i < npages && restore; i++) {
- struct page *page = migrate_pfn_to_page(migrate->src[i]);
-
- if (!page || (migrate->src[i] & MIGRATE_PFN_MIGRATE))
- continue;
-
- remove_migration_ptes(page, page, false);
-
- migrate->src[i] = 0;
- unlock_page(page);
- put_page(page);
- restore--;
- }
-}
-
-/**
- * migrate_vma_setup() - prepare to migrate a range of memory
- * @args: contains the vma, start, and pfns arrays for the migration
- *
- * Returns: negative errno on failures, 0 when 0 or more pages were migrated
- * without an error.
- *
- * Prepare to migrate a range of memory virtual address range by collecting all
- * the pages backing each virtual address in the range, saving them inside the
- * src array. Then lock those pages and unmap them. Once the pages are locked
- * and unmapped, check whether each page is pinned or not. Pages that aren't
- * pinned have the MIGRATE_PFN_MIGRATE flag set (by this function) in the
- * corresponding src array entry. Then restores any pages that are pinned, by
- * remapping and unlocking those pages.
- *
- * The caller should then allocate destination memory and copy source memory to
- * it for all those entries (ie with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE
- * flag set). Once these are allocated and copied, the caller must update each
- * corresponding entry in the dst array with the pfn value of the destination
- * page and with MIGRATE_PFN_VALID. Destination pages must be locked via
- * lock_page().
- *
- * Note that the caller does not have to migrate all the pages that are marked
- * with MIGRATE_PFN_MIGRATE flag in src array unless this is a migration from
- * device memory to system memory. If the caller cannot migrate a device page
- * back to system memory, then it must return VM_FAULT_SIGBUS, which has severe
- * consequences for the userspace process, so it must be avoided if at all
- * possible.
- *
- * For empty entries inside CPU page table (pte_none() or pmd_none() is true) we
- * do set MIGRATE_PFN_MIGRATE flag inside the corresponding source array thus
- * allowing the caller to allocate device memory for those unbacked virtual
- * addresses. For this the caller simply has to allocate device memory and
- * properly set the destination entry like for regular migration. Note that
- * this can still fail, and thus inside the device driver you must check if the
- * migration was successful for those entries after calling migrate_vma_pages(),
- * just like for regular migration.
- *
- * After that, the callers must call migrate_vma_pages() to go over each entry
- * in the src array that has the MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag
- * set. If the corresponding entry in dst array has MIGRATE_PFN_VALID flag set,
- * then migrate_vma_pages() to migrate struct page information from the source
- * struct page to the destination struct page. If it fails to migrate the
- * struct page information, then it clears the MIGRATE_PFN_MIGRATE flag in the
- * src array.
- *
- * At this point all successfully migrated pages have an entry in the src
- * array with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag set and the dst
- * array entry with MIGRATE_PFN_VALID flag set.
- *
- * Once migrate_vma_pages() returns the caller may inspect which pages were
- * successfully migrated, and which were not. Successfully migrated pages will
- * have the MIGRATE_PFN_MIGRATE flag set for their src array entry.
- *
- * It is safe to update device page table after migrate_vma_pages() because
- * both destination and source page are still locked, and the mmap_lock is held
- * in read mode (hence no one can unmap the range being migrated).
- *
- * Once the caller is done cleaning up things and updating its page table (if it
- * chose to do so, this is not an obligation) it finally calls
- * migrate_vma_finalize() to update the CPU page table to point to new pages
- * for successfully migrated pages or otherwise restore the CPU page table to
- * point to the original source pages.
- */
-int migrate_vma_setup(struct migrate_vma *args)
-{
- long nr_pages = (args->end - args->start) >> PAGE_SHIFT;
-
- args->start &= PAGE_MASK;
- args->end &= PAGE_MASK;
- if (!args->vma || is_vm_hugetlb_page(args->vma) ||
- (args->vma->vm_flags & VM_SPECIAL) || vma_is_dax(args->vma))
- return -EINVAL;
- if (nr_pages <= 0)
- return -EINVAL;
- if (args->start < args->vma->vm_start ||
- args->start >= args->vma->vm_end)
- return -EINVAL;
- if (args->end <= args->vma->vm_start || args->end > args->vma->vm_end)
- return -EINVAL;
- if (!args->src || !args->dst)
- return -EINVAL;
-
- memset(args->src, 0, sizeof(*args->src) * nr_pages);
- args->cpages = 0;
- args->npages = 0;
-
- migrate_vma_collect(args);
-
- if (args->cpages)
- migrate_vma_unmap(args);
-
- /*
- * At this point pages are locked and unmapped, and thus they have
- * stable content and can safely be copied to destination memory that
- * is allocated by the drivers.
- */
- return 0;
-
-}
-EXPORT_SYMBOL(migrate_vma_setup);
-
-/*
- * This code closely matches the code in:
- * __handle_mm_fault()
- * handle_pte_fault()
- * do_anonymous_page()
- * to map in an anonymous zero page but the struct page will be a ZONE_DEVICE
- * private page.
- */
-static void migrate_vma_insert_page(struct migrate_vma *migrate,
- unsigned long addr,
- struct page *page,
- unsigned long *src)
-{
- struct vm_area_struct *vma = migrate->vma;
- struct mm_struct *mm = vma->vm_mm;
- bool flush = false;
- spinlock_t *ptl;
- pte_t entry;
- pgd_t *pgdp;
- p4d_t *p4dp;
- pud_t *pudp;
- pmd_t *pmdp;
- pte_t *ptep;
-
- /* Only allow populating anonymous memory */
- if (!vma_is_anonymous(vma))
- goto abort;
-
- pgdp = pgd_offset(mm, addr);
- p4dp = p4d_alloc(mm, pgdp, addr);
- if (!p4dp)
- goto abort;
- pudp = pud_alloc(mm, p4dp, addr);
- if (!pudp)
- goto abort;
- pmdp = pmd_alloc(mm, pudp, addr);
- if (!pmdp)
- goto abort;
-
- if (pmd_trans_huge(*pmdp) || pmd_devmap(*pmdp))
- goto abort;
-
- /*
- * Use pte_alloc() instead of pte_alloc_map(). We can't run
- * pte_offset_map() on pmds where a huge pmd might be created
- * from a different thread.
- *
- * pte_alloc_map() is safe to use under mmap_write_lock(mm) or when
- * parallel threads are excluded by other means.
- *
- * Here we only have mmap_read_lock(mm).
- */
- if (pte_alloc(mm, pmdp))
- goto abort;
-
- /* See the comment in pte_alloc_one_map() */
- if (unlikely(pmd_trans_unstable(pmdp)))
- goto abort;
-
- if (unlikely(anon_vma_prepare(vma)))
- goto abort;
- if (mem_cgroup_charge(page_folio(page), vma->vm_mm, GFP_KERNEL))
- goto abort;
-
- /*
- * The memory barrier inside __SetPageUptodate makes sure that
- * preceding stores to the page contents become visible before
- * the set_pte_at() write.
- */
- __SetPageUptodate(page);
-
- if (is_zone_device_page(page)) {
- if (is_device_private_page(page)) {
- swp_entry_t swp_entry;
-
- if (vma->vm_flags & VM_WRITE)
- swp_entry = make_writable_device_private_entry(
- page_to_pfn(page));
- else
- swp_entry = make_readable_device_private_entry(
- page_to_pfn(page));
- entry = swp_entry_to_pte(swp_entry);
- } else {
- /*
- * For now we only support migrating to un-addressable
- * device memory.
- */
- pr_warn_once("Unsupported ZONE_DEVICE page type.\n");
- goto abort;
- }
- } else {
- entry = mk_pte(page, vma->vm_page_prot);
- if (vma->vm_flags & VM_WRITE)
- entry = pte_mkwrite(pte_mkdirty(entry));
- }
-
- ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
-
- if (check_stable_address_space(mm))
- goto unlock_abort;
-
- if (pte_present(*ptep)) {
- unsigned long pfn = pte_pfn(*ptep);
-
- if (!is_zero_pfn(pfn))
- goto unlock_abort;
- flush = true;
- } else if (!pte_none(*ptep))
- goto unlock_abort;
-
- /*
- * Check for userfaultfd but do not deliver the fault. Instead,
- * just back off.
- */
- if (userfaultfd_missing(vma))
- goto unlock_abort;
-
- inc_mm_counter(mm, MM_ANONPAGES);
- page_add_new_anon_rmap(page, vma, addr, false);
- if (!is_zone_device_page(page))
- lru_cache_add_inactive_or_unevictable(page, vma);
- get_page(page);
-
- if (flush) {
- flush_cache_page(vma, addr, pte_pfn(*ptep));
- ptep_clear_flush_notify(vma, addr, ptep);
- set_pte_at_notify(mm, addr, ptep, entry);
- update_mmu_cache(vma, addr, ptep);
- } else {
- /* No need to invalidate - it was non-present before */
- set_pte_at(mm, addr, ptep, entry);
- update_mmu_cache(vma, addr, ptep);
- }
-
- pte_unmap_unlock(ptep, ptl);
- *src = MIGRATE_PFN_MIGRATE;
- return;
-
-unlock_abort:
- pte_unmap_unlock(ptep, ptl);
-abort:
- *src &= ~MIGRATE_PFN_MIGRATE;
-}
-
-/**
- * migrate_vma_pages() - migrate meta-data from src page to dst page
- * @migrate: migrate struct containing all migration information
- *
- * This migrates struct page meta-data from source struct page to destination
- * struct page. This effectively finishes the migration from source page to the
- * destination page.
- */
-void migrate_vma_pages(struct migrate_vma *migrate)
-{
- const unsigned long npages = migrate->npages;
- const unsigned long start = migrate->start;
- struct mmu_notifier_range range;
- unsigned long addr, i;
- bool notified = false;
-
- for (i = 0, addr = start; i < npages; addr += PAGE_SIZE, i++) {
- struct page *newpage = migrate_pfn_to_page(migrate->dst[i]);
- struct page *page = migrate_pfn_to_page(migrate->src[i]);
- struct address_space *mapping;
- int r;
-
- if (!newpage) {
- migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
- continue;
- }
-
- if (!page) {
- if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE))
- continue;
- if (!notified) {
- notified = true;
-
- mmu_notifier_range_init_owner(&range,
- MMU_NOTIFY_MIGRATE, 0, migrate->vma,
- migrate->vma->vm_mm, addr, migrate->end,
- migrate->pgmap_owner);
- mmu_notifier_invalidate_range_start(&range);
- }
- migrate_vma_insert_page(migrate, addr, newpage,
- &migrate->src[i]);
- continue;
- }
-
- mapping = page_mapping(page);
-
- if (is_zone_device_page(newpage)) {
- if (is_device_private_page(newpage)) {
- /*
- * For now only support private anonymous when
- * migrating to un-addressable device memory.
- */
- if (mapping) {
- migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
- continue;
- }
- } else {
- /*
- * Other types of ZONE_DEVICE page are not
- * supported.
- */
- migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
- continue;
- }
- }
-
- r = migrate_page(mapping, newpage, page, MIGRATE_SYNC_NO_COPY);
- if (r != MIGRATEPAGE_SUCCESS)
- migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
- }
-
- /*
- * No need to double call mmu_notifier->invalidate_range() callback as
- * the above ptep_clear_flush_notify() inside migrate_vma_insert_page()
- * did already call it.
- */
- if (notified)
- mmu_notifier_invalidate_range_only_end(&range);
-}
-EXPORT_SYMBOL(migrate_vma_pages);
-
-/**
- * migrate_vma_finalize() - restore CPU page table entry
- * @migrate: migrate struct containing all migration information
- *
- * This replaces the special migration pte entry with either a mapping to the
- * new page if migration was successful for that page, or to the original page
- * otherwise.
- *
- * This also unlocks the pages and puts them back on the lru, or drops the extra
- * refcount, for device pages.
- */
-void migrate_vma_finalize(struct migrate_vma *migrate)
-{
- const unsigned long npages = migrate->npages;
- unsigned long i;
-
- for (i = 0; i < npages; i++) {
- struct page *newpage = migrate_pfn_to_page(migrate->dst[i]);
- struct page *page = migrate_pfn_to_page(migrate->src[i]);
-
- if (!page) {
- if (newpage) {
- unlock_page(newpage);
- put_page(newpage);
- }
- continue;
- }
-
- if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE) || !newpage) {
- if (newpage) {
- unlock_page(newpage);
- put_page(newpage);
- }
- newpage = page;
- }
-
- remove_migration_ptes(page, newpage, false);
- unlock_page(page);
-
- if (is_zone_device_page(page))
- put_page(page);
- else
- putback_lru_page(page);
-
- if (newpage != page) {
- unlock_page(newpage);
- if (is_zone_device_page(newpage))
- put_page(newpage);
- else
- putback_lru_page(newpage);
- }
- }
-}
-EXPORT_SYMBOL(migrate_vma_finalize);
-#endif /* CONFIG_DEVICE_PRIVATE */
-
/*
* node_demotion[] example:
*
@@ -3082,18 +2342,21 @@ static int establish_migrate_target(int node, nodemask_t *used,
if (best_distance != -1) {
val = node_distance(node, migration_target);
if (val > best_distance)
- return NUMA_NO_NODE;
+ goto out_clear;
}
index = nd->nr;
if (WARN_ONCE(index >= DEMOTION_TARGET_NODES,
"Exceeds maximum demotion target nodes\n"))
- return NUMA_NO_NODE;
+ goto out_clear;
nd->nodes[index] = migration_target;
nd->nr++;
return migration_target;
+out_clear:
+ node_clear(migration_target, *used);
+ return NUMA_NO_NODE;
}
/*
@@ -3190,7 +2453,7 @@ again:
/*
* For callers that do not hold get_online_mems() already.
*/
-static void set_migration_target_nodes(void)
+void set_migration_target_nodes(void)
{
get_online_mems();
__set_migration_target_nodes();
@@ -3254,51 +2517,24 @@ static int __meminit migrate_on_reclaim_callback(struct notifier_block *self,
return notifier_from_errno(0);
}
-/*
- * React to hotplug events that might affect the migration targets
- * like events that online or offline NUMA nodes.
- *
- * The ordering is also currently dependent on which nodes have
- * CPUs. That means we need CPU on/offline notification too.
- */
-static int migration_online_cpu(unsigned int cpu)
-{
- set_migration_target_nodes();
- return 0;
-}
-
-static int migration_offline_cpu(unsigned int cpu)
+void __init migrate_on_reclaim_init(void)
{
- set_migration_target_nodes();
- return 0;
-}
-
-static int __init migrate_on_reclaim_init(void)
-{
- int ret;
-
node_demotion = kmalloc_array(nr_node_ids,
sizeof(struct demotion_nodes),
GFP_KERNEL);
WARN_ON(!node_demotion);
- ret = cpuhp_setup_state_nocalls(CPUHP_MM_DEMOTION_DEAD, "mm/demotion:offline",
- NULL, migration_offline_cpu);
+ hotplug_memory_notifier(migrate_on_reclaim_callback, 100);
/*
- * In the unlikely case that this fails, the automatic
- * migration targets may become suboptimal for nodes
- * where N_CPU changes. With such a small impact in a
- * rare case, do not bother trying to do anything special.
+ * At this point, all numa nodes with memory/CPus have their state
+ * properly set, so we can build the demotion order now.
+ * Let us hold the cpu_hotplug lock just, as we could possibily have
+ * CPU hotplug events during boot.
*/
- WARN_ON(ret < 0);
- ret = cpuhp_setup_state(CPUHP_AP_MM_DEMOTION_ONLINE, "mm/demotion:online",
- migration_online_cpu, NULL);
- WARN_ON(ret < 0);
-
- hotplug_memory_notifier(migrate_on_reclaim_callback, 100);
- return 0;
+ cpus_read_lock();
+ set_migration_target_nodes();
+ cpus_read_unlock();
}
-late_initcall(migrate_on_reclaim_init);
#endif /* CONFIG_HOTPLUG_CPU */
bool numa_demotion_enabled = false;
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
new file mode 100644
index 000000000000..70c7dc05bbfc
--- /dev/null
+++ b/mm/migrate_device.c
@@ -0,0 +1,773 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Device Memory Migration functionality.
+ *
+ * Originally written by Jérôme Glisse.
+ */
+#include <linux/export.h>
+#include <linux/memremap.h>
+#include <linux/migrate.h>
+#include <linux/mm_inline.h>
+#include <linux/mmu_notifier.h>
+#include <linux/oom.h>
+#include <linux/pagewalk.h>
+#include <linux/rmap.h>
+#include <linux/swapops.h>
+#include <asm/tlbflush.h>
+#include "internal.h"
+
+static int migrate_vma_collect_skip(unsigned long start,
+ unsigned long end,
+ struct mm_walk *walk)
+{
+ struct migrate_vma *migrate = walk->private;
+ unsigned long addr;
+
+ for (addr = start; addr < end; addr += PAGE_SIZE) {
+ migrate->dst[migrate->npages] = 0;
+ migrate->src[migrate->npages++] = 0;
+ }
+
+ return 0;
+}
+
+static int migrate_vma_collect_hole(unsigned long start,
+ unsigned long end,
+ __always_unused int depth,
+ struct mm_walk *walk)
+{
+ struct migrate_vma *migrate = walk->private;
+ unsigned long addr;
+
+ /* Only allow populating anonymous memory. */
+ if (!vma_is_anonymous(walk->vma))
+ return migrate_vma_collect_skip(start, end, walk);
+
+ for (addr = start; addr < end; addr += PAGE_SIZE) {
+ migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE;
+ migrate->dst[migrate->npages] = 0;
+ migrate->npages++;
+ migrate->cpages++;
+ }
+
+ return 0;
+}
+
+static int migrate_vma_collect_pmd(pmd_t *pmdp,
+ unsigned long start,
+ unsigned long end,
+ struct mm_walk *walk)
+{
+ struct migrate_vma *migrate = walk->private;
+ struct vm_area_struct *vma = walk->vma;
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long addr = start, unmapped = 0;
+ spinlock_t *ptl;
+ pte_t *ptep;
+
+again:
+ if (pmd_none(*pmdp))
+ return migrate_vma_collect_hole(start, end, -1, walk);
+
+ if (pmd_trans_huge(*pmdp)) {
+ struct page *page;
+
+ ptl = pmd_lock(mm, pmdp);
+ if (unlikely(!pmd_trans_huge(*pmdp))) {
+ spin_unlock(ptl);
+ goto again;
+ }
+
+ page = pmd_page(*pmdp);
+ if (is_huge_zero_page(page)) {
+ spin_unlock(ptl);
+ split_huge_pmd(vma, pmdp, addr);
+ if (pmd_trans_unstable(pmdp))
+ return migrate_vma_collect_skip(start, end,
+ walk);
+ } else {
+ int ret;
+
+ get_page(page);
+ spin_unlock(ptl);
+ if (unlikely(!trylock_page(page)))
+ return migrate_vma_collect_skip(start, end,
+ walk);
+ ret = split_huge_page(page);
+ unlock_page(page);
+ put_page(page);
+ if (ret)
+ return migrate_vma_collect_skip(start, end,
+ walk);
+ if (pmd_none(*pmdp))
+ return migrate_vma_collect_hole(start, end, -1,
+ walk);
+ }
+ }
+
+ if (unlikely(pmd_bad(*pmdp)))
+ return migrate_vma_collect_skip(start, end, walk);
+
+ ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
+ arch_enter_lazy_mmu_mode();
+
+ for (; addr < end; addr += PAGE_SIZE, ptep++) {
+ unsigned long mpfn = 0, pfn;
+ struct page *page;
+ swp_entry_t entry;
+ pte_t pte;
+
+ pte = *ptep;
+
+ if (pte_none(pte)) {
+ if (vma_is_anonymous(vma)) {
+ mpfn = MIGRATE_PFN_MIGRATE;
+ migrate->cpages++;
+ }
+ goto next;
+ }
+
+ if (!pte_present(pte)) {
+ /*
+ * Only care about unaddressable device page special
+ * page table entry. Other special swap entries are not
+ * migratable, and we ignore regular swapped page.
+ */
+ entry = pte_to_swp_entry(pte);
+ if (!is_device_private_entry(entry))
+ goto next;
+
+ page = pfn_swap_entry_to_page(entry);
+ if (!(migrate->flags &
+ MIGRATE_VMA_SELECT_DEVICE_PRIVATE) ||
+ page->pgmap->owner != migrate->pgmap_owner)
+ goto next;
+
+ mpfn = migrate_pfn(page_to_pfn(page)) |
+ MIGRATE_PFN_MIGRATE;
+ if (is_writable_device_private_entry(entry))
+ mpfn |= MIGRATE_PFN_WRITE;
+ } else {
+ if (!(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM))
+ goto next;
+ pfn = pte_pfn(pte);
+ if (is_zero_pfn(pfn)) {
+ mpfn = MIGRATE_PFN_MIGRATE;
+ migrate->cpages++;
+ goto next;
+ }
+ page = vm_normal_page(migrate->vma, addr, pte);
+ mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
+ mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0;
+ }
+
+ /* FIXME support THP */
+ if (!page || !page->mapping || PageTransCompound(page)) {
+ mpfn = 0;
+ goto next;
+ }
+
+ /*
+ * By getting a reference on the page we pin it and that blocks
+ * any kind of migration. Side effect is that it "freezes" the
+ * pte.
+ *
+ * We drop this reference after isolating the page from the lru
+ * for non device page (device page are not on the lru and thus
+ * can't be dropped from it).
+ */
+ get_page(page);
+
+ /*
+ * Optimize for the common case where page is only mapped once
+ * in one process. If we can lock the page, then we can safely
+ * set up a special migration page table entry now.
+ */
+ if (trylock_page(page)) {
+ pte_t swp_pte;
+
+ migrate->cpages++;
+ ptep_get_and_clear(mm, addr, ptep);
+
+ /* Setup special migration page table entry */
+ if (mpfn & MIGRATE_PFN_WRITE)
+ entry = make_writable_migration_entry(
+ page_to_pfn(page));
+ else
+ entry = make_readable_migration_entry(
+ page_to_pfn(page));
+ swp_pte = swp_entry_to_pte(entry);
+ if (pte_present(pte)) {
+ if (pte_soft_dirty(pte))
+ swp_pte = pte_swp_mksoft_dirty(swp_pte);
+ if (pte_uffd_wp(pte))
+ swp_pte = pte_swp_mkuffd_wp(swp_pte);
+ } else {
+ if (pte_swp_soft_dirty(pte))
+ swp_pte = pte_swp_mksoft_dirty(swp_pte);
+ if (pte_swp_uffd_wp(pte))
+ swp_pte = pte_swp_mkuffd_wp(swp_pte);
+ }
+ set_pte_at(mm, addr, ptep, swp_pte);
+
+ /*
+ * This is like regular unmap: we remove the rmap and
+ * drop page refcount. Page won't be freed, as we took
+ * a reference just above.
+ */
+ page_remove_rmap(page, vma, false);
+ put_page(page);
+
+ if (pte_present(pte))
+ unmapped++;
+ } else {
+ put_page(page);
+ mpfn = 0;
+ }
+
+next:
+ migrate->dst[migrate->npages] = 0;
+ migrate->src[migrate->npages++] = mpfn;
+ }
+ arch_leave_lazy_mmu_mode();
+ pte_unmap_unlock(ptep - 1, ptl);
+
+ /* Only flush the TLB if we actually modified any entries */
+ if (unmapped)
+ flush_tlb_range(walk->vma, start, end);
+
+ return 0;
+}
+
+static const struct mm_walk_ops migrate_vma_walk_ops = {
+ .pmd_entry = migrate_vma_collect_pmd,
+ .pte_hole = migrate_vma_collect_hole,
+};
+
+/*
+ * migrate_vma_collect() - collect pages over a range of virtual addresses
+ * @migrate: migrate struct containing all migration information
+ *
+ * This will walk the CPU page table. For each virtual address backed by a
+ * valid page, it updates the src array and takes a reference on the page, in
+ * order to pin the page until we lock it and unmap it.
+ */
+static void migrate_vma_collect(struct migrate_vma *migrate)
+{
+ struct mmu_notifier_range range;
+
+ /*
+ * Note that the pgmap_owner is passed to the mmu notifier callback so
+ * that the registered device driver can skip invalidating device
+ * private page mappings that won't be migrated.
+ */
+ mmu_notifier_range_init_owner(&range, MMU_NOTIFY_MIGRATE, 0,
+ migrate->vma, migrate->vma->vm_mm, migrate->start, migrate->end,
+ migrate->pgmap_owner);
+ mmu_notifier_invalidate_range_start(&range);
+
+ walk_page_range(migrate->vma->vm_mm, migrate->start, migrate->end,
+ &migrate_vma_walk_ops, migrate);
+
+ mmu_notifier_invalidate_range_end(&range);
+ migrate->end = migrate->start + (migrate->npages << PAGE_SHIFT);
+}
+
+/*
+ * migrate_vma_check_page() - check if page is pinned or not
+ * @page: struct page to check
+ *
+ * Pinned pages cannot be migrated. This is the same test as in
+ * folio_migrate_mapping(), except that here we allow migration of a
+ * ZONE_DEVICE page.
+ */
+static bool migrate_vma_check_page(struct page *page)
+{
+ /*
+ * One extra ref because caller holds an extra reference, either from
+ * isolate_lru_page() for a regular page, or migrate_vma_collect() for
+ * a device page.
+ */
+ int extra = 1;
+
+ /*
+ * FIXME support THP (transparent huge page), it is bit more complex to
+ * check them than regular pages, because they can be mapped with a pmd
+ * or with a pte (split pte mapping).
+ */
+ if (PageCompound(page))
+ return false;
+
+ /* Page from ZONE_DEVICE have one extra reference */
+ if (is_zone_device_page(page))
+ extra++;
+
+ /* For file back page */
+ if (page_mapping(page))
+ extra += 1 + page_has_private(page);
+
+ if ((page_count(page) - extra) > page_mapcount(page))
+ return false;
+
+ return true;
+}
+
+/*
+ * migrate_vma_unmap() - replace page mapping with special migration pte entry
+ * @migrate: migrate struct containing all migration information
+ *
+ * Isolate pages from the LRU and replace mappings (CPU page table pte) with a
+ * special migration pte entry and check if it has been pinned. Pinned pages are
+ * restored because we cannot migrate them.
+ *
+ * This is the last step before we call the device driver callback to allocate
+ * destination memory and copy contents of original page over to new page.
+ */
+static void migrate_vma_unmap(struct migrate_vma *migrate)
+{
+ const unsigned long npages = migrate->npages;
+ unsigned long i, restore = 0;
+ bool allow_drain = true;
+
+ lru_add_drain();
+
+ for (i = 0; i < npages; i++) {
+ struct page *page = migrate_pfn_to_page(migrate->src[i]);
+ struct folio *folio;
+
+ if (!page)
+ continue;
+
+ /* ZONE_DEVICE pages are not on LRU */
+ if (!is_zone_device_page(page)) {
+ if (!PageLRU(page) && allow_drain) {
+ /* Drain CPU's pagevec */
+ lru_add_drain_all();
+ allow_drain = false;
+ }
+
+ if (isolate_lru_page(page)) {
+ migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
+ migrate->cpages--;
+ restore++;
+ continue;
+ }
+
+ /* Drop the reference we took in collect */
+ put_page(page);
+ }
+
+ folio = page_folio(page);
+ if (folio_mapped(folio))
+ try_to_migrate(folio, 0);
+
+ if (page_mapped(page) || !migrate_vma_check_page(page)) {
+ if (!is_zone_device_page(page)) {
+ get_page(page);
+ putback_lru_page(page);
+ }
+
+ migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
+ migrate->cpages--;
+ restore++;
+ continue;
+ }
+ }
+
+ for (i = 0; i < npages && restore; i++) {
+ struct page *page = migrate_pfn_to_page(migrate->src[i]);
+ struct folio *folio;
+
+ if (!page || (migrate->src[i] & MIGRATE_PFN_MIGRATE))
+ continue;
+
+ folio = page_folio(page);
+ remove_migration_ptes(folio, folio, false);
+
+ migrate->src[i] = 0;
+ folio_unlock(folio);
+ folio_put(folio);
+ restore--;
+ }
+}
+
+/**
+ * migrate_vma_setup() - prepare to migrate a range of memory
+ * @args: contains the vma, start, and pfns arrays for the migration
+ *
+ * Returns: negative errno on failures, 0 when 0 or more pages were migrated
+ * without an error.
+ *
+ * Prepare to migrate a range of memory virtual address range by collecting all
+ * the pages backing each virtual address in the range, saving them inside the
+ * src array. Then lock those pages and unmap them. Once the pages are locked
+ * and unmapped, check whether each page is pinned or not. Pages that aren't
+ * pinned have the MIGRATE_PFN_MIGRATE flag set (by this function) in the
+ * corresponding src array entry. Then restores any pages that are pinned, by
+ * remapping and unlocking those pages.
+ *
+ * The caller should then allocate destination memory and copy source memory to
+ * it for all those entries (ie with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE
+ * flag set). Once these are allocated and copied, the caller must update each
+ * corresponding entry in the dst array with the pfn value of the destination
+ * page and with MIGRATE_PFN_VALID. Destination pages must be locked via
+ * lock_page().
+ *
+ * Note that the caller does not have to migrate all the pages that are marked
+ * with MIGRATE_PFN_MIGRATE flag in src array unless this is a migration from
+ * device memory to system memory. If the caller cannot migrate a device page
+ * back to system memory, then it must return VM_FAULT_SIGBUS, which has severe
+ * consequences for the userspace process, so it must be avoided if at all
+ * possible.
+ *
+ * For empty entries inside CPU page table (pte_none() or pmd_none() is true) we
+ * do set MIGRATE_PFN_MIGRATE flag inside the corresponding source array thus
+ * allowing the caller to allocate device memory for those unbacked virtual
+ * addresses. For this the caller simply has to allocate device memory and
+ * properly set the destination entry like for regular migration. Note that
+ * this can still fail, and thus inside the device driver you must check if the
+ * migration was successful for those entries after calling migrate_vma_pages(),
+ * just like for regular migration.
+ *
+ * After that, the callers must call migrate_vma_pages() to go over each entry
+ * in the src array that has the MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag
+ * set. If the corresponding entry in dst array has MIGRATE_PFN_VALID flag set,
+ * then migrate_vma_pages() to migrate struct page information from the source
+ * struct page to the destination struct page. If it fails to migrate the
+ * struct page information, then it clears the MIGRATE_PFN_MIGRATE flag in the
+ * src array.
+ *
+ * At this point all successfully migrated pages have an entry in the src
+ * array with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag set and the dst
+ * array entry with MIGRATE_PFN_VALID flag set.
+ *
+ * Once migrate_vma_pages() returns the caller may inspect which pages were
+ * successfully migrated, and which were not. Successfully migrated pages will
+ * have the MIGRATE_PFN_MIGRATE flag set for their src array entry.
+ *
+ * It is safe to update device page table after migrate_vma_pages() because
+ * both destination and source page are still locked, and the mmap_lock is held
+ * in read mode (hence no one can unmap the range being migrated).
+ *
+ * Once the caller is done cleaning up things and updating its page table (if it
+ * chose to do so, this is not an obligation) it finally calls
+ * migrate_vma_finalize() to update the CPU page table to point to new pages
+ * for successfully migrated pages or otherwise restore the CPU page table to
+ * point to the original source pages.
+ */
+int migrate_vma_setup(struct migrate_vma *args)
+{
+ long nr_pages = (args->end - args->start) >> PAGE_SHIFT;
+
+ args->start &= PAGE_MASK;
+ args->end &= PAGE_MASK;
+ if (!args->vma || is_vm_hugetlb_page(args->vma) ||
+ (args->vma->vm_flags & VM_SPECIAL) || vma_is_dax(args->vma))
+ return -EINVAL;
+ if (nr_pages <= 0)
+ return -EINVAL;
+ if (args->start < args->vma->vm_start ||
+ args->start >= args->vma->vm_end)
+ return -EINVAL;
+ if (args->end <= args->vma->vm_start || args->end > args->vma->vm_end)
+ return -EINVAL;
+ if (!args->src || !args->dst)
+ return -EINVAL;
+
+ memset(args->src, 0, sizeof(*args->src) * nr_pages);
+ args->cpages = 0;
+ args->npages = 0;
+
+ migrate_vma_collect(args);
+
+ if (args->cpages)
+ migrate_vma_unmap(args);
+
+ /*
+ * At this point pages are locked and unmapped, and thus they have
+ * stable content and can safely be copied to destination memory that
+ * is allocated by the drivers.
+ */
+ return 0;
+
+}
+EXPORT_SYMBOL(migrate_vma_setup);
+
+/*
+ * This code closely matches the code in:
+ * __handle_mm_fault()
+ * handle_pte_fault()
+ * do_anonymous_page()
+ * to map in an anonymous zero page but the struct page will be a ZONE_DEVICE
+ * private page.
+ */
+static void migrate_vma_insert_page(struct migrate_vma *migrate,
+ unsigned long addr,
+ struct page *page,
+ unsigned long *src)
+{
+ struct vm_area_struct *vma = migrate->vma;
+ struct mm_struct *mm = vma->vm_mm;
+ bool flush = false;
+ spinlock_t *ptl;
+ pte_t entry;
+ pgd_t *pgdp;
+ p4d_t *p4dp;
+ pud_t *pudp;
+ pmd_t *pmdp;
+ pte_t *ptep;
+
+ /* Only allow populating anonymous memory */
+ if (!vma_is_anonymous(vma))
+ goto abort;
+
+ pgdp = pgd_offset(mm, addr);
+ p4dp = p4d_alloc(mm, pgdp, addr);
+ if (!p4dp)
+ goto abort;
+ pudp = pud_alloc(mm, p4dp, addr);
+ if (!pudp)
+ goto abort;
+ pmdp = pmd_alloc(mm, pudp, addr);
+ if (!pmdp)
+ goto abort;
+
+ if (pmd_trans_huge(*pmdp) || pmd_devmap(*pmdp))
+ goto abort;
+
+ /*
+ * Use pte_alloc() instead of pte_alloc_map(). We can't run
+ * pte_offset_map() on pmds where a huge pmd might be created
+ * from a different thread.
+ *
+ * pte_alloc_map() is safe to use under mmap_write_lock(mm) or when
+ * parallel threads are excluded by other means.
+ *
+ * Here we only have mmap_read_lock(mm).
+ */
+ if (pte_alloc(mm, pmdp))
+ goto abort;
+
+ /* See the comment in pte_alloc_one_map() */
+ if (unlikely(pmd_trans_unstable(pmdp)))
+ goto abort;
+
+ if (unlikely(anon_vma_prepare(vma)))
+ goto abort;
+ if (mem_cgroup_charge(page_folio(page), vma->vm_mm, GFP_KERNEL))
+ goto abort;
+
+ /*
+ * The memory barrier inside __SetPageUptodate makes sure that
+ * preceding stores to the page contents become visible before
+ * the set_pte_at() write.
+ */
+ __SetPageUptodate(page);
+
+ if (is_device_private_page(page)) {
+ swp_entry_t swp_entry;
+
+ if (vma->vm_flags & VM_WRITE)
+ swp_entry = make_writable_device_private_entry(
+ page_to_pfn(page));
+ else
+ swp_entry = make_readable_device_private_entry(
+ page_to_pfn(page));
+ entry = swp_entry_to_pte(swp_entry);
+ } else {
+ /*
+ * For now we only support migrating to un-addressable device
+ * memory.
+ */
+ if (is_zone_device_page(page)) {
+ pr_warn_once("Unsupported ZONE_DEVICE page type.\n");
+ goto abort;
+ }
+ entry = mk_pte(page, vma->vm_page_prot);
+ if (vma->vm_flags & VM_WRITE)
+ entry = pte_mkwrite(pte_mkdirty(entry));
+ }
+
+ ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
+
+ if (check_stable_address_space(mm))
+ goto unlock_abort;
+
+ if (pte_present(*ptep)) {
+ unsigned long pfn = pte_pfn(*ptep);
+
+ if (!is_zero_pfn(pfn))
+ goto unlock_abort;
+ flush = true;
+ } else if (!pte_none(*ptep))
+ goto unlock_abort;
+
+ /*
+ * Check for userfaultfd but do not deliver the fault. Instead,
+ * just back off.
+ */
+ if (userfaultfd_missing(vma))
+ goto unlock_abort;
+
+ inc_mm_counter(mm, MM_ANONPAGES);
+ page_add_new_anon_rmap(page, vma, addr, false);
+ if (!is_zone_device_page(page))
+ lru_cache_add_inactive_or_unevictable(page, vma);
+ get_page(page);
+
+ if (flush) {
+ flush_cache_page(vma, addr, pte_pfn(*ptep));
+ ptep_clear_flush_notify(vma, addr, ptep);
+ set_pte_at_notify(mm, addr, ptep, entry);
+ update_mmu_cache(vma, addr, ptep);
+ } else {
+ /* No need to invalidate - it was non-present before */
+ set_pte_at(mm, addr, ptep, entry);
+ update_mmu_cache(vma, addr, ptep);
+ }
+
+ pte_unmap_unlock(ptep, ptl);
+ *src = MIGRATE_PFN_MIGRATE;
+ return;
+
+unlock_abort:
+ pte_unmap_unlock(ptep, ptl);
+abort:
+ *src &= ~MIGRATE_PFN_MIGRATE;
+}
+
+/**
+ * migrate_vma_pages() - migrate meta-data from src page to dst page
+ * @migrate: migrate struct containing all migration information
+ *
+ * This migrates struct page meta-data from source struct page to destination
+ * struct page. This effectively finishes the migration from source page to the
+ * destination page.
+ */
+void migrate_vma_pages(struct migrate_vma *migrate)
+{
+ const unsigned long npages = migrate->npages;
+ const unsigned long start = migrate->start;
+ struct mmu_notifier_range range;
+ unsigned long addr, i;
+ bool notified = false;
+
+ for (i = 0, addr = start; i < npages; addr += PAGE_SIZE, i++) {
+ struct page *newpage = migrate_pfn_to_page(migrate->dst[i]);
+ struct page *page = migrate_pfn_to_page(migrate->src[i]);
+ struct address_space *mapping;
+ int r;
+
+ if (!newpage) {
+ migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
+ continue;
+ }
+
+ if (!page) {
+ if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE))
+ continue;
+ if (!notified) {
+ notified = true;
+
+ mmu_notifier_range_init_owner(&range,
+ MMU_NOTIFY_MIGRATE, 0, migrate->vma,
+ migrate->vma->vm_mm, addr, migrate->end,
+ migrate->pgmap_owner);
+ mmu_notifier_invalidate_range_start(&range);
+ }
+ migrate_vma_insert_page(migrate, addr, newpage,
+ &migrate->src[i]);
+ continue;
+ }
+
+ mapping = page_mapping(page);
+
+ if (is_device_private_page(newpage)) {
+ /*
+ * For now only support private anonymous when migrating
+ * to un-addressable device memory.
+ */
+ if (mapping) {
+ migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
+ continue;
+ }
+ } else if (is_zone_device_page(newpage)) {
+ /*
+ * Other types of ZONE_DEVICE page are not supported.
+ */
+ migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
+ continue;
+ }
+
+ r = migrate_page(mapping, newpage, page, MIGRATE_SYNC_NO_COPY);
+ if (r != MIGRATEPAGE_SUCCESS)
+ migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
+ }
+
+ /*
+ * No need to double call mmu_notifier->invalidate_range() callback as
+ * the above ptep_clear_flush_notify() inside migrate_vma_insert_page()
+ * did already call it.
+ */
+ if (notified)
+ mmu_notifier_invalidate_range_only_end(&range);
+}
+EXPORT_SYMBOL(migrate_vma_pages);
+
+/**
+ * migrate_vma_finalize() - restore CPU page table entry
+ * @migrate: migrate struct containing all migration information
+ *
+ * This replaces the special migration pte entry with either a mapping to the
+ * new page if migration was successful for that page, or to the original page
+ * otherwise.
+ *
+ * This also unlocks the pages and puts them back on the lru, or drops the extra
+ * refcount, for device pages.
+ */
+void migrate_vma_finalize(struct migrate_vma *migrate)
+{
+ const unsigned long npages = migrate->npages;
+ unsigned long i;
+
+ for (i = 0; i < npages; i++) {
+ struct folio *dst, *src;
+ struct page *newpage = migrate_pfn_to_page(migrate->dst[i]);
+ struct page *page = migrate_pfn_to_page(migrate->src[i]);
+
+ if (!page) {
+ if (newpage) {
+ unlock_page(newpage);
+ put_page(newpage);
+ }
+ continue;
+ }
+
+ if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE) || !newpage) {
+ if (newpage) {
+ unlock_page(newpage);
+ put_page(newpage);
+ }
+ newpage = page;
+ }
+
+ src = page_folio(page);
+ dst = page_folio(newpage);
+ remove_migration_ptes(src, dst, false);
+ folio_unlock(src);
+
+ if (is_zone_device_page(page))
+ put_page(page);
+ else
+ putback_lru_page(page);
+
+ if (newpage != page) {
+ unlock_page(newpage);
+ if (is_zone_device_page(newpage))
+ put_page(newpage);
+ else
+ putback_lru_page(newpage);
+ }
+ }
+}
+EXPORT_SYMBOL(migrate_vma_finalize);
diff --git a/mm/mlock.c b/mm/mlock.c
index 8f584eddd305..529fbc1f27c8 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -14,6 +14,7 @@
#include <linux/swapops.h>
#include <linux/pagemap.h>
#include <linux/pagevec.h>
+#include <linux/pagewalk.h>
#include <linux/mempolicy.h>
#include <linux/syscalls.h>
#include <linux/sched.h>
@@ -27,6 +28,8 @@
#include "internal.h"
+static DEFINE_PER_CPU(struct pagevec, mlock_pvec);
+
bool can_do_mlock(void)
{
if (rlimit(RLIMIT_MEMLOCK) != 0)
@@ -46,441 +49,320 @@ EXPORT_SYMBOL(can_do_mlock);
* be placed on the LRU "unevictable" list, rather than the [in]active lists.
* The unevictable list is an LRU sibling list to the [in]active lists.
* PageUnevictable is set to indicate the unevictable state.
- *
- * When lazy mlocking via vmscan, it is important to ensure that the
- * vma's VM_LOCKED status is not concurrently being modified, otherwise we
- * may have mlocked a page that is being munlocked. So lazy mlock must take
- * the mmap_lock for read, and verify that the vma really is locked
- * (see mm/rmap.c).
*/
-/*
- * LRU accounting for clear_page_mlock()
- */
-void clear_page_mlock(struct page *page)
+static struct lruvec *__mlock_page(struct page *page, struct lruvec *lruvec)
{
- int nr_pages;
+ /* There is nothing more we can do while it's off LRU */
+ if (!TestClearPageLRU(page))
+ return lruvec;
- if (!TestClearPageMlocked(page))
- return;
+ lruvec = folio_lruvec_relock_irq(page_folio(page), lruvec);
- nr_pages = thp_nr_pages(page);
- mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
- count_vm_events(UNEVICTABLE_PGCLEARED, nr_pages);
- /*
- * The previous TestClearPageMlocked() corresponds to the smp_mb()
- * in __pagevec_lru_add_fn().
- *
- * See __pagevec_lru_add_fn for more explanation.
- */
- if (!isolate_lru_page(page)) {
- putback_lru_page(page);
- } else {
+ if (unlikely(page_evictable(page))) {
/*
- * We lost the race. the page already moved to evictable list.
+ * This is a little surprising, but quite possible:
+ * PageMlocked must have got cleared already by another CPU.
+ * Could this page be on the Unevictable LRU? I'm not sure,
+ * but move it now if so.
*/
- if (PageUnevictable(page))
- count_vm_events(UNEVICTABLE_PGSTRANDED, nr_pages);
+ if (PageUnevictable(page)) {
+ del_page_from_lru_list(page, lruvec);
+ ClearPageUnevictable(page);
+ add_page_to_lru_list(page, lruvec);
+ __count_vm_events(UNEVICTABLE_PGRESCUED,
+ thp_nr_pages(page));
+ }
+ goto out;
}
+
+ if (PageUnevictable(page)) {
+ if (PageMlocked(page))
+ page->mlock_count++;
+ goto out;
+ }
+
+ del_page_from_lru_list(page, lruvec);
+ ClearPageActive(page);
+ SetPageUnevictable(page);
+ page->mlock_count = !!PageMlocked(page);
+ add_page_to_lru_list(page, lruvec);
+ __count_vm_events(UNEVICTABLE_PGCULLED, thp_nr_pages(page));
+out:
+ SetPageLRU(page);
+ return lruvec;
}
-/*
- * Mark page as mlocked if not already.
- * If page on LRU, isolate and putback to move to unevictable list.
- */
-void mlock_vma_page(struct page *page)
+static struct lruvec *__mlock_new_page(struct page *page, struct lruvec *lruvec)
{
- /* Serialize with page migration */
- BUG_ON(!PageLocked(page));
+ VM_BUG_ON_PAGE(PageLRU(page), page);
- VM_BUG_ON_PAGE(PageTail(page), page);
- VM_BUG_ON_PAGE(PageCompound(page) && PageDoubleMap(page), page);
+ lruvec = folio_lruvec_relock_irq(page_folio(page), lruvec);
- if (!TestSetPageMlocked(page)) {
- int nr_pages = thp_nr_pages(page);
+ /* As above, this is a little surprising, but possible */
+ if (unlikely(page_evictable(page)))
+ goto out;
- mod_zone_page_state(page_zone(page), NR_MLOCK, nr_pages);
- count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages);
- if (!isolate_lru_page(page))
- putback_lru_page(page);
- }
+ SetPageUnevictable(page);
+ page->mlock_count = !!PageMlocked(page);
+ __count_vm_events(UNEVICTABLE_PGCULLED, thp_nr_pages(page));
+out:
+ add_page_to_lru_list(page, lruvec);
+ SetPageLRU(page);
+ return lruvec;
}
-/*
- * Finish munlock after successful page isolation
- *
- * Page must be locked. This is a wrapper for page_mlock()
- * and putback_lru_page() with munlock accounting.
- */
-static void __munlock_isolated_page(struct page *page)
+static struct lruvec *__munlock_page(struct page *page, struct lruvec *lruvec)
{
- /*
- * Optimization: if the page was mapped just once, that's our mapping
- * and we don't need to check all the other vmas.
- */
- if (page_mapcount(page) > 1)
- page_mlock(page);
+ int nr_pages = thp_nr_pages(page);
+ bool isolated = false;
+
+ if (!TestClearPageLRU(page))
+ goto munlock;
- /* Did try_to_unlock() succeed or punt? */
- if (!PageMlocked(page))
- count_vm_events(UNEVICTABLE_PGMUNLOCKED, thp_nr_pages(page));
+ isolated = true;
+ lruvec = folio_lruvec_relock_irq(page_folio(page), lruvec);
- putback_lru_page(page);
+ if (PageUnevictable(page)) {
+ /* Then mlock_count is maintained, but might undercount */
+ if (page->mlock_count)
+ page->mlock_count--;
+ if (page->mlock_count)
+ goto out;
+ }
+ /* else assume that was the last mlock: reclaim will fix it if not */
+
+munlock:
+ if (TestClearPageMlocked(page)) {
+ __mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
+ if (isolated || !PageUnevictable(page))
+ __count_vm_events(UNEVICTABLE_PGMUNLOCKED, nr_pages);
+ else
+ __count_vm_events(UNEVICTABLE_PGSTRANDED, nr_pages);
+ }
+
+ /* page_evictable() has to be checked *after* clearing Mlocked */
+ if (isolated && PageUnevictable(page) && page_evictable(page)) {
+ del_page_from_lru_list(page, lruvec);
+ ClearPageUnevictable(page);
+ add_page_to_lru_list(page, lruvec);
+ __count_vm_events(UNEVICTABLE_PGRESCUED, nr_pages);
+ }
+out:
+ if (isolated)
+ SetPageLRU(page);
+ return lruvec;
}
/*
- * Accounting for page isolation fail during munlock
- *
- * Performs accounting when page isolation fails in munlock. There is nothing
- * else to do because it means some other task has already removed the page
- * from the LRU. putback_lru_page() will take care of removing the page from
- * the unevictable list, if necessary. vmscan [page_referenced()] will move
- * the page back to the unevictable list if some other vma has it mlocked.
+ * Flags held in the low bits of a struct page pointer on the mlock_pvec.
*/
-static void __munlock_isolation_failed(struct page *page)
+#define LRU_PAGE 0x1
+#define NEW_PAGE 0x2
+static inline struct page *mlock_lru(struct page *page)
{
- int nr_pages = thp_nr_pages(page);
+ return (struct page *)((unsigned long)page + LRU_PAGE);
+}
- if (PageUnevictable(page))
- __count_vm_events(UNEVICTABLE_PGSTRANDED, nr_pages);
- else
- __count_vm_events(UNEVICTABLE_PGMUNLOCKED, nr_pages);
+static inline struct page *mlock_new(struct page *page)
+{
+ return (struct page *)((unsigned long)page + NEW_PAGE);
}
-/**
- * munlock_vma_page - munlock a vma page
- * @page: page to be unlocked, either a normal page or THP page head
- *
- * returns the size of the page as a page mask (0 for normal page,
- * HPAGE_PMD_NR - 1 for THP head page)
- *
- * called from munlock()/munmap() path with page supposedly on the LRU.
- * When we munlock a page, because the vma where we found the page is being
- * munlock()ed or munmap()ed, we want to check whether other vmas hold the
- * page locked so that we can leave it on the unevictable lru list and not
- * bother vmscan with it. However, to walk the page's rmap list in
- * page_mlock() we must isolate the page from the LRU. If some other
- * task has removed the page from the LRU, we won't be able to do that.
- * So we clear the PageMlocked as we might not get another chance. If we
- * can't isolate the page, we leave it for putback_lru_page() and vmscan
- * [page_referenced()/try_to_unmap()] to deal with.
+/*
+ * mlock_pagevec() is derived from pagevec_lru_move_fn():
+ * perhaps that can make use of such page pointer flags in future,
+ * but for now just keep it for mlock. We could use three separate
+ * pagevecs instead, but one feels better (munlocking a full pagevec
+ * does not need to drain mlocking pagevecs first).
*/
-unsigned int munlock_vma_page(struct page *page)
+static void mlock_pagevec(struct pagevec *pvec)
{
- int nr_pages;
-
- /* For page_mlock() and to serialize with page migration */
- BUG_ON(!PageLocked(page));
- VM_BUG_ON_PAGE(PageTail(page), page);
+ struct lruvec *lruvec = NULL;
+ unsigned long mlock;
+ struct page *page;
+ int i;
- if (!TestClearPageMlocked(page)) {
- /* Potentially, PTE-mapped THP: do not skip the rest PTEs */
- return 0;
+ for (i = 0; i < pagevec_count(pvec); i++) {
+ page = pvec->pages[i];
+ mlock = (unsigned long)page & (LRU_PAGE | NEW_PAGE);
+ page = (struct page *)((unsigned long)page - mlock);
+ pvec->pages[i] = page;
+
+ if (mlock & LRU_PAGE)
+ lruvec = __mlock_page(page, lruvec);
+ else if (mlock & NEW_PAGE)
+ lruvec = __mlock_new_page(page, lruvec);
+ else
+ lruvec = __munlock_page(page, lruvec);
}
- nr_pages = thp_nr_pages(page);
- mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
+ if (lruvec)
+ unlock_page_lruvec_irq(lruvec);
+ release_pages(pvec->pages, pvec->nr);
+ pagevec_reinit(pvec);
+}
- if (!isolate_lru_page(page))
- __munlock_isolated_page(page);
- else
- __munlock_isolation_failed(page);
+void mlock_page_drain(int cpu)
+{
+ struct pagevec *pvec;
- return nr_pages - 1;
+ pvec = &per_cpu(mlock_pvec, cpu);
+ if (pagevec_count(pvec))
+ mlock_pagevec(pvec);
}
-/*
- * convert get_user_pages() return value to posix mlock() error
- */
-static int __mlock_posix_error_return(long retval)
+bool need_mlock_page_drain(int cpu)
{
- if (retval == -EFAULT)
- retval = -ENOMEM;
- else if (retval == -ENOMEM)
- retval = -EAGAIN;
- return retval;
+ return pagevec_count(&per_cpu(mlock_pvec, cpu));
}
-/*
- * Prepare page for fast batched LRU putback via putback_lru_evictable_pagevec()
- *
- * The fast path is available only for evictable pages with single mapping.
- * Then we can bypass the per-cpu pvec and get better performance.
- * when mapcount > 1 we need page_mlock() which can fail.
- * when !page_evictable(), we need the full redo logic of putback_lru_page to
- * avoid leaving evictable page in unevictable list.
- *
- * In case of success, @page is added to @pvec and @pgrescued is incremented
- * in case that the page was previously unevictable. @page is also unlocked.
+/**
+ * mlock_folio - mlock a folio already on (or temporarily off) LRU
+ * @folio: folio to be mlocked.
*/
-static bool __putback_lru_fast_prepare(struct page *page, struct pagevec *pvec,
- int *pgrescued)
+void mlock_folio(struct folio *folio)
{
- VM_BUG_ON_PAGE(PageLRU(page), page);
- VM_BUG_ON_PAGE(!PageLocked(page), page);
+ struct pagevec *pvec = &get_cpu_var(mlock_pvec);
- if (page_mapcount(page) <= 1 && page_evictable(page)) {
- pagevec_add(pvec, page);
- if (TestClearPageUnevictable(page))
- (*pgrescued)++;
- unlock_page(page);
- return true;
+ if (!folio_test_set_mlocked(folio)) {
+ int nr_pages = folio_nr_pages(folio);
+
+ zone_stat_mod_folio(folio, NR_MLOCK, nr_pages);
+ __count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages);
}
- return false;
+ folio_get(folio);
+ if (!pagevec_add(pvec, mlock_lru(&folio->page)) ||
+ folio_test_large(folio) || lru_cache_disabled())
+ mlock_pagevec(pvec);
+ put_cpu_var(mlock_pvec);
}
-/*
- * Putback multiple evictable pages to the LRU
- *
- * Batched putback of evictable pages that bypasses the per-cpu pvec. Some of
- * the pages might have meanwhile become unevictable but that is OK.
+/**
+ * mlock_new_page - mlock a newly allocated page not yet on LRU
+ * @page: page to be mlocked, either a normal page or a THP head.
*/
-static void __putback_lru_fast(struct pagevec *pvec, int pgrescued)
+void mlock_new_page(struct page *page)
{
- count_vm_events(UNEVICTABLE_PGMUNLOCKED, pagevec_count(pvec));
- /*
- *__pagevec_lru_add() calls release_pages() so we don't call
- * put_page() explicitly
- */
- __pagevec_lru_add(pvec);
- count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued);
+ struct pagevec *pvec = &get_cpu_var(mlock_pvec);
+ int nr_pages = thp_nr_pages(page);
+
+ SetPageMlocked(page);
+ mod_zone_page_state(page_zone(page), NR_MLOCK, nr_pages);
+ __count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages);
+
+ get_page(page);
+ if (!pagevec_add(pvec, mlock_new(page)) ||
+ PageHead(page) || lru_cache_disabled())
+ mlock_pagevec(pvec);
+ put_cpu_var(mlock_pvec);
}
-/*
- * Munlock a batch of pages from the same zone
- *
- * The work is split to two main phases. First phase clears the Mlocked flag
- * and attempts to isolate the pages, all under a single zone lru lock.
- * The second phase finishes the munlock only for pages where isolation
- * succeeded.
- *
- * Note that the pagevec may be modified during the process.
+/**
+ * munlock_page - munlock a page
+ * @page: page to be munlocked, either a normal page or a THP head.
*/
-static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
+void munlock_page(struct page *page)
{
- int i;
- int nr = pagevec_count(pvec);
- int delta_munlocked = -nr;
- struct pagevec pvec_putback;
- struct lruvec *lruvec = NULL;
- int pgrescued = 0;
-
- pagevec_init(&pvec_putback);
-
- /* Phase 1: page isolation */
- for (i = 0; i < nr; i++) {
- struct page *page = pvec->pages[i];
- struct folio *folio = page_folio(page);
-
- if (TestClearPageMlocked(page)) {
- /*
- * We already have pin from follow_page_mask()
- * so we can spare the get_page() here.
- */
- if (TestClearPageLRU(page)) {
- lruvec = folio_lruvec_relock_irq(folio, lruvec);
- del_page_from_lru_list(page, lruvec);
- continue;
- } else
- __munlock_isolation_failed(page);
- } else {
- delta_munlocked++;
- }
-
- /*
- * We won't be munlocking this page in the next phase
- * but we still need to release the follow_page_mask()
- * pin. We cannot do it under lru_lock however. If it's
- * the last pin, __page_cache_release() would deadlock.
- */
- pagevec_add(&pvec_putback, pvec->pages[i]);
- pvec->pages[i] = NULL;
- }
- if (lruvec) {
- __mod_zone_page_state(zone, NR_MLOCK, delta_munlocked);
- unlock_page_lruvec_irq(lruvec);
- } else if (delta_munlocked) {
- mod_zone_page_state(zone, NR_MLOCK, delta_munlocked);
- }
-
- /* Now we can release pins of pages that we are not munlocking */
- pagevec_release(&pvec_putback);
-
- /* Phase 2: page munlock */
- for (i = 0; i < nr; i++) {
- struct page *page = pvec->pages[i];
-
- if (page) {
- lock_page(page);
- if (!__putback_lru_fast_prepare(page, &pvec_putback,
- &pgrescued)) {
- /*
- * Slow path. We don't want to lose the last
- * pin before unlock_page()
- */
- get_page(page); /* for putback_lru_page() */
- __munlock_isolated_page(page);
- unlock_page(page);
- put_page(page); /* from follow_page_mask() */
- }
- }
- }
+ struct pagevec *pvec = &get_cpu_var(mlock_pvec);
/*
- * Phase 3: page putback for pages that qualified for the fast path
- * This will also call put_page() to return pin from follow_page_mask()
+ * TestClearPageMlocked(page) must be left to __munlock_page(),
+ * which will check whether the page is multiply mlocked.
*/
- if (pagevec_count(&pvec_putback))
- __putback_lru_fast(&pvec_putback, pgrescued);
+
+ get_page(page);
+ if (!pagevec_add(pvec, page) ||
+ PageHead(page) || lru_cache_disabled())
+ mlock_pagevec(pvec);
+ put_cpu_var(mlock_pvec);
}
-/*
- * Fill up pagevec for __munlock_pagevec using pte walk
- *
- * The function expects that the struct page corresponding to @start address is
- * a non-TPH page already pinned and in the @pvec, and that it belongs to @zone.
- *
- * The rest of @pvec is filled by subsequent pages within the same pmd and same
- * zone, as long as the pte's are present and vm_normal_page() succeeds. These
- * pages also get pinned.
- *
- * Returns the address of the next page that should be scanned. This equals
- * @start + PAGE_SIZE when no page could be added by the pte walk.
- */
-static unsigned long __munlock_pagevec_fill(struct pagevec *pvec,
- struct vm_area_struct *vma, struct zone *zone,
- unsigned long start, unsigned long end)
+static int mlock_pte_range(pmd_t *pmd, unsigned long addr,
+ unsigned long end, struct mm_walk *walk)
+
{
- pte_t *pte;
+ struct vm_area_struct *vma = walk->vma;
spinlock_t *ptl;
+ pte_t *start_pte, *pte;
+ struct page *page;
- /*
- * Initialize pte walk starting at the already pinned page where we
- * are sure that there is a pte, as it was pinned under the same
- * mmap_lock write op.
- */
- pte = get_locked_pte(vma->vm_mm, start, &ptl);
- /* Make sure we do not cross the page table boundary */
- end = pgd_addr_end(start, end);
- end = p4d_addr_end(start, end);
- end = pud_addr_end(start, end);
- end = pmd_addr_end(start, end);
-
- /* The page next to the pinned page is the first we will try to get */
- start += PAGE_SIZE;
- while (start < end) {
- struct page *page = NULL;
- pte++;
- if (pte_present(*pte))
- page = vm_normal_page(vma, start, *pte);
- /*
- * Break if page could not be obtained or the page's node+zone does not
- * match
- */
- if (!page || page_zone(page) != zone)
- break;
+ ptl = pmd_trans_huge_lock(pmd, vma);
+ if (ptl) {
+ if (!pmd_present(*pmd))
+ goto out;
+ if (is_huge_zero_pmd(*pmd))
+ goto out;
+ page = pmd_page(*pmd);
+ if (vma->vm_flags & VM_LOCKED)
+ mlock_folio(page_folio(page));
+ else
+ munlock_page(page);
+ goto out;
+ }
- /*
- * Do not use pagevec for PTE-mapped THP,
- * munlock_vma_pages_range() will handle them.
- */
+ start_pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+ for (pte = start_pte; addr != end; pte++, addr += PAGE_SIZE) {
+ if (!pte_present(*pte))
+ continue;
+ page = vm_normal_page(vma, addr, *pte);
+ if (!page)
+ continue;
if (PageTransCompound(page))
- break;
-
- get_page(page);
- /*
- * Increase the address that will be returned *before* the
- * eventual break due to pvec becoming full by adding the page
- */
- start += PAGE_SIZE;
- if (pagevec_add(pvec, page) == 0)
- break;
+ continue;
+ if (vma->vm_flags & VM_LOCKED)
+ mlock_folio(page_folio(page));
+ else
+ munlock_page(page);
}
- pte_unmap_unlock(pte, ptl);
- return start;
+ pte_unmap(start_pte);
+out:
+ spin_unlock(ptl);
+ cond_resched();
+ return 0;
}
/*
- * munlock_vma_pages_range() - munlock all pages in the vma range.'
- * @vma - vma containing range to be munlock()ed.
+ * mlock_vma_pages_range() - mlock any pages already in the range,
+ * or munlock all pages in the range.
+ * @vma - vma containing range to be mlock()ed or munlock()ed
* @start - start address in @vma of the range
- * @end - end of range in @vma.
- *
- * For mremap(), munmap() and exit().
+ * @end - end of range in @vma
+ * @newflags - the new set of flags for @vma.
*
- * Called with @vma VM_LOCKED.
- *
- * Returns with VM_LOCKED cleared. Callers must be prepared to
- * deal with this.
- *
- * We don't save and restore VM_LOCKED here because pages are
- * still on lru. In unmap path, pages might be scanned by reclaim
- * and re-mlocked by page_mlock/try_to_unmap before we unmap and
- * free them. This will result in freeing mlocked pages.
+ * Called for mlock(), mlock2() and mlockall(), to set @vma VM_LOCKED;
+ * called for munlock() and munlockall(), to clear VM_LOCKED from @vma.
*/
-void munlock_vma_pages_range(struct vm_area_struct *vma,
- unsigned long start, unsigned long end)
+static void mlock_vma_pages_range(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end, vm_flags_t newflags)
{
- vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
+ static const struct mm_walk_ops mlock_walk_ops = {
+ .pmd_entry = mlock_pte_range,
+ };
- while (start < end) {
- struct page *page;
- unsigned int page_mask = 0;
- unsigned long page_increm;
- struct pagevec pvec;
- struct zone *zone;
+ /*
+ * There is a slight chance that concurrent page migration,
+ * or page reclaim finding a page of this now-VM_LOCKED vma,
+ * will call mlock_vma_page() and raise page's mlock_count:
+ * double counting, leaving the page unevictable indefinitely.
+ * Communicate this danger to mlock_vma_page() with VM_IO,
+ * which is a VM_SPECIAL flag not allowed on VM_LOCKED vmas.
+ * mmap_lock is held in write mode here, so this weird
+ * combination should not be visible to other mmap_lock users;
+ * but WRITE_ONCE so rmap walkers must see VM_IO if VM_LOCKED.
+ */
+ if (newflags & VM_LOCKED)
+ newflags |= VM_IO;
+ WRITE_ONCE(vma->vm_flags, newflags);
- pagevec_init(&pvec);
- /*
- * Although FOLL_DUMP is intended for get_dump_page(),
- * it just so happens that its special treatment of the
- * ZERO_PAGE (returning an error instead of doing get_page)
- * suits munlock very well (and if somehow an abnormal page
- * has sneaked into the range, we won't oops here: great).
- */
- page = follow_page(vma, start, FOLL_GET | FOLL_DUMP);
-
- if (page && !IS_ERR(page)) {
- if (PageTransTail(page)) {
- VM_BUG_ON_PAGE(PageMlocked(page), page);
- put_page(page); /* follow_page_mask() */
- } else if (PageTransHuge(page)) {
- lock_page(page);
- /*
- * Any THP page found by follow_page_mask() may
- * have gotten split before reaching
- * munlock_vma_page(), so we need to compute
- * the page_mask here instead.
- */
- page_mask = munlock_vma_page(page);
- unlock_page(page);
- put_page(page); /* follow_page_mask() */
- } else {
- /*
- * Non-huge pages are handled in batches via
- * pagevec. The pin from follow_page_mask()
- * prevents them from collapsing by THP.
- */
- pagevec_add(&pvec, page);
- zone = page_zone(page);
-
- /*
- * Try to fill the rest of pagevec using fast
- * pte walk. This will also update start to
- * the next page to process. Then munlock the
- * pagevec.
- */
- start = __munlock_pagevec_fill(&pvec, vma,
- zone, start, end);
- __munlock_pagevec(&pvec, zone);
- goto next;
- }
- }
- page_increm = 1 + page_mask;
- start += page_increm * PAGE_SIZE;
-next:
- cond_resched();
+ lru_add_drain();
+ walk_page_range(vma->vm_mm, start, end, &mlock_walk_ops, NULL);
+ lru_add_drain();
+
+ if (newflags & VM_IO) {
+ newflags &= ~VM_IO;
+ WRITE_ONCE(vma->vm_flags, newflags);
}
}
@@ -500,10 +382,9 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
pgoff_t pgoff;
int nr_pages;
int ret = 0;
- int lock = !!(newflags & VM_LOCKED);
- vm_flags_t old_flags = vma->vm_flags;
+ vm_flags_t oldflags = vma->vm_flags;
- if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) ||
+ if (newflags == oldflags || (oldflags & VM_SPECIAL) ||
is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm) ||
vma_is_dax(vma) || vma_is_secretmem(vma))
/* don't set VM_LOCKED or VM_LOCKONFAULT and don't count */
@@ -512,7 +393,7 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
*prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma,
vma->vm_file, pgoff, vma_policy(vma),
- vma->vm_userfaultfd_ctx, vma_anon_name(vma));
+ vma->vm_userfaultfd_ctx, anon_vma_name(vma));
if (*prev) {
vma = *prev;
goto success;
@@ -535,9 +416,9 @@ success:
* Keep track of amount of locked VM.
*/
nr_pages = (end - start) >> PAGE_SHIFT;
- if (!lock)
+ if (!(newflags & VM_LOCKED))
nr_pages = -nr_pages;
- else if (old_flags & VM_LOCKED)
+ else if (oldflags & VM_LOCKED)
nr_pages = 0;
mm->locked_vm += nr_pages;
@@ -547,11 +428,12 @@ success:
* set VM_LOCKED, populate_vma_page_range will bring it back.
*/
- if (lock)
+ if ((newflags & VM_LOCKED) && (oldflags & VM_LOCKED)) {
+ /* No work to do, and mlocking twice would be wrong */
vma->vm_flags = newflags;
- else
- munlock_vma_pages_range(vma, start, end);
-
+ } else {
+ mlock_vma_pages_range(vma, start, end, newflags);
+ }
out:
*prev = vma;
return ret;
@@ -645,6 +527,18 @@ static unsigned long count_mm_mlocked_page_nr(struct mm_struct *mm,
return count >> PAGE_SHIFT;
}
+/*
+ * convert get_user_pages() return value to posix mlock() error
+ */
+static int __mlock_posix_error_return(long retval)
+{
+ if (retval == -EFAULT)
+ retval = -ENOMEM;
+ else if (retval == -ENOMEM)
+ retval = -EAGAIN;
+ return retval;
+}
+
static __must_check int do_mlock(unsigned long start, size_t len, vm_flags_t flags)
{
unsigned long locked;
@@ -827,18 +721,18 @@ int user_shm_lock(size_t size, struct ucounts *ucounts)
locked = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
lock_limit = rlimit(RLIMIT_MEMLOCK);
- if (lock_limit == RLIM_INFINITY)
- allowed = 1;
- lock_limit >>= PAGE_SHIFT;
+ if (lock_limit != RLIM_INFINITY)
+ lock_limit >>= PAGE_SHIFT;
spin_lock(&shmlock_user_lock);
memlock = inc_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked);
- if (!allowed && (memlock == LONG_MAX || memlock > lock_limit) && !capable(CAP_IPC_LOCK)) {
+ if ((memlock == LONG_MAX || memlock > lock_limit) && !capable(CAP_IPC_LOCK)) {
dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked);
goto out;
}
if (!get_ucounts(ucounts)) {
dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked);
+ allowed = 0;
goto out;
}
allowed = 1;
diff --git a/mm/mmap.c b/mm/mmap.c
index 1e8fdb0b51ed..3aa839f81e63 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1031,7 +1031,7 @@ again:
static inline int is_mergeable_vma(struct vm_area_struct *vma,
struct file *file, unsigned long vm_flags,
struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
- const char *anon_name)
+ struct anon_vma_name *anon_name)
{
/*
* VM_SOFTDIRTY should not prevent from VMA merging, if we
@@ -1049,7 +1049,7 @@ static inline int is_mergeable_vma(struct vm_area_struct *vma,
return 0;
if (!is_mergeable_vm_userfaultfd_ctx(vma, vm_userfaultfd_ctx))
return 0;
- if (!is_same_vma_anon_name(vma, anon_name))
+ if (!anon_vma_name_eq(anon_vma_name(vma), anon_name))
return 0;
return 1;
}
@@ -1084,7 +1084,7 @@ can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
struct anon_vma *anon_vma, struct file *file,
pgoff_t vm_pgoff,
struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
- const char *anon_name)
+ struct anon_vma_name *anon_name)
{
if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name) &&
is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
@@ -1106,7 +1106,7 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
struct anon_vma *anon_vma, struct file *file,
pgoff_t vm_pgoff,
struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
- const char *anon_name)
+ struct anon_vma_name *anon_name)
{
if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name) &&
is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
@@ -1167,7 +1167,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
struct anon_vma *anon_vma, struct file *file,
pgoff_t pgoff, struct mempolicy *policy,
struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
- const char *anon_name)
+ struct anon_vma_name *anon_name)
{
pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
struct vm_area_struct *area, *next;
@@ -1616,8 +1616,6 @@ unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len,
/*
* VM_NORESERVE is used because the reservations will be
* taken when vm_ops->mmap() is called
- * A dummy user value is used because we are not locking
- * memory so no accounting is necessary
*/
file = hugetlb_file_setup(HUGETLB_ANON_FILE, len,
VM_NORESERVE,
@@ -2557,7 +2555,7 @@ static int __init cmdline_parse_stack_guard_gap(char *p)
if (!*endptr)
stack_guard_gap = val << PAGE_SHIFT;
- return 0;
+ return 1;
}
__setup("stack_guard_gap=", cmdline_parse_stack_guard_gap);
@@ -2674,6 +2672,8 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
vma->vm_prev = NULL;
do {
vma_rb_erase(vma, &mm->mm_rb);
+ if (vma->vm_flags & VM_LOCKED)
+ mm->locked_vm -= vma_pages(vma);
mm->map_count--;
tail_vma = vma;
vma = vma->vm_next;
@@ -2778,22 +2778,6 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
return __split_vma(mm, vma, addr, new_below);
}
-static inline void
-unlock_range(struct vm_area_struct *start, unsigned long limit)
-{
- struct mm_struct *mm = start->vm_mm;
- struct vm_area_struct *tmp = start;
-
- while (tmp && tmp->vm_start < limit) {
- if (tmp->vm_flags & VM_LOCKED) {
- mm->locked_vm -= vma_pages(tmp);
- munlock_vma_pages_all(tmp);
- }
-
- tmp = tmp->vm_next;
- }
-}
-
/* Munmap is split into 2 main parts -- this part which finds
* what needs doing, and the areas themselves, which do the
* work. This now handles partial unmappings.
@@ -2874,12 +2858,6 @@ int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
return error;
}
- /*
- * unlock any mlock()ed ranges before detaching vmas
- */
- if (mm->locked_vm)
- unlock_range(vma, end);
-
/* Detach vmas from rbtree */
if (!detach_vmas_to_be_unmapped(mm, vma, prev, end))
downgrade = false;
@@ -3147,20 +3125,12 @@ void exit_mmap(struct mm_struct *mm)
* Nothing can be holding mm->mmap_lock here and the above call
* to mmu_notifier_release(mm) ensures mmu notifier callbacks in
* __oom_reap_task_mm() will not block.
- *
- * This needs to be done before calling unlock_range(),
- * which clears VM_LOCKED, otherwise the oom reaper cannot
- * reliably test it.
*/
(void)__oom_reap_task_mm(mm);
-
set_bit(MMF_OOM_SKIP, &mm->flags);
}
mmap_write_lock(mm);
- if (mm->locked_vm)
- unlock_range(mm->mmap, ULONG_MAX);
-
arch_exit_mmap(mm);
vma = mm->mmap;
@@ -3186,6 +3156,7 @@ void exit_mmap(struct mm_struct *mm)
vma = remove_vma(vma);
cond_resched();
}
+ mm->mmap = NULL;
mmap_write_unlock(mm);
vm_unacct_memory(nr_accounted);
}
@@ -3255,7 +3226,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
return NULL; /* should never get here */
new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
- vma->vm_userfaultfd_ctx, vma_anon_name(vma));
+ vma->vm_userfaultfd_ctx, anon_vma_name(vma));
if (new_vma) {
/*
* Source vma may have been merged into new_vma
@@ -3447,6 +3418,7 @@ static struct vm_area_struct *__install_special_mapping(
vma->vm_end = addr + len;
vma->vm_flags = vm_flags | mm->def_flags | VM_DONTEXPAND | VM_SOFTDIRTY;
+ vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
vma->vm_ops = ops;
diff --git a/mm/mmzone.c b/mm/mmzone.c
index eb89d6e018e2..0ae7571e35ab 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -81,6 +81,13 @@ void lruvec_init(struct lruvec *lruvec)
for_each_lru(lru)
INIT_LIST_HEAD(&lruvec->lists[lru]);
+ /*
+ * The "Unevictable LRU" is imaginary: though its size is maintained,
+ * it is never scanned, and unevictable pages are not threaded on it
+ * (so that their lru fields can be reused to hold mlock_count).
+ * Poison its list head, so that any operations on it would crash.
+ */
+ list_del(&lruvec->lists[LRU_UNEVICTABLE]);
}
#if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS)
@@ -89,13 +96,14 @@ int page_cpupid_xchg_last(struct page *page, int cpupid)
unsigned long old_flags, flags;
int last_cpupid;
+ old_flags = READ_ONCE(page->flags);
do {
- old_flags = flags = page->flags;
- last_cpupid = page_cpupid_last(page);
+ flags = old_flags;
+ last_cpupid = (flags >> LAST_CPUPID_PGSHIFT) & LAST_CPUPID_MASK;
flags &= ~(LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT);
flags |= (cpupid & LAST_CPUPID_MASK) << LAST_CPUPID_PGSHIFT;
- } while (unlikely(cmpxchg(&page->flags, old_flags, flags) != old_flags));
+ } while (unlikely(!try_cmpxchg(&page->flags, &old_flags, flags)));
return last_cpupid;
}
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 0138dfcdb1d8..b69ce7a7b2b7 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -29,6 +29,7 @@
#include <linux/uaccess.h>
#include <linux/mm_inline.h>
#include <linux/pgtable.h>
+#include <linux/sched/sysctl.h>
#include <asm/cacheflush.h>
#include <asm/mmu_context.h>
#include <asm/tlbflush.h>
@@ -83,6 +84,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
*/
if (prot_numa) {
struct page *page;
+ int nid;
/* Avoid TLB flush if possible */
if (pte_protnone(oldpte))
@@ -94,7 +96,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
/* Also skip shared copy-on-write pages */
if (is_cow_mapping(vma->vm_flags) &&
- page_mapcount(page) != 1)
+ page_count(page) != 1)
continue;
/*
@@ -109,7 +111,16 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
* Don't mess with PTEs if page is already on the node
* a single-threaded process is running on.
*/
- if (target_node == page_to_nid(page))
+ nid = page_to_nid(page);
+ if (target_node == nid)
+ continue;
+
+ /*
+ * Skip scanning top tier node if normal numa
+ * balancing is disabled
+ */
+ if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) &&
+ node_is_toptier(nid))
continue;
}
@@ -464,7 +475,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
*pprev = vma_merge(mm, *pprev, start, end, newflags,
vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
- vma->vm_userfaultfd_ctx, vma_anon_name(vma));
+ vma->vm_userfaultfd_ctx, anon_vma_name(vma));
if (*pprev) {
vma = *pprev;
VM_WARN_ON((vma->vm_flags ^ newflags) & ~VM_SOFTDIRTY);
diff --git a/mm/mremap.c b/mm/mremap.c
index 002eec83e91e..9d76da79594d 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -942,8 +942,8 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
if (mmap_write_lock_killable(current->mm))
return -EINTR;
- vma = find_vma(mm, addr);
- if (!vma || vma->vm_start > addr) {
+ vma = vma_lookup(mm, addr);
+ if (!vma) {
ret = EFAULT;
goto out;
}
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 832fb330376e..7ec38194f8e1 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -93,9 +93,6 @@ static bool oom_cpuset_eligible(struct task_struct *start,
bool ret = false;
const nodemask_t *mask = oc->nodemask;
- if (is_memcg_oom(oc))
- return true;
-
rcu_read_lock();
for_each_thread(start, tsk) {
if (mask) {
@@ -526,7 +523,7 @@ bool __oom_reap_task_mm(struct mm_struct *mm)
set_bit(MMF_UNSTABLE, &mm->flags);
for (vma = mm->mmap ; vma; vma = vma->vm_next) {
- if (!can_madv_lru_vma(vma))
+ if (vma->vm_flags & (VM_HUGETLB|VM_PFNMAP))
continue;
/*
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 91d163f8d36b..7e2da284e427 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -324,18 +324,6 @@ static unsigned long highmem_dirtyable_memory(unsigned long total)
}
/*
- * Unreclaimable memory (kernel memory or anonymous memory
- * without swap) can bring down the dirtyable pages below
- * the zone's dirty balance reserve and the above calculation
- * will underflow. However we still want to add in nodes
- * which are below threshold (negative values) to get a more
- * accurate calculation but make sure that the total never
- * underflows.
- */
- if ((long)x < 0)
- x = 0;
-
- /*
* Make sure that the number of highmem pages is never larger
* than the number of the total dirtyable memory. This can only
* occur in very strange VM situations but we want to make sure
@@ -2430,13 +2418,13 @@ EXPORT_SYMBOL(folio_write_one);
/*
* For address_spaces which do not use buffers nor write back.
*/
-int __set_page_dirty_no_writeback(struct page *page)
+bool noop_dirty_folio(struct address_space *mapping, struct folio *folio)
{
- if (!PageDirty(page))
- return !TestSetPageDirty(page);
- return 0;
+ if (!folio_test_dirty(folio))
+ return !folio_test_set_dirty(folio);
+ return false;
}
-EXPORT_SYMBOL(__set_page_dirty_no_writeback);
+EXPORT_SYMBOL(noop_dirty_folio);
/*
* Helper function for set_page_dirty family.
@@ -2477,16 +2465,14 @@ static void folio_account_dirtied(struct folio *folio,
*
* Caller must hold lock_page_memcg().
*/
-void folio_account_cleaned(struct folio *folio, struct address_space *mapping,
- struct bdi_writeback *wb)
+void folio_account_cleaned(struct folio *folio, struct bdi_writeback *wb)
{
- if (mapping_can_writeback(mapping)) {
- long nr = folio_nr_pages(folio);
- lruvec_stat_mod_folio(folio, NR_FILE_DIRTY, -nr);
- zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, -nr);
- wb_stat_mod(wb, WB_RECLAIMABLE, -nr);
- task_io_account_cancelled_write(nr * PAGE_SIZE);
- }
+ long nr = folio_nr_pages(folio);
+
+ lruvec_stat_mod_folio(folio, NR_FILE_DIRTY, -nr);
+ zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, -nr);
+ wb_stat_mod(wb, WB_RECLAIMABLE, -nr);
+ task_io_account_cancelled_write(nr * PAGE_SIZE);
}
/*
@@ -2530,7 +2516,7 @@ void __folio_mark_dirty(struct folio *folio, struct address_space *mapping,
* This is also sometimes used by filesystems which use buffer_heads when
* a single buffer is being dirtied: we want to set the folio dirty in
* that case, but not all the buffers. This is a "bottom-up" dirtying,
- * whereas __set_page_dirty_buffers() is a "top-down" dirtying.
+ * whereas block_dirty_folio() is a "top-down" dirtying.
*
* The caller must ensure this doesn't race with truncation. Most will
* simply hold the folio lock, but e.g. zap_pte_range() calls with the
@@ -2616,7 +2602,7 @@ EXPORT_SYMBOL(folio_redirty_for_writepage);
* folio_mark_dirty - Mark a folio as being modified.
* @folio: The folio.
*
- * For folios with a mapping this should be done under the page lock
+ * For folios with a mapping this should be done with the folio lock held
* for the benefit of asynchronous memory errors who prefer a consistent
* dirty state. This rule can be broken in some special cases,
* but should be better not to.
@@ -2630,23 +2616,21 @@ bool folio_mark_dirty(struct folio *folio)
if (likely(mapping)) {
/*
* readahead/lru_deactivate_page could remain
- * PG_readahead/PG_reclaim due to race with end_page_writeback
- * About readahead, if the page is written, the flags would be
+ * PG_readahead/PG_reclaim due to race with folio_end_writeback
+ * About readahead, if the folio is written, the flags would be
* reset. So no problem.
- * About lru_deactivate_page, if the page is redirty, the flag
- * will be reset. So no problem. but if the page is used by readahead
- * it will confuse readahead and make it restart the size rampup
- * process. But it's a trivial problem.
+ * About lru_deactivate_page, if the folio is redirtied,
+ * the flag will be reset. So no problem. but if the
+ * folio is used by readahead it will confuse readahead
+ * and make it restart the size rampup process. But it's
+ * a trivial problem.
*/
if (folio_test_reclaim(folio))
folio_clear_reclaim(folio);
- return mapping->a_ops->set_page_dirty(&folio->page);
+ return mapping->a_ops->dirty_folio(mapping, folio);
}
- if (!folio_test_dirty(folio)) {
- if (!folio_test_set_dirty(folio))
- return true;
- }
- return false;
+
+ return noop_dirty_folio(mapping, folio);
}
EXPORT_SYMBOL(folio_mark_dirty);
@@ -2697,7 +2681,7 @@ void __folio_cancel_dirty(struct folio *folio)
wb = unlocked_inode_to_wb_begin(inode, &cookie);
if (folio_test_clear_dirty(folio))
- folio_account_cleaned(folio, mapping, wb);
+ folio_account_cleaned(folio, wb);
unlocked_inode_to_wb_end(inode, &cookie);
folio_memcg_unlock(folio);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 3589febc6d31..bdc8f60ae462 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -128,7 +128,7 @@ static DEFINE_MUTEX(pcp_batch_high_lock);
struct pagesets {
local_lock_t lock;
};
-static DEFINE_PER_CPU(struct pagesets, pagesets) = {
+static DEFINE_PER_CPU(struct pagesets, pagesets) __maybe_unused = {
.lock = INIT_LOCAL_LOCK(lock),
};
@@ -378,25 +378,9 @@ int page_group_by_mobility_disabled __read_mostly;
*/
static DEFINE_STATIC_KEY_TRUE(deferred_pages);
-/*
- * Calling kasan_poison_pages() only after deferred memory initialization
- * has completed. Poisoning pages during deferred memory init will greatly
- * lengthen the process and cause problem in large memory systems as the
- * deferred pages initialization is done with interrupt disabled.
- *
- * Assuming that there will be no reference to those newly initialized
- * pages before they are ever allocated, this should have no effect on
- * KASAN memory tracking as the poison will be properly inserted at page
- * allocation time. The only corner case is when pages are allocated by
- * on-demand allocation and then freed again before the deferred pages
- * initialization is done, but this is not likely to happen.
- */
-static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags)
+static inline bool deferred_pages_enabled(void)
{
- return static_branch_unlikely(&deferred_pages) ||
- (!IS_ENABLED(CONFIG_KASAN_GENERIC) &&
- (fpi_flags & FPI_SKIP_KASAN_POISON)) ||
- PageSkipKASanPoison(page);
+ return static_branch_unlikely(&deferred_pages);
}
/* Returns true if the struct page for the pfn is uninitialised */
@@ -447,11 +431,9 @@ defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
return false;
}
#else
-static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags)
+static inline bool deferred_pages_enabled(void)
{
- return (!IS_ENABLED(CONFIG_KASAN_GENERIC) &&
- (fpi_flags & FPI_SKIP_KASAN_POISON)) ||
- PageSkipKASanPoison(page);
+ return false;
}
static inline bool early_page_uninitialised(unsigned long pfn)
@@ -734,8 +716,7 @@ static void prep_compound_head(struct page *page, unsigned int order)
set_compound_page_dtor(page, COMPOUND_PAGE_DTOR);
set_compound_order(page, order);
atomic_set(compound_mapcount_ptr(page), -1);
- if (hpage_pincount_available(page))
- atomic_set(compound_pincount_ptr(page), 0);
+ atomic_set(compound_pincount_ptr(page), 0);
}
static void prep_compound_tail(struct page *head, int tail_idx)
@@ -1072,14 +1053,12 @@ static inline void __free_one_page(struct page *page,
int migratetype, fpi_t fpi_flags)
{
struct capture_control *capc = task_capc(zone);
+ unsigned int max_order = pageblock_order;
unsigned long buddy_pfn;
unsigned long combined_pfn;
- unsigned int max_order;
struct page *buddy;
bool to_tail;
- max_order = min_t(unsigned int, MAX_ORDER - 1, pageblock_order);
-
VM_BUG_ON(!zone_is_initialized(zone));
VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page);
@@ -1117,25 +1096,24 @@ continue_merging:
}
if (order < MAX_ORDER - 1) {
/* If we are here, it means order is >= pageblock_order.
- * We want to prevent merge between freepages on isolate
- * pageblock and normal pageblock. Without this, pageblock
- * isolation could cause incorrect freepage or CMA accounting.
+ * We want to prevent merge between freepages on pageblock
+ * without fallbacks and normal pageblock. Without this,
+ * pageblock isolation could cause incorrect freepage or CMA
+ * accounting or HIGHATOMIC accounting.
*
* We don't want to hit this code for the more frequent
* low-order merging.
*/
- if (unlikely(has_isolate_pageblock(zone))) {
- int buddy_mt;
+ int buddy_mt;
- buddy_pfn = __find_buddy_pfn(pfn, order);
- buddy = page + (buddy_pfn - pfn);
- buddy_mt = get_pageblock_migratetype(buddy);
+ buddy_pfn = __find_buddy_pfn(pfn, order);
+ buddy = page + (buddy_pfn - pfn);
+ buddy_mt = get_pageblock_migratetype(buddy);
- if (migratetype != buddy_mt
- && (is_migrate_isolate(migratetype) ||
- is_migrate_isolate(buddy_mt)))
- goto done_merging;
- }
+ if (migratetype != buddy_mt
+ && (!migratetype_is_mergeable(migratetype) ||
+ !migratetype_is_mergeable(buddy_mt)))
+ goto done_merging;
max_order = order + 1;
goto continue_merging;
}
@@ -1271,15 +1249,38 @@ out:
return ret;
}
-static void kernel_init_free_pages(struct page *page, int numpages, bool zero_tags)
+/*
+ * Skip KASAN memory poisoning when either:
+ *
+ * 1. Deferred memory initialization has not yet completed,
+ * see the explanation below.
+ * 2. Skipping poisoning is requested via FPI_SKIP_KASAN_POISON,
+ * see the comment next to it.
+ * 3. Skipping poisoning is requested via __GFP_SKIP_KASAN_POISON,
+ * see the comment next to it.
+ *
+ * Poisoning pages during deferred memory init will greatly lengthen the
+ * process and cause problem in large memory systems as the deferred pages
+ * initialization is done with interrupt disabled.
+ *
+ * Assuming that there will be no reference to those newly initialized
+ * pages before they are ever allocated, this should have no effect on
+ * KASAN memory tracking as the poison will be properly inserted at page
+ * allocation time. The only corner case is when pages are allocated by
+ * on-demand allocation and then freed again before the deferred pages
+ * initialization is done, but this is not likely to happen.
+ */
+static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags)
{
- int i;
+ return deferred_pages_enabled() ||
+ (!IS_ENABLED(CONFIG_KASAN_GENERIC) &&
+ (fpi_flags & FPI_SKIP_KASAN_POISON)) ||
+ PageSkipKASanPoison(page);
+}
- if (zero_tags) {
- for (i = 0; i < numpages; i++)
- tag_clear_highpage(page + i);
- return;
- }
+static void kernel_init_free_pages(struct page *page, int numpages)
+{
+ int i;
/* s390's use of memset() could override KASAN redzones. */
kasan_disable_current();
@@ -1296,7 +1297,7 @@ static __always_inline bool free_pages_prepare(struct page *page,
unsigned int order, bool check_free, fpi_t fpi_flags)
{
int bad = 0;
- bool skip_kasan_poison = should_skip_kasan_poison(page, fpi_flags);
+ bool init = want_init_on_free();
VM_BUG_ON_PAGE(PageTail(page), page);
@@ -1363,23 +1364,21 @@ static __always_inline bool free_pages_prepare(struct page *page,
/*
* As memory initialization might be integrated into KASAN,
- * kasan_free_pages and kernel_init_free_pages must be
+ * KASAN poisoning and memory initialization code must be
* kept together to avoid discrepancies in behavior.
*
* With hardware tag-based KASAN, memory tags must be set before the
* page becomes unavailable via debug_pagealloc or arch_free_page.
*/
- if (kasan_has_integrated_init()) {
- if (!skip_kasan_poison)
- kasan_free_pages(page, order);
- } else {
- bool init = want_init_on_free();
+ if (!should_skip_kasan_poison(page, fpi_flags)) {
+ kasan_poison_pages(page, order, init);
- if (init)
- kernel_init_free_pages(page, 1 << order, false);
- if (!skip_kasan_poison)
- kasan_poison_pages(page, order, init);
+ /* Memory is already initialized if KASAN did it internally. */
+ if (kasan_has_integrated_init())
+ init = false;
}
+ if (init)
+ kernel_init_free_pages(page, 1 << order);
/*
* arch_free_page() can make the page's contents inaccessible. s390
@@ -1432,120 +1431,83 @@ static bool bulkfree_pcp_prepare(struct page *page)
}
#endif /* CONFIG_DEBUG_VM */
-static inline void prefetch_buddy(struct page *page)
-{
- unsigned long pfn = page_to_pfn(page);
- unsigned long buddy_pfn = __find_buddy_pfn(pfn, 0);
- struct page *buddy = page + (buddy_pfn - pfn);
-
- prefetch(buddy);
-}
-
/*
* Frees a number of pages from the PCP lists
* Assumes all pages on list are in same zone.
* count is the number of pages to free.
*/
static void free_pcppages_bulk(struct zone *zone, int count,
- struct per_cpu_pages *pcp)
+ struct per_cpu_pages *pcp,
+ int pindex)
{
- int pindex = 0;
- int batch_free = 0;
- int nr_freed = 0;
+ int min_pindex = 0;
+ int max_pindex = NR_PCP_LISTS - 1;
unsigned int order;
- int prefetch_nr = READ_ONCE(pcp->batch);
bool isolated_pageblocks;
- struct page *page, *tmp;
- LIST_HEAD(head);
+ struct page *page;
/*
* Ensure proper count is passed which otherwise would stuck in the
* below while (list_empty(list)) loop.
*/
count = min(pcp->count, count);
+
+ /* Ensure requested pindex is drained first. */
+ pindex = pindex - 1;
+
+ /*
+ * local_lock_irq held so equivalent to spin_lock_irqsave for
+ * both PREEMPT_RT and non-PREEMPT_RT configurations.
+ */
+ spin_lock(&zone->lock);
+ isolated_pageblocks = has_isolate_pageblock(zone);
+
while (count > 0) {
struct list_head *list;
+ int nr_pages;
- /*
- * Remove pages from lists in a round-robin fashion. A
- * batch_free count is maintained that is incremented when an
- * empty list is encountered. This is so more pages are freed
- * off fuller lists instead of spinning excessively around empty
- * lists
- */
+ /* Remove pages from lists in a round-robin fashion. */
do {
- batch_free++;
- if (++pindex == NR_PCP_LISTS)
- pindex = 0;
+ if (++pindex > max_pindex)
+ pindex = min_pindex;
list = &pcp->lists[pindex];
- } while (list_empty(list));
+ if (!list_empty(list))
+ break;
- /* This is the only non-empty list. Free them all. */
- if (batch_free == NR_PCP_LISTS)
- batch_free = count;
+ if (pindex == max_pindex)
+ max_pindex--;
+ if (pindex == min_pindex)
+ min_pindex++;
+ } while (1);
order = pindex_to_order(pindex);
+ nr_pages = 1 << order;
BUILD_BUG_ON(MAX_ORDER >= (1<<NR_PCP_ORDER_WIDTH));
do {
+ int mt;
+
page = list_last_entry(list, struct page, lru);
+ mt = get_pcppage_migratetype(page);
+
/* must delete to avoid corrupting pcp list */
list_del(&page->lru);
- nr_freed += 1 << order;
- count -= 1 << order;
+ count -= nr_pages;
+ pcp->count -= nr_pages;
if (bulkfree_pcp_prepare(page))
continue;
- /* Encode order with the migratetype */
- page->index <<= NR_PCP_ORDER_WIDTH;
- page->index |= order;
-
- list_add_tail(&page->lru, &head);
+ /* MIGRATE_ISOLATE page should not go to pcplists */
+ VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
+ /* Pageblock could have been isolated meanwhile */
+ if (unlikely(isolated_pageblocks))
+ mt = get_pageblock_migratetype(page);
- /*
- * We are going to put the page back to the global
- * pool, prefetch its buddy to speed up later access
- * under zone->lock. It is believed the overhead of
- * an additional test and calculating buddy_pfn here
- * can be offset by reduced memory latency later. To
- * avoid excessive prefetching due to large count, only
- * prefetch buddy for the first pcp->batch nr of pages.
- */
- if (prefetch_nr) {
- prefetch_buddy(page);
- prefetch_nr--;
- }
- } while (count > 0 && --batch_free && !list_empty(list));
+ __free_one_page(page, page_to_pfn(page), zone, order, mt, FPI_NONE);
+ trace_mm_page_pcpu_drain(page, order, mt);
+ } while (count > 0 && !list_empty(list));
}
- pcp->count -= nr_freed;
-
- /*
- * local_lock_irq held so equivalent to spin_lock_irqsave for
- * both PREEMPT_RT and non-PREEMPT_RT configurations.
- */
- spin_lock(&zone->lock);
- isolated_pageblocks = has_isolate_pageblock(zone);
-
- /*
- * Use safe version since after __free_one_page(),
- * page->lru.next will not point to original list.
- */
- list_for_each_entry_safe(page, tmp, &head, lru) {
- int mt = get_pcppage_migratetype(page);
- /* mt has been encoded with the order (see above) */
- order = mt & NR_PCP_ORDER_MASK;
- mt >>= NR_PCP_ORDER_WIDTH;
-
- /* MIGRATE_ISOLATE page should not go to pcplists */
- VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
- /* Pageblock could have been isolated meanwhile */
- if (unlikely(isolated_pageblocks))
- mt = get_pageblock_migratetype(page);
-
- __free_one_page(page, page_to_pfn(page), zone, order, mt, FPI_NONE);
- trace_mm_page_pcpu_drain(page, order, mt);
- }
spin_unlock(&zone->lock);
}
@@ -2260,19 +2222,8 @@ void __init init_cma_reserved_pageblock(struct page *page)
} while (++p, --i);
set_pageblock_migratetype(page, MIGRATE_CMA);
-
- if (pageblock_order >= MAX_ORDER) {
- i = pageblock_nr_pages;
- p = page;
- do {
- set_page_refcounted(p);
- __free_pages(p, MAX_ORDER - 1);
- p += MAX_ORDER_NR_PAGES;
- } while (i -= MAX_ORDER_NR_PAGES);
- } else {
- set_page_refcounted(page);
- __free_pages(page, pageblock_order);
- }
+ set_page_refcounted(page);
+ __free_pages(page, pageblock_order);
adjust_managed_page_count(page, pageblock_nr_pages);
page_zone(page)->cma_pages += pageblock_nr_pages;
@@ -2342,23 +2293,36 @@ static inline int check_new_page(struct page *page)
return 1;
}
+static bool check_new_pages(struct page *page, unsigned int order)
+{
+ int i;
+ for (i = 0; i < (1 << order); i++) {
+ struct page *p = page + i;
+
+ if (unlikely(check_new_page(p)))
+ return true;
+ }
+
+ return false;
+}
+
#ifdef CONFIG_DEBUG_VM
/*
* With DEBUG_VM enabled, order-0 pages are checked for expected state when
* being allocated from pcp lists. With debug_pagealloc also enabled, they are
* also checked when pcp lists are refilled from the free lists.
*/
-static inline bool check_pcp_refill(struct page *page)
+static inline bool check_pcp_refill(struct page *page, unsigned int order)
{
if (debug_pagealloc_enabled_static())
- return check_new_page(page);
+ return check_new_pages(page, order);
else
return false;
}
-static inline bool check_new_pcp(struct page *page)
+static inline bool check_new_pcp(struct page *page, unsigned int order)
{
- return check_new_page(page);
+ return check_new_pages(page, order);
}
#else
/*
@@ -2366,35 +2330,56 @@ static inline bool check_new_pcp(struct page *page)
* when pcp lists are being refilled from the free lists. With debug_pagealloc
* enabled, they are also checked when being allocated from the pcp lists.
*/
-static inline bool check_pcp_refill(struct page *page)
+static inline bool check_pcp_refill(struct page *page, unsigned int order)
{
- return check_new_page(page);
+ return check_new_pages(page, order);
}
-static inline bool check_new_pcp(struct page *page)
+static inline bool check_new_pcp(struct page *page, unsigned int order)
{
if (debug_pagealloc_enabled_static())
- return check_new_page(page);
+ return check_new_pages(page, order);
else
return false;
}
#endif /* CONFIG_DEBUG_VM */
-static bool check_new_pages(struct page *page, unsigned int order)
+static inline bool should_skip_kasan_unpoison(gfp_t flags, bool init_tags)
{
- int i;
- for (i = 0; i < (1 << order); i++) {
- struct page *p = page + i;
+ /* Don't skip if a software KASAN mode is enabled. */
+ if (IS_ENABLED(CONFIG_KASAN_GENERIC) ||
+ IS_ENABLED(CONFIG_KASAN_SW_TAGS))
+ return false;
- if (unlikely(check_new_page(p)))
- return true;
- }
+ /* Skip, if hardware tag-based KASAN is not enabled. */
+ if (!kasan_hw_tags_enabled())
+ return true;
- return false;
+ /*
+ * With hardware tag-based KASAN enabled, skip if either:
+ *
+ * 1. Memory tags have already been cleared via tag_clear_highpage().
+ * 2. Skipping has been requested via __GFP_SKIP_KASAN_UNPOISON.
+ */
+ return init_tags || (flags & __GFP_SKIP_KASAN_UNPOISON);
+}
+
+static inline bool should_skip_init(gfp_t flags)
+{
+ /* Don't skip, if hardware tag-based KASAN is not enabled. */
+ if (!kasan_hw_tags_enabled())
+ return false;
+
+ /* For hardware tag-based KASAN, skip if requested. */
+ return (flags & __GFP_SKIP_ZERO);
}
inline void post_alloc_hook(struct page *page, unsigned int order,
gfp_t gfp_flags)
{
+ bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags) &&
+ !should_skip_init(gfp_flags);
+ bool init_tags = init && (gfp_flags & __GFP_ZEROTAGS);
+
set_page_private(page, 0);
set_page_refcounted(page);
@@ -2410,19 +2395,38 @@ inline void post_alloc_hook(struct page *page, unsigned int order,
/*
* As memory initialization might be integrated into KASAN,
- * kasan_alloc_pages and kernel_init_free_pages must be
+ * KASAN unpoisoning and memory initializion code must be
* kept together to avoid discrepancies in behavior.
*/
- if (kasan_has_integrated_init()) {
- kasan_alloc_pages(page, order, gfp_flags);
- } else {
- bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags);
+ /*
+ * If memory tags should be zeroed (which happens only when memory
+ * should be initialized as well).
+ */
+ if (init_tags) {
+ int i;
+
+ /* Initialize both memory and tags. */
+ for (i = 0; i != 1 << order; ++i)
+ tag_clear_highpage(page + i);
+
+ /* Note that memory is already initialized by the loop above. */
+ init = false;
+ }
+ if (!should_skip_kasan_unpoison(gfp_flags, init_tags)) {
+ /* Unpoison shadow memory or set memory tags. */
kasan_unpoison_pages(page, order, init);
- if (init)
- kernel_init_free_pages(page, 1 << order,
- gfp_flags & __GFP_ZEROTAGS);
+
+ /* Note that memory is already initialized by KASAN. */
+ if (kasan_has_integrated_init())
+ init = false;
}
+ /* If memory is still not initialized, do it now. */
+ if (init)
+ kernel_init_free_pages(page, 1 << order);
+ /* Propagate __GFP_SKIP_KASAN_POISON to page flags. */
+ if (kasan_hw_tags_enabled() && (gfp_flags & __GFP_SKIP_KASAN_POISON))
+ SetPageSkipKASanPoison(page);
set_page_owner(page, order, gfp_flags);
page_table_check_alloc(page, order);
@@ -2479,17 +2483,13 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
/*
* This array describes the order lists are fallen back to when
* the free lists for the desirable migrate type are depleted
+ *
+ * The other migratetypes do not have fallbacks.
*/
static int fallbacks[MIGRATE_TYPES][3] = {
[MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_TYPES },
[MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_TYPES },
[MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_TYPES },
-#ifdef CONFIG_CMA
- [MIGRATE_CMA] = { MIGRATE_TYPES }, /* Never used */
-#endif
-#ifdef CONFIG_MEMORY_ISOLATION
- [MIGRATE_ISOLATE] = { MIGRATE_TYPES }, /* Never used */
-#endif
};
#ifdef CONFIG_CMA
@@ -2795,8 +2795,8 @@ static void reserve_highatomic_pageblock(struct page *page, struct zone *zone,
/* Yoink! */
mt = get_pageblock_migratetype(page);
- if (!is_migrate_highatomic(mt) && !is_migrate_isolate(mt)
- && !is_migrate_cma(mt)) {
+ /* Only reserve normal pageblocks (i.e., they can merge with others) */
+ if (migratetype_is_mergeable(mt)) {
zone->nr_reserved_highatomic += pageblock_nr_pages;
set_pageblock_migratetype(page, MIGRATE_HIGHATOMIC);
move_freepages_block(zone, page, MIGRATE_HIGHATOMIC, NULL);
@@ -3037,7 +3037,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
if (unlikely(page == NULL))
break;
- if (unlikely(check_pcp_refill(page)))
+ if (unlikely(check_pcp_refill(page, order)))
continue;
/*
@@ -3086,7 +3086,7 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
batch = READ_ONCE(pcp->batch);
to_drain = min(pcp->count, batch);
if (to_drain > 0)
- free_pcppages_bulk(zone, to_drain, pcp);
+ free_pcppages_bulk(zone, to_drain, pcp, 0);
local_unlock_irqrestore(&pagesets.lock, flags);
}
#endif
@@ -3107,7 +3107,7 @@ static void drain_pages_zone(unsigned int cpu, struct zone *zone)
pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
if (pcp->count)
- free_pcppages_bulk(zone, pcp->count, pcp);
+ free_pcppages_bulk(zone, pcp->count, pcp, 0);
local_unlock_irqrestore(&pagesets.lock, flags);
}
@@ -3330,10 +3330,15 @@ static bool free_unref_page_prepare(struct page *page, unsigned long pfn,
return true;
}
-static int nr_pcp_free(struct per_cpu_pages *pcp, int high, int batch)
+static int nr_pcp_free(struct per_cpu_pages *pcp, int high, int batch,
+ bool free_high)
{
int min_nr_free, max_nr_free;
+ /* Free everything if batch freeing high-order pages. */
+ if (unlikely(free_high))
+ return pcp->count;
+
/* Check for PCP disabled or boot pageset */
if (unlikely(high < batch))
return 1;
@@ -3354,11 +3359,12 @@ static int nr_pcp_free(struct per_cpu_pages *pcp, int high, int batch)
return batch;
}
-static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone)
+static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone,
+ bool free_high)
{
int high = READ_ONCE(pcp->high);
- if (unlikely(!high))
+ if (unlikely(!high || free_high))
return 0;
if (!test_bit(ZONE_RECLAIM_ACTIVE, &zone->flags))
@@ -3371,24 +3377,34 @@ static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone)
return min(READ_ONCE(pcp->batch) << 2, high);
}
-static void free_unref_page_commit(struct page *page, unsigned long pfn,
- int migratetype, unsigned int order)
+static void free_unref_page_commit(struct page *page, int migratetype,
+ unsigned int order)
{
struct zone *zone = page_zone(page);
struct per_cpu_pages *pcp;
int high;
int pindex;
+ bool free_high;
__count_vm_event(PGFREE);
pcp = this_cpu_ptr(zone->per_cpu_pageset);
pindex = order_to_pindex(migratetype, order);
list_add(&page->lru, &pcp->lists[pindex]);
pcp->count += 1 << order;
- high = nr_pcp_high(pcp, zone);
+
+ /*
+ * As high-order pages other than THP's stored on PCP can contribute
+ * to fragmentation, limit the number stored when PCP is heavily
+ * freeing without allocation. The remainder after bulk freeing
+ * stops will be drained from vmstat refresh context.
+ */
+ free_high = (pcp->free_factor && order && order <= PAGE_ALLOC_COSTLY_ORDER);
+
+ high = nr_pcp_high(pcp, zone, free_high);
if (pcp->count >= high) {
int batch = READ_ONCE(pcp->batch);
- free_pcppages_bulk(zone, nr_pcp_free(pcp, high, batch), pcp);
+ free_pcppages_bulk(zone, nr_pcp_free(pcp, high, batch, free_high), pcp, pindex);
}
}
@@ -3421,7 +3437,7 @@ void free_unref_page(struct page *page, unsigned int order)
}
local_lock_irqsave(&pagesets.lock, flags);
- free_unref_page_commit(page, pfn, migratetype, order);
+ free_unref_page_commit(page, migratetype, order);
local_unlock_irqrestore(&pagesets.lock, flags);
}
@@ -3431,13 +3447,13 @@ void free_unref_page(struct page *page, unsigned int order)
void free_unref_page_list(struct list_head *list)
{
struct page *page, *next;
- unsigned long flags, pfn;
+ unsigned long flags;
int batch_count = 0;
int migratetype;
/* Prepare pages for freeing */
list_for_each_entry_safe(page, next, list, lru) {
- pfn = page_to_pfn(page);
+ unsigned long pfn = page_to_pfn(page);
if (!free_unref_page_prepare(page, pfn, 0)) {
list_del(&page->lru);
continue;
@@ -3453,15 +3469,10 @@ void free_unref_page_list(struct list_head *list)
free_one_page(page_zone(page), page, pfn, 0, migratetype, FPI_NONE);
continue;
}
-
- set_page_private(page, pfn);
}
local_lock_irqsave(&pagesets.lock, flags);
list_for_each_entry_safe(page, next, list, lru) {
- pfn = page_private(page);
- set_page_private(page, 0);
-
/*
* Non-isolated types over MIGRATE_PCPTYPES get added
* to the MIGRATE_MOVABLE pcp list.
@@ -3471,7 +3482,7 @@ void free_unref_page_list(struct list_head *list)
migratetype = MIGRATE_MOVABLE;
trace_mm_page_free_batched(page);
- free_unref_page_commit(page, pfn, migratetype, 0);
+ free_unref_page_commit(page, migratetype, 0);
/*
* Guard against excessive IRQ disabled times when we get
@@ -3545,8 +3556,11 @@ int __isolate_free_page(struct page *page, unsigned int order)
struct page *endpage = page + (1 << order) - 1;
for (; page < endpage; page += pageblock_nr_pages) {
int mt = get_pageblock_migratetype(page);
- if (!is_migrate_isolate(mt) && !is_migrate_cma(mt)
- && !is_migrate_highatomic(mt))
+ /*
+ * Only change normal pageblocks (i.e., they can merge
+ * with others)
+ */
+ if (migratetype_is_mergeable(mt))
set_pageblock_migratetype(page,
MIGRATE_MOVABLE);
}
@@ -3641,7 +3655,7 @@ struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order,
page = list_first_entry(list, struct page, lru);
list_del(&page->lru);
pcp->count -= 1 << order;
- } while (check_new_pcp(page));
+ } while (check_new_pcp(page, order));
return page;
}
@@ -3706,10 +3720,10 @@ struct page *rmqueue(struct zone *preferred_zone,
* allocate greater than order-1 page units with __GFP_NOFAIL.
*/
WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
- spin_lock_irqsave(&zone->lock, flags);
do {
page = NULL;
+ spin_lock_irqsave(&zone->lock, flags);
/*
* order-0 request can reach here when the pcplist is skipped
* due to non-CMA allocation context. HIGHATOMIC area is
@@ -3721,15 +3735,15 @@ struct page *rmqueue(struct zone *preferred_zone,
if (page)
trace_mm_page_alloc_zone_locked(page, order, migratetype);
}
- if (!page)
+ if (!page) {
page = __rmqueue(zone, order, migratetype, alloc_flags);
- } while (page && check_new_pages(page, order));
- if (!page)
- goto failed;
-
- __mod_zone_freepage_state(zone, -(1 << order),
- get_pcppage_migratetype(page));
- spin_unlock_irqrestore(&zone->lock, flags);
+ if (!page)
+ goto failed;
+ }
+ __mod_zone_freepage_state(zone, -(1 << order),
+ get_pcppage_migratetype(page));
+ spin_unlock_irqrestore(&zone->lock, flags);
+ } while (check_new_pages(page, order));
__count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
zone_statistics(preferred_zone, zone, 1);
@@ -4595,13 +4609,12 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order,
const struct alloc_context *ac)
{
unsigned int noreclaim_flag;
- unsigned long pflags, progress;
+ unsigned long progress;
cond_resched();
/* We now go into synchronous reclaim */
cpuset_memory_pressure_bump();
- psi_memstall_enter(&pflags);
fs_reclaim_acquire(gfp_mask);
noreclaim_flag = memalloc_noreclaim_save();
@@ -4610,7 +4623,6 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order,
memalloc_noreclaim_restore(noreclaim_flag);
fs_reclaim_release(gfp_mask);
- psi_memstall_leave(&pflags);
cond_resched();
@@ -4624,11 +4636,13 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
unsigned long *did_some_progress)
{
struct page *page = NULL;
+ unsigned long pflags;
bool drained = false;
+ psi_memstall_enter(&pflags);
*did_some_progress = __perform_reclaim(gfp_mask, order, ac);
if (unlikely(!(*did_some_progress)))
- return NULL;
+ goto out;
retry:
page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
@@ -4644,6 +4658,8 @@ retry:
drained = true;
goto retry;
}
+out:
+ psi_memstall_leave(&pflags);
return page;
}
@@ -6380,7 +6396,7 @@ static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonesta
#define BOOT_PAGESET_BATCH 1
static DEFINE_PER_CPU(struct per_cpu_pages, boot_pageset);
static DEFINE_PER_CPU(struct per_cpu_zonestat, boot_zonestats);
-static DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats);
+DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats);
static void __build_all_zonelists(void *data)
{
@@ -6402,7 +6418,11 @@ static void __build_all_zonelists(void *data)
if (self && !node_online(self->node_id)) {
build_zonelists(self);
} else {
- for_each_online_node(nid) {
+ /*
+ * All possible nodes have pgdat preallocated
+ * in free_area_init
+ */
+ for_each_node(nid) {
pg_data_t *pgdat = NODE_DATA(nid);
build_zonelists(pgdat);
@@ -7389,16 +7409,15 @@ static inline void setup_usemap(struct zone *zone) {}
/* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
void __init set_pageblock_order(void)
{
- unsigned int order;
+ unsigned int order = MAX_ORDER - 1;
/* Check that pageblock_nr_pages has not already been setup */
if (pageblock_order)
return;
- if (HPAGE_SHIFT > PAGE_SHIFT)
+ /* Don't let pageblocks exceed the maximum allocation granularity. */
+ if (HPAGE_SHIFT > PAGE_SHIFT && HUGETLB_PAGE_ORDER < order)
order = HUGETLB_PAGE_ORDER;
- else
- order = MAX_ORDER - 1;
/*
* Assume the largest contiguous order of interest is a huge page.
@@ -7502,12 +7521,33 @@ static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx,
* NOTE: this function is only called during memory hotplug
*/
#ifdef CONFIG_MEMORY_HOTPLUG
-void __ref free_area_init_core_hotplug(int nid)
+void __ref free_area_init_core_hotplug(struct pglist_data *pgdat)
{
+ int nid = pgdat->node_id;
enum zone_type z;
- pg_data_t *pgdat = NODE_DATA(nid);
+ int cpu;
pgdat_init_internals(pgdat);
+
+ if (pgdat->per_cpu_nodestats == &boot_nodestats)
+ pgdat->per_cpu_nodestats = alloc_percpu(struct per_cpu_nodestat);
+
+ /*
+ * Reset the nr_zones, order and highest_zoneidx before reuse.
+ * Note that kswapd will init kswapd_highest_zoneidx properly
+ * when it starts in the near future.
+ */
+ pgdat->nr_zones = 0;
+ pgdat->kswapd_order = 0;
+ pgdat->kswapd_highest_zoneidx = 0;
+ pgdat->node_start_pfn = 0;
+ for_each_online_cpu(cpu) {
+ struct per_cpu_nodestat *p;
+
+ p = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu);
+ memset(p, 0, sizeof(*p));
+ }
+
for (z = 0; z < MAX_NR_ZONES; z++)
zone_init_internals(&pgdat->node_zones[z], z, nid, 0);
}
@@ -7657,9 +7697,14 @@ static void __init free_area_init_node(int nid)
pgdat->node_start_pfn = start_pfn;
pgdat->per_cpu_nodestats = NULL;
- pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid,
- (u64)start_pfn << PAGE_SHIFT,
- end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0);
+ if (start_pfn != end_pfn) {
+ pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid,
+ (u64)start_pfn << PAGE_SHIFT,
+ end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0);
+ } else {
+ pr_info("Initmem setup node %d as memoryless\n", nid);
+ }
+
calculate_node_totalpages(pgdat, start_pfn, end_pfn);
alloc_node_mem_map(pgdat);
@@ -7668,7 +7713,7 @@ static void __init free_area_init_node(int nid)
free_area_init_core(pgdat);
}
-void __init free_area_init_memoryless_node(int nid)
+static void __init free_area_init_memoryless_node(int nid)
{
free_area_init_node(nid);
}
@@ -7972,10 +8017,17 @@ restart:
out2:
/* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */
- for (nid = 0; nid < MAX_NUMNODES; nid++)
+ for (nid = 0; nid < MAX_NUMNODES; nid++) {
+ unsigned long start_pfn, end_pfn;
+
zone_movable_pfn[nid] =
roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
+ get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
+ if (zone_movable_pfn[nid] >= end_pfn)
+ zone_movable_pfn[nid] = 0;
+ }
+
out:
/* restore the node_state */
node_states[N_MEMORY] = saved_node_state;
@@ -8096,8 +8148,36 @@ void __init free_area_init(unsigned long *max_zone_pfn)
/* Initialise every node */
mminit_verify_pageflags_layout();
setup_nr_node_ids();
- for_each_online_node(nid) {
- pg_data_t *pgdat = NODE_DATA(nid);
+ for_each_node(nid) {
+ pg_data_t *pgdat;
+
+ if (!node_online(nid)) {
+ pr_info("Initializing node %d as memoryless\n", nid);
+
+ /* Allocator not initialized yet */
+ pgdat = arch_alloc_nodedata(nid);
+ if (!pgdat) {
+ pr_err("Cannot allocate %zuB for node %d.\n",
+ sizeof(*pgdat), nid);
+ continue;
+ }
+ arch_refresh_nodedata(nid, pgdat);
+ free_area_init_memoryless_node(nid);
+
+ /*
+ * We do not want to confuse userspace by sysfs
+ * files/directories for node without any memory
+ * attached to it, so this node is not marked as
+ * N_MEMORY and not marked online so that no sysfs
+ * hierarchy will be created via register_one_node for
+ * it. The pgdat will get fully initialized by
+ * hotadd_init_pgdat() when memory is hotplugged into
+ * this node.
+ */
+ continue;
+ }
+
+ pgdat = NODE_DATA(nid);
free_area_init_node(nid);
/* Any memory on that node */
@@ -8474,7 +8554,8 @@ static void __setup_per_zone_wmarks(void)
zone->watermark_boost = 0;
zone->_watermark[WMARK_LOW] = min_wmark_pages(zone) + tmp;
- zone->_watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2;
+ zone->_watermark[WMARK_HIGH] = low_wmark_pages(zone) + tmp;
+ zone->_watermark[WMARK_PROMO] = high_wmark_pages(zone) + tmp;
spin_unlock_irqrestore(&zone->lock, flags);
}
@@ -8986,14 +9067,12 @@ struct page *has_unmovable_pages(struct zone *zone, struct page *page,
#ifdef CONFIG_CONTIG_ALLOC
static unsigned long pfn_max_align_down(unsigned long pfn)
{
- return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES,
- pageblock_nr_pages) - 1);
+ return ALIGN_DOWN(pfn, MAX_ORDER_NR_PAGES);
}
static unsigned long pfn_max_align_up(unsigned long pfn)
{
- return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES,
- pageblock_nr_pages));
+ return ALIGN(pfn, MAX_ORDER_NR_PAGES);
}
#if defined(CONFIG_DYNAMIC_DEBUG) || \
@@ -9452,6 +9531,7 @@ bool is_free_buddy_page(struct page *page)
return order < MAX_ORDER;
}
+EXPORT_SYMBOL(is_free_buddy_page);
#ifdef CONFIG_MEMORY_FAILURE
/*
diff --git a/mm/page_idle.c b/mm/page_idle.c
index edead6a8a5f9..fc0435abf909 100644
--- a/mm/page_idle.c
+++ b/mm/page_idle.c
@@ -13,6 +13,8 @@
#include <linux/page_ext.h>
#include <linux/page_idle.h>
+#include "internal.h"
+
#define BITMAP_CHUNK_SIZE sizeof(u64)
#define BITMAP_CHUNK_BITS (BITMAP_CHUNK_SIZE * BITS_PER_BYTE)
@@ -44,15 +46,11 @@ static struct page *page_idle_get_page(unsigned long pfn)
return page;
}
-static bool page_idle_clear_pte_refs_one(struct page *page,
+static bool page_idle_clear_pte_refs_one(struct folio *folio,
struct vm_area_struct *vma,
unsigned long addr, void *arg)
{
- struct page_vma_mapped_walk pvmw = {
- .page = page,
- .vma = vma,
- .address = addr,
- };
+ DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, addr, 0);
bool referenced = false;
while (page_vma_mapped_walk(&pvmw)) {
@@ -74,41 +72,41 @@ static bool page_idle_clear_pte_refs_one(struct page *page,
}
if (referenced) {
- clear_page_idle(page);
+ folio_clear_idle(folio);
/*
* We cleared the referenced bit in a mapping to this page. To
* avoid interference with page reclaim, mark it young so that
- * page_referenced() will return > 0.
+ * folio_referenced() will return > 0.
*/
- set_page_young(page);
+ folio_set_young(folio);
}
return true;
}
static void page_idle_clear_pte_refs(struct page *page)
{
+ struct folio *folio = page_folio(page);
/*
* Since rwc.arg is unused, rwc is effectively immutable, so we
* can make it static const to save some cycles and stack.
*/
static const struct rmap_walk_control rwc = {
.rmap_one = page_idle_clear_pte_refs_one,
- .anon_lock = page_lock_anon_vma_read,
+ .anon_lock = folio_lock_anon_vma_read,
};
bool need_lock;
- if (!page_mapped(page) ||
- !page_rmapping(page))
+ if (!folio_mapped(folio) || !folio_raw_mapping(folio))
return;
- need_lock = !PageAnon(page) || PageKsm(page);
- if (need_lock && !trylock_page(page))
+ need_lock = !folio_test_anon(folio) || folio_test_ksm(folio);
+ if (need_lock && !folio_trylock(folio))
return;
- rmap_walk(page, (struct rmap_walk_control *)&rwc);
+ rmap_walk(folio, &rwc);
if (need_lock)
- unlock_page(page);
+ folio_unlock(folio);
}
static ssize_t page_idle_bitmap_read(struct file *file, struct kobject *kobj,
diff --git a/mm/page_io.c b/mm/page_io.c
index 0bf8e40f4e57..b417f000b49e 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -338,10 +338,10 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc,
return 0;
}
- bio = bio_alloc(GFP_NOIO, 1);
- bio_set_dev(bio, sis->bdev);
+ bio = bio_alloc(sis->bdev, 1,
+ REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc),
+ GFP_NOIO);
bio->bi_iter.bi_sector = swap_page_sector(page);
- bio->bi_opf = REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc);
bio->bi_end_io = end_write_func;
bio_add_page(bio, page, thp_size(page), 0);
@@ -359,6 +359,7 @@ int swap_readpage(struct page *page, bool synchronous)
struct bio *bio;
int ret = 0;
struct swap_info_struct *sis = page_swap_info(page);
+ bool workingset = PageWorkingset(page);
unsigned long pflags;
VM_BUG_ON_PAGE(!PageSwapCache(page) && !synchronous, page);
@@ -370,7 +371,8 @@ int swap_readpage(struct page *page, bool synchronous)
* or the submitting cgroup IO-throttled, submission can be a
* significant part of overall IO time.
*/
- psi_memstall_enter(&pflags);
+ if (workingset)
+ psi_memstall_enter(&pflags);
delayacct_swapin_start();
if (frontswap_load(page) == 0) {
@@ -403,9 +405,7 @@ int swap_readpage(struct page *page, bool synchronous)
}
ret = 0;
- bio = bio_alloc(GFP_KERNEL, 1);
- bio_set_dev(bio, sis->bdev);
- bio->bi_opf = REQ_OP_READ;
+ bio = bio_alloc(sis->bdev, 1, REQ_OP_READ, GFP_KERNEL);
bio->bi_iter.bi_sector = swap_page_sector(page);
bio->bi_end_io = end_swap_bio_read;
bio_add_page(bio, page, thp_size(page), 0);
@@ -433,21 +433,25 @@ int swap_readpage(struct page *page, bool synchronous)
bio_put(bio);
out:
- psi_memstall_leave(&pflags);
+ if (workingset)
+ psi_memstall_leave(&pflags);
delayacct_swapin_end();
return ret;
}
-int swap_set_page_dirty(struct page *page)
+bool swap_dirty_folio(struct address_space *mapping, struct folio *folio)
{
- struct swap_info_struct *sis = page_swap_info(page);
+ struct swap_info_struct *sis = swp_swap_info(folio_swap_entry(folio));
if (data_race(sis->flags & SWP_FS_OPS)) {
- struct address_space *mapping = sis->swap_file->f_mapping;
+ const struct address_space_operations *aops;
+
+ mapping = sis->swap_file->f_mapping;
+ aops = mapping->a_ops;
- VM_BUG_ON_PAGE(!PageSwapCache(page), page);
- return mapping->a_ops->set_page_dirty(page);
+ VM_BUG_ON_FOLIO(!folio_test_swapcache(folio), folio);
+ return aops->dirty_folio(mapping, folio);
} else {
- return __set_page_dirty_no_writeback(page);
+ return noop_dirty_folio(mapping, folio);
}
}
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 6a0ddda6b3c5..f67c4c70f17f 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -115,7 +115,7 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype)
* onlining - just onlined memory won't immediately be considered for
* allocation.
*/
- if (!isolated_page && PageBuddy(page)) {
+ if (!isolated_page) {
nr_pages = move_freepages_block(zone, page, migratetype, NULL);
__mod_zone_freepage_state(zone, nr_pages, migratetype);
}
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 99e360df9465..fb3a05fdebdb 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -10,6 +10,7 @@
#include <linux/migrate.h>
#include <linux/stackdepot.h>
#include <linux/seq_file.h>
+#include <linux/memcontrol.h>
#include <linux/sched/clock.h>
#include "internal.h"
@@ -28,7 +29,9 @@ struct page_owner {
depot_stack_handle_t free_handle;
u64 ts_nsec;
u64 free_ts_nsec;
+ char comm[TASK_COMM_LEN];
pid_t pid;
+ pid_t tgid;
};
static bool page_owner_enabled = false;
@@ -163,7 +166,10 @@ static inline void __set_page_owner_handle(struct page_ext *page_ext,
page_owner->gfp_mask = gfp_mask;
page_owner->last_migrate_reason = -1;
page_owner->pid = current->pid;
+ page_owner->tgid = current->tgid;
page_owner->ts_nsec = local_clock();
+ strlcpy(page_owner->comm, current->comm,
+ sizeof(page_owner->comm));
__set_bit(PAGE_EXT_OWNER, &page_ext->flags);
__set_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags);
@@ -229,8 +235,10 @@ void __folio_copy_owner(struct folio *newfolio, struct folio *old)
old_page_owner->last_migrate_reason;
new_page_owner->handle = old_page_owner->handle;
new_page_owner->pid = old_page_owner->pid;
+ new_page_owner->tgid = old_page_owner->tgid;
new_page_owner->ts_nsec = old_page_owner->ts_nsec;
new_page_owner->free_ts_nsec = old_page_owner->ts_nsec;
+ strcpy(new_page_owner->comm, old_page_owner->comm);
/*
* We don't clear the bit on the old folio as it's going to be freed
@@ -325,6 +333,45 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m,
seq_putc(m, '\n');
}
+/*
+ * Looking for memcg information and print it out
+ */
+static inline int print_page_owner_memcg(char *kbuf, size_t count, int ret,
+ struct page *page)
+{
+#ifdef CONFIG_MEMCG
+ unsigned long memcg_data;
+ struct mem_cgroup *memcg;
+ bool online;
+ char name[80];
+
+ rcu_read_lock();
+ memcg_data = READ_ONCE(page->memcg_data);
+ if (!memcg_data)
+ goto out_unlock;
+
+ if (memcg_data & MEMCG_DATA_OBJCGS)
+ ret += scnprintf(kbuf + ret, count - ret,
+ "Slab cache page\n");
+
+ memcg = page_memcg_check(page);
+ if (!memcg)
+ goto out_unlock;
+
+ online = (memcg->css.flags & CSS_ONLINE);
+ cgroup_name(memcg->css.cgroup, name, sizeof(name));
+ ret += scnprintf(kbuf + ret, count - ret,
+ "Charged %sto %smemcg %s\n",
+ PageMemcgKmem(page) ? "(via objcg) " : "",
+ online ? "" : "offline ",
+ name);
+out_unlock:
+ rcu_read_unlock();
+#endif /* CONFIG_MEMCG */
+
+ return ret;
+}
+
static ssize_t
print_page_owner(char __user *buf, size_t count, unsigned long pfn,
struct page *page, struct page_owner *page_owner,
@@ -338,19 +385,17 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn,
if (!kbuf)
return -ENOMEM;
- ret = snprintf(kbuf, count,
- "Page allocated via order %u, mask %#x(%pGg), pid %d, ts %llu ns, free_ts %llu ns\n",
+ ret = scnprintf(kbuf, count,
+ "Page allocated via order %u, mask %#x(%pGg), pid %d, tgid %d (%s), ts %llu ns, free_ts %llu ns\n",
page_owner->order, page_owner->gfp_mask,
&page_owner->gfp_mask, page_owner->pid,
+ page_owner->tgid, page_owner->comm,
page_owner->ts_nsec, page_owner->free_ts_nsec);
- if (ret >= count)
- goto err;
-
/* Print information relevant to grouping pages by mobility */
pageblock_mt = get_pageblock_migratetype(page);
page_mt = gfp_migratetype(page_owner->gfp_mask);
- ret += snprintf(kbuf + ret, count - ret,
+ ret += scnprintf(kbuf + ret, count - ret,
"PFN %lu type %s Block %lu type %s Flags %pGp\n",
pfn,
migratetype_names[page_mt],
@@ -358,21 +403,18 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn,
migratetype_names[pageblock_mt],
&page->flags);
- if (ret >= count)
- goto err;
-
ret += stack_depot_snprint(handle, kbuf + ret, count - ret, 0);
if (ret >= count)
goto err;
if (page_owner->last_migrate_reason != -1) {
- ret += snprintf(kbuf + ret, count - ret,
+ ret += scnprintf(kbuf + ret, count - ret,
"Page has been migrated, last migrate reason: %s\n",
migrate_reason_names[page_owner->last_migrate_reason]);
- if (ret >= count)
- goto err;
}
+ ret = print_page_owner_memcg(kbuf, count, ret, page);
+
ret += snprintf(kbuf + ret, count - ret, "\n");
if (ret >= count)
goto err;
@@ -415,9 +457,10 @@ void __dump_page_owner(const struct page *page)
else
pr_alert("page_owner tracks the page as freed\n");
- pr_alert("page last allocated via order %u, migratetype %s, gfp_mask %#x(%pGg), pid %d, ts %llu, free_ts %llu\n",
+ pr_alert("page last allocated via order %u, migratetype %s, gfp_mask %#x(%pGg), pid %d, tgid %d (%s), ts %llu, free_ts %llu\n",
page_owner->order, migratetype_names[mt], gfp_mask, &gfp_mask,
- page_owner->pid, page_owner->ts_nsec, page_owner->free_ts_nsec);
+ page_owner->pid, page_owner->tgid, page_owner->comm,
+ page_owner->ts_nsec, page_owner->free_ts_nsec);
handle = READ_ONCE(page_owner->handle);
if (!handle)
diff --git a/mm/page_table_check.c b/mm/page_table_check.c
index 7504e7caa2a1..2458281bff89 100644
--- a/mm/page_table_check.c
+++ b/mm/page_table_check.c
@@ -23,15 +23,7 @@ EXPORT_SYMBOL(page_table_check_disabled);
static int __init early_page_table_check_param(char *buf)
{
- if (!buf)
- return -EINVAL;
-
- if (strcmp(buf, "on") == 0)
- __page_table_check_enabled = true;
- else if (strcmp(buf, "off") == 0)
- __page_table_check_enabled = false;
-
- return 0;
+ return strtobool(buf, &__page_table_check_enabled);
}
early_param("page_table_check", early_page_table_check_param);
@@ -86,8 +78,8 @@ static void page_table_check_clear(struct mm_struct *mm, unsigned long addr,
{
struct page_ext *page_ext;
struct page *page;
+ unsigned long i;
bool anon;
- int i;
if (!pfn_valid(pfn))
return;
@@ -121,8 +113,8 @@ static void page_table_check_set(struct mm_struct *mm, unsigned long addr,
{
struct page_ext *page_ext;
struct page *page;
+ unsigned long i;
bool anon;
- int i;
if (!pfn_valid(pfn))
return;
@@ -152,10 +144,10 @@ static void page_table_check_set(struct mm_struct *mm, unsigned long addr,
void __page_table_check_zero(struct page *page, unsigned int order)
{
struct page_ext *page_ext = lookup_page_ext(page);
- int i;
+ unsigned long i;
BUG_ON(!page_ext);
- for (i = 0; i < (1 << order); i++) {
+ for (i = 0; i < (1ul << order); i++) {
struct page_table_check *ptc = get_page_table_check(page_ext);
BUG_ON(atomic_read(&ptc->anon_map_count));
@@ -206,17 +198,10 @@ EXPORT_SYMBOL(__page_table_check_pud_clear);
void __page_table_check_pte_set(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pte)
{
- pte_t old_pte;
-
if (&init_mm == mm)
return;
- old_pte = *ptep;
- if (pte_user_accessible_page(old_pte)) {
- page_table_check_clear(mm, addr, pte_pfn(old_pte),
- PAGE_SIZE >> PAGE_SHIFT);
- }
-
+ __page_table_check_pte_clear(mm, addr, *ptep);
if (pte_user_accessible_page(pte)) {
page_table_check_set(mm, addr, pte_pfn(pte),
PAGE_SIZE >> PAGE_SHIFT,
@@ -228,17 +213,10 @@ EXPORT_SYMBOL(__page_table_check_pte_set);
void __page_table_check_pmd_set(struct mm_struct *mm, unsigned long addr,
pmd_t *pmdp, pmd_t pmd)
{
- pmd_t old_pmd;
-
if (&init_mm == mm)
return;
- old_pmd = *pmdp;
- if (pmd_user_accessible_page(old_pmd)) {
- page_table_check_clear(mm, addr, pmd_pfn(old_pmd),
- PMD_PAGE_SIZE >> PAGE_SHIFT);
- }
-
+ __page_table_check_pmd_clear(mm, addr, *pmdp);
if (pmd_user_accessible_page(pmd)) {
page_table_check_set(mm, addr, pmd_pfn(pmd),
PMD_PAGE_SIZE >> PAGE_SHIFT,
@@ -250,17 +228,10 @@ EXPORT_SYMBOL(__page_table_check_pmd_set);
void __page_table_check_pud_set(struct mm_struct *mm, unsigned long addr,
pud_t *pudp, pud_t pud)
{
- pud_t old_pud;
-
if (&init_mm == mm)
return;
- old_pud = *pudp;
- if (pud_user_accessible_page(old_pud)) {
- page_table_check_clear(mm, addr, pud_pfn(old_pud),
- PUD_PAGE_SIZE >> PAGE_SHIFT);
- }
-
+ __page_table_check_pud_clear(mm, addr, *pudp);
if (pud_user_accessible_page(pud)) {
page_table_check_set(mm, addr, pud_pfn(pud),
PUD_PAGE_SIZE >> PAGE_SHIFT,
@@ -268,3 +239,23 @@ void __page_table_check_pud_set(struct mm_struct *mm, unsigned long addr,
}
}
EXPORT_SYMBOL(__page_table_check_pud_set);
+
+void __page_table_check_pte_clear_range(struct mm_struct *mm,
+ unsigned long addr,
+ pmd_t pmd)
+{
+ if (&init_mm == mm)
+ return;
+
+ if (!pmd_bad(pmd) && !pmd_leaf(pmd)) {
+ pte_t *ptep = pte_offset_map(&pmd, addr);
+ unsigned long i;
+
+ pte_unmap(ptep);
+ for (i = 0; i < PTRS_PER_PTE; i++) {
+ __page_table_check_pte_clear(mm, addr, *ptep);
+ addr += PAGE_SIZE;
+ ptep++;
+ }
+ }
+}
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index f7b331081791..1187f9c1ec5b 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -53,18 +53,6 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw)
return true;
}
-static inline bool pfn_is_match(struct page *page, unsigned long pfn)
-{
- unsigned long page_pfn = page_to_pfn(page);
-
- /* normal page and hugetlbfs page */
- if (!PageTransCompound(page) || PageHuge(page))
- return page_pfn == pfn;
-
- /* THP can be referenced by any subpage */
- return pfn >= page_pfn && pfn - page_pfn < thp_nr_pages(page);
-}
-
/**
* check_pte - check if @pvmw->page is mapped at the @pvmw->pte
* @pvmw: page_vma_mapped_walk struct, includes a pair pte and page for checking
@@ -116,7 +104,17 @@ static bool check_pte(struct page_vma_mapped_walk *pvmw)
pfn = pte_pfn(*pvmw->pte);
}
- return pfn_is_match(pvmw->page, pfn);
+ return (pfn - pvmw->pfn) < pvmw->nr_pages;
+}
+
+/* Returns true if the two ranges overlap. Careful to not overflow. */
+static bool check_pmd(unsigned long pfn, struct page_vma_mapped_walk *pvmw)
+{
+ if ((pfn + HPAGE_PMD_NR - 1) < pvmw->pfn)
+ return false;
+ if (pfn > pvmw->pfn + pvmw->nr_pages - 1)
+ return false;
+ return true;
}
static void step_forward(struct page_vma_mapped_walk *pvmw, unsigned long size)
@@ -127,7 +125,7 @@ static void step_forward(struct page_vma_mapped_walk *pvmw, unsigned long size)
}
/**
- * page_vma_mapped_walk - check if @pvmw->page is mapped in @pvmw->vma at
+ * page_vma_mapped_walk - check if @pvmw->pfn is mapped in @pvmw->vma at
* @pvmw->address
* @pvmw: pointer to struct page_vma_mapped_walk. page, vma, address and flags
* must be set. pmd, pte and ptl must be NULL.
@@ -152,8 +150,8 @@ static void step_forward(struct page_vma_mapped_walk *pvmw, unsigned long size)
*/
bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
{
- struct mm_struct *mm = pvmw->vma->vm_mm;
- struct page *page = pvmw->page;
+ struct vm_area_struct *vma = pvmw->vma;
+ struct mm_struct *mm = vma->vm_mm;
unsigned long end;
pgd_t *pgd;
p4d_t *p4d;
@@ -164,32 +162,26 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
if (pvmw->pmd && !pvmw->pte)
return not_found(pvmw);
- if (unlikely(PageHuge(page))) {
+ if (unlikely(is_vm_hugetlb_page(vma))) {
+ unsigned long size = pvmw->nr_pages * PAGE_SIZE;
/* The only possible mapping was handled on last iteration */
if (pvmw->pte)
return not_found(pvmw);
/* when pud is not present, pte will be NULL */
- pvmw->pte = huge_pte_offset(mm, pvmw->address, page_size(page));
+ pvmw->pte = huge_pte_offset(mm, pvmw->address, size);
if (!pvmw->pte)
return false;
- pvmw->ptl = huge_pte_lockptr(page_hstate(page), mm, pvmw->pte);
+ pvmw->ptl = huge_pte_lockptr(size_to_hstate(size), mm,
+ pvmw->pte);
spin_lock(pvmw->ptl);
if (!check_pte(pvmw))
return not_found(pvmw);
return true;
}
- /*
- * Seek to next pte only makes sense for THP.
- * But more important than that optimization, is to filter out
- * any PageKsm page: whose page->index misleads vma_address()
- * and vma_address_end() to disaster.
- */
- end = PageTransCompound(page) ?
- vma_address_end(page, pvmw->vma) :
- pvmw->address + PAGE_SIZE;
+ end = vma_address_end(pvmw);
if (pvmw->pte)
goto next_pte;
restart:
@@ -224,7 +216,7 @@ restart:
if (likely(pmd_trans_huge(pmde))) {
if (pvmw->flags & PVMW_MIGRATION)
return not_found(pvmw);
- if (pmd_page(pmde) != page)
+ if (!check_pmd(pmd_pfn(pmde), pvmw))
return not_found(pvmw);
return true;
}
@@ -236,7 +228,7 @@ restart:
return not_found(pvmw);
entry = pmd_to_swp_entry(pmde);
if (!is_migration_entry(entry) ||
- pfn_swap_entry_to_page(entry) != page)
+ !check_pmd(swp_offset(entry), pvmw))
return not_found(pvmw);
return true;
}
@@ -250,7 +242,8 @@ restart:
* cleared *pmd but not decremented compound_mapcount().
*/
if ((pvmw->flags & PVMW_SYNC) &&
- PageTransCompound(page)) {
+ transparent_hugepage_active(vma) &&
+ (pvmw->nr_pages >= HPAGE_PMD_NR)) {
spinlock_t *ptl = pmd_lock(mm, pvmw->pmd);
spin_unlock(ptl);
@@ -307,7 +300,8 @@ next_pte:
int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
{
struct page_vma_mapped_walk pvmw = {
- .page = page,
+ .pfn = page_to_pfn(page),
+ .nr_pages = 1,
.vma = vma,
.flags = PVMW_SYNC,
};
diff --git a/mm/percpu-stats.c b/mm/percpu-stats.c
index c6bd092ff7a3..dd3590dfc23d 100644
--- a/mm/percpu-stats.c
+++ b/mm/percpu-stats.c
@@ -144,7 +144,7 @@ alloc_buffer:
spin_unlock_irq(&pcpu_lock);
/* there can be at most this many free and allocated fragments */
- buffer = vmalloc(array_size(sizeof(int), (2 * max_nr_alloc + 1)));
+ buffer = vmalloc_array(2 * max_nr_alloc + 1, sizeof(int));
if (!buffer)
return -ENOMEM;
diff --git a/mm/ptdump.c b/mm/ptdump.c
index da751448d0e4..eea3d28d173c 100644
--- a/mm/ptdump.c
+++ b/mm/ptdump.c
@@ -40,8 +40,10 @@ static int ptdump_pgd_entry(pgd_t *pgd, unsigned long addr,
if (st->effective_prot)
st->effective_prot(st, 0, pgd_val(val));
- if (pgd_leaf(val))
+ if (pgd_leaf(val)) {
st->note_page(st, addr, 0, pgd_val(val));
+ walk->action = ACTION_CONTINUE;
+ }
return 0;
}
@@ -61,8 +63,10 @@ static int ptdump_p4d_entry(p4d_t *p4d, unsigned long addr,
if (st->effective_prot)
st->effective_prot(st, 1, p4d_val(val));
- if (p4d_leaf(val))
+ if (p4d_leaf(val)) {
st->note_page(st, addr, 1, p4d_val(val));
+ walk->action = ACTION_CONTINUE;
+ }
return 0;
}
@@ -82,8 +86,10 @@ static int ptdump_pud_entry(pud_t *pud, unsigned long addr,
if (st->effective_prot)
st->effective_prot(st, 2, pud_val(val));
- if (pud_leaf(val))
+ if (pud_leaf(val)) {
st->note_page(st, addr, 2, pud_val(val));
+ walk->action = ACTION_CONTINUE;
+ }
return 0;
}
@@ -101,8 +107,10 @@ static int ptdump_pmd_entry(pmd_t *pmd, unsigned long addr,
if (st->effective_prot)
st->effective_prot(st, 3, pmd_val(val));
- if (pmd_leaf(val))
+ if (pmd_leaf(val)) {
st->note_page(st, addr, 3, pmd_val(val));
+ walk->action = ACTION_CONTINUE;
+ }
return 0;
}
diff --git a/mm/readahead.c b/mm/readahead.c
index cf0dcf89eb69..d3a47546d17d 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -8,6 +8,111 @@
* Initial version.
*/
+/**
+ * DOC: Readahead Overview
+ *
+ * Readahead is used to read content into the page cache before it is
+ * explicitly requested by the application. Readahead only ever
+ * attempts to read pages that are not yet in the page cache. If a
+ * page is present but not up-to-date, readahead will not try to read
+ * it. In that case a simple ->readpage() will be requested.
+ *
+ * Readahead is triggered when an application read request (whether a
+ * systemcall or a page fault) finds that the requested page is not in
+ * the page cache, or that it is in the page cache and has the
+ * %PG_readahead flag set. This flag indicates that the page was loaded
+ * as part of a previous read-ahead request and now that it has been
+ * accessed, it is time for the next read-ahead.
+ *
+ * Each readahead request is partly synchronous read, and partly async
+ * read-ahead. This is reflected in the struct file_ra_state which
+ * contains ->size being to total number of pages, and ->async_size
+ * which is the number of pages in the async section. The first page in
+ * this async section will have %PG_readahead set as a trigger for a
+ * subsequent read ahead. Once a series of sequential reads has been
+ * established, there should be no need for a synchronous component and
+ * all read ahead request will be fully asynchronous.
+ *
+ * When either of the triggers causes a readahead, three numbers need to
+ * be determined: the start of the region, the size of the region, and
+ * the size of the async tail.
+ *
+ * The start of the region is simply the first page address at or after
+ * the accessed address, which is not currently populated in the page
+ * cache. This is found with a simple search in the page cache.
+ *
+ * The size of the async tail is determined by subtracting the size that
+ * was explicitly requested from the determined request size, unless
+ * this would be less than zero - then zero is used. NOTE THIS
+ * CALCULATION IS WRONG WHEN THE START OF THE REGION IS NOT THE ACCESSED
+ * PAGE.
+ *
+ * The size of the region is normally determined from the size of the
+ * previous readahead which loaded the preceding pages. This may be
+ * discovered from the struct file_ra_state for simple sequential reads,
+ * or from examining the state of the page cache when multiple
+ * sequential reads are interleaved. Specifically: where the readahead
+ * was triggered by the %PG_readahead flag, the size of the previous
+ * readahead is assumed to be the number of pages from the triggering
+ * page to the start of the new readahead. In these cases, the size of
+ * the previous readahead is scaled, often doubled, for the new
+ * readahead, though see get_next_ra_size() for details.
+ *
+ * If the size of the previous read cannot be determined, the number of
+ * preceding pages in the page cache is used to estimate the size of
+ * a previous read. This estimate could easily be misled by random
+ * reads being coincidentally adjacent, so it is ignored unless it is
+ * larger than the current request, and it is not scaled up, unless it
+ * is at the start of file.
+ *
+ * In general read ahead is accelerated at the start of the file, as
+ * reads from there are often sequential. There are other minor
+ * adjustments to the read ahead size in various special cases and these
+ * are best discovered by reading the code.
+ *
+ * The above calculation determines the readahead, to which any requested
+ * read size may be added.
+ *
+ * Readahead requests are sent to the filesystem using the ->readahead()
+ * address space operation, for which mpage_readahead() is a canonical
+ * implementation. ->readahead() should normally initiate reads on all
+ * pages, but may fail to read any or all pages without causing an IO
+ * error. The page cache reading code will issue a ->readpage() request
+ * for any page which ->readahead() does not provided, and only an error
+ * from this will be final.
+ *
+ * ->readahead() will generally call readahead_page() repeatedly to get
+ * each page from those prepared for read ahead. It may fail to read a
+ * page by:
+ *
+ * * not calling readahead_page() sufficiently many times, effectively
+ * ignoring some pages, as might be appropriate if the path to
+ * storage is congested.
+ *
+ * * failing to actually submit a read request for a given page,
+ * possibly due to insufficient resources, or
+ *
+ * * getting an error during subsequent processing of a request.
+ *
+ * In the last two cases, the page should be unlocked to indicate that
+ * the read attempt has failed. In the first case the page will be
+ * unlocked by the caller.
+ *
+ * Those pages not in the final ``async_size`` of the request should be
+ * considered to be important and ->readahead() should not fail them due
+ * to congestion or temporary resource unavailability, but should wait
+ * for necessary resources (e.g. memory or indexing information) to
+ * become available. Pages in the final ``async_size`` may be
+ * considered less urgent and failure to read them is more acceptable.
+ * In this case it is best to use delete_from_page_cache() to remove the
+ * pages from the page cache as is automatically done for pages that
+ * were not fetched with readahead_page(). This will allow a
+ * subsequent synchronous read ahead request to try them again. If they
+ * are left in the page cache, then they will be read individually using
+ * ->readpage().
+ *
+ */
+
#include <linux/kernel.h>
#include <linux/dax.h>
#include <linux/gfp.h>
@@ -51,7 +156,7 @@ static void read_cache_pages_invalidate_page(struct address_space *mapping,
if (!trylock_page(page))
BUG();
page->mapping = mapping;
- do_invalidatepage(page, 0, PAGE_SIZE);
+ folio_invalidate(page_folio(page), 0, PAGE_SIZE);
page->mapping = NULL;
unlock_page(page);
}
@@ -127,8 +232,17 @@ static void read_pages(struct readahead_control *rac, struct list_head *pages,
if (aops->readahead) {
aops->readahead(rac);
- /* Clean up the remaining pages */
+ /*
+ * Clean up the remaining pages. The sizes in ->ra
+ * maybe be used to size next read-ahead, so make sure
+ * they accurately reflect what happened.
+ */
while ((page = readahead_page(rac))) {
+ rac->ra->size -= 1;
+ if (rac->ra->async_size > 0) {
+ rac->ra->async_size -= 1;
+ delete_from_page_cache(page);
+ }
unlock_page(page);
put_page(page);
}
@@ -148,7 +262,7 @@ static void read_pages(struct readahead_control *rac, struct list_head *pages,
blk_finish_plug(&plug);
- BUG_ON(!list_empty(pages));
+ BUG_ON(pages && !list_empty(pages));
BUG_ON(readahead_count(rac));
out:
@@ -247,7 +361,7 @@ EXPORT_SYMBOL_GPL(page_cache_ra_unbounded);
* behaviour which would occur if page allocations are causing VM writeback.
* We really don't want to intermingle reads and writes like that.
*/
-void do_page_cache_ra(struct readahead_control *ractl,
+static void do_page_cache_ra(struct readahead_control *ractl,
unsigned long nr_to_read, unsigned long lookahead_size)
{
struct inode *inode = ractl->mapping->host;
@@ -432,10 +546,102 @@ static int try_context_readahead(struct address_space *mapping,
}
/*
+ * There are some parts of the kernel which assume that PMD entries
+ * are exactly HPAGE_PMD_ORDER. Those should be fixed, but until then,
+ * limit the maximum allocation order to PMD size. I'm not aware of any
+ * assumptions about maximum order if THP are disabled, but 8 seems like
+ * a good order (that's 1MB if you're using 4kB pages)
+ */
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#define MAX_PAGECACHE_ORDER HPAGE_PMD_ORDER
+#else
+#define MAX_PAGECACHE_ORDER 8
+#endif
+
+static inline int ra_alloc_folio(struct readahead_control *ractl, pgoff_t index,
+ pgoff_t mark, unsigned int order, gfp_t gfp)
+{
+ int err;
+ struct folio *folio = filemap_alloc_folio(gfp, order);
+
+ if (!folio)
+ return -ENOMEM;
+ if (mark - index < (1UL << order))
+ folio_set_readahead(folio);
+ err = filemap_add_folio(ractl->mapping, folio, index, gfp);
+ if (err)
+ folio_put(folio);
+ else
+ ractl->_nr_pages += 1UL << order;
+ return err;
+}
+
+void page_cache_ra_order(struct readahead_control *ractl,
+ struct file_ra_state *ra, unsigned int new_order)
+{
+ struct address_space *mapping = ractl->mapping;
+ pgoff_t index = readahead_index(ractl);
+ pgoff_t limit = (i_size_read(mapping->host) - 1) >> PAGE_SHIFT;
+ pgoff_t mark = index + ra->size - ra->async_size;
+ int err = 0;
+ gfp_t gfp = readahead_gfp_mask(mapping);
+
+ if (!mapping_large_folio_support(mapping) || ra->size < 4)
+ goto fallback;
+
+ limit = min(limit, index + ra->size - 1);
+
+ if (new_order < MAX_PAGECACHE_ORDER) {
+ new_order += 2;
+ if (new_order > MAX_PAGECACHE_ORDER)
+ new_order = MAX_PAGECACHE_ORDER;
+ while ((1 << new_order) > ra->size)
+ new_order--;
+ }
+
+ while (index <= limit) {
+ unsigned int order = new_order;
+
+ /* Align with smaller pages if needed */
+ if (index & ((1UL << order) - 1)) {
+ order = __ffs(index);
+ if (order == 1)
+ order = 0;
+ }
+ /* Don't allocate pages past EOF */
+ while (index + (1UL << order) - 1 > limit) {
+ if (--order == 1)
+ order = 0;
+ }
+ err = ra_alloc_folio(ractl, index, mark, order, gfp);
+ if (err)
+ break;
+ index += 1UL << order;
+ }
+
+ if (index > limit) {
+ ra->size += index - limit - 1;
+ ra->async_size += index - limit - 1;
+ }
+
+ read_pages(ractl, NULL, false);
+
+ /*
+ * If there were already pages in the page cache, then we may have
+ * left some gaps. Let the regular readahead code take care of this
+ * situation.
+ */
+ if (!err)
+ return;
+fallback:
+ do_page_cache_ra(ractl, ra->size, ra->async_size);
+}
+
+/*
* A minimal readahead algorithm for trivial sequential/random reads.
*/
static void ondemand_readahead(struct readahead_control *ractl,
- bool hit_readahead_marker, unsigned long req_size)
+ struct folio *folio, unsigned long req_size)
{
struct backing_dev_info *bdi = inode_to_bdi(ractl->mapping->host);
struct file_ra_state *ra = ractl->ra;
@@ -470,12 +676,12 @@ static void ondemand_readahead(struct readahead_control *ractl,
}
/*
- * Hit a marked page without valid readahead state.
+ * Hit a marked folio without valid readahead state.
* E.g. interleaved reads.
* Query the pagecache for async_size, which normally equals to
* readahead size. Ramp it up and use it as the new readahead size.
*/
- if (hit_readahead_marker) {
+ if (folio) {
pgoff_t start;
rcu_read_lock();
@@ -548,7 +754,7 @@ readit:
}
ractl->_index = ra->start;
- do_page_cache_ra(ractl, ra->size, ra->async_size);
+ page_cache_ra_order(ractl, ra, folio ? folio_order(folio) : 0);
}
void page_cache_sync_ra(struct readahead_control *ractl,
@@ -576,7 +782,7 @@ void page_cache_sync_ra(struct readahead_control *ractl,
}
/* do read-ahead */
- ondemand_readahead(ractl, false, req_count);
+ ondemand_readahead(ractl, NULL, req_count);
}
EXPORT_SYMBOL_GPL(page_cache_sync_ra);
@@ -595,17 +801,11 @@ void page_cache_async_ra(struct readahead_control *ractl,
folio_clear_readahead(folio);
- /*
- * Defer asynchronous read-ahead on IO congestion.
- */
- if (inode_read_congested(ractl->mapping->host))
- return;
-
if (blk_cgroup_congested())
return;
/* do read-ahead */
- ondemand_readahead(ractl, true, req_count);
+ ondemand_readahead(ractl, folio, req_count);
}
EXPORT_SYMBOL_GPL(page_cache_async_ra);
diff --git a/mm/rmap.c b/mm/rmap.c
index 6a1e8c7f6213..5cb970d51f0a 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -31,8 +31,8 @@
* mm->page_table_lock or pte_lock
* swap_lock (in swap_duplicate, swap_info_get)
* mmlist_lock (in mmput, drain_mmlist and others)
- * mapping->private_lock (in __set_page_dirty_buffers)
- * lock_page_memcg move_lock (in __set_page_dirty_buffers)
+ * mapping->private_lock (in block_dirty_folio)
+ * folio_lock_memcg move_lock (in block_dirty_folio)
* i_pages lock (widely used)
* lruvec->lru_lock (in folio_lruvec_lock_irq)
* inode->i_lock (in set_page_dirty's __mark_inode_dirty)
@@ -76,7 +76,9 @@
#include <asm/tlbflush.h>
+#define CREATE_TRACE_POINTS
#include <trace/events/tlb.h>
+#include <trace/events/migrate.h>
#include "internal.h"
@@ -107,15 +109,15 @@ static inline void anon_vma_free(struct anon_vma *anon_vma)
VM_BUG_ON(atomic_read(&anon_vma->refcount));
/*
- * Synchronize against page_lock_anon_vma_read() such that
+ * Synchronize against folio_lock_anon_vma_read() such that
* we can safely hold the lock without the anon_vma getting
* freed.
*
* Relies on the full mb implied by the atomic_dec_and_test() from
* put_anon_vma() against the acquire barrier implied by
- * down_read_trylock() from page_lock_anon_vma_read(). This orders:
+ * down_read_trylock() from folio_lock_anon_vma_read(). This orders:
*
- * page_lock_anon_vma_read() VS put_anon_vma()
+ * folio_lock_anon_vma_read() VS put_anon_vma()
* down_read_trylock() atomic_dec_and_test()
* LOCK MB
* atomic_read() rwsem_is_locked()
@@ -168,7 +170,7 @@ static void anon_vma_chain_link(struct vm_area_struct *vma,
* allocate a new one.
*
* Anon-vma allocations are very subtle, because we may have
- * optimistically looked up an anon_vma in page_lock_anon_vma_read()
+ * optimistically looked up an anon_vma in folio_lock_anon_vma_read()
* and that may actually touch the rwsem even in the newly
* allocated vma (it depends on RCU to make sure that the
* anon_vma isn't actually destroyed).
@@ -526,28 +528,28 @@ out:
* atomic op -- the trylock. If we fail the trylock, we fall back to getting a
* reference like with page_get_anon_vma() and then block on the mutex.
*/
-struct anon_vma *page_lock_anon_vma_read(struct page *page)
+struct anon_vma *folio_lock_anon_vma_read(struct folio *folio)
{
struct anon_vma *anon_vma = NULL;
struct anon_vma *root_anon_vma;
unsigned long anon_mapping;
rcu_read_lock();
- anon_mapping = (unsigned long)READ_ONCE(page->mapping);
+ anon_mapping = (unsigned long)READ_ONCE(folio->mapping);
if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
goto out;
- if (!page_mapped(page))
+ if (!folio_mapped(folio))
goto out;
anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
root_anon_vma = READ_ONCE(anon_vma->root);
if (down_read_trylock(&root_anon_vma->rwsem)) {
/*
- * If the page is still mapped, then this anon_vma is still
+ * If the folio is still mapped, then this anon_vma is still
* its anon_vma, and holding the mutex ensures that it will
* not go away, see anon_vma_free().
*/
- if (!page_mapped(page)) {
+ if (!folio_mapped(folio)) {
up_read(&root_anon_vma->rwsem);
anon_vma = NULL;
}
@@ -560,7 +562,7 @@ struct anon_vma *page_lock_anon_vma_read(struct page *page)
goto out;
}
- if (!page_mapped(page)) {
+ if (!folio_mapped(folio)) {
rcu_read_unlock();
put_anon_vma(anon_vma);
return NULL;
@@ -737,8 +739,9 @@ static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
*/
unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
{
- if (PageAnon(page)) {
- struct anon_vma *page__anon_vma = page_anon_vma(page);
+ struct folio *folio = page_folio(page);
+ if (folio_test_anon(folio)) {
+ struct anon_vma *page__anon_vma = folio_anon_vma(folio);
/*
* Note: swapoff's unuse_vma() is more efficient with this
* check, and needs it to match anon_vma when KSM is active.
@@ -748,7 +751,7 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
return -EFAULT;
} else if (!vma->vm_file) {
return -EFAULT;
- } else if (vma->vm_file->f_mapping != compound_head(page)->mapping) {
+ } else if (vma->vm_file->f_mapping != folio->mapping) {
return -EFAULT;
}
@@ -789,30 +792,29 @@ out:
return pmd;
}
-struct page_referenced_arg {
+struct folio_referenced_arg {
int mapcount;
int referenced;
unsigned long vm_flags;
struct mem_cgroup *memcg;
};
/*
- * arg: page_referenced_arg will be passed
+ * arg: folio_referenced_arg will be passed
*/
-static bool page_referenced_one(struct page *page, struct vm_area_struct *vma,
- unsigned long address, void *arg)
+static bool folio_referenced_one(struct folio *folio,
+ struct vm_area_struct *vma, unsigned long address, void *arg)
{
- struct page_referenced_arg *pra = arg;
- struct page_vma_mapped_walk pvmw = {
- .page = page,
- .vma = vma,
- .address = address,
- };
+ struct folio_referenced_arg *pra = arg;
+ DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
int referenced = 0;
while (page_vma_mapped_walk(&pvmw)) {
address = pvmw.address;
- if (vma->vm_flags & VM_LOCKED) {
+ if ((vma->vm_flags & VM_LOCKED) &&
+ (!folio_test_large(folio) || !pvmw.pte)) {
+ /* Restore the mlock which got missed */
+ mlock_vma_folio(folio, vma, !pvmw.pte);
page_vma_mapped_walk_done(&pvmw);
pra->vm_flags |= VM_LOCKED;
return false; /* To break the loop */
@@ -824,10 +826,10 @@ static bool page_referenced_one(struct page *page, struct vm_area_struct *vma,
/*
* Don't treat a reference through
* a sequentially read mapping as such.
- * If the page has been used in another mapping,
+ * If the folio has been used in another mapping,
* we will catch it; if this other mapping is
* already gone, the unmap path will have set
- * PG_referenced or activated the page.
+ * the referenced flag or activated the folio.
*/
if (likely(!(vma->vm_flags & VM_SEQ_READ)))
referenced++;
@@ -837,7 +839,7 @@ static bool page_referenced_one(struct page *page, struct vm_area_struct *vma,
pvmw.pmd))
referenced++;
} else {
- /* unexpected pmd-mapped page? */
+ /* unexpected pmd-mapped folio? */
WARN_ON_ONCE(1);
}
@@ -845,13 +847,13 @@ static bool page_referenced_one(struct page *page, struct vm_area_struct *vma,
}
if (referenced)
- clear_page_idle(page);
- if (test_and_clear_page_young(page))
+ folio_clear_idle(folio);
+ if (folio_test_clear_young(folio))
referenced++;
if (referenced) {
pra->referenced++;
- pra->vm_flags |= vma->vm_flags;
+ pra->vm_flags |= vma->vm_flags & ~VM_LOCKED;
}
if (!pra->mapcount)
@@ -860,9 +862,9 @@ static bool page_referenced_one(struct page *page, struct vm_area_struct *vma,
return true;
}
-static bool invalid_page_referenced_vma(struct vm_area_struct *vma, void *arg)
+static bool invalid_folio_referenced_vma(struct vm_area_struct *vma, void *arg)
{
- struct page_referenced_arg *pra = arg;
+ struct folio_referenced_arg *pra = arg;
struct mem_cgroup *memcg = pra->memcg;
if (!mm_match_cgroup(vma->vm_mm, memcg))
@@ -872,40 +874,39 @@ static bool invalid_page_referenced_vma(struct vm_area_struct *vma, void *arg)
}
/**
- * page_referenced - test if the page was referenced
- * @page: the page to test
- * @is_locked: caller holds lock on the page
+ * folio_referenced() - Test if the folio was referenced.
+ * @folio: The folio to test.
+ * @is_locked: Caller holds lock on the folio.
* @memcg: target memory cgroup
- * @vm_flags: collect encountered vma->vm_flags who actually referenced the page
+ * @vm_flags: A combination of all the vma->vm_flags which referenced the folio.
+ *
+ * Quick test_and_clear_referenced for all mappings of a folio,
*
- * Quick test_and_clear_referenced for all mappings to a page,
- * returns the number of ptes which referenced the page.
+ * Return: The number of mappings which referenced the folio.
*/
-int page_referenced(struct page *page,
- int is_locked,
- struct mem_cgroup *memcg,
- unsigned long *vm_flags)
+int folio_referenced(struct folio *folio, int is_locked,
+ struct mem_cgroup *memcg, unsigned long *vm_flags)
{
int we_locked = 0;
- struct page_referenced_arg pra = {
- .mapcount = total_mapcount(page),
+ struct folio_referenced_arg pra = {
+ .mapcount = folio_mapcount(folio),
.memcg = memcg,
};
struct rmap_walk_control rwc = {
- .rmap_one = page_referenced_one,
+ .rmap_one = folio_referenced_one,
.arg = (void *)&pra,
- .anon_lock = page_lock_anon_vma_read,
+ .anon_lock = folio_lock_anon_vma_read,
};
*vm_flags = 0;
if (!pra.mapcount)
return 0;
- if (!page_rmapping(page))
+ if (!folio_raw_mapping(folio))
return 0;
- if (!is_locked && (!PageAnon(page) || PageKsm(page))) {
- we_locked = trylock_page(page);
+ if (!is_locked && (!folio_test_anon(folio) || folio_test_ksm(folio))) {
+ we_locked = folio_trylock(folio);
if (!we_locked)
return 1;
}
@@ -916,37 +917,32 @@ int page_referenced(struct page *page,
* cgroups
*/
if (memcg) {
- rwc.invalid_vma = invalid_page_referenced_vma;
+ rwc.invalid_vma = invalid_folio_referenced_vma;
}
- rmap_walk(page, &rwc);
+ rmap_walk(folio, &rwc);
*vm_flags = pra.vm_flags;
if (we_locked)
- unlock_page(page);
+ folio_unlock(folio);
return pra.referenced;
}
-static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
+static bool page_mkclean_one(struct folio *folio, struct vm_area_struct *vma,
unsigned long address, void *arg)
{
- struct page_vma_mapped_walk pvmw = {
- .page = page,
- .vma = vma,
- .address = address,
- .flags = PVMW_SYNC,
- };
+ DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, PVMW_SYNC);
struct mmu_notifier_range range;
int *cleaned = arg;
/*
* We have to assume the worse case ie pmd for invalidation. Note that
- * the page can not be free from this function.
+ * the folio can not be freed from this function.
*/
mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE,
0, vma, vma->vm_mm, address,
- vma_address_end(page, vma));
+ vma_address_end(&pvmw));
mmu_notifier_invalidate_range_start(&range);
while (page_vma_mapped_walk(&pvmw)) {
@@ -974,14 +970,14 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
if (!pmd_dirty(*pmd) && !pmd_write(*pmd))
continue;
- flush_cache_page(vma, address, page_to_pfn(page));
+ flush_cache_page(vma, address, folio_pfn(folio));
entry = pmdp_invalidate(vma, address, pmd);
entry = pmd_wrprotect(entry);
entry = pmd_mkclean(entry);
set_pmd_at(vma->vm_mm, address, pmd, entry);
ret = 1;
#else
- /* unexpected pmd-mapped page? */
+ /* unexpected pmd-mapped folio? */
WARN_ON_ONCE(1);
#endif
}
@@ -1029,7 +1025,7 @@ int folio_mkclean(struct folio *folio)
if (!mapping)
return 0;
- rmap_walk(&folio->page, &rwc);
+ rmap_walk(folio, &rwc);
return cleaned;
}
@@ -1057,8 +1053,8 @@ void page_move_anon_rmap(struct page *page, struct vm_area_struct *vma)
anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
/*
* Ensure that anon_vma and the PAGE_MAPPING_ANON bit are written
- * simultaneously, so a concurrent reader (eg page_referenced()'s
- * PageAnon()) will not see one without the other.
+ * simultaneously, so a concurrent reader (eg folio_referenced()'s
+ * folio_test_anon()) will not see one without the other.
*/
WRITE_ONCE(page->mapping, (struct address_space *) anon_vma);
}
@@ -1108,6 +1104,7 @@ static void __page_set_anon_rmap(struct page *page,
static void __page_check_anon_rmap(struct page *page,
struct vm_area_struct *vma, unsigned long address)
{
+ struct folio *folio = page_folio(page);
/*
* The page's anon-rmap details (mapping and index) are guaranteed to
* be set up correctly at this point.
@@ -1119,7 +1116,8 @@ static void __page_check_anon_rmap(struct page *page,
* are initially only visible via the pagetables, and the pte is locked
* over the call to page_add_new_anon_rmap.
*/
- VM_BUG_ON_PAGE(page_anon_vma(page)->root != vma->anon_vma->root, page);
+ VM_BUG_ON_FOLIO(folio_anon_vma(folio)->root != vma->anon_vma->root,
+ folio);
VM_BUG_ON_PAGE(page_to_pgoff(page) != linear_page_index(vma, address),
page);
}
@@ -1181,17 +1179,17 @@ void do_page_add_anon_rmap(struct page *page,
__mod_lruvec_page_state(page, NR_ANON_MAPPED, nr);
}
- if (unlikely(PageKsm(page))) {
+ if (unlikely(PageKsm(page)))
unlock_page_memcg(page);
- return;
- }
/* address might be in next vma when migration races vma_adjust */
- if (first)
+ else if (first)
__page_set_anon_rmap(page, vma, address,
flags & RMAP_EXCLUSIVE);
else
__page_check_anon_rmap(page, vma, address);
+
+ mlock_vma_page(page, vma, compound);
}
/**
@@ -1216,8 +1214,7 @@ void page_add_new_anon_rmap(struct page *page,
VM_BUG_ON_PAGE(!PageTransHuge(page), page);
/* increment count (starts at -1) */
atomic_set(compound_mapcount_ptr(page), 0);
- if (hpage_pincount_available(page))
- atomic_set(compound_pincount_ptr(page), 0);
+ atomic_set(compound_pincount_ptr(page), 0);
__mod_lruvec_page_state(page, NR_ANON_THPS, nr);
} else {
@@ -1232,26 +1229,39 @@ void page_add_new_anon_rmap(struct page *page,
/**
* page_add_file_rmap - add pte mapping to a file page
- * @page: the page to add the mapping to
- * @compound: charge the page as compound or small page
+ * @page: the page to add the mapping to
+ * @vma: the vm area in which the mapping is added
+ * @compound: charge the page as compound or small page
*
* The caller needs to hold the pte lock.
*/
-void page_add_file_rmap(struct page *page, bool compound)
+void page_add_file_rmap(struct page *page,
+ struct vm_area_struct *vma, bool compound)
{
- int i, nr = 1;
+ int i, nr = 0;
VM_BUG_ON_PAGE(compound && !PageTransHuge(page), page);
lock_page_memcg(page);
if (compound && PageTransHuge(page)) {
int nr_pages = thp_nr_pages(page);
- for (i = 0, nr = 0; i < nr_pages; i++) {
+ for (i = 0; i < nr_pages; i++) {
if (atomic_inc_and_test(&page[i]._mapcount))
nr++;
}
if (!atomic_inc_and_test(compound_mapcount_ptr(page)))
goto out;
+
+ /*
+ * It is racy to ClearPageDoubleMap in page_remove_file_rmap();
+ * but page lock is held by all page_add_file_rmap() compound
+ * callers, and SetPageDoubleMap below warns if !PageLocked:
+ * so here is a place that DoubleMap can be safely cleared.
+ */
+ VM_WARN_ON_ONCE(!PageLocked(page));
+ if (nr == nr_pages && PageDoubleMap(page))
+ ClearPageDoubleMap(page);
+
if (PageSwapBacked(page))
__mod_lruvec_page_state(page, NR_SHMEM_PMDMAPPED,
nr_pages);
@@ -1260,25 +1270,23 @@ void page_add_file_rmap(struct page *page, bool compound)
nr_pages);
} else {
if (PageTransCompound(page) && page_mapping(page)) {
- struct page *head = compound_head(page);
-
VM_WARN_ON_ONCE(!PageLocked(page));
-
- SetPageDoubleMap(head);
- if (PageMlocked(page))
- clear_page_mlock(head);
+ SetPageDoubleMap(compound_head(page));
}
- if (!atomic_inc_and_test(&page->_mapcount))
- goto out;
+ if (atomic_inc_and_test(&page->_mapcount))
+ nr++;
}
- __mod_lruvec_page_state(page, NR_FILE_MAPPED, nr);
out:
+ if (nr)
+ __mod_lruvec_page_state(page, NR_FILE_MAPPED, nr);
unlock_page_memcg(page);
+
+ mlock_vma_page(page, vma, compound);
}
static void page_remove_file_rmap(struct page *page, bool compound)
{
- int i, nr = 1;
+ int i, nr = 0;
VM_BUG_ON_PAGE(compound && !PageHead(page), page);
@@ -1293,12 +1301,12 @@ static void page_remove_file_rmap(struct page *page, bool compound)
if (compound && PageTransHuge(page)) {
int nr_pages = thp_nr_pages(page);
- for (i = 0, nr = 0; i < nr_pages; i++) {
+ for (i = 0; i < nr_pages; i++) {
if (atomic_add_negative(-1, &page[i]._mapcount))
nr++;
}
if (!atomic_add_negative(-1, compound_mapcount_ptr(page)))
- return;
+ goto out;
if (PageSwapBacked(page))
__mod_lruvec_page_state(page, NR_SHMEM_PMDMAPPED,
-nr_pages);
@@ -1306,19 +1314,12 @@ static void page_remove_file_rmap(struct page *page, bool compound)
__mod_lruvec_page_state(page, NR_FILE_PMDMAPPED,
-nr_pages);
} else {
- if (!atomic_add_negative(-1, &page->_mapcount))
- return;
+ if (atomic_add_negative(-1, &page->_mapcount))
+ nr++;
}
-
- /*
- * We use the irq-unsafe __{inc|mod}_lruvec_page_state because
- * these counters are not modified in interrupt context, and
- * pte lock(a spinlock) is held, which implies preemption disabled.
- */
- __mod_lruvec_page_state(page, NR_FILE_MAPPED, -nr);
-
- if (unlikely(PageMlocked(page)))
- clear_page_mlock(page);
+out:
+ if (nr)
+ __mod_lruvec_page_state(page, NR_FILE_MAPPED, -nr);
}
static void page_remove_anon_compound_rmap(struct page *page)
@@ -1358,9 +1359,6 @@ static void page_remove_anon_compound_rmap(struct page *page)
nr = thp_nr_pages(page);
}
- if (unlikely(PageMlocked(page)))
- clear_page_mlock(page);
-
if (nr)
__mod_lruvec_page_state(page, NR_ANON_MAPPED, -nr);
}
@@ -1368,11 +1366,13 @@ static void page_remove_anon_compound_rmap(struct page *page)
/**
* page_remove_rmap - take down pte mapping from a page
* @page: page to remove mapping from
+ * @vma: the vm area from which the mapping is removed
* @compound: uncharge the page as compound or small page
*
* The caller needs to hold the pte lock.
*/
-void page_remove_rmap(struct page *page, bool compound)
+void page_remove_rmap(struct page *page,
+ struct vm_area_struct *vma, bool compound)
{
lock_page_memcg(page);
@@ -1397,9 +1397,6 @@ void page_remove_rmap(struct page *page, bool compound)
*/
__dec_lruvec_page_state(page, NR_ANON_MAPPED);
- if (unlikely(PageMlocked(page)))
- clear_page_mlock(page);
-
if (PageTransCompound(page))
deferred_split_huge_page(compound_head(page));
@@ -1414,20 +1411,18 @@ void page_remove_rmap(struct page *page, bool compound)
*/
out:
unlock_page_memcg(page);
+
+ munlock_vma_page(page, vma, compound);
}
/*
* @arg: enum ttu_flags will be passed to this argument
*/
-static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
+static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
unsigned long address, void *arg)
{
struct mm_struct *mm = vma->vm_mm;
- struct page_vma_mapped_walk pvmw = {
- .page = page,
- .vma = vma,
- .address = address,
- };
+ DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
pte_t pteval;
struct page *subpage;
bool ret = true;
@@ -1444,21 +1439,20 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
pvmw.flags = PVMW_SYNC;
if (flags & TTU_SPLIT_HUGE_PMD)
- split_huge_pmd_address(vma, address, false, page);
+ split_huge_pmd_address(vma, address, false, folio);
/*
* For THP, we have to assume the worse case ie pmd for invalidation.
* For hugetlb, it could be much worse if we need to do pud
* invalidation in the case of pmd sharing.
*
- * Note that the page can not be free in this function as call of
- * try_to_unmap() must hold a reference on the page.
+ * Note that the folio can not be freed in this function as call of
+ * try_to_unmap() must hold a reference on the folio.
*/
- range.end = PageKsm(page) ?
- address + PAGE_SIZE : vma_address_end(page, vma);
+ range.end = vma_address_end(&pvmw);
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
address, range.end);
- if (PageHuge(page)) {
+ if (folio_test_hugetlb(folio)) {
/*
* If sharing is possible, start and end will be adjusted
* accordingly.
@@ -1469,32 +1463,26 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
mmu_notifier_invalidate_range_start(&range);
while (page_vma_mapped_walk(&pvmw)) {
+ /* Unexpected PMD-mapped THP? */
+ VM_BUG_ON_FOLIO(!pvmw.pte, folio);
+
/*
- * If the page is mlock()d, we cannot swap it out.
+ * If the folio is in an mlock()d vma, we must not swap it out.
*/
if (!(flags & TTU_IGNORE_MLOCK) &&
(vma->vm_flags & VM_LOCKED)) {
- /*
- * PTE-mapped THP are never marked as mlocked: so do
- * not set it on a DoubleMap THP, nor on an Anon THP
- * (which may still be PTE-mapped after DoubleMap was
- * cleared). But stop unmapping even in those cases.
- */
- if (!PageTransCompound(page) || (PageHead(page) &&
- !PageDoubleMap(page) && !PageAnon(page)))
- mlock_vma_page(page);
+ /* Restore the mlock which got missed */
+ mlock_vma_folio(folio, vma, false);
page_vma_mapped_walk_done(&pvmw);
ret = false;
break;
}
- /* Unexpected PMD-mapped THP? */
- VM_BUG_ON_PAGE(!pvmw.pte, page);
-
- subpage = page - page_to_pfn(page) + pte_pfn(*pvmw.pte);
+ subpage = folio_page(folio,
+ pte_pfn(*pvmw.pte) - folio_pfn(folio));
address = pvmw.address;
- if (PageHuge(page) && !PageAnon(page)) {
+ if (folio_test_hugetlb(folio) && !folio_test_anon(folio)) {
/*
* To call huge_pmd_unshare, i_mmap_rwsem must be
* held in write mode. Caller needs to explicitly
@@ -1533,7 +1521,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
if (should_defer_flush(mm, flags)) {
/*
* We clear the PTE but do not flush so potentially
- * a remote CPU could still be writing to the page.
+ * a remote CPU could still be writing to the folio.
* If the entry was previously clean then the
* architecture must guarantee that a clear->dirty
* transition on a cached TLB entry is written through
@@ -1546,22 +1534,22 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
pteval = ptep_clear_flush(vma, address, pvmw.pte);
}
- /* Move the dirty bit to the page. Now the pte is gone. */
+ /* Set the dirty flag on the folio now the pte is gone. */
if (pte_dirty(pteval))
- set_page_dirty(page);
+ folio_mark_dirty(folio);
/* Update high watermark before we lower rss */
update_hiwater_rss(mm);
- if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) {
+ if (PageHWPoison(subpage) && !(flags & TTU_IGNORE_HWPOISON)) {
pteval = swp_entry_to_pte(make_hwpoison_entry(subpage));
- if (PageHuge(page)) {
- hugetlb_count_sub(compound_nr(page), mm);
+ if (folio_test_hugetlb(folio)) {
+ hugetlb_count_sub(folio_nr_pages(folio), mm);
set_huge_swap_pte_at(mm, address,
pvmw.pte, pteval,
vma_mmu_pagesize(vma));
} else {
- dec_mm_counter(mm, mm_counter(page));
+ dec_mm_counter(mm, mm_counter(&folio->page));
set_pte_at(mm, address, pvmw.pte, pteval);
}
@@ -1576,18 +1564,19 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
* migration) will not expect userfaults on already
* copied pages.
*/
- dec_mm_counter(mm, mm_counter(page));
+ dec_mm_counter(mm, mm_counter(&folio->page));
/* We have to invalidate as we cleared the pte */
mmu_notifier_invalidate_range(mm, address,
address + PAGE_SIZE);
- } else if (PageAnon(page)) {
+ } else if (folio_test_anon(folio)) {
swp_entry_t entry = { .val = page_private(subpage) };
pte_t swp_pte;
/*
* Store the swap location in the pte.
* See handle_pte_fault() ...
*/
- if (unlikely(PageSwapBacked(page) != PageSwapCache(page))) {
+ if (unlikely(folio_test_swapbacked(folio) !=
+ folio_test_swapcache(folio))) {
WARN_ON_ONCE(1);
ret = false;
/* We have to invalidate as we cleared the pte */
@@ -1598,8 +1587,31 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
}
/* MADV_FREE page check */
- if (!PageSwapBacked(page)) {
- if (!PageDirty(page)) {
+ if (!folio_test_swapbacked(folio)) {
+ int ref_count, map_count;
+
+ /*
+ * Synchronize with gup_pte_range():
+ * - clear PTE; barrier; read refcount
+ * - inc refcount; barrier; read PTE
+ */
+ smp_mb();
+
+ ref_count = folio_ref_count(folio);
+ map_count = folio_mapcount(folio);
+
+ /*
+ * Order reads for page refcount and dirty flag
+ * (see comments in __remove_mapping()).
+ */
+ smp_rmb();
+
+ /*
+ * The only page refs must be one from isolation
+ * plus the rmap(s) (dropped by discard:).
+ */
+ if (ref_count == 1 + map_count &&
+ !folio_test_dirty(folio)) {
/* Invalidate as we cleared the pte */
mmu_notifier_invalidate_range(mm,
address, address + PAGE_SIZE);
@@ -1608,11 +1620,11 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
}
/*
- * If the page was redirtied, it cannot be
+ * If the folio was redirtied, it cannot be
* discarded. Remap the page to page table.
*/
set_pte_at(mm, address, pvmw.pte, pteval);
- SetPageSwapBacked(page);
+ folio_set_swapbacked(folio);
ret = false;
page_vma_mapped_walk_done(&pvmw);
break;
@@ -1649,16 +1661,17 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
address + PAGE_SIZE);
} else {
/*
- * This is a locked file-backed page, thus it cannot
- * be removed from the page cache and replaced by a new
- * page before mmu_notifier_invalidate_range_end, so no
- * concurrent thread might update its page table to
- * point at new page while a device still is using this
- * page.
+ * This is a locked file-backed folio,
+ * so it cannot be removed from the page
+ * cache and replaced by a new folio before
+ * mmu_notifier_invalidate_range_end, so no
+ * concurrent thread might update its page table
+ * to point at a new folio while a device is
+ * still using this folio.
*
* See Documentation/vm/mmu_notifier.rst
*/
- dec_mm_counter(mm, mm_counter_file(page));
+ dec_mm_counter(mm, mm_counter_file(&folio->page));
}
discard:
/*
@@ -1668,8 +1681,10 @@ discard:
*
* See Documentation/vm/mmu_notifier.rst
*/
- page_remove_rmap(subpage, PageHuge(page));
- put_page(page);
+ page_remove_rmap(subpage, vma, folio_test_hugetlb(folio));
+ if (vma->vm_flags & VM_LOCKED)
+ mlock_page_drain(smp_processor_id());
+ folio_put(folio);
}
mmu_notifier_invalidate_range_end(&range);
@@ -1682,35 +1697,35 @@ static bool invalid_migration_vma(struct vm_area_struct *vma, void *arg)
return vma_is_temporary_stack(vma);
}
-static int page_not_mapped(struct page *page)
+static int page_not_mapped(struct folio *folio)
{
- return !page_mapped(page);
+ return !folio_mapped(folio);
}
/**
- * try_to_unmap - try to remove all page table mappings to a page
- * @page: the page to get unmapped
+ * try_to_unmap - Try to remove all page table mappings to a folio.
+ * @folio: The folio to unmap.
* @flags: action and flags
*
* Tries to remove all the page table entries which are mapping this
- * page, used in the pageout path. Caller must hold the page lock.
+ * folio. It is the caller's responsibility to check if the folio is
+ * still mapped if needed (use TTU_SYNC to prevent accounting races).
*
- * It is the caller's responsibility to check if the page is still
- * mapped when needed (use TTU_SYNC to prevent accounting races).
+ * Context: Caller must hold the folio lock.
*/
-void try_to_unmap(struct page *page, enum ttu_flags flags)
+void try_to_unmap(struct folio *folio, enum ttu_flags flags)
{
struct rmap_walk_control rwc = {
.rmap_one = try_to_unmap_one,
.arg = (void *)flags,
.done = page_not_mapped,
- .anon_lock = page_lock_anon_vma_read,
+ .anon_lock = folio_lock_anon_vma_read,
};
if (flags & TTU_RMAP_LOCKED)
- rmap_walk_locked(page, &rwc);
+ rmap_walk_locked(folio, &rwc);
else
- rmap_walk(page, &rwc);
+ rmap_walk(folio, &rwc);
}
/*
@@ -1719,15 +1734,11 @@ void try_to_unmap(struct page *page, enum ttu_flags flags)
* If TTU_SPLIT_HUGE_PMD is specified any PMD mappings will be split into PTEs
* containing migration entries.
*/
-static bool try_to_migrate_one(struct page *page, struct vm_area_struct *vma,
+static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
unsigned long address, void *arg)
{
struct mm_struct *mm = vma->vm_mm;
- struct page_vma_mapped_walk pvmw = {
- .page = page,
- .vma = vma,
- .address = address,
- };
+ DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
pte_t pteval;
struct page *subpage;
bool ret = true;
@@ -1748,7 +1759,7 @@ static bool try_to_migrate_one(struct page *page, struct vm_area_struct *vma,
* TTU_SPLIT_HUGE_PMD and it wants to freeze.
*/
if (flags & TTU_SPLIT_HUGE_PMD)
- split_huge_pmd_address(vma, address, true, page);
+ split_huge_pmd_address(vma, address, true, folio);
/*
* For THP, we have to assume the worse case ie pmd for invalidation.
@@ -1758,11 +1769,10 @@ static bool try_to_migrate_one(struct page *page, struct vm_area_struct *vma,
* Note that the page can not be free in this function as call of
* try_to_unmap() must hold a reference on the page.
*/
- range.end = PageKsm(page) ?
- address + PAGE_SIZE : vma_address_end(page, vma);
+ range.end = vma_address_end(&pvmw);
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
address, range.end);
- if (PageHuge(page)) {
+ if (folio_test_hugetlb(folio)) {
/*
* If sharing is possible, start and end will be adjusted
* accordingly.
@@ -1776,21 +1786,24 @@ static bool try_to_migrate_one(struct page *page, struct vm_area_struct *vma,
#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
/* PMD-mapped THP migration entry */
if (!pvmw.pte) {
- VM_BUG_ON_PAGE(PageHuge(page) ||
- !PageTransCompound(page), page);
+ subpage = folio_page(folio,
+ pmd_pfn(*pvmw.pmd) - folio_pfn(folio));
+ VM_BUG_ON_FOLIO(folio_test_hugetlb(folio) ||
+ !folio_test_pmd_mappable(folio), folio);
- set_pmd_migration_entry(&pvmw, page);
+ set_pmd_migration_entry(&pvmw, subpage);
continue;
}
#endif
/* Unexpected PMD-mapped THP? */
- VM_BUG_ON_PAGE(!pvmw.pte, page);
+ VM_BUG_ON_FOLIO(!pvmw.pte, folio);
- subpage = page - page_to_pfn(page) + pte_pfn(*pvmw.pte);
+ subpage = folio_page(folio,
+ pte_pfn(*pvmw.pte) - folio_pfn(folio));
address = pvmw.address;
- if (PageHuge(page) && !PageAnon(page)) {
+ if (folio_test_hugetlb(folio) && !folio_test_anon(folio)) {
/*
* To call huge_pmd_unshare, i_mmap_rwsem must be
* held in write mode. Caller needs to explicitly
@@ -1828,15 +1841,15 @@ static bool try_to_migrate_one(struct page *page, struct vm_area_struct *vma,
flush_cache_page(vma, address, pte_pfn(*pvmw.pte));
pteval = ptep_clear_flush(vma, address, pvmw.pte);
- /* Move the dirty bit to the page. Now the pte is gone. */
+ /* Set the dirty flag on the folio now the pte is gone. */
if (pte_dirty(pteval))
- set_page_dirty(page);
+ folio_mark_dirty(folio);
/* Update high watermark before we lower rss */
update_hiwater_rss(mm);
- if (is_zone_device_page(page)) {
- unsigned long pfn = page_to_pfn(page);
+ if (folio_is_zone_device(folio)) {
+ unsigned long pfn = folio_pfn(folio);
swp_entry_t entry;
pte_t swp_pte;
@@ -1861,6 +1874,8 @@ static bool try_to_migrate_one(struct page *page, struct vm_area_struct *vma,
if (pte_swp_uffd_wp(pteval))
swp_pte = pte_swp_mkuffd_wp(swp_pte);
set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte);
+ trace_set_migration_pte(pvmw.address, pte_val(swp_pte),
+ compound_order(&folio->page));
/*
* No need to invalidate here it will synchronize on
* against the special swap migration pte.
@@ -1872,16 +1887,16 @@ static bool try_to_migrate_one(struct page *page, struct vm_area_struct *vma,
* changed when hugepage migrations to device private
* memory are supported.
*/
- subpage = page;
- } else if (PageHWPoison(page)) {
+ subpage = &folio->page;
+ } else if (PageHWPoison(subpage)) {
pteval = swp_entry_to_pte(make_hwpoison_entry(subpage));
- if (PageHuge(page)) {
- hugetlb_count_sub(compound_nr(page), mm);
+ if (folio_test_hugetlb(folio)) {
+ hugetlb_count_sub(folio_nr_pages(folio), mm);
set_huge_swap_pte_at(mm, address,
pvmw.pte, pteval,
vma_mmu_pagesize(vma));
} else {
- dec_mm_counter(mm, mm_counter(page));
+ dec_mm_counter(mm, mm_counter(&folio->page));
set_pte_at(mm, address, pvmw.pte, pteval);
}
@@ -1896,7 +1911,7 @@ static bool try_to_migrate_one(struct page *page, struct vm_area_struct *vma,
* migration) will not expect userfaults on already
* copied pages.
*/
- dec_mm_counter(mm, mm_counter(page));
+ dec_mm_counter(mm, mm_counter(&folio->page));
/* We have to invalidate as we cleared the pte */
mmu_notifier_invalidate_range(mm, address,
address + PAGE_SIZE);
@@ -1929,6 +1944,8 @@ static bool try_to_migrate_one(struct page *page, struct vm_area_struct *vma,
if (pte_uffd_wp(pteval))
swp_pte = pte_swp_mkuffd_wp(swp_pte);
set_pte_at(mm, address, pvmw.pte, swp_pte);
+ trace_set_migration_pte(address, pte_val(swp_pte),
+ compound_order(&folio->page));
/*
* No need to invalidate here it will synchronize on
* against the special swap migration pte.
@@ -1942,8 +1959,10 @@ static bool try_to_migrate_one(struct page *page, struct vm_area_struct *vma,
*
* See Documentation/vm/mmu_notifier.rst
*/
- page_remove_rmap(subpage, PageHuge(page));
- put_page(page);
+ page_remove_rmap(subpage, vma, folio_test_hugetlb(folio));
+ if (vma->vm_flags & VM_LOCKED)
+ mlock_page_drain(smp_processor_id());
+ folio_put(folio);
}
mmu_notifier_invalidate_range_end(&range);
@@ -1953,19 +1972,19 @@ static bool try_to_migrate_one(struct page *page, struct vm_area_struct *vma,
/**
* try_to_migrate - try to replace all page table mappings with swap entries
- * @page: the page to replace page table entries for
+ * @folio: the folio to replace page table entries for
* @flags: action and flags
*
- * Tries to remove all the page table entries which are mapping this page and
- * replace them with special swap entries. Caller must hold the page lock.
+ * Tries to remove all the page table entries which are mapping this folio and
+ * replace them with special swap entries. Caller must hold the folio lock.
*/
-void try_to_migrate(struct page *page, enum ttu_flags flags)
+void try_to_migrate(struct folio *folio, enum ttu_flags flags)
{
struct rmap_walk_control rwc = {
.rmap_one = try_to_migrate_one,
.arg = (void *)flags,
.done = page_not_mapped,
- .anon_lock = page_lock_anon_vma_read,
+ .anon_lock = folio_lock_anon_vma_read,
};
/*
@@ -1976,7 +1995,7 @@ void try_to_migrate(struct page *page, enum ttu_flags flags)
TTU_SYNC)))
return;
- if (is_zone_device_page(page) && !is_device_private_page(page))
+ if (folio_is_zone_device(folio) && !folio_is_device_private(folio))
return;
/*
@@ -1987,83 +2006,13 @@ void try_to_migrate(struct page *page, enum ttu_flags flags)
* locking requirements of exec(), migration skips
* temporary VMAs until after exec() completes.
*/
- if (!PageKsm(page) && PageAnon(page))
+ if (!folio_test_ksm(folio) && folio_test_anon(folio))
rwc.invalid_vma = invalid_migration_vma;
if (flags & TTU_RMAP_LOCKED)
- rmap_walk_locked(page, &rwc);
+ rmap_walk_locked(folio, &rwc);
else
- rmap_walk(page, &rwc);
-}
-
-/*
- * Walks the vma's mapping a page and mlocks the page if any locked vma's are
- * found. Once one is found the page is locked and the scan can be terminated.
- */
-static bool page_mlock_one(struct page *page, struct vm_area_struct *vma,
- unsigned long address, void *unused)
-{
- struct page_vma_mapped_walk pvmw = {
- .page = page,
- .vma = vma,
- .address = address,
- };
-
- /* An un-locked vma doesn't have any pages to lock, continue the scan */
- if (!(vma->vm_flags & VM_LOCKED))
- return true;
-
- while (page_vma_mapped_walk(&pvmw)) {
- /*
- * Need to recheck under the ptl to serialise with
- * __munlock_pagevec_fill() after VM_LOCKED is cleared in
- * munlock_vma_pages_range().
- */
- if (vma->vm_flags & VM_LOCKED) {
- /*
- * PTE-mapped THP are never marked as mlocked; but
- * this function is never called on a DoubleMap THP,
- * nor on an Anon THP (which may still be PTE-mapped
- * after DoubleMap was cleared).
- */
- mlock_vma_page(page);
- /*
- * No need to scan further once the page is marked
- * as mlocked.
- */
- page_vma_mapped_walk_done(&pvmw);
- return false;
- }
- }
-
- return true;
-}
-
-/**
- * page_mlock - try to mlock a page
- * @page: the page to be mlocked
- *
- * Called from munlock code. Checks all of the VMAs mapping the page and mlocks
- * the page if any are found. The page will be returned with PG_mlocked cleared
- * if it is not mapped by any locked vmas.
- */
-void page_mlock(struct page *page)
-{
- struct rmap_walk_control rwc = {
- .rmap_one = page_mlock_one,
- .done = page_not_mapped,
- .anon_lock = page_lock_anon_vma_read,
-
- };
-
- VM_BUG_ON_PAGE(!PageLocked(page) || PageLRU(page), page);
- VM_BUG_ON_PAGE(PageCompound(page) && PageDoubleMap(page), page);
-
- /* Anon THP are only marked as mlocked when singly mapped */
- if (PageTransCompound(page) && PageAnon(page))
- return;
-
- rmap_walk(page, &rwc);
+ rmap_walk(folio, &rwc);
}
#ifdef CONFIG_DEVICE_PRIVATE
@@ -2074,15 +2023,11 @@ struct make_exclusive_args {
bool valid;
};
-static bool page_make_device_exclusive_one(struct page *page,
+static bool page_make_device_exclusive_one(struct folio *folio,
struct vm_area_struct *vma, unsigned long address, void *priv)
{
struct mm_struct *mm = vma->vm_mm;
- struct page_vma_mapped_walk pvmw = {
- .page = page,
- .vma = vma,
- .address = address,
- };
+ DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
struct make_exclusive_args *args = priv;
pte_t pteval;
struct page *subpage;
@@ -2093,12 +2038,13 @@ static bool page_make_device_exclusive_one(struct page *page,
mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0, vma,
vma->vm_mm, address, min(vma->vm_end,
- address + page_size(page)), args->owner);
+ address + folio_size(folio)),
+ args->owner);
mmu_notifier_invalidate_range_start(&range);
while (page_vma_mapped_walk(&pvmw)) {
/* Unexpected PMD-mapped THP? */
- VM_BUG_ON_PAGE(!pvmw.pte, page);
+ VM_BUG_ON_FOLIO(!pvmw.pte, folio);
if (!pte_present(*pvmw.pte)) {
ret = false;
@@ -2106,16 +2052,17 @@ static bool page_make_device_exclusive_one(struct page *page,
break;
}
- subpage = page - page_to_pfn(page) + pte_pfn(*pvmw.pte);
+ subpage = folio_page(folio,
+ pte_pfn(*pvmw.pte) - folio_pfn(folio));
address = pvmw.address;
/* Nuke the page table entry. */
flush_cache_page(vma, address, pte_pfn(*pvmw.pte));
pteval = ptep_clear_flush(vma, address, pvmw.pte);
- /* Move the dirty bit to the page. Now the pte is gone. */
+ /* Set the dirty flag on the folio now the pte is gone. */
if (pte_dirty(pteval))
- set_page_dirty(page);
+ folio_mark_dirty(folio);
/*
* Check that our target page is still mapped at the expected
@@ -2148,7 +2095,7 @@ static bool page_make_device_exclusive_one(struct page *page,
* There is a reference on the page for the swap entry which has
* been removed, so shouldn't take another.
*/
- page_remove_rmap(subpage, false);
+ page_remove_rmap(subpage, vma, false);
}
mmu_notifier_invalidate_range_end(&range);
@@ -2157,21 +2104,22 @@ static bool page_make_device_exclusive_one(struct page *page,
}
/**
- * page_make_device_exclusive - mark the page exclusively owned by a device
- * @page: the page to replace page table entries for
- * @mm: the mm_struct where the page is expected to be mapped
- * @address: address where the page is expected to be mapped
+ * folio_make_device_exclusive - Mark the folio exclusively owned by a device.
+ * @folio: The folio to replace page table entries for.
+ * @mm: The mm_struct where the folio is expected to be mapped.
+ * @address: Address where the folio is expected to be mapped.
* @owner: passed to MMU_NOTIFY_EXCLUSIVE range notifier callbacks
*
- * Tries to remove all the page table entries which are mapping this page and
- * replace them with special device exclusive swap entries to grant a device
- * exclusive access to the page. Caller must hold the page lock.
+ * Tries to remove all the page table entries which are mapping this
+ * folio and replace them with special device exclusive swap entries to
+ * grant a device exclusive access to the folio.
*
- * Returns false if the page is still mapped, or if it could not be unmapped
+ * Context: Caller must hold the folio lock.
+ * Return: false if the page is still mapped, or if it could not be unmapped
* from the expected address. Otherwise returns true (success).
*/
-static bool page_make_device_exclusive(struct page *page, struct mm_struct *mm,
- unsigned long address, void *owner)
+static bool folio_make_device_exclusive(struct folio *folio,
+ struct mm_struct *mm, unsigned long address, void *owner)
{
struct make_exclusive_args args = {
.mm = mm,
@@ -2182,21 +2130,20 @@ static bool page_make_device_exclusive(struct page *page, struct mm_struct *mm,
struct rmap_walk_control rwc = {
.rmap_one = page_make_device_exclusive_one,
.done = page_not_mapped,
- .anon_lock = page_lock_anon_vma_read,
+ .anon_lock = folio_lock_anon_vma_read,
.arg = &args,
};
/*
- * Restrict to anonymous pages for now to avoid potential writeback
- * issues. Also tail pages shouldn't be passed to rmap_walk so skip
- * those.
+ * Restrict to anonymous folios for now to avoid potential writeback
+ * issues.
*/
- if (!PageAnon(page) || PageTail(page))
+ if (!folio_test_anon(folio))
return false;
- rmap_walk(page, &rwc);
+ rmap_walk(folio, &rwc);
- return args.valid && !page_mapcount(page);
+ return args.valid && !folio_mapcount(folio);
}
/**
@@ -2234,15 +2181,16 @@ int make_device_exclusive_range(struct mm_struct *mm, unsigned long start,
return npages;
for (i = 0; i < npages; i++, start += PAGE_SIZE) {
- if (!trylock_page(pages[i])) {
- put_page(pages[i]);
+ struct folio *folio = page_folio(pages[i]);
+ if (PageTail(pages[i]) || !folio_trylock(folio)) {
+ folio_put(folio);
pages[i] = NULL;
continue;
}
- if (!page_make_device_exclusive(pages[i], mm, start, owner)) {
- unlock_page(pages[i]);
- put_page(pages[i]);
+ if (!folio_make_device_exclusive(folio, mm, start, owner)) {
+ folio_unlock(folio);
+ folio_put(folio);
pages[i] = NULL;
}
}
@@ -2261,21 +2209,21 @@ void __put_anon_vma(struct anon_vma *anon_vma)
anon_vma_free(root);
}
-static struct anon_vma *rmap_walk_anon_lock(struct page *page,
- struct rmap_walk_control *rwc)
+static struct anon_vma *rmap_walk_anon_lock(struct folio *folio,
+ const struct rmap_walk_control *rwc)
{
struct anon_vma *anon_vma;
if (rwc->anon_lock)
- return rwc->anon_lock(page);
+ return rwc->anon_lock(folio);
/*
- * Note: remove_migration_ptes() cannot use page_lock_anon_vma_read()
+ * Note: remove_migration_ptes() cannot use folio_lock_anon_vma_read()
* because that depends on page_mapped(); but not all its usages
* are holding mmap_lock. Users without mmap_lock are required to
* take a reference count to prevent the anon_vma disappearing
*/
- anon_vma = page_anon_vma(page);
+ anon_vma = folio_anon_vma(folio);
if (!anon_vma)
return NULL;
@@ -2291,35 +2239,30 @@ static struct anon_vma *rmap_walk_anon_lock(struct page *page,
*
* Find all the mappings of a page using the mapping pointer and the vma chains
* contained in the anon_vma struct it points to.
- *
- * When called from page_mlock(), the mmap_lock of the mm containing the vma
- * where the page was found will be held for write. So, we won't recheck
- * vm_flags for that VMA. That should be OK, because that vma shouldn't be
- * LOCKED.
*/
-static void rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc,
- bool locked)
+static void rmap_walk_anon(struct folio *folio,
+ const struct rmap_walk_control *rwc, bool locked)
{
struct anon_vma *anon_vma;
pgoff_t pgoff_start, pgoff_end;
struct anon_vma_chain *avc;
if (locked) {
- anon_vma = page_anon_vma(page);
+ anon_vma = folio_anon_vma(folio);
/* anon_vma disappear under us? */
- VM_BUG_ON_PAGE(!anon_vma, page);
+ VM_BUG_ON_FOLIO(!anon_vma, folio);
} else {
- anon_vma = rmap_walk_anon_lock(page, rwc);
+ anon_vma = rmap_walk_anon_lock(folio, rwc);
}
if (!anon_vma)
return;
- pgoff_start = page_to_pgoff(page);
- pgoff_end = pgoff_start + thp_nr_pages(page) - 1;
+ pgoff_start = folio_pgoff(folio);
+ pgoff_end = pgoff_start + folio_nr_pages(folio) - 1;
anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root,
pgoff_start, pgoff_end) {
struct vm_area_struct *vma = avc->vma;
- unsigned long address = vma_address(page, vma);
+ unsigned long address = vma_address(&folio->page, vma);
VM_BUG_ON_VMA(address == -EFAULT, vma);
cond_resched();
@@ -2327,9 +2270,9 @@ static void rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc,
if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
continue;
- if (!rwc->rmap_one(page, vma, address, rwc->arg))
+ if (!rwc->rmap_one(folio, vma, address, rwc->arg))
break;
- if (rwc->done && rwc->done(page))
+ if (rwc->done && rwc->done(folio))
break;
}
@@ -2344,16 +2287,11 @@ static void rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc,
*
* Find all the mappings of a page using the mapping pointer and the vma chains
* contained in the address_space struct it points to.
- *
- * When called from page_mlock(), the mmap_lock of the mm containing the vma
- * where the page was found will be held for write. So, we won't recheck
- * vm_flags for that VMA. That should be OK, because that vma shouldn't be
- * LOCKED.
*/
-static void rmap_walk_file(struct page *page, struct rmap_walk_control *rwc,
- bool locked)
+static void rmap_walk_file(struct folio *folio,
+ const struct rmap_walk_control *rwc, bool locked)
{
- struct address_space *mapping = page_mapping(page);
+ struct address_space *mapping = folio_mapping(folio);
pgoff_t pgoff_start, pgoff_end;
struct vm_area_struct *vma;
@@ -2363,18 +2301,18 @@ static void rmap_walk_file(struct page *page, struct rmap_walk_control *rwc,
* structure at mapping cannot be freed and reused yet,
* so we can safely take mapping->i_mmap_rwsem.
*/
- VM_BUG_ON_PAGE(!PageLocked(page), page);
+ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
if (!mapping)
return;
- pgoff_start = page_to_pgoff(page);
- pgoff_end = pgoff_start + thp_nr_pages(page) - 1;
+ pgoff_start = folio_pgoff(folio);
+ pgoff_end = pgoff_start + folio_nr_pages(folio) - 1;
if (!locked)
i_mmap_lock_read(mapping);
vma_interval_tree_foreach(vma, &mapping->i_mmap,
pgoff_start, pgoff_end) {
- unsigned long address = vma_address(page, vma);
+ unsigned long address = vma_address(&folio->page, vma);
VM_BUG_ON_VMA(address == -EFAULT, vma);
cond_resched();
@@ -2382,9 +2320,9 @@ static void rmap_walk_file(struct page *page, struct rmap_walk_control *rwc,
if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
continue;
- if (!rwc->rmap_one(page, vma, address, rwc->arg))
+ if (!rwc->rmap_one(folio, vma, address, rwc->arg))
goto done;
- if (rwc->done && rwc->done(page))
+ if (rwc->done && rwc->done(folio))
goto done;
}
@@ -2393,25 +2331,25 @@ done:
i_mmap_unlock_read(mapping);
}
-void rmap_walk(struct page *page, struct rmap_walk_control *rwc)
+void rmap_walk(struct folio *folio, const struct rmap_walk_control *rwc)
{
- if (unlikely(PageKsm(page)))
- rmap_walk_ksm(page, rwc);
- else if (PageAnon(page))
- rmap_walk_anon(page, rwc, false);
+ if (unlikely(folio_test_ksm(folio)))
+ rmap_walk_ksm(folio, rwc);
+ else if (folio_test_anon(folio))
+ rmap_walk_anon(folio, rwc, false);
else
- rmap_walk_file(page, rwc, false);
+ rmap_walk_file(folio, rwc, false);
}
/* Like rmap_walk, but caller holds relevant rmap lock */
-void rmap_walk_locked(struct page *page, struct rmap_walk_control *rwc)
+void rmap_walk_locked(struct folio *folio, const struct rmap_walk_control *rwc)
{
/* no ksm support for now */
- VM_BUG_ON_PAGE(PageKsm(page), page);
- if (PageAnon(page))
- rmap_walk_anon(page, rwc, true);
+ VM_BUG_ON_FOLIO(folio_test_ksm(folio), folio);
+ if (folio_test_anon(folio))
+ rmap_walk_anon(folio, rwc, true);
else
- rmap_walk_file(page, rwc, true);
+ rmap_walk_file(folio, rwc, true);
}
#ifdef CONFIG_HUGETLB_PAGE
@@ -2439,8 +2377,7 @@ void hugepage_add_new_anon_rmap(struct page *page,
{
BUG_ON(address < vma->vm_start || address >= vma->vm_end);
atomic_set(compound_mapcount_ptr(page), 0);
- if (hpage_pincount_available(page))
- atomic_set(compound_pincount_ptr(page), 0);
+ atomic_set(compound_pincount_ptr(page), 0);
__page_set_anon_rmap(page, vma, address, 1);
}
diff --git a/mm/secretmem.c b/mm/secretmem.c
index 22b310adb53d..098638d3b8a4 100644
--- a/mm/secretmem.c
+++ b/mm/secretmem.c
@@ -152,7 +152,7 @@ static void secretmem_freepage(struct page *page)
}
const struct address_space_operations secretmem_aops = {
- .set_page_dirty = __set_page_dirty_no_writeback,
+ .dirty_folio = noop_dirty_folio,
.freepage = secretmem_freepage,
.migratepage = secretmem_migratepage,
.isolate_page = secretmem_isolate_page,
diff --git a/mm/shmem.c b/mm/shmem.c
index a09b29ec2b45..529c9ad3e926 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -476,6 +476,8 @@ bool shmem_is_huge(struct vm_area_struct *vma,
{
loff_t i_size;
+ if (!S_ISREG(inode->i_mode))
+ return false;
if (shmem_huge == SHMEM_HUGE_DENY)
return false;
if (vma && ((vma->vm_flags & VM_NOHUGEPAGE) ||
@@ -1061,6 +1063,12 @@ static int shmem_getattr(struct user_namespace *mnt_userns,
if (shmem_is_huge(NULL, inode, 0))
stat->blksize = HPAGE_PMD_SIZE;
+ if (request_mask & STATX_BTIME) {
+ stat->result_mask |= STATX_BTIME;
+ stat->btime.tv_sec = info->i_crtime.tv_sec;
+ stat->btime.tv_nsec = info->i_crtime.tv_nsec;
+ }
+
return 0;
}
@@ -1121,6 +1129,7 @@ static void shmem_evict_inode(struct inode *inode)
if (shmem_mapping(inode->i_mapping)) {
shmem_unacct_size(info->flags, inode->i_size);
inode->i_size = 0;
+ mapping_set_exiting(inode->i_mapping);
shmem_truncate_range(inode, 0, (loff_t)-1);
if (!list_empty(&info->shrinklist)) {
spin_lock(&sbinfo->shrinklist_lock);
@@ -1854,9 +1863,6 @@ repeat:
return 0;
}
- /* Never use a huge page for shmem_symlink() */
- if (S_ISLNK(inode->i_mode))
- goto alloc_nohuge;
if (!shmem_is_huge(vma, inode, index))
goto alloc_nohuge;
@@ -2265,6 +2271,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
atomic_set(&info->stop_eviction, 0);
info->seals = F_SEAL_SEAL;
info->flags = flags & VM_NORESERVE;
+ info->i_crtime = inode->i_mtime;
INIT_LIST_HEAD(&info->shrinklist);
INIT_LIST_HEAD(&info->swaplist);
simple_xattrs_init(&info->xattrs);
@@ -2357,8 +2364,10 @@ int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
/* don't free the page */
goto out_unacct_blocks;
}
+
+ flush_dcache_page(page);
} else { /* ZEROPAGE */
- clear_highpage(page);
+ clear_user_highpage(page, dst_addr);
}
} else {
page = *pagep;
@@ -2492,19 +2501,10 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
struct address_space *mapping = inode->i_mapping;
pgoff_t index;
unsigned long offset;
- enum sgp_type sgp = SGP_READ;
int error = 0;
ssize_t retval = 0;
loff_t *ppos = &iocb->ki_pos;
- /*
- * Might this read be for a stacking filesystem? Then when reading
- * holes of a sparse file, we actually need to allocate those pages,
- * and even mark them dirty, so it cannot exceed the max_blocks limit.
- */
- if (!iter_is_iovec(to))
- sgp = SGP_CACHE;
-
index = *ppos >> PAGE_SHIFT;
offset = *ppos & ~PAGE_MASK;
@@ -2513,6 +2513,7 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
pgoff_t end_index;
unsigned long nr, ret;
loff_t i_size = i_size_read(inode);
+ bool got_page;
end_index = i_size >> PAGE_SHIFT;
if (index > end_index)
@@ -2523,15 +2524,13 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
break;
}
- error = shmem_getpage(inode, index, &page, sgp);
+ error = shmem_getpage(inode, index, &page, SGP_READ);
if (error) {
if (error == -EINVAL)
error = 0;
break;
}
if (page) {
- if (sgp == SGP_CACHE)
- set_page_dirty(page);
unlock_page(page);
if (PageHWPoison(page)) {
@@ -2571,9 +2570,10 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
*/
if (!offset)
mark_page_accessed(page);
+ got_page = true;
} else {
page = ZERO_PAGE(0);
- get_page(page);
+ got_page = false;
}
/*
@@ -2586,7 +2586,8 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
index += offset >> PAGE_SHIFT;
offset &= ~PAGE_MASK;
- put_page(page);
+ if (got_page)
+ put_page(page);
if (!iov_iter_count(to))
break;
if (ret < nr) {
@@ -3196,6 +3197,7 @@ static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size)
#endif /* CONFIG_TMPFS_XATTR */
static const struct inode_operations shmem_short_symlink_operations = {
+ .getattr = shmem_getattr,
.get_link = simple_get_link,
#ifdef CONFIG_TMPFS_XATTR
.listxattr = shmem_listxattr,
@@ -3203,6 +3205,7 @@ static const struct inode_operations shmem_short_symlink_operations = {
};
static const struct inode_operations shmem_symlink_inode_operations = {
+ .getattr = shmem_getattr,
.get_link = shmem_get_link,
#ifdef CONFIG_TMPFS_XATTR
.listxattr = shmem_listxattr,
@@ -3707,7 +3710,7 @@ static struct kmem_cache *shmem_inode_cachep;
static struct inode *shmem_alloc_inode(struct super_block *sb)
{
struct shmem_inode_info *info;
- info = kmem_cache_alloc(shmem_inode_cachep, GFP_KERNEL);
+ info = alloc_inode_sb(sb, shmem_inode_cachep, GFP_KERNEL);
if (!info)
return NULL;
return &info->vfs_inode;
@@ -3753,7 +3756,7 @@ static int shmem_error_remove_page(struct address_space *mapping,
const struct address_space_operations shmem_aops = {
.writepage = shmem_writepage,
- .set_page_dirty = __set_page_dirty_no_writeback,
+ .dirty_folio = noop_dirty_folio,
#ifdef CONFIG_TMPFS
.write_begin = shmem_write_begin,
.write_end = shmem_write_end,
@@ -3790,6 +3793,7 @@ static const struct inode_operations shmem_inode_operations = {
static const struct inode_operations shmem_dir_inode_operations = {
#ifdef CONFIG_TMPFS
+ .getattr = shmem_getattr,
.create = shmem_create,
.lookup = simple_lookup,
.link = shmem_link,
@@ -3811,6 +3815,7 @@ static const struct inode_operations shmem_dir_inode_operations = {
};
static const struct inode_operations shmem_special_inode_operations = {
+ .getattr = shmem_getattr,
#ifdef CONFIG_TMPFS_XATTR
.listxattr = shmem_listxattr,
#endif
@@ -3962,8 +3967,7 @@ static ssize_t shmem_enabled_store(struct kobject *kobj,
return count;
}
-struct kobj_attribute shmem_enabled_attr =
- __ATTR(shmem_enabled, 0644, shmem_enabled_show, shmem_enabled_store);
+struct kobj_attribute shmem_enabled_attr = __ATTR_RW(shmem_enabled);
#endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_SYSFS */
#else /* !CONFIG_SHMEM */
diff --git a/mm/slab.c b/mm/slab.c
index ddf5737c63d9..b04e40078bdf 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3211,7 +3211,7 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, size_t orig_
bool init = false;
flags &= gfp_allowed_mask;
- cachep = slab_pre_alloc_hook(cachep, &objcg, 1, flags);
+ cachep = slab_pre_alloc_hook(cachep, NULL, &objcg, 1, flags);
if (unlikely(!cachep))
return NULL;
@@ -3287,7 +3287,8 @@ __do_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
#endif /* CONFIG_NUMA */
static __always_inline void *
-slab_alloc(struct kmem_cache *cachep, gfp_t flags, size_t orig_size, unsigned long caller)
+slab_alloc(struct kmem_cache *cachep, struct list_lru *lru, gfp_t flags,
+ size_t orig_size, unsigned long caller)
{
unsigned long save_flags;
void *objp;
@@ -3295,7 +3296,7 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, size_t orig_size, unsigned lo
bool init = false;
flags &= gfp_allowed_mask;
- cachep = slab_pre_alloc_hook(cachep, &objcg, 1, flags);
+ cachep = slab_pre_alloc_hook(cachep, lru, &objcg, 1, flags);
if (unlikely(!cachep))
return NULL;
@@ -3421,6 +3422,7 @@ static __always_inline void __cache_free(struct kmem_cache *cachep, void *objp,
if (is_kfence_address(objp)) {
kmemleak_free_recursive(objp, cachep->flags);
+ memcg_slab_free_hook(cachep, &objp, 1);
__kfence_free(objp);
return;
}
@@ -3484,6 +3486,18 @@ void ___cache_free(struct kmem_cache *cachep, void *objp,
__free_one(ac, objp);
}
+static __always_inline
+void *__kmem_cache_alloc_lru(struct kmem_cache *cachep, struct list_lru *lru,
+ gfp_t flags)
+{
+ void *ret = slab_alloc(cachep, lru, flags, cachep->object_size, _RET_IP_);
+
+ trace_kmem_cache_alloc(_RET_IP_, ret,
+ cachep->object_size, cachep->size, flags);
+
+ return ret;
+}
+
/**
* kmem_cache_alloc - Allocate an object
* @cachep: The cache to allocate from.
@@ -3496,15 +3510,17 @@ void ___cache_free(struct kmem_cache *cachep, void *objp,
*/
void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
{
- void *ret = slab_alloc(cachep, flags, cachep->object_size, _RET_IP_);
-
- trace_kmem_cache_alloc(_RET_IP_, ret,
- cachep->object_size, cachep->size, flags);
-
- return ret;
+ return __kmem_cache_alloc_lru(cachep, NULL, flags);
}
EXPORT_SYMBOL(kmem_cache_alloc);
+void *kmem_cache_alloc_lru(struct kmem_cache *cachep, struct list_lru *lru,
+ gfp_t flags)
+{
+ return __kmem_cache_alloc_lru(cachep, lru, flags);
+}
+EXPORT_SYMBOL(kmem_cache_alloc_lru);
+
static __always_inline void
cache_alloc_debugcheck_after_bulk(struct kmem_cache *s, gfp_t flags,
size_t size, void **p, unsigned long caller)
@@ -3521,7 +3537,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
size_t i;
struct obj_cgroup *objcg = NULL;
- s = slab_pre_alloc_hook(s, &objcg, size, flags);
+ s = slab_pre_alloc_hook(s, NULL, &objcg, size, flags);
if (!s)
return 0;
@@ -3562,7 +3578,7 @@ kmem_cache_alloc_trace(struct kmem_cache *cachep, gfp_t flags, size_t size)
{
void *ret;
- ret = slab_alloc(cachep, flags, size, _RET_IP_);
+ ret = slab_alloc(cachep, NULL, flags, size, _RET_IP_);
ret = kasan_kmalloc(cachep, ret, size, flags);
trace_kmalloc(_RET_IP_, ret,
@@ -3689,7 +3705,7 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
cachep = kmalloc_slab(size, flags);
if (unlikely(ZERO_OR_NULL_PTR(cachep)))
return cachep;
- ret = slab_alloc(cachep, flags, size, caller);
+ ret = slab_alloc(cachep, NULL, flags, size, caller);
ret = kasan_kmalloc(cachep, ret, size, flags);
trace_kmalloc(caller, ret,
diff --git a/mm/slab.h b/mm/slab.h
index c7f2abc2b154..fd7ae2024897 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -231,6 +231,7 @@ struct kmem_cache {
#include <linux/kmemleak.h>
#include <linux/random.h>
#include <linux/sched/mm.h>
+#include <linux/list_lru.h>
/*
* State of the slab allocator.
@@ -472,6 +473,7 @@ static inline size_t obj_full_size(struct kmem_cache *s)
* Returns false if the allocation should fail.
*/
static inline bool memcg_slab_pre_alloc_hook(struct kmem_cache *s,
+ struct list_lru *lru,
struct obj_cgroup **objcgp,
size_t objects, gfp_t flags)
{
@@ -487,13 +489,26 @@ static inline bool memcg_slab_pre_alloc_hook(struct kmem_cache *s,
if (!objcg)
return true;
- if (obj_cgroup_charge(objcg, flags, objects * obj_full_size(s))) {
- obj_cgroup_put(objcg);
- return false;
+ if (lru) {
+ int ret;
+ struct mem_cgroup *memcg;
+
+ memcg = get_mem_cgroup_from_objcg(objcg);
+ ret = memcg_list_lru_alloc(memcg, lru, flags);
+ css_put(&memcg->css);
+
+ if (ret)
+ goto out;
}
+ if (obj_cgroup_charge(objcg, flags, objects * obj_full_size(s)))
+ goto out;
+
*objcgp = objcg;
return true;
+out:
+ obj_cgroup_put(objcg);
+ return false;
}
static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s,
@@ -598,6 +613,7 @@ static inline void memcg_free_slab_cgroups(struct slab *slab)
}
static inline bool memcg_slab_pre_alloc_hook(struct kmem_cache *s,
+ struct list_lru *lru,
struct obj_cgroup **objcgp,
size_t objects, gfp_t flags)
{
@@ -697,6 +713,7 @@ static inline size_t slab_ksize(const struct kmem_cache *s)
}
static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s,
+ struct list_lru *lru,
struct obj_cgroup **objcgp,
size_t size, gfp_t flags)
{
@@ -707,7 +724,7 @@ static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s,
if (should_failslab(s, flags))
return NULL;
- if (!memcg_slab_pre_alloc_hook(s, objcgp, size, flags))
+ if (!memcg_slab_pre_alloc_hook(s, lru, objcgp, size, flags))
return NULL;
return s;
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 23f2ab0713b7..6ee64d6208b3 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -807,7 +807,7 @@ void __init setup_kmalloc_cache_index_table(void)
unsigned int i;
BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 ||
- (KMALLOC_MIN_SIZE & (KMALLOC_MIN_SIZE - 1)));
+ !is_power_of_2(KMALLOC_MIN_SIZE));
for (i = 8; i < KMALLOC_MIN_SIZE; i += 8) {
unsigned int elem = size_index_elem(i);
diff --git a/mm/slob.c b/mm/slob.c
index 60c5842215f1..dfa6808dff36 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -635,6 +635,12 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
}
EXPORT_SYMBOL(kmem_cache_alloc);
+
+void *kmem_cache_alloc_lru(struct kmem_cache *cachep, struct list_lru *lru, gfp_t flags)
+{
+ return slob_alloc_node(cachep, flags, NUMA_NO_NODE);
+}
+EXPORT_SYMBOL(kmem_cache_alloc_lru);
#ifdef CONFIG_NUMA
void *__kmalloc_node(size_t size, gfp_t gfp, int node)
{
@@ -708,7 +714,7 @@ int __kmem_cache_shrink(struct kmem_cache *d)
return 0;
}
-struct kmem_cache kmem_cache_boot = {
+static struct kmem_cache kmem_cache_boot = {
.name = "kmem_cache",
.size = sizeof(struct kmem_cache),
.flags = SLAB_PANIC,
diff --git a/mm/slub.c b/mm/slub.c
index 261474092e43..74d92aa4a3a2 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1788,8 +1788,8 @@ static void *setup_object(struct kmem_cache *s, struct slab *slab,
/*
* Slab allocation and freeing
*/
-static inline struct slab *alloc_slab_page(struct kmem_cache *s,
- gfp_t flags, int node, struct kmem_cache_order_objects oo)
+static inline struct slab *alloc_slab_page(gfp_t flags, int node,
+ struct kmem_cache_order_objects oo)
{
struct folio *folio;
struct slab *slab;
@@ -1941,7 +1941,7 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
if ((alloc_gfp & __GFP_DIRECT_RECLAIM) && oo_order(oo) > oo_order(s->min))
alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~(__GFP_RECLAIM|__GFP_NOFAIL);
- slab = alloc_slab_page(s, alloc_gfp, node, oo);
+ slab = alloc_slab_page(alloc_gfp, node, oo);
if (unlikely(!slab)) {
oo = s->min;
alloc_gfp = flags;
@@ -1949,7 +1949,7 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
* Allocation may have failed due to fragmentation.
* Try a lower order alloc if possible
*/
- slab = alloc_slab_page(s, alloc_gfp, node, oo);
+ slab = alloc_slab_page(alloc_gfp, node, oo);
if (unlikely(!slab))
goto out;
stat(s, ORDER_FALLBACK);
@@ -2348,10 +2348,10 @@ static void init_kmem_cache_cpus(struct kmem_cache *s)
static void deactivate_slab(struct kmem_cache *s, struct slab *slab,
void *freelist)
{
- enum slab_modes { M_NONE, M_PARTIAL, M_FULL, M_FREE };
+ enum slab_modes { M_NONE, M_PARTIAL, M_FULL, M_FREE, M_FULL_NOLIST };
struct kmem_cache_node *n = get_node(s, slab_nid(slab));
- int lock = 0, free_delta = 0;
- enum slab_modes l = M_NONE, m = M_NONE;
+ int free_delta = 0;
+ enum slab_modes mode = M_NONE;
void *nextfree, *freelist_iter, *freelist_tail;
int tail = DEACTIVATE_TO_HEAD;
unsigned long flags = 0;
@@ -2393,14 +2393,10 @@ static void deactivate_slab(struct kmem_cache *s, struct slab *slab,
* Ensure that the slab is unfrozen while the list presence
* reflects the actual number of objects during unfreeze.
*
- * We setup the list membership and then perform a cmpxchg
- * with the count. If there is a mismatch then the slab
- * is not unfrozen but the slab is on the wrong list.
- *
- * Then we restart the process which may have to remove
- * the slab from the list that we just put it on again
- * because the number of objects in the slab may have
- * changed.
+ * We first perform cmpxchg holding lock and insert to list
+ * when it succeed. If there is mismatch then the slab is not
+ * unfrozen and number of objects in the slab may have changed.
+ * Then release lock and retry cmpxchg again.
*/
redo:
@@ -2419,61 +2415,52 @@ redo:
new.frozen = 0;
- if (!new.inuse && n->nr_partial >= s->min_partial)
- m = M_FREE;
- else if (new.freelist) {
- m = M_PARTIAL;
- if (!lock) {
- lock = 1;
- /*
- * Taking the spinlock removes the possibility that
- * acquire_slab() will see a slab that is frozen
- */
- spin_lock_irqsave(&n->list_lock, flags);
- }
+ if (!new.inuse && n->nr_partial >= s->min_partial) {
+ mode = M_FREE;
+ } else if (new.freelist) {
+ mode = M_PARTIAL;
+ /*
+ * Taking the spinlock removes the possibility that
+ * acquire_slab() will see a slab that is frozen
+ */
+ spin_lock_irqsave(&n->list_lock, flags);
+ } else if (kmem_cache_debug_flags(s, SLAB_STORE_USER)) {
+ mode = M_FULL;
+ /*
+ * This also ensures that the scanning of full
+ * slabs from diagnostic functions will not see
+ * any frozen slabs.
+ */
+ spin_lock_irqsave(&n->list_lock, flags);
} else {
- m = M_FULL;
- if (kmem_cache_debug_flags(s, SLAB_STORE_USER) && !lock) {
- lock = 1;
- /*
- * This also ensures that the scanning of full
- * slabs from diagnostic functions will not see
- * any frozen slabs.
- */
- spin_lock_irqsave(&n->list_lock, flags);
- }
+ mode = M_FULL_NOLIST;
}
- if (l != m) {
- if (l == M_PARTIAL)
- remove_partial(n, slab);
- else if (l == M_FULL)
- remove_full(s, n, slab);
-
- if (m == M_PARTIAL)
- add_partial(n, slab, tail);
- else if (m == M_FULL)
- add_full(s, n, slab);
- }
- l = m;
if (!cmpxchg_double_slab(s, slab,
old.freelist, old.counters,
new.freelist, new.counters,
- "unfreezing slab"))
+ "unfreezing slab")) {
+ if (mode == M_PARTIAL || mode == M_FULL)
+ spin_unlock_irqrestore(&n->list_lock, flags);
goto redo;
+ }
- if (lock)
- spin_unlock_irqrestore(&n->list_lock, flags);
- if (m == M_PARTIAL)
+ if (mode == M_PARTIAL) {
+ add_partial(n, slab, tail);
+ spin_unlock_irqrestore(&n->list_lock, flags);
stat(s, tail);
- else if (m == M_FULL)
- stat(s, DEACTIVATE_FULL);
- else if (m == M_FREE) {
+ } else if (mode == M_FREE) {
stat(s, DEACTIVATE_EMPTY);
discard_slab(s, slab);
stat(s, FREE_SLAB);
+ } else if (mode == M_FULL) {
+ add_full(s, n, slab);
+ spin_unlock_irqrestore(&n->list_lock, flags);
+ stat(s, DEACTIVATE_FULL);
+ } else if (mode == M_FULL_NOLIST) {
+ stat(s, DEACTIVATE_FULL);
}
}
@@ -3131,7 +3118,7 @@ static __always_inline void maybe_wipe_obj_freeptr(struct kmem_cache *s,
*
* Otherwise we can simply pick the next object from the lockless free list.
*/
-static __always_inline void *slab_alloc_node(struct kmem_cache *s,
+static __always_inline void *slab_alloc_node(struct kmem_cache *s, struct list_lru *lru,
gfp_t gfpflags, int node, unsigned long addr, size_t orig_size)
{
void *object;
@@ -3141,7 +3128,7 @@ static __always_inline void *slab_alloc_node(struct kmem_cache *s,
struct obj_cgroup *objcg = NULL;
bool init = false;
- s = slab_pre_alloc_hook(s, &objcg, 1, gfpflags);
+ s = slab_pre_alloc_hook(s, lru, &objcg, 1, gfpflags);
if (!s)
return NULL;
@@ -3232,27 +3219,41 @@ out:
return object;
}
-static __always_inline void *slab_alloc(struct kmem_cache *s,
+static __always_inline void *slab_alloc(struct kmem_cache *s, struct list_lru *lru,
gfp_t gfpflags, unsigned long addr, size_t orig_size)
{
- return slab_alloc_node(s, gfpflags, NUMA_NO_NODE, addr, orig_size);
+ return slab_alloc_node(s, lru, gfpflags, NUMA_NO_NODE, addr, orig_size);
}
-void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
+static __always_inline
+void *__kmem_cache_alloc_lru(struct kmem_cache *s, struct list_lru *lru,
+ gfp_t gfpflags)
{
- void *ret = slab_alloc(s, gfpflags, _RET_IP_, s->object_size);
+ void *ret = slab_alloc(s, lru, gfpflags, _RET_IP_, s->object_size);
trace_kmem_cache_alloc(_RET_IP_, ret, s->object_size,
s->size, gfpflags);
return ret;
}
+
+void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
+{
+ return __kmem_cache_alloc_lru(s, NULL, gfpflags);
+}
EXPORT_SYMBOL(kmem_cache_alloc);
+void *kmem_cache_alloc_lru(struct kmem_cache *s, struct list_lru *lru,
+ gfp_t gfpflags)
+{
+ return __kmem_cache_alloc_lru(s, lru, gfpflags);
+}
+EXPORT_SYMBOL(kmem_cache_alloc_lru);
+
#ifdef CONFIG_TRACING
void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size)
{
- void *ret = slab_alloc(s, gfpflags, _RET_IP_, size);
+ void *ret = slab_alloc(s, NULL, gfpflags, _RET_IP_, size);
trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags);
ret = kasan_kmalloc(s, ret, size, gfpflags);
return ret;
@@ -3263,7 +3264,7 @@ EXPORT_SYMBOL(kmem_cache_alloc_trace);
#ifdef CONFIG_NUMA
void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node)
{
- void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_, s->object_size);
+ void *ret = slab_alloc_node(s, NULL, gfpflags, node, _RET_IP_, s->object_size);
trace_kmem_cache_alloc_node(_RET_IP_, ret,
s->object_size, s->size, gfpflags, node);
@@ -3277,7 +3278,7 @@ void *kmem_cache_alloc_node_trace(struct kmem_cache *s,
gfp_t gfpflags,
int node, size_t size)
{
- void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_, size);
+ void *ret = slab_alloc_node(s, NULL, gfpflags, node, _RET_IP_, size);
trace_kmalloc_node(_RET_IP_, ret,
size, s->size, gfpflags, node);
@@ -3667,7 +3668,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
struct obj_cgroup *objcg = NULL;
/* memcg and kmem_cache debug support */
- s = slab_pre_alloc_hook(s, &objcg, size, flags);
+ s = slab_pre_alloc_hook(s, NULL, &objcg, size, flags);
if (unlikely(!s))
return false;
/*
@@ -4000,15 +4001,6 @@ static int init_kmem_cache_nodes(struct kmem_cache *s)
return 1;
}
-static void set_min_partial(struct kmem_cache *s, unsigned long min)
-{
- if (min < MIN_PARTIAL)
- min = MIN_PARTIAL;
- else if (min > MAX_PARTIAL)
- min = MAX_PARTIAL;
- s->min_partial = min;
-}
-
static void set_cpu_partial(struct kmem_cache *s)
{
#ifdef CONFIG_SLUB_CPU_PARTIAL
@@ -4046,7 +4038,7 @@ static void set_cpu_partial(struct kmem_cache *s)
* calculate_sizes() determines the order and the distribution of data within
* a slab object.
*/
-static int calculate_sizes(struct kmem_cache *s, int forced_order)
+static int calculate_sizes(struct kmem_cache *s)
{
slab_flags_t flags = s->flags;
unsigned int size = s->object_size;
@@ -4150,10 +4142,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
size = ALIGN(size, s->align);
s->size = size;
s->reciprocal_size = reciprocal_value(size);
- if (forced_order >= 0)
- order = forced_order;
- else
- order = calculate_order(size);
+ order = calculate_order(size);
if ((int)order < 0)
return 0;
@@ -4189,7 +4178,7 @@ static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags)
s->random = get_random_long();
#endif
- if (!calculate_sizes(s, -1))
+ if (!calculate_sizes(s))
goto error;
if (disable_higher_order_debug) {
/*
@@ -4199,7 +4188,7 @@ static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags)
if (get_order(s->size) > get_order(s->object_size)) {
s->flags &= ~DEBUG_METADATA_FLAGS;
s->offset = 0;
- if (!calculate_sizes(s, -1))
+ if (!calculate_sizes(s))
goto error;
}
}
@@ -4215,7 +4204,8 @@ static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags)
* The larger the object size is, the more slabs we want on the partial
* list to avoid pounding the page allocator excessively.
*/
- set_min_partial(s, ilog2(s->size) / 2);
+ s->min_partial = min_t(unsigned long, MAX_PARTIAL, ilog2(s->size) / 2);
+ s->min_partial = max_t(unsigned long, MIN_PARTIAL, s->min_partial);
set_cpu_partial(s);
@@ -4417,7 +4407,7 @@ void *__kmalloc(size_t size, gfp_t flags)
if (unlikely(ZERO_OR_NULL_PTR(s)))
return s;
- ret = slab_alloc(s, flags, _RET_IP_, size);
+ ret = slab_alloc(s, NULL, flags, _RET_IP_, size);
trace_kmalloc(_RET_IP_, ret, size, s->size, flags);
@@ -4465,7 +4455,7 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node)
if (unlikely(ZERO_OR_NULL_PTR(s)))
return s;
- ret = slab_alloc_node(s, flags, node, _RET_IP_, size);
+ ret = slab_alloc_node(s, NULL, flags, node, _RET_IP_, size);
trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node);
@@ -4923,7 +4913,7 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller)
if (unlikely(ZERO_OR_NULL_PTR(s)))
return s;
- ret = slab_alloc(s, gfpflags, caller, size);
+ ret = slab_alloc(s, NULL, gfpflags, caller, size);
/* Honor the call site pointer we received. */
trace_kmalloc(caller, ret, size, s->size, gfpflags);
@@ -4954,7 +4944,7 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
if (unlikely(ZERO_OR_NULL_PTR(s)))
return s;
- ret = slab_alloc_node(s, gfpflags, node, caller, size);
+ ret = slab_alloc_node(s, NULL, gfpflags, node, caller, size);
/* Honor the call site pointer we received. */
trace_kmalloc_node(caller, ret, size, s->size, gfpflags, node);
@@ -5344,12 +5334,10 @@ struct slab_attribute {
};
#define SLAB_ATTR_RO(_name) \
- static struct slab_attribute _name##_attr = \
- __ATTR(_name, 0400, _name##_show, NULL)
+ static struct slab_attribute _name##_attr = __ATTR_RO_MODE(_name, 0400)
#define SLAB_ATTR(_name) \
- static struct slab_attribute _name##_attr = \
- __ATTR(_name, 0600, _name##_show, _name##_store)
+ static struct slab_attribute _name##_attr = __ATTR_RW_MODE(_name, 0600)
static ssize_t slab_size_show(struct kmem_cache *s, char *buf)
{
@@ -5396,7 +5384,7 @@ static ssize_t min_partial_store(struct kmem_cache *s, const char *buf,
if (err)
return err;
- set_min_partial(s, min);
+ s->min_partial = min;
return length;
}
SLAB_ATTR(min_partial);
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index db6df27c852a..8aecd6b3896c 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -34,6 +34,7 @@
#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
+#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
/**
* struct vmemmap_remap_walk - walk vmemmap page table
*
@@ -53,8 +54,7 @@ struct vmemmap_remap_walk {
struct list_head *vmemmap_pages;
};
-static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start,
- struct vmemmap_remap_walk *walk)
+static int __split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start)
{
pmd_t __pmd;
int i;
@@ -76,15 +76,34 @@ static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start,
set_pte_at(&init_mm, addr, pte, entry);
}
- /* Make pte visible before pmd. See comment in pmd_install(). */
- smp_wmb();
- pmd_populate_kernel(&init_mm, pmd, pgtable);
-
- flush_tlb_kernel_range(start, start + PMD_SIZE);
+ spin_lock(&init_mm.page_table_lock);
+ if (likely(pmd_leaf(*pmd))) {
+ /* Make pte visible before pmd. See comment in pmd_install(). */
+ smp_wmb();
+ pmd_populate_kernel(&init_mm, pmd, pgtable);
+ flush_tlb_kernel_range(start, start + PMD_SIZE);
+ } else {
+ pte_free_kernel(&init_mm, pgtable);
+ }
+ spin_unlock(&init_mm.page_table_lock);
return 0;
}
+static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start)
+{
+ int leaf;
+
+ spin_lock(&init_mm.page_table_lock);
+ leaf = pmd_leaf(*pmd);
+ spin_unlock(&init_mm.page_table_lock);
+
+ if (!leaf)
+ return 0;
+
+ return __split_vmemmap_huge_pmd(pmd, start);
+}
+
static void vmemmap_pte_range(pmd_t *pmd, unsigned long addr,
unsigned long end,
struct vmemmap_remap_walk *walk)
@@ -121,13 +140,12 @@ static int vmemmap_pmd_range(pud_t *pud, unsigned long addr,
pmd = pmd_offset(pud, addr);
do {
- if (pmd_leaf(*pmd)) {
- int ret;
+ int ret;
+
+ ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK);
+ if (ret)
+ return ret;
- ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK, walk);
- if (ret)
- return ret;
- }
next = pmd_addr_end(addr, end);
vmemmap_pte_range(pmd, addr, next, walk);
} while (pmd++, addr = next, addr != end);
@@ -245,6 +263,26 @@ static void vmemmap_remap_pte(pte_t *pte, unsigned long addr,
set_pte_at(&init_mm, addr, pte, entry);
}
+/*
+ * How many struct page structs need to be reset. When we reuse the head
+ * struct page, the special metadata (e.g. page->flags or page->mapping)
+ * cannot copy to the tail struct page structs. The invalid value will be
+ * checked in the free_tail_pages_check(). In order to avoid the message
+ * of "corrupted mapping in tail page". We need to reset at least 3 (one
+ * head struct page struct and two tail struct page structs) struct page
+ * structs.
+ */
+#define NR_RESET_STRUCT_PAGE 3
+
+static inline void reset_struct_pages(struct page *start)
+{
+ int i;
+ struct page *from = start + NR_RESET_STRUCT_PAGE;
+
+ for (i = 0; i < NR_RESET_STRUCT_PAGE; i++)
+ memcpy(start + i, from, sizeof(*from));
+}
+
static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
struct vmemmap_remap_walk *walk)
{
@@ -258,6 +296,7 @@ static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
list_del(&page->lru);
to = page_to_virt(page);
copy_page(to, (void *)walk->reuse_addr);
+ reset_struct_pages(to);
set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot));
}
@@ -300,10 +339,8 @@ int vmemmap_remap_free(unsigned long start, unsigned long end,
*/
BUG_ON(start - reuse != PAGE_SIZE);
- mmap_write_lock(&init_mm);
+ mmap_read_lock(&init_mm);
ret = vmemmap_remap_range(reuse, end, &walk);
- mmap_write_downgrade(&init_mm);
-
if (ret && walk.nr_walked) {
end = reuse + walk.nr_walked * PAGE_SIZE;
/*
@@ -383,6 +420,7 @@ int vmemmap_remap_alloc(unsigned long start, unsigned long end,
return 0;
}
+#endif /* CONFIG_HUGETLB_PAGE_FREE_VMEMMAP */
/*
* Allocate a block of memory to be used to back the virtual memory map
diff --git a/mm/sparse.c b/mm/sparse.c
index d21c6e5910d0..952f06d8f373 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -126,7 +126,7 @@ static inline int sparse_early_nid(struct mem_section *section)
}
/* Validate the physical addressing limitations of the model */
-void __meminit mminit_validate_memmodel_limits(unsigned long *start_pfn,
+static void __meminit mminit_validate_memmodel_limits(unsigned long *start_pfn,
unsigned long *end_pfn)
{
unsigned long max_sparsemem_pfn = 1UL << (MAX_PHYSMEM_BITS-PAGE_SHIFT);
diff --git a/mm/swap.c b/mm/swap.c
index bcf3ac288b56..bceff0cb559c 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -74,8 +74,8 @@ static DEFINE_PER_CPU(struct lru_pvecs, lru_pvecs) = {
};
/*
- * This path almost never happens for VM activity - pages are normally
- * freed via pagevecs. But it gets used by networking.
+ * This path almost never happens for VM activity - pages are normally freed
+ * via pagevecs. But it gets used by networking - and for compound pages.
*/
static void __page_cache_release(struct page *page)
{
@@ -89,7 +89,14 @@ static void __page_cache_release(struct page *page)
__clear_page_lru_flags(page);
unlock_page_lruvec_irqrestore(lruvec, flags);
}
- __ClearPageWaiters(page);
+ /* See comment on PageMlocked in release_pages() */
+ if (unlikely(PageMlocked(page))) {
+ int nr_pages = thp_nr_pages(page);
+
+ __ClearPageMlocked(page);
+ mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
+ count_vm_events(UNEVICTABLE_PGCLEARED, nr_pages);
+ }
}
static void __put_single_page(struct page *page)
@@ -114,17 +121,9 @@ static void __put_compound_page(struct page *page)
void __put_page(struct page *page)
{
- if (is_zone_device_page(page)) {
- put_dev_pagemap(page->pgmap);
-
- /*
- * The page belongs to the device that created pgmap. Do
- * not return it to page allocator.
- */
- return;
- }
-
- if (unlikely(PageCompound(page)))
+ if (unlikely(is_zone_device_page(page)))
+ free_zone_device_page(page);
+ else if (unlikely(PageCompound(page)))
__put_compound_page(page);
else
__put_single_page(page);
@@ -152,7 +151,6 @@ void put_pages_list(struct list_head *pages)
continue;
}
/* Cannot be PageLRU because it's passed to us using the lru */
- __ClearPageWaiters(page);
}
free_unref_page_list(pages);
@@ -425,7 +423,7 @@ void folio_mark_accessed(struct folio *folio)
/*
* Unevictable pages are on the "LRU_UNEVICTABLE" list. But,
* this list is never rotated or maintained, so marking an
- * evictable page accessed has no effect.
+ * unevictable page accessed has no effect.
*/
} else if (!folio_test_active(folio)) {
/*
@@ -482,22 +480,12 @@ EXPORT_SYMBOL(folio_add_lru);
void lru_cache_add_inactive_or_unevictable(struct page *page,
struct vm_area_struct *vma)
{
- bool unevictable;
-
VM_BUG_ON_PAGE(PageLRU(page), page);
- unevictable = (vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) == VM_LOCKED;
- if (unlikely(unevictable) && !TestSetPageMlocked(page)) {
- int nr_pages = thp_nr_pages(page);
- /*
- * We use the irq-unsafe __mod_zone_page_state because this
- * counter is not modified from interrupt context, and the pte
- * lock is held(spinlock), which implies preemption disabled.
- */
- __mod_zone_page_state(page_zone(page), NR_MLOCK, nr_pages);
- count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages);
- }
- lru_cache_add(page);
+ if (unlikely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) == VM_LOCKED))
+ mlock_new_page(page);
+ else
+ lru_cache_add(page);
}
/*
@@ -636,35 +624,37 @@ void lru_add_drain_cpu(int cpu)
pagevec_lru_move_fn(pvec, lru_lazyfree_fn);
activate_page_drain(cpu);
+ mlock_page_drain(cpu);
}
/**
- * deactivate_file_page - forcefully deactivate a file page
- * @page: page to deactivate
+ * deactivate_file_folio() - Forcefully deactivate a file folio.
+ * @folio: Folio to deactivate.
*
- * This function hints the VM that @page is a good reclaim candidate,
- * for example if its invalidation fails due to the page being dirty
+ * This function hints to the VM that @folio is a good reclaim candidate,
+ * for example if its invalidation fails due to the folio being dirty
* or under writeback.
+ *
+ * Context: Caller holds a reference on the page.
*/
-void deactivate_file_page(struct page *page)
+void deactivate_file_folio(struct folio *folio)
{
+ struct pagevec *pvec;
+
/*
- * In a workload with many unevictable page such as mprotect,
- * unevictable page deactivation for accelerating reclaim is pointless.
+ * In a workload with many unevictable pages such as mprotect,
+ * unevictable folio deactivation for accelerating reclaim is pointless.
*/
- if (PageUnevictable(page))
+ if (folio_test_unevictable(folio))
return;
- if (likely(get_page_unless_zero(page))) {
- struct pagevec *pvec;
-
- local_lock(&lru_pvecs.lock);
- pvec = this_cpu_ptr(&lru_pvecs.lru_deactivate_file);
+ folio_get(folio);
+ local_lock(&lru_pvecs.lock);
+ pvec = this_cpu_ptr(&lru_pvecs.lru_deactivate_file);
- if (pagevec_add_and_need_flush(pvec, page))
- pagevec_lru_move_fn(pvec, lru_deactivate_file_fn);
- local_unlock(&lru_pvecs.lock);
- }
+ if (pagevec_add_and_need_flush(pvec, &folio->page))
+ pagevec_lru_move_fn(pvec, lru_deactivate_file_fn);
+ local_unlock(&lru_pvecs.lock);
}
/*
@@ -831,13 +821,13 @@ inline void __lru_add_drain_all(bool force_all_cpus)
for_each_online_cpu(cpu) {
struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);
- if (force_all_cpus ||
- pagevec_count(&per_cpu(lru_pvecs.lru_add, cpu)) ||
+ if (pagevec_count(&per_cpu(lru_pvecs.lru_add, cpu)) ||
data_race(pagevec_count(&per_cpu(lru_rotate.pvec, cpu))) ||
pagevec_count(&per_cpu(lru_pvecs.lru_deactivate_file, cpu)) ||
pagevec_count(&per_cpu(lru_pvecs.lru_deactivate, cpu)) ||
pagevec_count(&per_cpu(lru_pvecs.lru_lazyfree, cpu)) ||
need_activate_page_drain(cpu) ||
+ need_mlock_page_drain(cpu) ||
has_bh_in_lru(cpu, NULL)) {
INIT_WORK(work, lru_add_drain_per_cpu);
queue_work_on(cpu, mm_percpu_wq, work);
@@ -876,15 +866,21 @@ atomic_t lru_disable_count = ATOMIC_INIT(0);
void lru_cache_disable(void)
{
atomic_inc(&lru_disable_count);
-#ifdef CONFIG_SMP
/*
- * lru_add_drain_all in the force mode will schedule draining on
- * all online CPUs so any calls of lru_cache_disabled wrapped by
- * local_lock or preemption disabled would be ordered by that.
- * The atomic operation doesn't need to have stronger ordering
- * requirements because that is enforced by the scheduling
- * guarantees.
+ * Readers of lru_disable_count are protected by either disabling
+ * preemption or rcu_read_lock:
+ *
+ * preempt_disable, local_irq_disable [bh_lru_lock()]
+ * rcu_read_lock [rt_spin_lock CONFIG_PREEMPT_RT]
+ * preempt_disable [local_lock !CONFIG_PREEMPT_RT]
+ *
+ * Since v5.1 kernel, synchronize_rcu() is guaranteed to wait on
+ * preempt_disable() regions of code. So any CPU which sees
+ * lru_disable_count = 0 will have exited the critical
+ * section when synchronize_rcu() returns.
*/
+ synchronize_rcu();
+#ifdef CONFIG_SMP
__lru_add_drain_all(true);
#else
lru_add_and_bh_lrus_drain();
@@ -930,18 +926,10 @@ void release_pages(struct page **pages, int nr)
unlock_page_lruvec_irqrestore(lruvec, flags);
lruvec = NULL;
}
- /*
- * ZONE_DEVICE pages that return 'false' from
- * page_is_devmap_managed() do not require special
- * processing, and instead, expect a call to
- * put_page_testzero().
- */
- if (page_is_devmap_managed(page)) {
- put_devmap_managed_page(page);
+ if (put_devmap_managed_page(page))
continue;
- }
if (put_page_testzero(page))
- put_dev_pagemap(page->pgmap);
+ free_zone_device_page(page);
continue;
}
@@ -969,7 +957,17 @@ void release_pages(struct page **pages, int nr)
__clear_page_lru_flags(page);
}
- __ClearPageWaiters(page);
+ /*
+ * In rare cases, when truncation or holepunching raced with
+ * munlock after VM_LOCKED was cleared, Mlocked may still be
+ * found set here. This does not indicate a problem, unless
+ * "unevictable_pgs_cleared" appears worryingly large.
+ */
+ if (unlikely(PageMlocked(page))) {
+ __ClearPageMlocked(page);
+ dec_zone_page_state(page, NR_MLOCK);
+ count_vm_event(UNEVICTABLE_PGCLEARED);
+ }
list_add(&page->lru, &pages_to_free);
}
@@ -1009,43 +1007,32 @@ static void __pagevec_lru_add_fn(struct folio *folio, struct lruvec *lruvec)
VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
+ folio_set_lru(folio);
/*
- * A folio becomes evictable in two ways:
- * 1) Within LRU lock [munlock_vma_page() and __munlock_pagevec()].
- * 2) Before acquiring LRU lock to put the folio on the correct LRU
- * and then
- * a) do PageLRU check with lock [check_move_unevictable_pages]
- * b) do PageLRU check before lock [clear_page_mlock]
- *
- * (1) & (2a) are ok as LRU lock will serialize them. For (2b), we need
- * following strict ordering:
- *
- * #0: __pagevec_lru_add_fn #1: clear_page_mlock
- *
- * folio_set_lru() folio_test_clear_mlocked()
- * smp_mb() // explicit ordering // above provides strict
- * // ordering
- * folio_test_mlocked() folio_test_lru()
- *
+ * Is an smp_mb__after_atomic() still required here, before
+ * folio_evictable() tests PageMlocked, to rule out the possibility
+ * of stranding an evictable folio on an unevictable LRU? I think
+ * not, because __munlock_page() only clears PageMlocked while the LRU
+ * lock is held.
*
- * if '#1' does not observe setting of PG_lru by '#0' and
- * fails isolation, the explicit barrier will make sure that
- * folio_evictable check will put the folio on the correct
- * LRU. Without smp_mb(), folio_set_lru() can be reordered
- * after folio_test_mlocked() check and can make '#1' fail the
- * isolation of the folio whose mlocked bit is cleared (#0 is
- * also looking at the same folio) and the evictable folio will
- * be stranded on an unevictable LRU.
+ * (That is not true of __page_cache_release(), and not necessarily
+ * true of release_pages(): but those only clear PageMlocked after
+ * put_page_testzero() has excluded any other users of the page.)
*/
- folio_set_lru(folio);
- smp_mb__after_atomic();
-
if (folio_evictable(folio)) {
if (was_unevictable)
__count_vm_events(UNEVICTABLE_PGRESCUED, nr_pages);
} else {
folio_clear_active(folio);
folio_set_unevictable(folio);
+ /*
+ * folio->mlock_count = !!folio_test_mlocked(folio)?
+ * But that leaves __mlock_page() in doubt whether another
+ * actor has already counted the mlock or not. Err on the
+ * safe side, underestimate, let page reclaim fix it, rather
+ * than leaving a page on the unevictable LRU indefinitely.
+ */
+ folio->mlock_count = 0;
if (!was_unevictable)
__count_vm_events(UNEVICTABLE_PGCULLED, nr_pages);
}
@@ -1153,26 +1140,3 @@ void __init swap_setup(void)
* _really_ don't want to cluster much more
*/
}
-
-#ifdef CONFIG_DEV_PAGEMAP_OPS
-void put_devmap_managed_page(struct page *page)
-{
- int count;
-
- if (WARN_ON_ONCE(!page_is_devmap_managed(page)))
- return;
-
- count = page_ref_dec_return(page);
-
- /*
- * devmap page refcounts are 1-based, rather than 0-based: if
- * refcount is 1, then the page is free and the refcount is
- * stable because nobody holds a reference on the page.
- */
- if (count == 1)
- free_devmap_managed_page(page);
- else if (!count)
- __put_page(page);
-}
-EXPORT_SYMBOL(put_devmap_managed_page);
-#endif
diff --git a/mm/swap_cgroup.c b/mm/swap_cgroup.c
index 7f34343c075a..5a9442979a18 100644
--- a/mm/swap_cgroup.c
+++ b/mm/swap_cgroup.c
@@ -167,14 +167,12 @@ unsigned short lookup_swap_cgroup_id(swp_entry_t ent)
int swap_cgroup_swapon(int type, unsigned long max_pages)
{
void *array;
- unsigned long array_size;
unsigned long length;
struct swap_cgroup_ctrl *ctrl;
length = DIV_ROUND_UP(max_pages, SC_PER_PAGE);
- array_size = length * sizeof(void *);
- array = vzalloc(array_size);
+ array = vcalloc(length, sizeof(void *));
if (!array)
goto nomem;
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 8d4104242100..013856004825 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -30,7 +30,7 @@
*/
static const struct address_space_operations swap_aops = {
.writepage = swap_writepage,
- .set_page_dirty = swap_set_page_dirty,
+ .dirty_folio = swap_dirty_folio,
#ifdef CONFIG_MIGRATION
.migratepage = migrate_page,
#endif
@@ -478,7 +478,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
* __read_swap_cache_async(), which has set SWAP_HAS_CACHE
* in swap_map, but not yet added its page to swap cache.
*/
- cond_resched();
+ schedule_timeout_uninterruptible(1);
}
/*
diff --git a/mm/swapfile.c b/mm/swapfile.c
index bf0df7aa7158..63c61f8b2611 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1167,16 +1167,6 @@ out:
return NULL;
}
-static struct swap_info_struct *swap_info_get(swp_entry_t entry)
-{
- struct swap_info_struct *p;
-
- p = _swap_info_get(entry);
- if (p)
- spin_lock(&p->lock);
- return p;
-}
-
static struct swap_info_struct *swap_info_get_cont(swp_entry_t entry,
struct swap_info_struct *q)
{
@@ -1601,100 +1591,6 @@ static bool page_swapped(struct page *page)
return false;
}
-static int page_trans_huge_map_swapcount(struct page *page,
- int *total_swapcount)
-{
- int i, map_swapcount, _total_swapcount;
- unsigned long offset = 0;
- struct swap_info_struct *si;
- struct swap_cluster_info *ci = NULL;
- unsigned char *map = NULL;
- int swapcount = 0;
-
- /* hugetlbfs shouldn't call it */
- VM_BUG_ON_PAGE(PageHuge(page), page);
-
- if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!PageTransCompound(page))) {
- if (PageSwapCache(page))
- swapcount = page_swapcount(page);
- if (total_swapcount)
- *total_swapcount = swapcount;
- return swapcount + page_trans_huge_mapcount(page);
- }
-
- page = compound_head(page);
-
- _total_swapcount = map_swapcount = 0;
- if (PageSwapCache(page)) {
- swp_entry_t entry;
-
- entry.val = page_private(page);
- si = _swap_info_get(entry);
- if (si) {
- map = si->swap_map;
- offset = swp_offset(entry);
- }
- }
- if (map)
- ci = lock_cluster(si, offset);
- for (i = 0; i < HPAGE_PMD_NR; i++) {
- int mapcount = atomic_read(&page[i]._mapcount) + 1;
- if (map) {
- swapcount = swap_count(map[offset + i]);
- _total_swapcount += swapcount;
- }
- map_swapcount = max(map_swapcount, mapcount + swapcount);
- }
- unlock_cluster(ci);
-
- if (PageDoubleMap(page))
- map_swapcount -= 1;
-
- if (total_swapcount)
- *total_swapcount = _total_swapcount;
-
- return map_swapcount + compound_mapcount(page);
-}
-
-/*
- * We can write to an anon page without COW if there are no other references
- * to it. And as a side-effect, free up its swap: because the old content
- * on disk will never be read, and seeking back there to write new content
- * later would only waste time away from clustering.
- */
-bool reuse_swap_page(struct page *page)
-{
- int count, total_swapcount;
-
- VM_BUG_ON_PAGE(!PageLocked(page), page);
- if (unlikely(PageKsm(page)))
- return false;
- count = page_trans_huge_map_swapcount(page, &total_swapcount);
- if (count == 1 && PageSwapCache(page) &&
- (likely(!PageTransCompound(page)) ||
- /* The remaining swap count will be freed soon */
- total_swapcount == page_swapcount(page))) {
- if (!PageWriteback(page)) {
- page = compound_head(page);
- delete_from_swap_cache(page);
- SetPageDirty(page);
- } else {
- swp_entry_t entry;
- struct swap_info_struct *p;
-
- entry.val = page_private(page);
- p = swap_info_get(entry);
- if (p->flags & SWP_STABLE_WRITES) {
- spin_unlock(&p->lock);
- return false;
- }
- spin_unlock(&p->lock);
- }
- }
-
- return count <= 1;
-}
-
/*
* If swap is getting full, or if there are no more mappings of this page,
* then try_to_free_swap is called to free its swap space.
@@ -1951,6 +1847,7 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
struct vm_fault vmf = {
.vma = vma,
.address = addr,
+ .real_address = addr,
.pmd = pmd,
};
diff --git a/mm/truncate.c b/mm/truncate.c
index 9dbf0b75da5d..ab50d0d59a2a 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -19,8 +19,7 @@
#include <linux/highmem.h>
#include <linux/pagevec.h>
#include <linux/task_io_accounting_ops.h>
-#include <linux/buffer_head.h> /* grr. try_to_release_page,
- do_invalidatepage */
+#include <linux/buffer_head.h> /* grr. try_to_release_page */
#include <linux/shmem_fs.h>
#include <linux/rmap.h>
#include "internal.h"
@@ -138,33 +137,28 @@ static int invalidate_exceptional_entry2(struct address_space *mapping,
}
/**
- * do_invalidatepage - invalidate part or all of a page
- * @page: the page which is affected
+ * folio_invalidate - Invalidate part or all of a folio.
+ * @folio: The folio which is affected.
* @offset: start of the range to invalidate
* @length: length of the range to invalidate
*
- * do_invalidatepage() is called when all or part of the page has become
+ * folio_invalidate() is called when all or part of the folio has become
* invalidated by a truncate operation.
*
- * do_invalidatepage() does not have to release all buffers, but it must
+ * folio_invalidate() does not have to release all buffers, but it must
* ensure that no dirty buffer is left outside @offset and that no I/O
* is underway against any of the blocks which are outside the truncation
* point. Because the caller is about to free (and possibly reuse) those
* blocks on-disk.
*/
-void do_invalidatepage(struct page *page, unsigned int offset,
- unsigned int length)
+void folio_invalidate(struct folio *folio, size_t offset, size_t length)
{
- void (*invalidatepage)(struct page *, unsigned int, unsigned int);
-
- invalidatepage = page->mapping->a_ops->invalidatepage;
-#ifdef CONFIG_BLOCK
- if (!invalidatepage)
- invalidatepage = block_invalidatepage;
-#endif
- if (invalidatepage)
- (*invalidatepage)(page, offset, length);
+ const struct address_space_operations *aops = folio->mapping->a_ops;
+
+ if (aops->invalidate_folio)
+ aops->invalidate_folio(folio, offset, length);
}
+EXPORT_SYMBOL_GPL(folio_invalidate);
/*
* If truncate cannot remove the fs-private metadata from the page, the page
@@ -182,7 +176,7 @@ static void truncate_cleanup_folio(struct folio *folio)
unmap_mapping_folio(folio);
if (folio_has_private(folio))
- do_invalidatepage(&folio->page, 0, folio_size(folio));
+ folio_invalidate(folio, 0, folio_size(folio));
/*
* Some filesystems seem to re-dirty the page even after
@@ -193,27 +187,6 @@ static void truncate_cleanup_folio(struct folio *folio)
folio_clear_mappedtodisk(folio);
}
-/*
- * This is for invalidate_mapping_pages(). That function can be called at
- * any time, and is not supposed to throw away dirty pages. But pages can
- * be marked dirty at any time too, so use remove_mapping which safely
- * discards clean, unused pages.
- *
- * Returns non-zero if the page was successfully invalidated.
- */
-static int
-invalidate_complete_page(struct address_space *mapping, struct page *page)
-{
-
- if (page->mapping != mapping)
- return 0;
-
- if (page_has_private(page) && !try_to_release_page(page, 0))
- return 0;
-
- return remove_mapping(mapping, page);
-}
-
int truncate_inode_folio(struct address_space *mapping, struct folio *folio)
{
if (folio->mapping != mapping)
@@ -264,7 +237,7 @@ bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end)
folio_zero_range(folio, offset, length);
if (folio_has_private(folio))
- do_invalidatepage(&folio->page, offset, length);
+ folio_invalidate(folio, offset, length);
if (!folio_test_large(folio))
return true;
if (split_huge_page(&folio->page) == 0)
@@ -294,22 +267,40 @@ int generic_error_remove_page(struct address_space *mapping, struct page *page)
}
EXPORT_SYMBOL(generic_error_remove_page);
-/*
+static long mapping_evict_folio(struct address_space *mapping,
+ struct folio *folio)
+{
+ if (folio_test_dirty(folio) || folio_test_writeback(folio))
+ return 0;
+ /* The refcount will be elevated if any page in the folio is mapped */
+ if (folio_ref_count(folio) >
+ folio_nr_pages(folio) + folio_has_private(folio) + 1)
+ return 0;
+ if (folio_has_private(folio) && !filemap_release_folio(folio, 0))
+ return 0;
+
+ return remove_mapping(mapping, folio);
+}
+
+/**
+ * invalidate_inode_page() - Remove an unused page from the pagecache.
+ * @page: The page to remove.
+ *
* Safely invalidate one page from its pagecache mapping.
- * It only drops clean, unused pages. The page must be locked.
+ * It only drops clean, unused pages.
*
- * Returns 1 if the page is successfully invalidated, otherwise 0.
+ * Context: Page must be locked.
+ * Return: The number of pages successfully removed.
*/
-int invalidate_inode_page(struct page *page)
+long invalidate_inode_page(struct page *page)
{
- struct address_space *mapping = page_mapping(page);
+ struct folio *folio = page_folio(page);
+ struct address_space *mapping = folio_mapping(folio);
+
+ /* The page may have been truncated before it was locked */
if (!mapping)
return 0;
- if (PageDirty(page) || PageWriteback(page))
- return 0;
- if (page_mapped(page))
- return 0;
- return invalidate_complete_page(mapping, page);
+ return mapping_evict_folio(mapping, folio);
}
/**
@@ -332,7 +323,7 @@ int invalidate_inode_page(struct page *page)
* mapping is large, it is probably the case that the final pages are the most
* recently touched, and freeing happens in ascending file offset order.
*
- * Note that since ->invalidatepage() accepts range to invalidate
+ * Note that since ->invalidate_folio() accepts range to invalidate
* truncate_inode_pages_range is able to handle cases where lend + 1 is not
* page aligned properly.
*/
@@ -497,7 +488,18 @@ void truncate_inode_pages_final(struct address_space *mapping)
}
EXPORT_SYMBOL(truncate_inode_pages_final);
-static unsigned long __invalidate_mapping_pages(struct address_space *mapping,
+/**
+ * invalidate_mapping_pagevec - Invalidate all the unlocked pages of one inode
+ * @mapping: the address_space which holds the pages to invalidate
+ * @start: the offset 'from' which to invalidate
+ * @end: the offset 'to' which to invalidate (inclusive)
+ * @nr_pagevec: invalidate failed page number for caller
+ *
+ * This helper is similar to invalidate_mapping_pages(), except that it accounts
+ * for pages that are likely on a pagevec and counts them in @nr_pagevec, which
+ * will be used by the caller.
+ */
+unsigned long invalidate_mapping_pagevec(struct address_space *mapping,
pgoff_t start, pgoff_t end, unsigned long *nr_pagevec)
{
pgoff_t indices[PAGEVEC_SIZE];
@@ -510,27 +512,27 @@ static unsigned long __invalidate_mapping_pages(struct address_space *mapping,
folio_batch_init(&fbatch);
while (find_lock_entries(mapping, index, end, &fbatch, indices)) {
for (i = 0; i < folio_batch_count(&fbatch); i++) {
- struct page *page = &fbatch.folios[i]->page;
+ struct folio *folio = fbatch.folios[i];
- /* We rely upon deletion not changing page->index */
+ /* We rely upon deletion not changing folio->index */
index = indices[i];
- if (xa_is_value(page)) {
+ if (xa_is_value(folio)) {
count += invalidate_exceptional_entry(mapping,
index,
- page);
+ folio);
continue;
}
- index += thp_nr_pages(page) - 1;
+ index += folio_nr_pages(folio) - 1;
- ret = invalidate_inode_page(page);
- unlock_page(page);
+ ret = mapping_evict_folio(mapping, folio);
+ folio_unlock(folio);
/*
- * Invalidation is a hint that the page is no longer
+ * Invalidation is a hint that the folio is no longer
* of interest and try to speed up its reclaim.
*/
if (!ret) {
- deactivate_file_page(page);
+ deactivate_file_folio(folio);
/* It is likely on the pagevec of a remote CPU */
if (nr_pagevec)
(*nr_pagevec)++;
@@ -562,29 +564,12 @@ static unsigned long __invalidate_mapping_pages(struct address_space *mapping,
unsigned long invalidate_mapping_pages(struct address_space *mapping,
pgoff_t start, pgoff_t end)
{
- return __invalidate_mapping_pages(mapping, start, end, NULL);
+ return invalidate_mapping_pagevec(mapping, start, end, NULL);
}
EXPORT_SYMBOL(invalidate_mapping_pages);
-/**
- * invalidate_mapping_pagevec - Invalidate all the unlocked pages of one inode
- * @mapping: the address_space which holds the pages to invalidate
- * @start: the offset 'from' which to invalidate
- * @end: the offset 'to' which to invalidate (inclusive)
- * @nr_pagevec: invalidate failed page number for caller
- *
- * This helper is similar to invalidate_mapping_pages(), except that it accounts
- * for pages that are likely on a pagevec and counts them in @nr_pagevec, which
- * will be used by the caller.
- */
-void invalidate_mapping_pagevec(struct address_space *mapping,
- pgoff_t start, pgoff_t end, unsigned long *nr_pagevec)
-{
- __invalidate_mapping_pages(mapping, start, end, nr_pagevec);
-}
-
/*
- * This is like invalidate_complete_page(), except it ignores the page's
+ * This is like invalidate_inode_page(), except it ignores the page's
* refcount. We do this because invalidate_inode_pages2() needs stronger
* invalidation guarantees, and cannot afford to leave pages behind because
* shrink_page_list() has a temp ref on them, or because they're transiently
@@ -620,13 +605,13 @@ failed:
return 0;
}
-static int do_launder_folio(struct address_space *mapping, struct folio *folio)
+static int folio_launder(struct address_space *mapping, struct folio *folio)
{
if (!folio_test_dirty(folio))
return 0;
- if (folio->mapping != mapping || mapping->a_ops->launder_page == NULL)
+ if (folio->mapping != mapping || mapping->a_ops->launder_folio == NULL)
return 0;
- return mapping->a_ops->launder_page(&folio->page);
+ return mapping->a_ops->launder_folio(folio);
}
/**
@@ -692,7 +677,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
unmap_mapping_folio(folio);
BUG_ON(folio_mapped(folio));
- ret2 = do_launder_folio(mapping, folio);
+ ret2 = folio_launder(mapping, folio);
if (ret2 == 0) {
if (!invalidate_complete_folio2(mapping, folio))
ret2 = -EBUSY;
diff --git a/mm/usercopy.c b/mm/usercopy.c
index d0d268135d96..2c235d5c2364 100644
--- a/mm/usercopy.c
+++ b/mm/usercopy.c
@@ -29,7 +29,7 @@
* Returns:
* NOT_STACK: not at all on the stack
* GOOD_FRAME: fully within a valid stack frame
- * GOOD_STACK: fully on the stack (when can't do frame-checking)
+ * GOOD_STACK: within the current stack (when can't frame-check exactly)
* BAD_STACK: error condition (invalid stack position or bad stack frame)
*/
static noinline int check_stack_object(const void *obj, unsigned long len)
@@ -55,6 +55,17 @@ static noinline int check_stack_object(const void *obj, unsigned long len)
if (ret)
return ret;
+ /* Finally, check stack depth if possible. */
+#ifdef CONFIG_ARCH_HAS_CURRENT_STACK_POINTER
+ if (IS_ENABLED(CONFIG_STACK_GROWSUP)) {
+ if ((void *)current_stack_pointer < obj + len)
+ return BAD_STACK;
+ } else {
+ if (obj < (void *)current_stack_pointer)
+ return BAD_STACK;
+ }
+#endif
+
return GOOD_STACK;
}
@@ -70,17 +81,6 @@ static noinline int check_stack_object(const void *obj, unsigned long len)
* kmem_cache_create_usercopy() function to create the cache (and
* carefully audit the whitelist range).
*/
-void usercopy_warn(const char *name, const char *detail, bool to_user,
- unsigned long offset, unsigned long len)
-{
- WARN_ONCE(1, "Bad or missing usercopy whitelist? Kernel memory %s attempt detected %s %s%s%s%s (offset %lu, size %lu)!\n",
- to_user ? "exposure" : "overwrite",
- to_user ? "from" : "to",
- name ? : "unknown?!",
- detail ? " '" : "", detail ? : "", detail ? "'" : "",
- offset, len);
-}
-
void __noreturn usercopy_abort(const char *name, const char *detail,
bool to_user, unsigned long offset,
unsigned long len)
@@ -280,7 +280,15 @@ void __check_object_size(const void *ptr, unsigned long n, bool to_user)
*/
return;
default:
- usercopy_abort("process stack", NULL, to_user, 0, n);
+ usercopy_abort("process stack", NULL, to_user,
+#ifdef CONFIG_ARCH_HAS_CURRENT_STACK_POINTER
+ IS_ENABLED(CONFIG_STACK_GROWSUP) ?
+ ptr - (void *)current_stack_pointer :
+ (void *)current_stack_pointer - ptr,
+#else
+ 0,
+#endif
+ n);
}
/* Check for bad heap object. */
@@ -295,7 +303,10 @@ static bool enable_checks __initdata = true;
static int __init parse_hardened_usercopy(char *str)
{
- return strtobool(str, &enable_checks);
+ if (strtobool(str, &enable_checks))
+ pr_warn("Invalid option string for hardened_usercopy: '%s'\n",
+ str);
+ return 1;
}
__setup("hardened_usercopy=", parse_hardened_usercopy);
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 0780c2a57ff1..0cb8e5ef1713 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -95,10 +95,15 @@ int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
if (!pte_none(*dst_pte))
goto out_unlock;
- if (page_in_cache)
- page_add_file_rmap(page, false);
- else
+ if (page_in_cache) {
+ /* Usually, cache pages are already added to LRU */
+ if (newly_allocated)
+ lru_cache_add(page);
+ page_add_file_rmap(page, dst_vma, false);
+ } else {
page_add_new_anon_rmap(page, dst_vma, dst_addr, false);
+ lru_cache_add_inactive_or_unevictable(page, dst_vma);
+ }
/*
* Must happen after rmap, as mm_counter() checks mapping (via
@@ -106,9 +111,6 @@ int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
*/
inc_mm_counter(dst_mm, mm_counter(page));
- if (newly_allocated)
- lru_cache_add_inactive_or_unevictable(page, dst_vma);
-
set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
/* No need to invalidate - it was non-present before */
@@ -150,6 +152,8 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm,
/* don't free the page */
goto out;
}
+
+ flush_dcache_page(page);
} else {
page = *pagep;
*pagep = NULL;
@@ -625,6 +629,7 @@ retry:
err = -EFAULT;
goto out;
}
+ flush_dcache_page(page);
goto retry;
} else
BUG_ON(page);
diff --git a/mm/util.c b/mm/util.c
index 7e43369064c8..54e5e761a9a9 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -587,8 +587,10 @@ void *kvmalloc_node(size_t size, gfp_t flags, int node)
return ret;
/* Don't even allow crazy sizes */
- if (WARN_ON_ONCE(size > INT_MAX))
+ if (unlikely(size > INT_MAX)) {
+ WARN_ON_ONCE(!(flags & __GFP_NOWARN));
return NULL;
+ }
return __vmalloc_node(size, 1, flags, node,
__builtin_return_address(0));
@@ -647,6 +649,56 @@ void *kvrealloc(const void *p, size_t oldsize, size_t newsize, gfp_t flags)
}
EXPORT_SYMBOL(kvrealloc);
+/**
+ * __vmalloc_array - allocate memory for a virtually contiguous array.
+ * @n: number of elements.
+ * @size: element size.
+ * @flags: the type of memory to allocate (see kmalloc).
+ */
+void *__vmalloc_array(size_t n, size_t size, gfp_t flags)
+{
+ size_t bytes;
+
+ if (unlikely(check_mul_overflow(n, size, &bytes)))
+ return NULL;
+ return __vmalloc(bytes, flags);
+}
+EXPORT_SYMBOL(__vmalloc_array);
+
+/**
+ * vmalloc_array - allocate memory for a virtually contiguous array.
+ * @n: number of elements.
+ * @size: element size.
+ */
+void *vmalloc_array(size_t n, size_t size)
+{
+ return __vmalloc_array(n, size, GFP_KERNEL);
+}
+EXPORT_SYMBOL(vmalloc_array);
+
+/**
+ * __vcalloc - allocate and zero memory for a virtually contiguous array.
+ * @n: number of elements.
+ * @size: element size.
+ * @flags: the type of memory to allocate (see kmalloc).
+ */
+void *__vcalloc(size_t n, size_t size, gfp_t flags)
+{
+ return __vmalloc_array(n, size, flags | __GFP_ZERO);
+}
+EXPORT_SYMBOL(__vcalloc);
+
+/**
+ * vcalloc - allocate and zero memory for a virtually contiguous array.
+ * @n: number of elements.
+ * @size: element size.
+ */
+void *vcalloc(size_t n, size_t size)
+{
+ return __vmalloc_array(n, size, GFP_KERNEL | __GFP_ZERO);
+}
+EXPORT_SYMBOL(vcalloc);
+
/* Neutral page->mapping pointer to address_space or anon_vma or other */
void *page_rmapping(struct page *page)
{
@@ -679,9 +731,8 @@ bool folio_mapped(struct folio *folio)
}
EXPORT_SYMBOL(folio_mapped);
-struct anon_vma *page_anon_vma(struct page *page)
+struct anon_vma *folio_anon_vma(struct folio *folio)
{
- struct folio *folio = page_folio(page);
unsigned long mapping = (unsigned long)folio->mapping;
if ((mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
@@ -741,6 +792,39 @@ int __page_mapcount(struct page *page)
EXPORT_SYMBOL_GPL(__page_mapcount);
/**
+ * folio_mapcount() - Calculate the number of mappings of this folio.
+ * @folio: The folio.
+ *
+ * A large folio tracks both how many times the entire folio is mapped,
+ * and how many times each individual page in the folio is mapped.
+ * This function calculates the total number of times the folio is
+ * mapped.
+ *
+ * Return: The number of times this folio is mapped.
+ */
+int folio_mapcount(struct folio *folio)
+{
+ int i, compound, nr, ret;
+
+ if (likely(!folio_test_large(folio)))
+ return atomic_read(&folio->_mapcount) + 1;
+
+ compound = folio_entire_mapcount(folio);
+ nr = folio_nr_pages(folio);
+ if (folio_test_hugetlb(folio))
+ return compound;
+ ret = compound;
+ for (i = 0; i < nr; i++)
+ ret += atomic_read(&folio_page(folio, i)->_mapcount) + 1;
+ /* File pages has compound_mapcount included in _mapcount */
+ if (!folio_test_anon(folio))
+ return ret - compound * nr;
+ if (folio_test_double_map(folio))
+ ret -= nr;
+ return ret;
+}
+
+/**
* folio_copy - Copy the contents of one folio to another.
* @dst: Folio to copy to.
* @src: Folio to copy from.
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 4165304d3547..e163372d3967 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -74,7 +74,7 @@ static const bool vmap_allow_huge = false;
bool is_vmalloc_addr(const void *x)
{
- unsigned long addr = (unsigned long)x;
+ unsigned long addr = (unsigned long)kasan_reset_tag(x);
return addr >= VMALLOC_START && addr < VMALLOC_END;
}
@@ -118,7 +118,6 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
if (size != PAGE_SIZE) {
pte_t entry = pfn_pte(pfn, prot);
- entry = pte_mkhuge(entry);
entry = arch_make_huge_pte(entry, ilog2(size), 0);
set_huge_pte_at(&init_mm, addr, pte, entry);
pfn += PFN_DOWN(size);
@@ -632,7 +631,7 @@ int is_vmalloc_or_module_addr(const void *x)
* just put it in the vmalloc space.
*/
#if defined(CONFIG_MODULES) && defined(MODULES_VADDR)
- unsigned long addr = (unsigned long)x;
+ unsigned long addr = (unsigned long)kasan_reset_tag(x);
if (addr >= MODULES_VADDR && addr < MODULES_END)
return 1;
#endif
@@ -776,23 +775,13 @@ get_subtree_max_size(struct rb_node *node)
return va ? va->subtree_max_size : 0;
}
-/*
- * Gets called when remove the node and rotate.
- */
-static __always_inline unsigned long
-compute_subtree_max_size(struct vmap_area *va)
-{
- return max3(va_size(va),
- get_subtree_max_size(va->rb_node.rb_left),
- get_subtree_max_size(va->rb_node.rb_right));
-}
-
RB_DECLARE_CALLBACKS_MAX(static, free_vmap_area_rb_augment_cb,
struct vmap_area, rb_node, unsigned long, subtree_max_size, va_size)
static void purge_vmap_area_lazy(void);
static BLOCKING_NOTIFIER_HEAD(vmap_notify_list);
-static unsigned long lazy_max_pages(void);
+static void drain_vmap_area_work(struct work_struct *work);
+static DECLARE_WORK(drain_vmap_work, drain_vmap_area_work);
static atomic_long_t nr_vmalloc_pages;
@@ -806,6 +795,8 @@ static struct vmap_area *find_vmap_area_exceed_addr(unsigned long addr)
struct vmap_area *va = NULL;
struct rb_node *n = vmap_area_root.rb_node;
+ addr = (unsigned long)kasan_reset_tag((void *)addr);
+
while (n) {
struct vmap_area *tmp;
@@ -827,6 +818,8 @@ static struct vmap_area *__find_vmap_area(unsigned long addr)
{
struct rb_node *n = vmap_area_root.rb_node;
+ addr = (unsigned long)kasan_reset_tag((void *)addr);
+
while (n) {
struct vmap_area *va;
@@ -973,6 +966,17 @@ unlink_va(struct vmap_area *va, struct rb_root *root)
}
#if DEBUG_AUGMENT_PROPAGATE_CHECK
+/*
+ * Gets called when remove the node and rotate.
+ */
+static __always_inline unsigned long
+compute_subtree_max_size(struct vmap_area *va)
+{
+ return max3(va_size(va),
+ get_subtree_max_size(va->rb_node.rb_left),
+ get_subtree_max_size(va->rb_node.rb_right));
+}
+
static void
augment_tree_propagate_check(void)
{
@@ -1189,22 +1193,28 @@ is_within_this_va(struct vmap_area *va, unsigned long size,
/*
* Find the first free block(lowest start address) in the tree,
* that will accomplish the request corresponding to passing
- * parameters.
+ * parameters. Please note, with an alignment bigger than PAGE_SIZE,
+ * a search length is adjusted to account for worst case alignment
+ * overhead.
*/
static __always_inline struct vmap_area *
-find_vmap_lowest_match(unsigned long size,
- unsigned long align, unsigned long vstart)
+find_vmap_lowest_match(unsigned long size, unsigned long align,
+ unsigned long vstart, bool adjust_search_size)
{
struct vmap_area *va;
struct rb_node *node;
+ unsigned long length;
/* Start from the root. */
node = free_vmap_area_root.rb_node;
+ /* Adjust the search size for alignment overhead. */
+ length = adjust_search_size ? size + align - 1 : size;
+
while (node) {
va = rb_entry(node, struct vmap_area, rb_node);
- if (get_subtree_max_size(node->rb_left) >= size &&
+ if (get_subtree_max_size(node->rb_left) >= length &&
vstart < va->va_start) {
node = node->rb_left;
} else {
@@ -1214,9 +1224,9 @@ find_vmap_lowest_match(unsigned long size,
/*
* Does not make sense to go deeper towards the right
* sub-tree if it does not have a free block that is
- * equal or bigger to the requested search size.
+ * equal or bigger to the requested search length.
*/
- if (get_subtree_max_size(node->rb_right) >= size) {
+ if (get_subtree_max_size(node->rb_right) >= length) {
node = node->rb_right;
continue;
}
@@ -1232,7 +1242,7 @@ find_vmap_lowest_match(unsigned long size,
if (is_within_this_va(va, size, align, vstart))
return va;
- if (get_subtree_max_size(node->rb_right) >= size &&
+ if (get_subtree_max_size(node->rb_right) >= length &&
vstart <= va->va_start) {
/*
* Shift the vstart forward. Please note, we update it with
@@ -1280,7 +1290,7 @@ find_vmap_lowest_match_check(unsigned long size, unsigned long align)
get_random_bytes(&rnd, sizeof(rnd));
vstart = VMALLOC_START + rnd;
- va_1 = find_vmap_lowest_match(size, align, vstart);
+ va_1 = find_vmap_lowest_match(size, align, vstart, false);
va_2 = find_vmap_lowest_linear_match(size, align, vstart);
if (va_1 != va_2)
@@ -1431,12 +1441,25 @@ static __always_inline unsigned long
__alloc_vmap_area(unsigned long size, unsigned long align,
unsigned long vstart, unsigned long vend)
{
+ bool adjust_search_size = true;
unsigned long nva_start_addr;
struct vmap_area *va;
enum fit_type type;
int ret;
- va = find_vmap_lowest_match(size, align, vstart);
+ /*
+ * Do not adjust when:
+ * a) align <= PAGE_SIZE, because it does not make any sense.
+ * All blocks(their start addresses) are at least PAGE_SIZE
+ * aligned anyway;
+ * b) a short range where a requested size corresponds to exactly
+ * specified [vstart:vend] interval and an alignment > PAGE_SIZE.
+ * With adjusted search length an allocation would not succeed.
+ */
+ if (align <= PAGE_SIZE || (align > PAGE_SIZE && (vend - vstart) == size))
+ adjust_search_size = false;
+
+ va = find_vmap_lowest_match(size, align, vstart, adjust_search_size);
if (unlikely(!va))
return vend;
@@ -1720,18 +1743,6 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
}
/*
- * Kick off a purge of the outstanding lazy areas. Don't bother if somebody
- * is already purging.
- */
-static void try_purge_vmap_area_lazy(void)
-{
- if (mutex_trylock(&vmap_purge_lock)) {
- __purge_vmap_area_lazy(ULONG_MAX, 0);
- mutex_unlock(&vmap_purge_lock);
- }
-}
-
-/*
* Kick off a purge of the outstanding lazy areas.
*/
static void purge_vmap_area_lazy(void)
@@ -1742,6 +1753,20 @@ static void purge_vmap_area_lazy(void)
mutex_unlock(&vmap_purge_lock);
}
+static void drain_vmap_area_work(struct work_struct *work)
+{
+ unsigned long nr_lazy;
+
+ do {
+ mutex_lock(&vmap_purge_lock);
+ __purge_vmap_area_lazy(ULONG_MAX, 0);
+ mutex_unlock(&vmap_purge_lock);
+
+ /* Recheck if further work is required. */
+ nr_lazy = atomic_long_read(&vmap_lazy_nr);
+ } while (nr_lazy > lazy_max_pages());
+}
+
/*
* Free a vmap area, caller ensuring that the area has been unmapped
* and flush_cache_vunmap had been called for the correct range
@@ -1768,7 +1793,7 @@ static void free_vmap_area_noflush(struct vmap_area *va)
/* After this point, we may free va at any time */
if (unlikely(nr_lazy > lazy_max_pages()))
- try_purge_vmap_area_lazy();
+ schedule_work(&drain_vmap_work);
}
/*
@@ -2145,7 +2170,7 @@ EXPORT_SYMBOL_GPL(vm_unmap_aliases);
void vm_unmap_ram(const void *mem, unsigned int count)
{
unsigned long size = (unsigned long)count << PAGE_SHIFT;
- unsigned long addr = (unsigned long)mem;
+ unsigned long addr = (unsigned long)kasan_reset_tag(mem);
struct vmap_area *va;
might_sleep();
@@ -2206,14 +2231,19 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node)
mem = (void *)addr;
}
- kasan_unpoison_vmalloc(mem, size);
-
if (vmap_pages_range(addr, addr + size, PAGE_KERNEL,
pages, PAGE_SHIFT) < 0) {
vm_unmap_ram(mem, count);
return NULL;
}
+ /*
+ * Mark the pages as accessible, now that they are mapped.
+ * With hardware tag-based KASAN, marking is skipped for
+ * non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc().
+ */
+ mem = kasan_unpoison_vmalloc(mem, size, KASAN_VMALLOC_PROT_NORMAL);
+
return mem;
}
EXPORT_SYMBOL(vm_map_ram);
@@ -2439,10 +2469,20 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
return NULL;
}
- kasan_unpoison_vmalloc((void *)va->va_start, requested_size);
-
setup_vmalloc_vm(area, va, flags, caller);
+ /*
+ * Mark pages for non-VM_ALLOC mappings as accessible. Do it now as a
+ * best-effort approach, as they can be mapped outside of vmalloc code.
+ * For VM_ALLOC mappings, the pages are marked as accessible after
+ * getting mapped in __vmalloc_node_range().
+ * With hardware tag-based KASAN, marking is skipped for
+ * non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc().
+ */
+ if (!(flags & VM_ALLOC))
+ area->addr = kasan_unpoison_vmalloc(area->addr, requested_size,
+ KASAN_VMALLOC_PROT_NORMAL);
+
return area;
}
@@ -2526,7 +2566,7 @@ struct vm_struct *remove_vm_area(const void *addr)
va->vm = NULL;
spin_unlock(&vmap_area_lock);
- kasan_free_shadow(vm);
+ kasan_free_module_shadow(vm);
free_unmap_vmap_area(va);
return vm;
@@ -2925,7 +2965,6 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
int node)
{
const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
- const gfp_t orig_gfp_mask = gfp_mask;
bool nofail = gfp_mask & __GFP_NOFAIL;
unsigned long addr = (unsigned long)area->addr;
unsigned long size = get_vm_area_size(area);
@@ -2949,7 +2988,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
}
if (!area->pages) {
- warn_alloc(orig_gfp_mask, NULL,
+ warn_alloc(gfp_mask, NULL,
"vmalloc error: size %lu, failed to allocated page array size %lu",
nr_small_pages * PAGE_SIZE, array_size);
free_vm_area(area);
@@ -2959,8 +2998,8 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
set_vm_area_page_order(area, page_shift - PAGE_SHIFT);
page_order = vm_area_page_order(area);
- area->nr_pages = vm_area_alloc_pages(gfp_mask, node,
- page_order, nr_small_pages, area->pages);
+ area->nr_pages = vm_area_alloc_pages(gfp_mask | __GFP_NOWARN,
+ node, page_order, nr_small_pages, area->pages);
atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
if (gfp_mask & __GFP_ACCOUNT) {
@@ -2976,7 +3015,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
* allocation request, free them via __vfree() if any.
*/
if (area->nr_pages != nr_small_pages) {
- warn_alloc(orig_gfp_mask, NULL,
+ warn_alloc(gfp_mask, NULL,
"vmalloc error: size %lu, page order %u, failed to allocate pages",
area->nr_pages * PAGE_SIZE, page_order);
goto fail;
@@ -3004,7 +3043,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
memalloc_noio_restore(flags);
if (ret < 0) {
- warn_alloc(orig_gfp_mask, NULL,
+ warn_alloc(gfp_mask, NULL,
"vmalloc error: size %lu, failed to map pages",
area->nr_pages * PAGE_SIZE);
goto fail;
@@ -3051,7 +3090,8 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
const void *caller)
{
struct vm_struct *area;
- void *addr;
+ void *ret;
+ kasan_vmalloc_flags_t kasan_flags = KASAN_VMALLOC_NONE;
unsigned long real_size = size;
unsigned long real_align = align;
unsigned int shift = PAGE_SHIFT;
@@ -3104,11 +3144,51 @@ again:
goto fail;
}
- addr = __vmalloc_area_node(area, gfp_mask, prot, shift, node);
- if (!addr)
+ /*
+ * Prepare arguments for __vmalloc_area_node() and
+ * kasan_unpoison_vmalloc().
+ */
+ if (pgprot_val(prot) == pgprot_val(PAGE_KERNEL)) {
+ if (kasan_hw_tags_enabled()) {
+ /*
+ * Modify protection bits to allow tagging.
+ * This must be done before mapping.
+ */
+ prot = arch_vmap_pgprot_tagged(prot);
+
+ /*
+ * Skip page_alloc poisoning and zeroing for physical
+ * pages backing VM_ALLOC mapping. Memory is instead
+ * poisoned and zeroed by kasan_unpoison_vmalloc().
+ */
+ gfp_mask |= __GFP_SKIP_KASAN_UNPOISON | __GFP_SKIP_ZERO;
+ }
+
+ /* Take note that the mapping is PAGE_KERNEL. */
+ kasan_flags |= KASAN_VMALLOC_PROT_NORMAL;
+ }
+
+ /* Allocate physical pages and map them into vmalloc space. */
+ ret = __vmalloc_area_node(area, gfp_mask, prot, shift, node);
+ if (!ret)
goto fail;
/*
+ * Mark the pages as accessible, now that they are mapped.
+ * The init condition should match the one in post_alloc_hook()
+ * (except for the should_skip_init() check) to make sure that memory
+ * is initialized under the same conditions regardless of the enabled
+ * KASAN mode.
+ * Tag-based KASAN modes only assign tags to normal non-executable
+ * allocations, see __kasan_unpoison_vmalloc().
+ */
+ kasan_flags |= KASAN_VMALLOC_VM_ALLOC;
+ if (!want_init_on_free() && want_init_on_alloc(gfp_mask))
+ kasan_flags |= KASAN_VMALLOC_INIT;
+ /* KASAN_VMALLOC_PROT_NORMAL already set if required. */
+ area->addr = kasan_unpoison_vmalloc(area->addr, real_size, kasan_flags);
+
+ /*
* In this function, newly allocated vm_struct has VM_UNINITIALIZED
* flag. It means that vm_struct is not fully initialized.
* Now, it is fully initialized, so remove this flag here.
@@ -3119,7 +3199,7 @@ again:
if (!(vm_flags & VM_DEFER_KMEMLEAK))
kmemleak_vmalloc(area, size, gfp_mask);
- return addr;
+ return area->addr;
fail:
if (shift > PAGE_SHIFT) {
@@ -3404,6 +3484,8 @@ long vread(char *buf, char *addr, unsigned long count)
unsigned long buflen = count;
unsigned long n;
+ addr = kasan_reset_tag(addr);
+
/* Don't allow overflow */
if ((unsigned long) addr + count < count)
count = -(unsigned long) addr;
@@ -3789,9 +3871,6 @@ retry:
for (area = 0; area < nr_vms; area++) {
if (kasan_populate_vmalloc(vas[area]->va_start, sizes[area]))
goto err_free_shadow;
-
- kasan_unpoison_vmalloc((void *)vas[area]->va_start,
- sizes[area]);
}
/* insert all vm's */
@@ -3804,6 +3883,16 @@ retry:
}
spin_unlock(&vmap_area_lock);
+ /*
+ * Mark allocated areas as accessible. Do it now as a best-effort
+ * approach, as they can be mapped outside of vmalloc code.
+ * With hardware tag-based KASAN, marking is skipped for
+ * non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc().
+ */
+ for (area = 0; area < nr_vms; area++)
+ vms[area]->addr = kasan_unpoison_vmalloc(vms[area]->addr,
+ vms[area]->size, KASAN_VMALLOC_PROT_NORMAL);
+
kfree(vas);
return vms;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 090bfb605ecf..1678802e03e7 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -56,6 +56,7 @@
#include <linux/swapops.h>
#include <linux/balloon_compaction.h>
+#include <linux/sched/sysctl.h>
#include "internal.h"
@@ -978,47 +979,36 @@ void drop_slab(void)
drop_slab_node(nid);
}
-static inline int is_page_cache_freeable(struct page *page)
+static inline int is_page_cache_freeable(struct folio *folio)
{
/*
* A freeable page cache page is referenced only by the caller
* that isolated the page, the page cache and optional buffer
* heads at page->private.
*/
- int page_cache_pins = thp_nr_pages(page);
- return page_count(page) - page_has_private(page) == 1 + page_cache_pins;
-}
-
-static int may_write_to_inode(struct inode *inode)
-{
- if (current->flags & PF_SWAPWRITE)
- return 1;
- if (!inode_write_congested(inode))
- return 1;
- if (inode_to_bdi(inode) == current->backing_dev_info)
- return 1;
- return 0;
+ return folio_ref_count(folio) - folio_test_private(folio) ==
+ 1 + folio_nr_pages(folio);
}
/*
- * We detected a synchronous write error writing a page out. Probably
+ * We detected a synchronous write error writing a folio out. Probably
* -ENOSPC. We need to propagate that into the address_space for a subsequent
* fsync(), msync() or close().
*
* The tricky part is that after writepage we cannot touch the mapping: nothing
- * prevents it from being freed up. But we have a ref on the page and once
- * that page is locked, the mapping is pinned.
+ * prevents it from being freed up. But we have a ref on the folio and once
+ * that folio is locked, the mapping is pinned.
*
- * We're allowed to run sleeping lock_page() here because we know the caller has
+ * We're allowed to run sleeping folio_lock() here because we know the caller has
* __GFP_FS.
*/
static void handle_write_error(struct address_space *mapping,
- struct page *page, int error)
+ struct folio *folio, int error)
{
- lock_page(page);
- if (page_mapping(page) == mapping)
+ folio_lock(folio);
+ if (folio_mapping(folio) == mapping)
mapping_set_error(mapping, error);
- unlock_page(page);
+ folio_unlock(folio);
}
static bool skip_throttle_noprogress(pg_data_t *pgdat)
@@ -1066,8 +1056,10 @@ void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason)
* forward progress (e.g. journalling workqueues or kthreads).
*/
if (!current_is_kswapd() &&
- current->flags & (PF_IO_WORKER|PF_KTHREAD))
+ current->flags & (PF_IO_WORKER|PF_KTHREAD)) {
+ cond_resched();
return;
+ }
/*
* These figures are pulled out of thin air.
@@ -1163,35 +1155,35 @@ typedef enum {
* pageout is called by shrink_page_list() for each dirty page.
* Calls ->writepage().
*/
-static pageout_t pageout(struct page *page, struct address_space *mapping)
+static pageout_t pageout(struct folio *folio, struct address_space *mapping)
{
/*
- * If the page is dirty, only perform writeback if that write
+ * If the folio is dirty, only perform writeback if that write
* will be non-blocking. To prevent this allocation from being
* stalled by pagecache activity. But note that there may be
* stalls if we need to run get_block(). We could test
* PagePrivate for that.
*
* If this process is currently in __generic_file_write_iter() against
- * this page's queue, we can perform writeback even if that
+ * this folio's queue, we can perform writeback even if that
* will block.
*
- * If the page is swapcache, write it back even if that would
+ * If the folio is swapcache, write it back even if that would
* block, for some throttling. This happens by accident, because
* swap_backing_dev_info is bust: it doesn't reflect the
* congestion state of the swapdevs. Easy to fix, if needed.
*/
- if (!is_page_cache_freeable(page))
+ if (!is_page_cache_freeable(folio))
return PAGE_KEEP;
if (!mapping) {
/*
- * Some data journaling orphaned pages can have
- * page->mapping == NULL while being dirty with clean buffers.
+ * Some data journaling orphaned folios can have
+ * folio->mapping == NULL while being dirty with clean buffers.
*/
- if (page_has_private(page)) {
- if (try_to_free_buffers(page)) {
- ClearPageDirty(page);
- pr_info("%s: orphaned page\n", __func__);
+ if (folio_test_private(folio)) {
+ if (try_to_free_buffers(&folio->page)) {
+ folio_clear_dirty(folio);
+ pr_info("%s: orphaned folio\n", __func__);
return PAGE_CLEAN;
}
}
@@ -1199,10 +1191,8 @@ static pageout_t pageout(struct page *page, struct address_space *mapping)
}
if (mapping->a_ops->writepage == NULL)
return PAGE_ACTIVATE;
- if (!may_write_to_inode(mapping->host))
- return PAGE_KEEP;
- if (clear_page_dirty_for_io(page)) {
+ if (folio_clear_dirty_for_io(folio)) {
int res;
struct writeback_control wbc = {
.sync_mode = WB_SYNC_NONE,
@@ -1212,21 +1202,21 @@ static pageout_t pageout(struct page *page, struct address_space *mapping)
.for_reclaim = 1,
};
- SetPageReclaim(page);
- res = mapping->a_ops->writepage(page, &wbc);
+ folio_set_reclaim(folio);
+ res = mapping->a_ops->writepage(&folio->page, &wbc);
if (res < 0)
- handle_write_error(mapping, page, res);
+ handle_write_error(mapping, folio, res);
if (res == AOP_WRITEPAGE_ACTIVATE) {
- ClearPageReclaim(page);
+ folio_clear_reclaim(folio);
return PAGE_ACTIVATE;
}
- if (!PageWriteback(page)) {
+ if (!folio_test_writeback(folio)) {
/* synchronous write or broken a_ops? */
- ClearPageReclaim(page);
+ folio_clear_reclaim(folio);
}
- trace_mm_vmscan_writepage(page);
- inc_node_page_state(page, NR_VMSCAN_WRITE);
+ trace_mm_vmscan_write_folio(folio);
+ node_stat_add_folio(folio, NR_VMSCAN_WRITE);
return PAGE_SUCCESS;
}
@@ -1237,16 +1227,16 @@ static pageout_t pageout(struct page *page, struct address_space *mapping)
* Same as remove_mapping, but if the page is removed from the mapping, it
* gets returned with a refcount of 0.
*/
-static int __remove_mapping(struct address_space *mapping, struct page *page,
+static int __remove_mapping(struct address_space *mapping, struct folio *folio,
bool reclaimed, struct mem_cgroup *target_memcg)
{
int refcount;
void *shadow = NULL;
- BUG_ON(!PageLocked(page));
- BUG_ON(mapping != page_mapping(page));
+ BUG_ON(!folio_test_locked(folio));
+ BUG_ON(mapping != folio_mapping(folio));
- if (!PageSwapCache(page))
+ if (!folio_test_swapcache(folio))
spin_lock(&mapping->host->i_lock);
xa_lock_irq(&mapping->i_pages);
/*
@@ -1274,23 +1264,23 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
* Note that if SetPageDirty is always performed via set_page_dirty,
* and thus under the i_pages lock, then this ordering is not required.
*/
- refcount = 1 + compound_nr(page);
- if (!page_ref_freeze(page, refcount))
+ refcount = 1 + folio_nr_pages(folio);
+ if (!folio_ref_freeze(folio, refcount))
goto cannot_free;
/* note: atomic_cmpxchg in page_ref_freeze provides the smp_rmb */
- if (unlikely(PageDirty(page))) {
- page_ref_unfreeze(page, refcount);
+ if (unlikely(folio_test_dirty(folio))) {
+ folio_ref_unfreeze(folio, refcount);
goto cannot_free;
}
- if (PageSwapCache(page)) {
- swp_entry_t swap = { .val = page_private(page) };
- mem_cgroup_swapout(page, swap);
+ if (folio_test_swapcache(folio)) {
+ swp_entry_t swap = folio_swap_entry(folio);
+ mem_cgroup_swapout(folio, swap);
if (reclaimed && !mapping_exiting(mapping))
- shadow = workingset_eviction(page, target_memcg);
- __delete_from_swap_cache(page, swap, shadow);
+ shadow = workingset_eviction(folio, target_memcg);
+ __delete_from_swap_cache(&folio->page, swap, shadow);
xa_unlock_irq(&mapping->i_pages);
- put_swap_page(page, swap);
+ put_swap_page(&folio->page, swap);
} else {
void (*freepage)(struct page *);
@@ -1311,61 +1301,67 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
* exceptional entries and shadow exceptional entries in the
* same address_space.
*/
- if (reclaimed && page_is_file_lru(page) &&
+ if (reclaimed && folio_is_file_lru(folio) &&
!mapping_exiting(mapping) && !dax_mapping(mapping))
- shadow = workingset_eviction(page, target_memcg);
- __delete_from_page_cache(page, shadow);
+ shadow = workingset_eviction(folio, target_memcg);
+ __filemap_remove_folio(folio, shadow);
xa_unlock_irq(&mapping->i_pages);
if (mapping_shrinkable(mapping))
inode_add_lru(mapping->host);
spin_unlock(&mapping->host->i_lock);
if (freepage != NULL)
- freepage(page);
+ freepage(&folio->page);
}
return 1;
cannot_free:
xa_unlock_irq(&mapping->i_pages);
- if (!PageSwapCache(page))
+ if (!folio_test_swapcache(folio))
spin_unlock(&mapping->host->i_lock);
return 0;
}
-/*
- * Attempt to detach a locked page from its ->mapping. If it is dirty or if
- * someone else has a ref on the page, abort and return 0. If it was
- * successfully detached, return 1. Assumes the caller has a single ref on
- * this page.
+/**
+ * remove_mapping() - Attempt to remove a folio from its mapping.
+ * @mapping: The address space.
+ * @folio: The folio to remove.
+ *
+ * If the folio is dirty, under writeback or if someone else has a ref
+ * on it, removal will fail.
+ * Return: The number of pages removed from the mapping. 0 if the folio
+ * could not be removed.
+ * Context: The caller should have a single refcount on the folio and
+ * hold its lock.
*/
-int remove_mapping(struct address_space *mapping, struct page *page)
+long remove_mapping(struct address_space *mapping, struct folio *folio)
{
- if (__remove_mapping(mapping, page, false, NULL)) {
+ if (__remove_mapping(mapping, folio, false, NULL)) {
/*
- * Unfreezing the refcount with 1 rather than 2 effectively
+ * Unfreezing the refcount with 1 effectively
* drops the pagecache ref for us without requiring another
* atomic operation.
*/
- page_ref_unfreeze(page, 1);
- return 1;
+ folio_ref_unfreeze(folio, 1);
+ return folio_nr_pages(folio);
}
return 0;
}
/**
- * putback_lru_page - put previously isolated page onto appropriate LRU list
- * @page: page to be put back to appropriate lru list
+ * folio_putback_lru - Put previously isolated folio onto appropriate LRU list.
+ * @folio: Folio to be returned to an LRU list.
*
- * Add previously isolated @page to appropriate LRU list.
- * Page may still be unevictable for other reasons.
+ * Add previously isolated @folio to appropriate LRU list.
+ * The folio may still be unevictable for other reasons.
*
- * lru_lock must not be held, interrupts must be enabled.
+ * Context: lru_lock must not be held, interrupts must be enabled.
*/
-void putback_lru_page(struct page *page)
+void folio_putback_lru(struct folio *folio)
{
- lru_cache_add(page);
- put_page(page); /* drop ref from isolate */
+ folio_add_lru(folio);
+ folio_put(folio); /* drop ref from isolate */
}
enum page_references {
@@ -1375,61 +1371,61 @@ enum page_references {
PAGEREF_ACTIVATE,
};
-static enum page_references page_check_references(struct page *page,
+static enum page_references folio_check_references(struct folio *folio,
struct scan_control *sc)
{
- int referenced_ptes, referenced_page;
+ int referenced_ptes, referenced_folio;
unsigned long vm_flags;
- referenced_ptes = page_referenced(page, 1, sc->target_mem_cgroup,
- &vm_flags);
- referenced_page = TestClearPageReferenced(page);
+ referenced_ptes = folio_referenced(folio, 1, sc->target_mem_cgroup,
+ &vm_flags);
+ referenced_folio = folio_test_clear_referenced(folio);
/*
- * Mlock lost the isolation race with us. Let try_to_unmap()
- * move the page to the unevictable list.
+ * The supposedly reclaimable folio was found to be in a VM_LOCKED vma.
+ * Let the folio, now marked Mlocked, be moved to the unevictable list.
*/
if (vm_flags & VM_LOCKED)
- return PAGEREF_RECLAIM;
+ return PAGEREF_ACTIVATE;
if (referenced_ptes) {
/*
- * All mapped pages start out with page table
+ * All mapped folios start out with page table
* references from the instantiating fault, so we need
- * to look twice if a mapped file page is used more
+ * to look twice if a mapped file/anon folio is used more
* than once.
*
* Mark it and spare it for another trip around the
* inactive list. Another page table reference will
* lead to its activation.
*
- * Note: the mark is set for activated pages as well
- * so that recently deactivated but used pages are
+ * Note: the mark is set for activated folios as well
+ * so that recently deactivated but used folios are
* quickly recovered.
*/
- SetPageReferenced(page);
+ folio_set_referenced(folio);
- if (referenced_page || referenced_ptes > 1)
+ if (referenced_folio || referenced_ptes > 1)
return PAGEREF_ACTIVATE;
/*
- * Activate file-backed executable pages after first usage.
+ * Activate file-backed executable folios after first usage.
*/
- if ((vm_flags & VM_EXEC) && !PageSwapBacked(page))
+ if ((vm_flags & VM_EXEC) && !folio_test_swapbacked(folio))
return PAGEREF_ACTIVATE;
return PAGEREF_KEEP;
}
- /* Reclaim if clean, defer dirty pages to writeback */
- if (referenced_page && !PageSwapBacked(page))
+ /* Reclaim if clean, defer dirty folios to writeback */
+ if (referenced_folio && !folio_test_swapbacked(folio))
return PAGEREF_RECLAIM_CLEAN;
return PAGEREF_RECLAIM;
}
/* Check if a page is dirty or under writeback */
-static void page_check_dirty_writeback(struct page *page,
+static void folio_check_dirty_writeback(struct folio *folio,
bool *dirty, bool *writeback)
{
struct address_space *mapping;
@@ -1438,24 +1434,24 @@ static void page_check_dirty_writeback(struct page *page,
* Anonymous pages are not handled by flushers and must be written
* from reclaim context. Do not stall reclaim based on them
*/
- if (!page_is_file_lru(page) ||
- (PageAnon(page) && !PageSwapBacked(page))) {
+ if (!folio_is_file_lru(folio) ||
+ (folio_test_anon(folio) && !folio_test_swapbacked(folio))) {
*dirty = false;
*writeback = false;
return;
}
- /* By default assume that the page flags are accurate */
- *dirty = PageDirty(page);
- *writeback = PageWriteback(page);
+ /* By default assume that the folio flags are accurate */
+ *dirty = folio_test_dirty(folio);
+ *writeback = folio_test_writeback(folio);
/* Verify dirty/writeback state if the filesystem supports it */
- if (!page_has_private(page))
+ if (!folio_test_private(folio))
return;
- mapping = page_mapping(page);
+ mapping = folio_mapping(folio);
if (mapping && mapping->a_ops->is_dirty_writeback)
- mapping->a_ops->is_dirty_writeback(page, dirty, writeback);
+ mapping->a_ops->is_dirty_writeback(&folio->page, dirty, writeback);
}
static struct page *alloc_demote_page(struct page *page, unsigned long node)
@@ -1529,14 +1525,16 @@ retry:
while (!list_empty(page_list)) {
struct address_space *mapping;
struct page *page;
+ struct folio *folio;
enum page_references references = PAGEREF_RECLAIM;
bool dirty, writeback, may_enter_fs;
unsigned int nr_pages;
cond_resched();
- page = lru_to_page(page_list);
- list_del(&page->lru);
+ folio = lru_to_folio(page_list);
+ list_del(&folio->lru);
+ page = &folio->page;
if (!trylock_page(page))
goto keep;
@@ -1562,12 +1560,12 @@ retry:
* reclaim_congested. kswapd will stall and start writing
* pages if the tail of the LRU is all dirty unqueued pages.
*/
- page_check_dirty_writeback(page, &dirty, &writeback);
+ folio_check_dirty_writeback(folio, &dirty, &writeback);
if (dirty || writeback)
- stat->nr_dirty++;
+ stat->nr_dirty += nr_pages;
if (dirty && !writeback)
- stat->nr_unqueued_dirty++;
+ stat->nr_unqueued_dirty += nr_pages;
/*
* Treat this page as congested if the underlying BDI is or if
@@ -1576,10 +1574,8 @@ retry:
* end of the LRU a second time.
*/
mapping = page_mapping(page);
- if (((dirty || writeback) && mapping &&
- inode_write_congested(mapping->host)) ||
- (writeback && PageReclaim(page)))
- stat->nr_congested++;
+ if (writeback && PageReclaim(page))
+ stat->nr_congested += nr_pages;
/*
* If a page at the tail of the LRU is under writeback, there
@@ -1628,7 +1624,7 @@ retry:
if (current_is_kswapd() &&
PageReclaim(page) &&
test_bit(PGDAT_WRITEBACK, &pgdat->flags)) {
- stat->nr_immediate++;
+ stat->nr_immediate += nr_pages;
goto activate_locked;
/* Case 2 above */
@@ -1646,7 +1642,7 @@ retry:
* and it's also appropriate in global reclaim.
*/
SetPageReclaim(page);
- stat->nr_writeback++;
+ stat->nr_writeback += nr_pages;
goto activate_locked;
/* Case 3 above */
@@ -1660,7 +1656,7 @@ retry:
}
if (!ignore_references)
- references = page_check_references(page, sc);
+ references = folio_check_references(folio, sc);
switch (references) {
case PAGEREF_ACTIVATE:
@@ -1693,28 +1689,28 @@ retry:
if (!PageSwapCache(page)) {
if (!(sc->gfp_mask & __GFP_IO))
goto keep_locked;
- if (page_maybe_dma_pinned(page))
+ if (folio_maybe_dma_pinned(folio))
goto keep_locked;
if (PageTransHuge(page)) {
/* cannot split THP, skip it */
- if (!can_split_huge_page(page, NULL))
+ if (!can_split_folio(folio, NULL))
goto activate_locked;
/*
* Split pages without a PMD map right
* away. Chances are some or all of the
* tail pages can be freed without IO.
*/
- if (!compound_mapcount(page) &&
- split_huge_page_to_list(page,
- page_list))
+ if (!folio_entire_mapcount(folio) &&
+ split_folio_to_list(folio,
+ page_list))
goto activate_locked;
}
if (!add_to_swap(page)) {
if (!PageTransHuge(page))
goto activate_locked_split;
/* Fallback to swap normal pages */
- if (split_huge_page_to_list(page,
- page_list))
+ if (split_folio_to_list(folio,
+ page_list))
goto activate_locked;
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
count_vm_event(THP_SWPOUT_FALLBACK);
@@ -1728,9 +1724,9 @@ retry:
/* Adding to swap updated mapping */
mapping = page_mapping(page);
}
- } else if (unlikely(PageTransHuge(page))) {
- /* Split file THP */
- if (split_huge_page_to_list(page, page_list))
+ } else if (PageSwapBacked(page) && PageTransHuge(page)) {
+ /* Split shmem THP */
+ if (split_folio_to_list(folio, page_list))
goto keep_locked;
}
@@ -1754,10 +1750,11 @@ retry:
enum ttu_flags flags = TTU_BATCH_FLUSH;
bool was_swapbacked = PageSwapBacked(page);
- if (unlikely(PageTransHuge(page)))
+ if (PageTransHuge(page) &&
+ thp_order(page) >= HPAGE_PMD_ORDER)
flags |= TTU_SPLIT_HUGE_PMD;
- try_to_unmap(page, flags);
+ try_to_unmap(folio, flags);
if (page_mapped(page)) {
stat->nr_unmap_fail += nr_pages;
if (!was_swapbacked && PageSwapBacked(page))
@@ -1805,13 +1802,13 @@ retry:
* starts and then write it out here.
*/
try_to_unmap_flush_dirty();
- switch (pageout(page, mapping)) {
+ switch (pageout(folio, mapping)) {
case PAGE_KEEP:
goto keep_locked;
case PAGE_ACTIVATE:
goto activate_locked;
case PAGE_SUCCESS:
- stat->nr_pageout += thp_nr_pages(page);
+ stat->nr_pageout += nr_pages;
if (PageWriteback(page))
goto keep;
@@ -1889,7 +1886,7 @@ retry:
*/
count_vm_event(PGLAZYFREED);
count_memcg_page_event(page, PGLAZYFREED);
- } else if (!mapping || !__remove_mapping(mapping, page, true,
+ } else if (!mapping || !__remove_mapping(mapping, folio, true,
sc->target_mem_cgroup))
goto keep_locked;
@@ -2012,69 +2009,6 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone,
}
/*
- * Attempt to remove the specified page from its LRU. Only take this page
- * if it is of the appropriate PageActive status. Pages which are being
- * freed elsewhere are also ignored.
- *
- * page: page to consider
- * mode: one of the LRU isolation modes defined above
- *
- * returns true on success, false on failure.
- */
-bool __isolate_lru_page_prepare(struct page *page, isolate_mode_t mode)
-{
- /* Only take pages on the LRU. */
- if (!PageLRU(page))
- return false;
-
- /* Compaction should not handle unevictable pages but CMA can do so */
- if (PageUnevictable(page) && !(mode & ISOLATE_UNEVICTABLE))
- return false;
-
- /*
- * To minimise LRU disruption, the caller can indicate that it only
- * wants to isolate pages it will be able to operate on without
- * blocking - clean pages for the most part.
- *
- * ISOLATE_ASYNC_MIGRATE is used to indicate that it only wants to pages
- * that it is possible to migrate without blocking
- */
- if (mode & ISOLATE_ASYNC_MIGRATE) {
- /* All the caller can do on PageWriteback is block */
- if (PageWriteback(page))
- return false;
-
- if (PageDirty(page)) {
- struct address_space *mapping;
- bool migrate_dirty;
-
- /*
- * Only pages without mappings or that have a
- * ->migratepage callback are possible to migrate
- * without blocking. However, we can be racing with
- * truncation so it's necessary to lock the page
- * to stabilise the mapping as truncation holds
- * the page lock until after the page is removed
- * from the page cache.
- */
- if (!trylock_page(page))
- return false;
-
- mapping = page_mapping(page);
- migrate_dirty = !mapping || mapping->a_ops->migratepage;
- unlock_page(page);
- if (!migrate_dirty)
- return false;
- }
- }
-
- if ((mode & ISOLATE_UNMAPPED) && page_mapped(page))
- return false;
-
- return true;
-}
-
-/*
* Update LRU sizes after isolating pages. The LRU size updates must
* be complete before mem_cgroup_update_lru_size due to a sanity check.
*/
@@ -2125,11 +2059,11 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
unsigned long skipped = 0;
unsigned long scan, total_scan, nr_pages;
LIST_HEAD(pages_skipped);
- isolate_mode_t mode = (sc->may_unmap ? 0 : ISOLATE_UNMAPPED);
total_scan = 0;
scan = 0;
while (scan < nr_to_scan && !list_empty(src)) {
+ struct list_head *move_to = src;
struct page *page;
page = lru_to_page(src);
@@ -2139,9 +2073,9 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
total_scan += nr_pages;
if (page_zonenum(page) > sc->reclaim_idx) {
- list_move(&page->lru, &pages_skipped);
nr_skipped[page_zonenum(page)] += nr_pages;
- continue;
+ move_to = &pages_skipped;
+ goto move;
}
/*
@@ -2149,37 +2083,34 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
* return with no isolated pages if the LRU mostly contains
* ineligible pages. This causes the VM to not reclaim any
* pages, triggering a premature OOM.
- *
- * Account all tail pages of THP. This would not cause
- * premature OOM since __isolate_lru_page() returns -EBUSY
- * only when the page is being freed somewhere else.
+ * Account all tail pages of THP.
*/
scan += nr_pages;
- if (!__isolate_lru_page_prepare(page, mode)) {
- /* It is being freed elsewhere */
- list_move(&page->lru, src);
- continue;
- }
+
+ if (!PageLRU(page))
+ goto move;
+ if (!sc->may_unmap && page_mapped(page))
+ goto move;
+
/*
* Be careful not to clear PageLRU until after we're
* sure the page is not being freed elsewhere -- the
* page release code relies on it.
*/
- if (unlikely(!get_page_unless_zero(page))) {
- list_move(&page->lru, src);
- continue;
- }
+ if (unlikely(!get_page_unless_zero(page)))
+ goto move;
if (!TestClearPageLRU(page)) {
/* Another thread is already isolating this page */
put_page(page);
- list_move(&page->lru, src);
- continue;
+ goto move;
}
nr_taken += nr_pages;
nr_zone_taken[page_zonenum(page)] += nr_pages;
- list_move(&page->lru, dst);
+ move_to = dst;
+move:
+ list_move(&page->lru, move_to);
}
/*
@@ -2203,51 +2134,47 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
}
*nr_scanned = total_scan;
trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan,
- total_scan, skipped, nr_taken, mode, lru);
+ total_scan, skipped, nr_taken,
+ sc->may_unmap ? 0 : ISOLATE_UNMAPPED, lru);
update_lru_sizes(lruvec, lru, nr_zone_taken);
return nr_taken;
}
/**
- * isolate_lru_page - tries to isolate a page from its LRU list
- * @page: page to isolate from its LRU list
- *
- * Isolates a @page from an LRU list, clears PageLRU and adjusts the
- * vmstat statistic corresponding to whatever LRU list the page was on.
+ * folio_isolate_lru() - Try to isolate a folio from its LRU list.
+ * @folio: Folio to isolate from its LRU list.
*
- * Returns 0 if the page was removed from an LRU list.
- * Returns -EBUSY if the page was not on an LRU list.
+ * Isolate a @folio from an LRU list and adjust the vmstat statistic
+ * corresponding to whatever LRU list the folio was on.
*
- * The returned page will have PageLRU() cleared. If it was found on
- * the active list, it will have PageActive set. If it was found on
- * the unevictable list, it will have the PageUnevictable bit set. That flag
+ * The folio will have its LRU flag cleared. If it was found on the
+ * active list, it will have the Active flag set. If it was found on the
+ * unevictable list, it will have the Unevictable flag set. These flags
* may need to be cleared by the caller before letting the page go.
*
- * The vmstat statistic corresponding to the list on which the page was
- * found will be decremented.
- *
- * Restrictions:
+ * Context:
*
* (1) Must be called with an elevated refcount on the page. This is a
- * fundamental difference from isolate_lru_pages (which is called
+ * fundamental difference from isolate_lru_pages() (which is called
* without a stable reference).
- * (2) the lru_lock must not be held.
- * (3) interrupts must be enabled.
+ * (2) The lru_lock must not be held.
+ * (3) Interrupts must be enabled.
+ *
+ * Return: 0 if the folio was removed from an LRU list.
+ * -EBUSY if the folio was not on an LRU list.
*/
-int isolate_lru_page(struct page *page)
+int folio_isolate_lru(struct folio *folio)
{
- struct folio *folio = page_folio(page);
int ret = -EBUSY;
- VM_BUG_ON_PAGE(!page_count(page), page);
- WARN_RATELIMIT(PageTail(page), "trying to isolate tail page");
+ VM_BUG_ON_FOLIO(!folio_ref_count(folio), folio);
- if (TestClearPageLRU(page)) {
+ if (folio_test_clear_lru(folio)) {
struct lruvec *lruvec;
- get_page(page);
+ folio_get(folio);
lruvec = folio_lruvec_lock_irq(folio);
- del_page_from_lru_list(page, lruvec);
+ lruvec_del_folio(lruvec, folio);
unlock_page_lruvec_irq(lruvec);
ret = 0;
}
@@ -2377,9 +2304,7 @@ static unsigned int move_pages_to_lru(struct lruvec *lruvec,
*/
static int current_may_throttle(void)
{
- return !(current->flags & PF_LOCAL_THROTTLE) ||
- current->backing_dev_info == NULL ||
- bdi_write_congested(current->backing_dev_info);
+ return !(current->flags & PF_LOCAL_THROTTLE);
}
/*
@@ -2485,7 +2410,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
*
* If the pages are mostly unmapped, the processing is fast and it is
* appropriate to hold lru_lock across the whole operation. But if
- * the pages are mapped, the processing is slow (page_referenced()), so
+ * the pages are mapped, the processing is slow (folio_referenced()), so
* we should drop lru_lock around each page. It's impossible to balance
* this, so instead we remove the pages from the LRU while processing them.
* It is safe to rely on PG_active against the non-LRU pages in here because
@@ -2505,7 +2430,6 @@ static void shrink_active_list(unsigned long nr_to_scan,
LIST_HEAD(l_hold); /* The pages which were snipped off */
LIST_HEAD(l_active);
LIST_HEAD(l_inactive);
- struct page *page;
unsigned nr_deactivate, nr_activate;
unsigned nr_rotated = 0;
int file = is_file_lru(lru);
@@ -2527,9 +2451,13 @@ static void shrink_active_list(unsigned long nr_to_scan,
spin_unlock_irq(&lruvec->lru_lock);
while (!list_empty(&l_hold)) {
+ struct folio *folio;
+ struct page *page;
+
cond_resched();
- page = lru_to_page(&l_hold);
- list_del(&page->lru);
+ folio = lru_to_folio(&l_hold);
+ list_del(&folio->lru);
+ page = &folio->page;
if (unlikely(!page_evictable(page))) {
putback_lru_page(page);
@@ -2544,8 +2472,8 @@ static void shrink_active_list(unsigned long nr_to_scan,
}
}
- if (page_referenced(page, 0, sc->target_mem_cgroup,
- &vm_flags)) {
+ if (folio_referenced(folio, 0, sc->target_mem_cgroup,
+ &vm_flags)) {
/*
* Identify referenced, file-backed active pages and
* give them one more trip around the active list. So
@@ -3975,7 +3903,10 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx)
if (!managed_zone(zone))
continue;
- mark = high_wmark_pages(zone);
+ if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING)
+ mark = wmark_pages(zone, WMARK_PROMO);
+ else
+ mark = high_wmark_pages(zone);
if (zone_watermark_ok_safe(zone, order, mark, highest_zoneidx))
return true;
}
@@ -4472,7 +4403,7 @@ static int kswapd(void *p)
* us from recursively trying to free more memory as we're
* trying to free the first piece of memory in the first place).
*/
- tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
+ tsk->flags |= PF_MEMALLOC | PF_KSWAPD;
set_freezable();
WRITE_ONCE(pgdat->kswapd_order, 0);
@@ -4523,7 +4454,7 @@ kswapd_try_sleep:
goto kswapd_try_sleep;
}
- tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD);
+ tsk->flags &= ~(PF_MEMALLOC | PF_KSWAPD);
return 0;
}
@@ -4764,11 +4695,8 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
fs_reclaim_acquire(sc.gfp_mask);
/*
* We need to be able to allocate from the reserves for RECLAIM_UNMAP
- * and we also need to be able to write out pages for RECLAIM_WRITE
- * and RECLAIM_UNMAP.
*/
noreclaim_flag = memalloc_noreclaim_save();
- p->flags |= PF_SWAPWRITE;
set_task_reclaim_state(p, &sc.reclaim_state);
if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages) {
@@ -4782,7 +4710,6 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
}
set_task_reclaim_state(p, NULL);
- current->flags &= ~PF_SWAPWRITE;
memalloc_noreclaim_restore(noreclaim_flag);
fs_reclaim_release(sc.gfp_mask);
psi_memstall_leave(&pflags);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 4057372745d0..b75b1a64b54c 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -28,6 +28,7 @@
#include <linux/mm_inline.h>
#include <linux/page_ext.h>
#include <linux/page_owner.h>
+#include <linux/migrate.h>
#include "internal.h"
@@ -1242,6 +1243,9 @@ const char * const vmstat_text[] = {
#ifdef CONFIG_SWAP
"nr_swapcached",
#endif
+#ifdef CONFIG_NUMA_BALANCING
+ "pgpromote_success",
+#endif
/* enum writeback_stat_item counters */
"nr_dirty_threshold",
@@ -1385,6 +1389,9 @@ const char * const vmstat_text[] = {
#ifdef CONFIG_SWAP
"swap_ra",
"swap_ra_hit",
+#ifdef CONFIG_KSM
+ "ksm_swpin_copy",
+#endif
#endif
#ifdef CONFIG_X86
"direct_map_level2_splits",
@@ -2043,7 +2050,12 @@ static void __init init_cpu_node_state(void)
static int vmstat_cpu_online(unsigned int cpu)
{
refresh_zone_stat_thresholds();
- node_set_state(cpu_to_node(cpu), N_CPU);
+
+ if (!node_state(cpu_to_node(cpu), N_CPU)) {
+ node_set_state(cpu_to_node(cpu), N_CPU);
+ set_migration_target_nodes();
+ }
+
return 0;
}
@@ -2066,6 +2078,8 @@ static int vmstat_cpu_dead(unsigned int cpu)
return 0;
node_clear_state(node, N_CPU);
+ set_migration_target_nodes();
+
return 0;
}
@@ -2097,6 +2111,9 @@ void __init init_mm_internals(void)
start_shepherd_timer();
#endif
+#if defined(CONFIG_MIGRATION) && defined(CONFIG_HOTPLUG_CPU)
+ migrate_on_reclaim_init();
+#endif
#ifdef CONFIG_PROC_FS
proc_create_seq("buddyinfo", 0444, NULL, &fragmentation_op);
proc_create_seq("pagetypeinfo", 0400, NULL, &pagetypeinfo_op);
diff --git a/mm/workingset.c b/mm/workingset.c
index 8c03afe1d67c..8a3828acc0bf 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -245,31 +245,32 @@ void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages)
}
/**
- * workingset_eviction - note the eviction of a page from memory
+ * workingset_eviction - note the eviction of a folio from memory
* @target_memcg: the cgroup that is causing the reclaim
- * @page: the page being evicted
+ * @folio: the folio being evicted
*
- * Return: a shadow entry to be stored in @page->mapping->i_pages in place
- * of the evicted @page so that a later refault can be detected.
+ * Return: a shadow entry to be stored in @folio->mapping->i_pages in place
+ * of the evicted @folio so that a later refault can be detected.
*/
-void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg)
+void *workingset_eviction(struct folio *folio, struct mem_cgroup *target_memcg)
{
- struct pglist_data *pgdat = page_pgdat(page);
+ struct pglist_data *pgdat = folio_pgdat(folio);
unsigned long eviction;
struct lruvec *lruvec;
int memcgid;
- /* Page is fully exclusive and pins page's memory cgroup pointer */
- VM_BUG_ON_PAGE(PageLRU(page), page);
- VM_BUG_ON_PAGE(page_count(page), page);
- VM_BUG_ON_PAGE(!PageLocked(page), page);
+ /* Folio is fully exclusive and pins folio's memory cgroup pointer */
+ VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
+ VM_BUG_ON_FOLIO(folio_ref_count(folio), folio);
+ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
lruvec = mem_cgroup_lruvec(target_memcg, pgdat);
/* XXX: target_memcg can be NULL, go through lruvec */
memcgid = mem_cgroup_id(lruvec_memcg(lruvec));
eviction = atomic_long_read(&lruvec->nonresident_age);
- workingset_age_nonresident(lruvec, thp_nr_pages(page));
- return pack_shadow(memcgid, pgdat, eviction, PageWorkingset(page));
+ workingset_age_nonresident(lruvec, folio_nr_pages(folio));
+ return pack_shadow(memcgid, pgdat, eviction,
+ folio_test_workingset(folio));
}
/**
@@ -429,10 +430,12 @@ out:
* point where they would still be useful.
*/
-static struct list_lru shadow_nodes;
+struct list_lru shadow_nodes;
void workingset_update_node(struct xa_node *node)
{
+ struct address_space *mapping;
+
/*
* Track non-empty nodes that contain only shadow entries;
* unlink those that contain pages or are being freed.
@@ -441,7 +444,8 @@ void workingset_update_node(struct xa_node *node)
* already where they should be. The list_empty() test is safe
* as node->private_list is protected by the i_pages lock.
*/
- VM_WARN_ON_ONCE(!irqs_disabled()); /* For __inc_lruvec_page_state */
+ mapping = container_of(node->array, struct address_space, i_pages);
+ lockdep_assert_held(&mapping->i_pages.xa_lock);
if (node->count && node->count == node->nr_values) {
if (list_empty(&node->private_list)) {
diff --git a/mm/zswap.c b/mm/zswap.c
index cdf6950fcb2e..3efd8cae315e 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -120,11 +120,19 @@ static unsigned int zswap_accept_thr_percent = 90; /* of max pool size */
module_param_named(accept_threshold_percent, zswap_accept_thr_percent,
uint, 0644);
-/* Enable/disable handling same-value filled pages (enabled by default) */
+/*
+ * Enable/disable handling same-value filled pages (enabled by default).
+ * If disabled every page is considered non-same-value filled.
+ */
static bool zswap_same_filled_pages_enabled = true;
module_param_named(same_filled_pages_enabled, zswap_same_filled_pages_enabled,
bool, 0644);
+/* Enable/disable handling non-same-value filled pages (enabled by default) */
+static bool zswap_non_same_filled_pages_enabled = true;
+module_param_named(non_same_filled_pages_enabled, zswap_non_same_filled_pages_enabled,
+ bool, 0644);
+
/*********************************
* data structures
**********************************/
@@ -1147,6 +1155,11 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
kunmap_atomic(src);
}
+ if (!zswap_non_same_filled_pages_enabled) {
+ ret = -EINVAL;
+ goto freepage;
+ }
+
/* if entry is successfully added, it keeps the reference */
entry->pool = zswap_pool_current_get();
if (!entry->pool) {