aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig8
-rw-r--r--mm/Makefile1
-rw-r--r--mm/backing-dev.c24
-rw-r--r--mm/cleancache.c10
-rw-r--r--mm/cma_debug.c25
-rw-r--r--mm/compaction.c2
-rw-r--r--mm/dmapool.c2
-rw-r--r--mm/failslab.c2
-rw-r--r--mm/filemap.c8
-rw-r--r--mm/frontswap.c11
-rw-r--r--mm/gup.c78
-rw-r--r--mm/gup_benchmark.c2
-rw-r--r--mm/hmm.c13
-rw-r--r--mm/huge_memory.c11
-rw-r--r--mm/hugetlb.c47
-rw-r--r--mm/hugetlb_cgroup.c6
-rw-r--r--mm/init-mm.c1
-rw-r--r--mm/internal.h2
-rw-r--r--mm/ksm.c25
-rw-r--r--mm/memblock.c36
-rw-r--r--mm/memcontrol.c352
-rw-r--r--mm/memfd.c345
-rw-r--r--mm/memory.c43
-rw-r--r--mm/memory_hotplug.c23
-rw-r--r--mm/mmap.c4
-rw-r--r--mm/mremap.c4
-rw-r--r--mm/nommu.c2
-rw-r--r--mm/oom_kill.c4
-rw-r--r--mm/page_alloc.c76
-rw-r--r--mm/page_counter.c100
-rw-r--r--mm/page_idle.c2
-rw-r--r--mm/page_owner.c4
-rw-r--r--mm/percpu-stats.c2
-rw-r--r--mm/readahead.c39
-rw-r--r--mm/shmem.c391
-rw-r--r--mm/slab.c7
-rw-r--r--mm/slab_common.c37
-rw-r--r--mm/slob.c4
-rw-r--r--mm/slub.c131
-rw-r--r--mm/sparse.c6
-rw-r--r--mm/swap.c3
-rw-r--r--mm/swap_slots.c14
-rw-r--r--mm/swap_state.c5
-rw-r--r--mm/swapfile.c7
-rw-r--r--mm/userfaultfd.c22
-rw-r--r--mm/util.c6
-rw-r--r--mm/vmalloc.c45
-rw-r--r--mm/vmpressure.c35
-rw-r--r--mm/vmscan.c38
-rw-r--r--mm/zsmalloc.c5
-rw-r--r--mm/zswap.c38
51 files changed, 1216 insertions, 892 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 3e0b6e87f65d..ce95491abd6a 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -694,6 +694,9 @@ config ARCH_HAS_HMM
config MIGRATE_VMA_HELPER
bool
+config DEV_PAGEMAP_OPS
+ bool
+
config HMM
bool
select MIGRATE_VMA_HELPER
@@ -714,6 +717,7 @@ config DEVICE_PRIVATE
bool "Unaddressable device memory (GPU memory, ...)"
depends on ARCH_HAS_HMM
select HMM
+ select DEV_PAGEMAP_OPS
help
Allows creation of struct pages to represent unaddressable device
@@ -724,6 +728,7 @@ config DEVICE_PUBLIC
bool "Addressable device memory (like GPU memory)"
depends on ARCH_HAS_HMM
select HMM
+ select DEV_PAGEMAP_OPS
help
Allows creation of struct pages to represent addressable device
@@ -754,3 +759,6 @@ config GUP_BENCHMARK
performance of get_user_pages_fast().
See tools/testing/selftests/vm/gup_benchmark.c
+
+config ARCH_HAS_PTE_SPECIAL
+ bool
diff --git a/mm/Makefile b/mm/Makefile
index b4e54a9ae9c5..8716bdabe1e6 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -105,3 +105,4 @@ obj-$(CONFIG_DEBUG_PAGE_REF) += debug_page_ref.o
obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o
obj-$(CONFIG_PERCPU_STATS) += percpu-stats.o
obj-$(CONFIG_HMM) += hmm.o
+obj-$(CONFIG_MEMFD_CREATE) += memfd.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 8fe3ebd6ac00..2e5d3df0853d 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -359,15 +359,8 @@ static void wb_shutdown(struct bdi_writeback *wb)
spin_lock_bh(&wb->work_lock);
if (!test_and_clear_bit(WB_registered, &wb->state)) {
spin_unlock_bh(&wb->work_lock);
- /*
- * Wait for wb shutdown to finish if someone else is just
- * running wb_shutdown(). Otherwise we could proceed to wb /
- * bdi destruction before wb_shutdown() is finished.
- */
- wait_on_bit(&wb->state, WB_shutting_down, TASK_UNINTERRUPTIBLE);
return;
}
- set_bit(WB_shutting_down, &wb->state);
spin_unlock_bh(&wb->work_lock);
cgwb_remove_from_bdi_list(wb);
@@ -379,12 +372,6 @@ static void wb_shutdown(struct bdi_writeback *wb)
mod_delayed_work(bdi_wq, &wb->dwork, 0);
flush_delayed_work(&wb->dwork);
WARN_ON(!list_empty(&wb->work_list));
- /*
- * Make sure bit gets cleared after shutdown is finished. Matches with
- * the barrier provided by test_and_clear_bit() above.
- */
- smp_wmb();
- clear_and_wake_up_bit(WB_shutting_down, &wb->state);
}
static void wb_exit(struct bdi_writeback *wb)
@@ -508,10 +495,12 @@ static void cgwb_release_workfn(struct work_struct *work)
struct bdi_writeback *wb = container_of(work, struct bdi_writeback,
release_work);
+ mutex_lock(&wb->bdi->cgwb_release_mutex);
wb_shutdown(wb);
css_put(wb->memcg_css);
css_put(wb->blkcg_css);
+ mutex_unlock(&wb->bdi->cgwb_release_mutex);
fprop_local_destroy_percpu(&wb->memcg_completions);
percpu_ref_exit(&wb->refcnt);
@@ -557,7 +546,7 @@ static int cgwb_create(struct backing_dev_info *bdi,
memcg = mem_cgroup_from_css(memcg_css);
blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &io_cgrp_subsys);
blkcg = css_to_blkcg(blkcg_css);
- memcg_cgwb_list = mem_cgroup_cgwb_list(memcg);
+ memcg_cgwb_list = &memcg->cgwb_list;
blkcg_cgwb_list = &blkcg->cgwb_list;
/* look up again under lock and discard on blkcg mismatch */
@@ -697,6 +686,7 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi)
INIT_RADIX_TREE(&bdi->cgwb_tree, GFP_ATOMIC);
bdi->cgwb_congested_tree = RB_ROOT;
+ mutex_init(&bdi->cgwb_release_mutex);
ret = wb_init(&bdi->wb, bdi, 1, GFP_KERNEL);
if (!ret) {
@@ -717,7 +707,10 @@ static void cgwb_bdi_unregister(struct backing_dev_info *bdi)
spin_lock_irq(&cgwb_lock);
radix_tree_for_each_slot(slot, &bdi->cgwb_tree, &iter, 0)
cgwb_kill(*slot);
+ spin_unlock_irq(&cgwb_lock);
+ mutex_lock(&bdi->cgwb_release_mutex);
+ spin_lock_irq(&cgwb_lock);
while (!list_empty(&bdi->wb_list)) {
wb = list_first_entry(&bdi->wb_list, struct bdi_writeback,
bdi_node);
@@ -726,6 +719,7 @@ static void cgwb_bdi_unregister(struct backing_dev_info *bdi)
spin_lock_irq(&cgwb_lock);
}
spin_unlock_irq(&cgwb_lock);
+ mutex_unlock(&bdi->cgwb_release_mutex);
}
/**
@@ -736,7 +730,7 @@ static void cgwb_bdi_unregister(struct backing_dev_info *bdi)
*/
void wb_memcg_offline(struct mem_cgroup *memcg)
{
- struct list_head *memcg_cgwb_list = mem_cgroup_cgwb_list(memcg);
+ struct list_head *memcg_cgwb_list = &memcg->cgwb_list;
struct bdi_writeback *wb, *next;
spin_lock_irq(&cgwb_lock);
diff --git a/mm/cleancache.c b/mm/cleancache.c
index 126548b5a292..2bf12da9baa0 100644
--- a/mm/cleancache.c
+++ b/mm/cleancache.c
@@ -307,12 +307,10 @@ static int __init init_cleancache(void)
struct dentry *root = debugfs_create_dir("cleancache", NULL);
if (root == NULL)
return -ENXIO;
- debugfs_create_u64("succ_gets", S_IRUGO, root, &cleancache_succ_gets);
- debugfs_create_u64("failed_gets", S_IRUGO,
- root, &cleancache_failed_gets);
- debugfs_create_u64("puts", S_IRUGO, root, &cleancache_puts);
- debugfs_create_u64("invalidates", S_IRUGO,
- root, &cleancache_invalidates);
+ debugfs_create_u64("succ_gets", 0444, root, &cleancache_succ_gets);
+ debugfs_create_u64("failed_gets", 0444, root, &cleancache_failed_gets);
+ debugfs_create_u64("puts", 0444, root, &cleancache_puts);
+ debugfs_create_u64("invalidates", 0444, root, &cleancache_invalidates);
#endif
return 0;
}
diff --git a/mm/cma_debug.c b/mm/cma_debug.c
index 275df8b5b22e..f23467291cfb 100644
--- a/mm/cma_debug.c
+++ b/mm/cma_debug.c
@@ -172,23 +172,18 @@ static void cma_debugfs_add_one(struct cma *cma, int idx)
tmp = debugfs_create_dir(name, cma_debugfs_root);
- debugfs_create_file("alloc", S_IWUSR, tmp, cma,
- &cma_alloc_fops);
-
- debugfs_create_file("free", S_IWUSR, tmp, cma,
- &cma_free_fops);
-
- debugfs_create_file("base_pfn", S_IRUGO, tmp,
- &cma->base_pfn, &cma_debugfs_fops);
- debugfs_create_file("count", S_IRUGO, tmp,
- &cma->count, &cma_debugfs_fops);
- debugfs_create_file("order_per_bit", S_IRUGO, tmp,
- &cma->order_per_bit, &cma_debugfs_fops);
- debugfs_create_file("used", S_IRUGO, tmp, cma, &cma_used_fops);
- debugfs_create_file("maxchunk", S_IRUGO, tmp, cma, &cma_maxchunk_fops);
+ debugfs_create_file("alloc", 0200, tmp, cma, &cma_alloc_fops);
+ debugfs_create_file("free", 0200, tmp, cma, &cma_free_fops);
+ debugfs_create_file("base_pfn", 0444, tmp,
+ &cma->base_pfn, &cma_debugfs_fops);
+ debugfs_create_file("count", 0444, tmp, &cma->count, &cma_debugfs_fops);
+ debugfs_create_file("order_per_bit", 0444, tmp,
+ &cma->order_per_bit, &cma_debugfs_fops);
+ debugfs_create_file("used", 0444, tmp, cma, &cma_used_fops);
+ debugfs_create_file("maxchunk", 0444, tmp, cma, &cma_maxchunk_fops);
u32s = DIV_ROUND_UP(cma_bitmap_maxno(cma), BITS_PER_BYTE * sizeof(u32));
- debugfs_create_u32_array("bitmap", S_IRUGO, tmp, (u32*)cma->bitmap, u32s);
+ debugfs_create_u32_array("bitmap", 0444, tmp, (u32 *)cma->bitmap, u32s);
}
static int __init cma_debugfs_init(void)
diff --git a/mm/compaction.c b/mm/compaction.c
index 29bd1df18b98..faca45ebe62d 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1899,7 +1899,7 @@ static ssize_t sysfs_compact_node(struct device *dev,
return count;
}
-static DEVICE_ATTR(compact, S_IWUSR, NULL, sysfs_compact_node);
+static DEVICE_ATTR(compact, 0200, NULL, sysfs_compact_node);
int compaction_register_node(struct node *node)
{
diff --git a/mm/dmapool.c b/mm/dmapool.c
index 4d90a64b2fdc..6d4b97e7e9e9 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -105,7 +105,7 @@ show_pools(struct device *dev, struct device_attribute *attr, char *buf)
return PAGE_SIZE - size;
}
-static DEVICE_ATTR(pools, S_IRUGO, show_pools, NULL);
+static DEVICE_ATTR(pools, 0444, show_pools, NULL);
/**
* dma_pool_create - Creates a pool of consistent memory blocks, for dma.
diff --git a/mm/failslab.c b/mm/failslab.c
index 1f2f248e3601..b135ebb88b6f 100644
--- a/mm/failslab.c
+++ b/mm/failslab.c
@@ -42,7 +42,7 @@ __setup("failslab=", setup_failslab);
static int __init failslab_debugfs_init(void)
{
struct dentry *dir;
- umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
+ umode_t mode = S_IFREG | 0600;
dir = fault_create_debugfs_attr("failslab", NULL, &failslab.attr);
if (IS_ERR(dir))
diff --git a/mm/filemap.c b/mm/filemap.c
index 0604cb02e6f3..52517f28e6f4 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2489,7 +2489,7 @@ static void do_async_mmap_readahead(struct vm_area_struct *vma,
*
* We never return with VM_FAULT_RETRY and a bit from VM_FAULT_ERROR set.
*/
-int filemap_fault(struct vm_fault *vmf)
+vm_fault_t filemap_fault(struct vm_fault *vmf)
{
int error;
struct file *file = vmf->vma->vm_file;
@@ -2499,7 +2499,7 @@ int filemap_fault(struct vm_fault *vmf)
pgoff_t offset = vmf->pgoff;
pgoff_t max_off;
struct page *page;
- int ret = 0;
+ vm_fault_t ret = 0;
max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
if (unlikely(offset >= max_off))
@@ -2693,11 +2693,11 @@ next:
}
EXPORT_SYMBOL(filemap_map_pages);
-int filemap_page_mkwrite(struct vm_fault *vmf)
+vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf)
{
struct page *page = vmf->page;
struct inode *inode = file_inode(vmf->vma->vm_file);
- int ret = VM_FAULT_LOCKED;
+ vm_fault_t ret = VM_FAULT_LOCKED;
sb_start_pagefault(inode->i_sb);
file_update_time(vmf->vma->vm_file);
diff --git a/mm/frontswap.c b/mm/frontswap.c
index 4f5476a0f955..157e5bf63504 100644
--- a/mm/frontswap.c
+++ b/mm/frontswap.c
@@ -486,12 +486,11 @@ static int __init init_frontswap(void)
struct dentry *root = debugfs_create_dir("frontswap", NULL);
if (root == NULL)
return -ENXIO;
- debugfs_create_u64("loads", S_IRUGO, root, &frontswap_loads);
- debugfs_create_u64("succ_stores", S_IRUGO, root, &frontswap_succ_stores);
- debugfs_create_u64("failed_stores", S_IRUGO, root,
- &frontswap_failed_stores);
- debugfs_create_u64("invalidates", S_IRUGO,
- root, &frontswap_invalidates);
+ debugfs_create_u64("loads", 0444, root, &frontswap_loads);
+ debugfs_create_u64("succ_stores", 0444, root, &frontswap_succ_stores);
+ debugfs_create_u64("failed_stores", 0444, root,
+ &frontswap_failed_stores);
+ debugfs_create_u64("invalidates", 0444, root, &frontswap_invalidates);
#endif
return 0;
}
diff --git a/mm/gup.c b/mm/gup.c
index 541904a7c60f..b70d7ba7cc13 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -212,53 +212,69 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
unsigned long address, pud_t *pudp,
unsigned int flags, unsigned int *page_mask)
{
- pmd_t *pmd;
+ pmd_t *pmd, pmdval;
spinlock_t *ptl;
struct page *page;
struct mm_struct *mm = vma->vm_mm;
pmd = pmd_offset(pudp, address);
- if (pmd_none(*pmd))
+ /*
+ * The READ_ONCE() will stabilize the pmdval in a register or
+ * on the stack so that it will stop changing under the code.
+ */
+ pmdval = READ_ONCE(*pmd);
+ if (pmd_none(pmdval))
return no_page_table(vma, flags);
- if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) {
+ if (pmd_huge(pmdval) && vma->vm_flags & VM_HUGETLB) {
page = follow_huge_pmd(mm, address, pmd, flags);
if (page)
return page;
return no_page_table(vma, flags);
}
- if (is_hugepd(__hugepd(pmd_val(*pmd)))) {
+ if (is_hugepd(__hugepd(pmd_val(pmdval)))) {
page = follow_huge_pd(vma, address,
- __hugepd(pmd_val(*pmd)), flags,
+ __hugepd(pmd_val(pmdval)), flags,
PMD_SHIFT);
if (page)
return page;
return no_page_table(vma, flags);
}
retry:
- if (!pmd_present(*pmd)) {
+ if (!pmd_present(pmdval)) {
if (likely(!(flags & FOLL_MIGRATION)))
return no_page_table(vma, flags);
VM_BUG_ON(thp_migration_supported() &&
- !is_pmd_migration_entry(*pmd));
- if (is_pmd_migration_entry(*pmd))
+ !is_pmd_migration_entry(pmdval));
+ if (is_pmd_migration_entry(pmdval))
pmd_migration_entry_wait(mm, pmd);
+ pmdval = READ_ONCE(*pmd);
+ /*
+ * MADV_DONTNEED may convert the pmd to null because
+ * mmap_sem is held in read mode
+ */
+ if (pmd_none(pmdval))
+ return no_page_table(vma, flags);
goto retry;
}
- if (pmd_devmap(*pmd)) {
+ if (pmd_devmap(pmdval)) {
ptl = pmd_lock(mm, pmd);
page = follow_devmap_pmd(vma, address, pmd, flags);
spin_unlock(ptl);
if (page)
return page;
}
- if (likely(!pmd_trans_huge(*pmd)))
+ if (likely(!pmd_trans_huge(pmdval)))
return follow_page_pte(vma, address, pmd, flags);
- if ((flags & FOLL_NUMA) && pmd_protnone(*pmd))
+ if ((flags & FOLL_NUMA) && pmd_protnone(pmdval))
return no_page_table(vma, flags);
retry_locked:
ptl = pmd_lock(mm, pmd);
+ if (unlikely(pmd_none(*pmd))) {
+ spin_unlock(ptl);
+ return no_page_table(vma, flags);
+ }
if (unlikely(!pmd_present(*pmd))) {
spin_unlock(ptl);
if (likely(!(flags & FOLL_MIGRATION)))
@@ -1354,7 +1370,7 @@ static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages)
}
}
-#ifdef __HAVE_ARCH_PTE_SPECIAL
+#ifdef CONFIG_ARCH_HAS_PTE_SPECIAL
static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
int write, struct page **pages, int *nr)
{
@@ -1430,7 +1446,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
{
return 0;
}
-#endif /* __HAVE_ARCH_PTE_SPECIAL */
+#endif /* CONFIG_ARCH_HAS_PTE_SPECIAL */
#if defined(__HAVE_ARCH_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
static int __gup_device_huge(unsigned long pfn, unsigned long addr,
@@ -1459,32 +1475,48 @@ static int __gup_device_huge(unsigned long pfn, unsigned long addr,
return 1;
}
-static int __gup_device_huge_pmd(pmd_t pmd, unsigned long addr,
+static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
unsigned long end, struct page **pages, int *nr)
{
unsigned long fault_pfn;
+ int nr_start = *nr;
+
+ fault_pfn = pmd_pfn(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
+ if (!__gup_device_huge(fault_pfn, addr, end, pages, nr))
+ return 0;
- fault_pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
- return __gup_device_huge(fault_pfn, addr, end, pages, nr);
+ if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) {
+ undo_dev_pagemap(nr, nr_start, pages);
+ return 0;
+ }
+ return 1;
}
-static int __gup_device_huge_pud(pud_t pud, unsigned long addr,
+static int __gup_device_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
unsigned long end, struct page **pages, int *nr)
{
unsigned long fault_pfn;
+ int nr_start = *nr;
- fault_pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
- return __gup_device_huge(fault_pfn, addr, end, pages, nr);
+ fault_pfn = pud_pfn(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
+ if (!__gup_device_huge(fault_pfn, addr, end, pages, nr))
+ return 0;
+
+ if (unlikely(pud_val(orig) != pud_val(*pudp))) {
+ undo_dev_pagemap(nr, nr_start, pages);
+ return 0;
+ }
+ return 1;
}
#else
-static int __gup_device_huge_pmd(pmd_t pmd, unsigned long addr,
+static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
unsigned long end, struct page **pages, int *nr)
{
BUILD_BUG();
return 0;
}
-static int __gup_device_huge_pud(pud_t pud, unsigned long addr,
+static int __gup_device_huge_pud(pud_t pud, pud_t *pudp, unsigned long addr,
unsigned long end, struct page **pages, int *nr)
{
BUILD_BUG();
@@ -1502,7 +1534,7 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
return 0;
if (pmd_devmap(orig))
- return __gup_device_huge_pmd(orig, addr, end, pages, nr);
+ return __gup_device_huge_pmd(orig, pmdp, addr, end, pages, nr);
refs = 0;
page = pmd_page(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
@@ -1540,7 +1572,7 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
return 0;
if (pud_devmap(orig))
- return __gup_device_huge_pud(orig, addr, end, pages, nr);
+ return __gup_device_huge_pud(orig, pudp, addr, end, pages, nr);
refs = 0;
page = pud_page(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
diff --git a/mm/gup_benchmark.c b/mm/gup_benchmark.c
index 0f44759486e2..6a473709e9b6 100644
--- a/mm/gup_benchmark.c
+++ b/mm/gup_benchmark.c
@@ -23,7 +23,7 @@ static int __gup_benchmark_ioctl(unsigned int cmd,
struct page **pages;
nr_pages = gup->size / PAGE_SIZE;
- pages = kvzalloc(sizeof(void *) * nr_pages, GFP_KERNEL);
+ pages = kvcalloc(nr_pages, sizeof(void *), GFP_KERNEL);
if (!pages)
return -ENOMEM;
diff --git a/mm/hmm.c b/mm/hmm.c
index e63e353830e8..de7b6bf77201 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -35,15 +35,6 @@
#define PA_SECTION_SIZE (1UL << PA_SECTION_SHIFT)
-#if defined(CONFIG_DEVICE_PRIVATE) || defined(CONFIG_DEVICE_PUBLIC)
-/*
- * Device private memory see HMM (Documentation/vm/hmm.rst) or hmm.h
- */
-DEFINE_STATIC_KEY_FALSE(device_private_key);
-EXPORT_SYMBOL(device_private_key);
-#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */
-
-
#if IS_ENABLED(CONFIG_HMM_MIRROR)
static const struct mmu_notifier_ops hmm_mmu_notifier_ops;
@@ -1167,7 +1158,7 @@ struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops,
resource_size_t addr;
int ret;
- static_branch_enable(&device_private_key);
+ dev_pagemap_get_ops();
devmem = devres_alloc_node(&hmm_devmem_release, sizeof(*devmem),
GFP_KERNEL, dev_to_node(device));
@@ -1261,7 +1252,7 @@ struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops,
if (res->desc != IORES_DESC_DEVICE_PUBLIC_MEMORY)
return ERR_PTR(-EINVAL);
- static_branch_enable(&device_private_key);
+ dev_pagemap_get_ops();
devmem = devres_alloc_node(&hmm_devmem_release, sizeof(*devmem),
GFP_KERNEL, dev_to_node(device));
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index ac5591d8622c..1cd7c1a57a14 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -483,11 +483,8 @@ pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
static inline struct list_head *page_deferred_list(struct page *page)
{
- /*
- * ->lru in the tail pages is occupied by compound_head.
- * Let's use ->mapping + ->index in the second tail page as list_head.
- */
- return (struct list_head *)&page[2].mapping;
+ /* ->lru in the tail pages is occupied by compound_head. */
+ return &page[2].deferred_list;
}
void prep_transhuge_page(struct page *page)
@@ -1134,8 +1131,8 @@ static int do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, pmd_t orig_pmd,
unsigned long mmun_start; /* For mmu_notifiers */
unsigned long mmun_end; /* For mmu_notifiers */
- pages = kmalloc(sizeof(struct page *) * HPAGE_PMD_NR,
- GFP_KERNEL);
+ pages = kmalloc_array(HPAGE_PMD_NR, sizeof(struct page *),
+ GFP_KERNEL);
if (unlikely(!pages)) {
ret |= VM_FAULT_OOM;
goto out;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 129088710510..3612fbb32e9d 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2798,7 +2798,8 @@ static int __init hugetlb_init(void)
num_fault_mutexes = 1;
#endif
hugetlb_fault_mutex_table =
- kmalloc(sizeof(struct mutex) * num_fault_mutexes, GFP_KERNEL);
+ kmalloc_array(num_fault_mutexes, sizeof(struct mutex),
+ GFP_KERNEL);
BUG_ON(!hugetlb_fault_mutex_table);
for (i = 0; i < num_fault_mutexes; i++)
@@ -3159,7 +3160,7 @@ static unsigned long hugetlb_vm_op_pagesize(struct vm_area_struct *vma)
* hugegpage VMA. do_page_fault() is supposed to trap this, so BUG is we get
* this far.
*/
-static int hugetlb_vm_op_fault(struct vm_fault *vmf)
+static vm_fault_t hugetlb_vm_op_fault(struct vm_fault *vmf)
{
BUG();
return 0;
@@ -3686,6 +3687,7 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
struct page *page;
pte_t new_pte;
spinlock_t *ptl;
+ unsigned long haddr = address & huge_page_mask(h);
/*
* Currently, we are forced to kill the process in the event the
@@ -3716,7 +3718,7 @@ retry:
u32 hash;
struct vm_fault vmf = {
.vma = vma,
- .address = address,
+ .address = haddr,
.flags = flags,
/*
* Hard to debug if it ends up being
@@ -3733,14 +3735,14 @@ retry:
* fault to make calling code simpler.
*/
hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping,
- idx, address);
+ idx, haddr);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
ret = handle_userfault(&vmf, VM_UFFD_MISSING);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
goto out;
}
- page = alloc_huge_page(vma, address, 0);
+ page = alloc_huge_page(vma, haddr, 0);
if (IS_ERR(page)) {
ret = PTR_ERR(page);
if (ret == -ENOMEM)
@@ -3789,12 +3791,12 @@ retry:
* the spinlock.
*/
if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
- if (vma_needs_reservation(h, vma, address) < 0) {
+ if (vma_needs_reservation(h, vma, haddr) < 0) {
ret = VM_FAULT_OOM;
goto backout_unlocked;
}
/* Just decrements count, does not deallocate */
- vma_end_reservation(h, vma, address);
+ vma_end_reservation(h, vma, haddr);
}
ptl = huge_pte_lock(h, mm, ptep);
@@ -3808,17 +3810,17 @@ retry:
if (anon_rmap) {
ClearPagePrivate(page);
- hugepage_add_new_anon_rmap(page, vma, address);
+ hugepage_add_new_anon_rmap(page, vma, haddr);
} else
page_dup_rmap(page, true);
new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
&& (vma->vm_flags & VM_SHARED)));
- set_huge_pte_at(mm, address, ptep, new_pte);
+ set_huge_pte_at(mm, haddr, ptep, new_pte);
hugetlb_count_add(pages_per_huge_page(h), mm);
if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
/* Optimization, do the COW without a second fault */
- ret = hugetlb_cow(mm, vma, address, ptep, page, ptl);
+ ret = hugetlb_cow(mm, vma, haddr, ptep, page, ptl);
}
spin_unlock(ptl);
@@ -3830,7 +3832,7 @@ backout:
spin_unlock(ptl);
backout_unlocked:
unlock_page(page);
- restore_reserve_on_error(h, vma, address, page);
+ restore_reserve_on_error(h, vma, haddr, page);
put_page(page);
goto out;
}
@@ -3883,10 +3885,9 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
struct hstate *h = hstate_vma(vma);
struct address_space *mapping;
int need_wait_lock = 0;
+ unsigned long haddr = address & huge_page_mask(h);
- address &= huge_page_mask(h);
-
- ptep = huge_pte_offset(mm, address, huge_page_size(h));
+ ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
if (ptep) {
entry = huge_ptep_get(ptep);
if (unlikely(is_hugetlb_entry_migration(entry))) {
@@ -3896,20 +3897,20 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
return VM_FAULT_HWPOISON_LARGE |
VM_FAULT_SET_HINDEX(hstate_index(h));
} else {
- ptep = huge_pte_alloc(mm, address, huge_page_size(h));
+ ptep = huge_pte_alloc(mm, haddr, huge_page_size(h));
if (!ptep)
return VM_FAULT_OOM;
}
mapping = vma->vm_file->f_mapping;
- idx = vma_hugecache_offset(h, vma, address);
+ idx = vma_hugecache_offset(h, vma, haddr);
/*
* Serialize hugepage allocation and instantiation, so that we don't
* get spurious allocation failures if two CPUs race to instantiate
* the same page in the page cache.
*/
- hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, idx, address);
+ hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, idx, haddr);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
entry = huge_ptep_get(ptep);
@@ -3939,16 +3940,16 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
* consumed.
*/
if ((flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) {
- if (vma_needs_reservation(h, vma, address) < 0) {
+ if (vma_needs_reservation(h, vma, haddr) < 0) {
ret = VM_FAULT_OOM;
goto out_mutex;
}
/* Just decrements count, does not deallocate */
- vma_end_reservation(h, vma, address);
+ vma_end_reservation(h, vma, haddr);
if (!(vma->vm_flags & VM_MAYSHARE))
pagecache_page = hugetlbfs_pagecache_page(h,
- vma, address);
+ vma, haddr);
}
ptl = huge_pte_lock(h, mm, ptep);
@@ -3973,16 +3974,16 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
if (flags & FAULT_FLAG_WRITE) {
if (!huge_pte_write(entry)) {
- ret = hugetlb_cow(mm, vma, address, ptep,
+ ret = hugetlb_cow(mm, vma, haddr, ptep,
pagecache_page, ptl);
goto out_put_page;
}
entry = huge_pte_mkdirty(entry);
}
entry = pte_mkyoung(entry);
- if (huge_ptep_set_access_flags(vma, address, ptep, entry,
+ if (huge_ptep_set_access_flags(vma, haddr, ptep, entry,
flags & FAULT_FLAG_WRITE))
- update_mmu_cache(vma, address, ptep);
+ update_mmu_cache(vma, haddr, ptep);
out_put_page:
if (page != pagecache_page)
unlock_page(page);
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index eec1150125b9..68c2f2f3c05b 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -84,7 +84,7 @@ static void hugetlb_cgroup_init(struct hugetlb_cgroup *h_cgroup,
limit = round_down(PAGE_COUNTER_MAX,
1 << huge_page_order(&hstates[idx]));
- ret = page_counter_limit(counter, limit);
+ ret = page_counter_set_max(counter, limit);
VM_BUG_ON(ret);
}
}
@@ -273,7 +273,7 @@ static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css,
case RES_USAGE:
return (u64)page_counter_read(counter) * PAGE_SIZE;
case RES_LIMIT:
- return (u64)counter->limit * PAGE_SIZE;
+ return (u64)counter->max * PAGE_SIZE;
case RES_MAX_USAGE:
return (u64)counter->watermark * PAGE_SIZE;
case RES_FAILCNT:
@@ -306,7 +306,7 @@ static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of,
switch (MEMFILE_ATTR(of_cft(of)->private)) {
case RES_LIMIT:
mutex_lock(&hugetlb_limit_mutex);
- ret = page_counter_limit(&h_cg->hugepage[idx], nr_pages);
+ ret = page_counter_set_max(&h_cg->hugepage[idx], nr_pages);
mutex_unlock(&hugetlb_limit_mutex);
break;
default:
diff --git a/mm/init-mm.c b/mm/init-mm.c
index f94d5d15ebc0..f0179c9c04c2 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -22,6 +22,7 @@ struct mm_struct init_mm = {
.mm_count = ATOMIC_INIT(1),
.mmap_sem = __RWSEM_INITIALIZER(init_mm.mmap_sem),
.page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
+ .arg_lock = __SPIN_LOCK_UNLOCKED(init_mm.arg_lock),
.mmlist = LIST_HEAD_INIT(init_mm.mmlist),
.user_ns = &init_user_ns,
INIT_MM_CONTEXT(init_mm)
diff --git a/mm/internal.h b/mm/internal.h
index 502d14189794..9e3654d70289 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -53,7 +53,7 @@ void unmap_page_range(struct mmu_gather *tlb,
unsigned long addr, unsigned long end,
struct zap_details *details);
-extern int __do_page_cache_readahead(struct address_space *mapping,
+extern unsigned int __do_page_cache_readahead(struct address_space *mapping,
struct file *filp, pgoff_t offset, unsigned long nr_to_read,
unsigned long lookahead_size);
diff --git a/mm/ksm.c b/mm/ksm.c
index 7d6558f3bac9..a6d43cf9a982 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -216,6 +216,8 @@ struct rmap_item {
#define SEQNR_MASK 0x0ff /* low bits of unstable tree seqnr */
#define UNSTABLE_FLAG 0x100 /* is a node of the unstable tree */
#define STABLE_FLAG 0x200 /* is listed from the stable tree */
+#define KSM_FLAG_MASK (SEQNR_MASK|UNSTABLE_FLAG|STABLE_FLAG)
+ /* to mask all the flags */
/* The stable and unstable tree heads */
static struct rb_root one_stable_tree[1] = { RB_ROOT };
@@ -840,6 +842,17 @@ static int unmerge_ksm_pages(struct vm_area_struct *vma,
return err;
}
+static inline struct stable_node *page_stable_node(struct page *page)
+{
+ return PageKsm(page) ? page_rmapping(page) : NULL;
+}
+
+static inline void set_page_stable_node(struct page *page,
+ struct stable_node *stable_node)
+{
+ page->mapping = (void *)((unsigned long)stable_node | PAGE_MAPPING_KSM);
+}
+
#ifdef CONFIG_SYSFS
/*
* Only called through the sysfs control interface:
@@ -2587,10 +2600,15 @@ again:
anon_vma_lock_read(anon_vma);
anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
0, ULONG_MAX) {
+ unsigned long addr;
+
cond_resched();
vma = vmac->vma;
- if (rmap_item->address < vma->vm_start ||
- rmap_item->address >= vma->vm_end)
+
+ /* Ignore the stable/unstable/sqnr flags */
+ addr = rmap_item->address & ~KSM_FLAG_MASK;
+
+ if (addr < vma->vm_start || addr >= vma->vm_end)
continue;
/*
* Initially we examine only the vma which covers this
@@ -2604,8 +2622,7 @@ again:
if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
continue;
- if (!rwc->rmap_one(page, vma,
- rmap_item->address, rwc->arg)) {
+ if (!rwc->rmap_one(page, vma, addr, rwc->arg)) {
anon_vma_unlock_read(anon_vma);
return;
}
diff --git a/mm/memblock.c b/mm/memblock.c
index 5108356ad8aa..03d48d8835ba 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -68,7 +68,7 @@ ulong __init_memblock choose_memblock_flags(void)
/* adjust *@size so that (@base + *@size) doesn't overflow, return new size */
static inline phys_addr_t memblock_cap_size(phys_addr_t base, phys_addr_t *size)
{
- return *size = min(*size, (phys_addr_t)ULLONG_MAX - base);
+ return *size = min(*size, PHYS_ADDR_MAX - base);
}
/*
@@ -697,6 +697,11 @@ static int __init_memblock memblock_remove_range(struct memblock_type *type,
int __init_memblock memblock_remove(phys_addr_t base, phys_addr_t size)
{
+ phys_addr_t end = base + size - 1;
+
+ memblock_dbg("memblock_remove: [%pa-%pa] %pS\n",
+ &base, &end, (void *)_RET_IP_);
+
return memblock_remove_range(&memblock.memory, base, size);
}
@@ -925,7 +930,7 @@ void __init_memblock __next_mem_range(u64 *idx, int nid, ulong flags,
r = &type_b->regions[idx_b];
r_start = idx_b ? r[-1].base + r[-1].size : 0;
r_end = idx_b < type_b->cnt ?
- r->base : (phys_addr_t)ULLONG_MAX;
+ r->base : PHYS_ADDR_MAX;
/*
* if idx_b advanced past idx_a,
@@ -1041,7 +1046,7 @@ void __init_memblock __next_mem_range_rev(u64 *idx, int nid, ulong flags,
r = &type_b->regions[idx_b];
r_start = idx_b ? r[-1].base + r[-1].size : 0;
r_end = idx_b < type_b->cnt ?
- r->base : (phys_addr_t)ULLONG_MAX;
+ r->base : PHYS_ADDR_MAX;
/*
* if idx_b advanced past idx_a,
* break out to advance idx_a
@@ -1516,13 +1521,13 @@ phys_addr_t __init_memblock memblock_end_of_DRAM(void)
static phys_addr_t __init_memblock __find_max_addr(phys_addr_t limit)
{
- phys_addr_t max_addr = (phys_addr_t)ULLONG_MAX;
+ phys_addr_t max_addr = PHYS_ADDR_MAX;
struct memblock_region *r;
/*
* translate the memory @limit size into the max address within one of
* the memory memblock regions, if the @limit exceeds the total size
- * of those regions, max_addr will keep original value ULLONG_MAX
+ * of those regions, max_addr will keep original value PHYS_ADDR_MAX
*/
for_each_memblock(memory, r) {
if (limit <= r->size) {
@@ -1537,7 +1542,7 @@ static phys_addr_t __init_memblock __find_max_addr(phys_addr_t limit)
void __init memblock_enforce_memory_limit(phys_addr_t limit)
{
- phys_addr_t max_addr = (phys_addr_t)ULLONG_MAX;
+ phys_addr_t max_addr = PHYS_ADDR_MAX;
if (!limit)
return;
@@ -1545,14 +1550,14 @@ void __init memblock_enforce_memory_limit(phys_addr_t limit)
max_addr = __find_max_addr(limit);
/* @limit exceeds the total size of the memory, do nothing */
- if (max_addr == (phys_addr_t)ULLONG_MAX)
+ if (max_addr == PHYS_ADDR_MAX)
return;
/* truncate both memory and reserved regions */
memblock_remove_range(&memblock.memory, max_addr,
- (phys_addr_t)ULLONG_MAX);
+ PHYS_ADDR_MAX);
memblock_remove_range(&memblock.reserved, max_addr,
- (phys_addr_t)ULLONG_MAX);
+ PHYS_ADDR_MAX);
}
void __init memblock_cap_memory_range(phys_addr_t base, phys_addr_t size)
@@ -1580,7 +1585,7 @@ void __init memblock_cap_memory_range(phys_addr_t base, phys_addr_t size)
/* truncate the reserved regions */
memblock_remove_range(&memblock.reserved, 0, base);
memblock_remove_range(&memblock.reserved,
- base + size, (phys_addr_t)ULLONG_MAX);
+ base + size, PHYS_ADDR_MAX);
}
void __init memblock_mem_limit_remove_map(phys_addr_t limit)
@@ -1593,7 +1598,7 @@ void __init memblock_mem_limit_remove_map(phys_addr_t limit)
max_addr = __find_max_addr(limit);
/* @limit exceeds the total size of the memory, do nothing */
- if (max_addr == (phys_addr_t)ULLONG_MAX)
+ if (max_addr == PHYS_ADDR_MAX)
return;
memblock_cap_memory_range(0, max_addr);
@@ -1803,10 +1808,13 @@ static int __init memblock_init_debugfs(void)
struct dentry *root = debugfs_create_dir("memblock", NULL);
if (!root)
return -ENXIO;
- debugfs_create_file("memory", S_IRUGO, root, &memblock.memory, &memblock_debug_fops);
- debugfs_create_file("reserved", S_IRUGO, root, &memblock.reserved, &memblock_debug_fops);
+ debugfs_create_file("memory", 0444, root,
+ &memblock.memory, &memblock_debug_fops);
+ debugfs_create_file("reserved", 0444, root,
+ &memblock.reserved, &memblock_debug_fops);
#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
- debugfs_create_file("physmem", S_IRUGO, root, &memblock.physmem, &memblock_debug_fops);
+ debugfs_create_file("physmem", 0444, root,
+ &memblock.physmem, &memblock_debug_fops);
#endif
return 0;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 1695f38630f1..e6f0d5ef320a 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1034,13 +1034,13 @@ static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
unsigned long limit;
count = page_counter_read(&memcg->memory);
- limit = READ_ONCE(memcg->memory.limit);
+ limit = READ_ONCE(memcg->memory.max);
if (count < limit)
margin = limit - count;
if (do_memsw_account()) {
count = page_counter_read(&memcg->memsw);
- limit = READ_ONCE(memcg->memsw.limit);
+ limit = READ_ONCE(memcg->memsw.max);
if (count <= limit)
margin = min(margin, limit - count);
else
@@ -1148,13 +1148,13 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n",
K((u64)page_counter_read(&memcg->memory)),
- K((u64)memcg->memory.limit), memcg->memory.failcnt);
+ K((u64)memcg->memory.max), memcg->memory.failcnt);
pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n",
K((u64)page_counter_read(&memcg->memsw)),
- K((u64)memcg->memsw.limit), memcg->memsw.failcnt);
+ K((u64)memcg->memsw.max), memcg->memsw.failcnt);
pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n",
K((u64)page_counter_read(&memcg->kmem)),
- K((u64)memcg->kmem.limit), memcg->kmem.failcnt);
+ K((u64)memcg->kmem.max), memcg->kmem.failcnt);
for_each_mem_cgroup_tree(iter, memcg) {
pr_info("Memory cgroup stats for ");
@@ -1179,21 +1179,21 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
/*
* Return the memory (and swap, if configured) limit for a memcg.
*/
-unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg)
+unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg)
{
- unsigned long limit;
+ unsigned long max;
- limit = memcg->memory.limit;
+ max = memcg->memory.max;
if (mem_cgroup_swappiness(memcg)) {
- unsigned long memsw_limit;
- unsigned long swap_limit;
+ unsigned long memsw_max;
+ unsigned long swap_max;
- memsw_limit = memcg->memsw.limit;
- swap_limit = memcg->swap.limit;
- swap_limit = min(swap_limit, (unsigned long)total_swap_pages);
- limit = min(limit + swap_limit, memsw_limit);
+ memsw_max = memcg->memsw.max;
+ swap_max = memcg->swap.max;
+ swap_max = min(swap_max, (unsigned long)total_swap_pages);
+ max = min(max + swap_max, memsw_max);
}
- return limit;
+ return max;
}
static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
@@ -2444,12 +2444,13 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
}
#endif
-static DEFINE_MUTEX(memcg_limit_mutex);
+static DEFINE_MUTEX(memcg_max_mutex);
-static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
- unsigned long limit, bool memsw)
+static int mem_cgroup_resize_max(struct mem_cgroup *memcg,
+ unsigned long max, bool memsw)
{
bool enlarge = false;
+ bool drained = false;
int ret;
bool limits_invariant;
struct page_counter *counter = memsw ? &memcg->memsw : &memcg->memory;
@@ -2460,26 +2461,32 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
break;
}
- mutex_lock(&memcg_limit_mutex);
+ mutex_lock(&memcg_max_mutex);
/*
* Make sure that the new limit (memsw or memory limit) doesn't
- * break our basic invariant rule memory.limit <= memsw.limit.
+ * break our basic invariant rule memory.max <= memsw.max.
*/
- limits_invariant = memsw ? limit >= memcg->memory.limit :
- limit <= memcg->memsw.limit;
+ limits_invariant = memsw ? max >= memcg->memory.max :
+ max <= memcg->memsw.max;
if (!limits_invariant) {
- mutex_unlock(&memcg_limit_mutex);
+ mutex_unlock(&memcg_max_mutex);
ret = -EINVAL;
break;
}
- if (limit > counter->limit)
+ if (max > counter->max)
enlarge = true;
- ret = page_counter_limit(counter, limit);
- mutex_unlock(&memcg_limit_mutex);
+ ret = page_counter_set_max(counter, max);
+ mutex_unlock(&memcg_max_mutex);
if (!ret)
break;
+ if (!drained) {
+ drain_all_stock(memcg);
+ drained = true;
+ continue;
+ }
+
if (!try_to_free_mem_cgroup_pages(memcg, 1,
GFP_KERNEL, !memsw)) {
ret = -EBUSY;
@@ -2603,6 +2610,9 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
/* we call try-to-free pages for make this cgroup empty */
lru_add_drain_all();
+
+ drain_all_stock(memcg);
+
/* try to free all pages in this cgroup */
while (nr_retries && page_counter_read(&memcg->memory)) {
int progress;
@@ -2757,7 +2767,7 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
return (u64)mem_cgroup_usage(memcg, true) * PAGE_SIZE;
return (u64)page_counter_read(counter) * PAGE_SIZE;
case RES_LIMIT:
- return (u64)counter->limit * PAGE_SIZE;
+ return (u64)counter->max * PAGE_SIZE;
case RES_MAX_USAGE:
return (u64)counter->watermark * PAGE_SIZE;
case RES_FAILCNT:
@@ -2871,24 +2881,24 @@ static void memcg_free_kmem(struct mem_cgroup *memcg)
}
#endif /* !CONFIG_SLOB */
-static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
- unsigned long limit)
+static int memcg_update_kmem_max(struct mem_cgroup *memcg,
+ unsigned long max)
{
int ret;
- mutex_lock(&memcg_limit_mutex);
- ret = page_counter_limit(&memcg->kmem, limit);
- mutex_unlock(&memcg_limit_mutex);
+ mutex_lock(&memcg_max_mutex);
+ ret = page_counter_set_max(&memcg->kmem, max);
+ mutex_unlock(&memcg_max_mutex);
return ret;
}
-static int memcg_update_tcp_limit(struct mem_cgroup *memcg, unsigned long limit)
+static int memcg_update_tcp_max(struct mem_cgroup *memcg, unsigned long max)
{
int ret;
- mutex_lock(&memcg_limit_mutex);
+ mutex_lock(&memcg_max_mutex);
- ret = page_counter_limit(&memcg->tcpmem, limit);
+ ret = page_counter_set_max(&memcg->tcpmem, max);
if (ret)
goto out;
@@ -2913,7 +2923,7 @@ static int memcg_update_tcp_limit(struct mem_cgroup *memcg, unsigned long limit)
memcg->tcpmem_active = true;
}
out:
- mutex_unlock(&memcg_limit_mutex);
+ mutex_unlock(&memcg_max_mutex);
return ret;
}
@@ -2941,16 +2951,16 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
}
switch (MEMFILE_TYPE(of_cft(of)->private)) {
case _MEM:
- ret = mem_cgroup_resize_limit(memcg, nr_pages, false);
+ ret = mem_cgroup_resize_max(memcg, nr_pages, false);
break;
case _MEMSWAP:
- ret = mem_cgroup_resize_limit(memcg, nr_pages, true);
+ ret = mem_cgroup_resize_max(memcg, nr_pages, true);
break;
case _KMEM:
- ret = memcg_update_kmem_limit(memcg, nr_pages);
+ ret = memcg_update_kmem_max(memcg, nr_pages);
break;
case _TCP:
- ret = memcg_update_tcp_limit(memcg, nr_pages);
+ ret = memcg_update_tcp_max(memcg, nr_pages);
break;
}
break;
@@ -3083,7 +3093,7 @@ static int memcg_numa_stat_show(struct seq_file *m, void *v)
#endif /* CONFIG_NUMA */
/* Universal VM events cgroup1 shows, original sort order */
-unsigned int memcg1_events[] = {
+static const unsigned int memcg1_events[] = {
PGPGIN,
PGPGOUT,
PGFAULT,
@@ -3126,8 +3136,8 @@ static int memcg_stat_show(struct seq_file *m, void *v)
/* Hierarchical information */
memory = memsw = PAGE_COUNTER_MAX;
for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) {
- memory = min(memory, mi->memory.limit);
- memsw = min(memsw, mi->memsw.limit);
+ memory = min(memory, mi->memory.max);
+ memsw = min(memsw, mi->memsw.max);
}
seq_printf(m, "hierarchical_memory_limit %llu\n",
(u64)memory * PAGE_SIZE);
@@ -3540,7 +3550,8 @@ static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)
seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable);
seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom);
- seq_printf(sf, "oom_kill %lu\n", memcg_sum_events(memcg, OOM_KILL));
+ seq_printf(sf, "oom_kill %lu\n",
+ atomic_long_read(&memcg->memory_events[MEMCG_OOM_KILL]));
return 0;
}
@@ -3562,11 +3573,6 @@ static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css,
#ifdef CONFIG_CGROUP_WRITEBACK
-struct list_head *mem_cgroup_cgwb_list(struct mem_cgroup *memcg)
-{
- return &memcg->cgwb_list;
-}
-
static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
{
return wb_domain_init(&memcg->cgwb_domain, gfp);
@@ -3626,7 +3632,7 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
*pheadroom = PAGE_COUNTER_MAX;
while ((parent = parent_mem_cgroup(memcg))) {
- unsigned long ceiling = min(memcg->memory.limit, memcg->high);
+ unsigned long ceiling = min(memcg->memory.max, memcg->high);
unsigned long used = page_counter_read(&memcg->memory);
*pheadroom = min(*pheadroom, ceiling - min(ceiling, used));
@@ -4270,7 +4276,8 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
}
spin_unlock(&memcg->event_list_lock);
- memcg->low = 0;
+ page_counter_set_min(&memcg->memory, 0);
+ page_counter_set_low(&memcg->memory, 0);
memcg_offline_kmem(memcg);
wb_memcg_offline(memcg);
@@ -4319,12 +4326,13 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
- page_counter_limit(&memcg->memory, PAGE_COUNTER_MAX);
- page_counter_limit(&memcg->swap, PAGE_COUNTER_MAX);
- page_counter_limit(&memcg->memsw, PAGE_COUNTER_MAX);
- page_counter_limit(&memcg->kmem, PAGE_COUNTER_MAX);
- page_counter_limit(&memcg->tcpmem, PAGE_COUNTER_MAX);
- memcg->low = 0;
+ page_counter_set_max(&memcg->memory, PAGE_COUNTER_MAX);
+ page_counter_set_max(&memcg->swap, PAGE_COUNTER_MAX);
+ page_counter_set_max(&memcg->memsw, PAGE_COUNTER_MAX);
+ page_counter_set_max(&memcg->kmem, PAGE_COUNTER_MAX);
+ page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX);
+ page_counter_set_min(&memcg->memory, 0);
+ page_counter_set_low(&memcg->memory, 0);
memcg->high = PAGE_COUNTER_MAX;
memcg->soft_limit = PAGE_COUNTER_MAX;
memcg_wb_domain_size_changed(memcg);
@@ -5061,10 +5069,40 @@ static u64 memory_current_read(struct cgroup_subsys_state *css,
return (u64)page_counter_read(&memcg->memory) * PAGE_SIZE;
}
+static int memory_min_show(struct seq_file *m, void *v)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+ unsigned long min = READ_ONCE(memcg->memory.min);
+
+ if (min == PAGE_COUNTER_MAX)
+ seq_puts(m, "max\n");
+ else
+ seq_printf(m, "%llu\n", (u64)min * PAGE_SIZE);
+
+ return 0;
+}
+
+static ssize_t memory_min_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+ unsigned long min;
+ int err;
+
+ buf = strstrip(buf);
+ err = page_counter_memparse(buf, "max", &min);
+ if (err)
+ return err;
+
+ page_counter_set_min(&memcg->memory, min);
+
+ return nbytes;
+}
+
static int memory_low_show(struct seq_file *m, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
- unsigned long low = READ_ONCE(memcg->low);
+ unsigned long low = READ_ONCE(memcg->memory.low);
if (low == PAGE_COUNTER_MAX)
seq_puts(m, "max\n");
@@ -5086,7 +5124,7 @@ static ssize_t memory_low_write(struct kernfs_open_file *of,
if (err)
return err;
- memcg->low = low;
+ page_counter_set_low(&memcg->memory, low);
return nbytes;
}
@@ -5131,7 +5169,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
static int memory_max_show(struct seq_file *m, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
- unsigned long max = READ_ONCE(memcg->memory.limit);
+ unsigned long max = READ_ONCE(memcg->memory.max);
if (max == PAGE_COUNTER_MAX)
seq_puts(m, "max\n");
@@ -5155,7 +5193,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
if (err)
return err;
- xchg(&memcg->memory.limit, max);
+ xchg(&memcg->memory.max, max);
for (;;) {
unsigned long nr_pages = page_counter_read(&memcg->memory);
@@ -5202,7 +5240,8 @@ static int memory_events_show(struct seq_file *m, void *v)
atomic_long_read(&memcg->memory_events[MEMCG_MAX]));
seq_printf(m, "oom %lu\n",
atomic_long_read(&memcg->memory_events[MEMCG_OOM]));
- seq_printf(m, "oom_kill %lu\n", memcg_sum_events(memcg, OOM_KILL));
+ seq_printf(m, "oom_kill %lu\n",
+ atomic_long_read(&memcg->memory_events[MEMCG_OOM_KILL]));
return 0;
}
@@ -5296,6 +5335,12 @@ static struct cftype memory_files[] = {
.read_u64 = memory_current_read,
},
{
+ .name = "min",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = memory_min_show,
+ .write = memory_min_write,
+ },
+ {
.name = "low",
.flags = CFTYPE_NOT_ON_ROOT,
.seq_show = memory_low_show,
@@ -5344,54 +5389,144 @@ struct cgroup_subsys memory_cgrp_subsys = {
};
/**
- * mem_cgroup_low - check if memory consumption is below the normal range
+ * mem_cgroup_protected - check if memory consumption is in the normal range
* @root: the top ancestor of the sub-tree being checked
* @memcg: the memory cgroup to check
*
- * Returns %true if memory consumption of @memcg, and that of all
- * ancestors up to (but not including) @root, is below the normal range.
+ * WARNING: This function is not stateless! It can only be used as part
+ * of a top-down tree iteration, not for isolated queries.
+ *
+ * Returns one of the following:
+ * MEMCG_PROT_NONE: cgroup memory is not protected
+ * MEMCG_PROT_LOW: cgroup memory is protected as long there is
+ * an unprotected supply of reclaimable memory from other cgroups.
+ * MEMCG_PROT_MIN: cgroup memory is protected
+ *
+ * @root is exclusive; it is never protected when looked at directly
*
- * @root is exclusive; it is never low when looked at directly and isn't
- * checked when traversing the hierarchy.
+ * To provide a proper hierarchical behavior, effective memory.min/low values
+ * are used. Below is the description of how effective memory.low is calculated.
+ * Effective memory.min values is calculated in the same way.
*
- * Excluding @root enables using memory.low to prioritize memory usage
- * between cgroups within a subtree of the hierarchy that is limited by
- * memory.high or memory.max.
+ * Effective memory.low is always equal or less than the original memory.low.
+ * If there is no memory.low overcommittment (which is always true for
+ * top-level memory cgroups), these two values are equal.
+ * Otherwise, it's a part of parent's effective memory.low,
+ * calculated as a cgroup's memory.low usage divided by sum of sibling's
+ * memory.low usages, where memory.low usage is the size of actually
+ * protected memory.
*
- * For example, given cgroup A with children B and C:
+ * low_usage
+ * elow = min( memory.low, parent->elow * ------------------ ),
+ * siblings_low_usage
*
- * A
- * / \
- * B C
+ * | memory.current, if memory.current < memory.low
+ * low_usage = |
+ | 0, otherwise.
*
- * and
*
- * 1. A/memory.current > A/memory.high
- * 2. A/B/memory.current < A/B/memory.low
- * 3. A/C/memory.current >= A/C/memory.low
+ * Such definition of the effective memory.low provides the expected
+ * hierarchical behavior: parent's memory.low value is limiting
+ * children, unprotected memory is reclaimed first and cgroups,
+ * which are not using their guarantee do not affect actual memory
+ * distribution.
*
- * As 'A' is high, i.e. triggers reclaim from 'A', and 'B' is low, we
- * should reclaim from 'C' until 'A' is no longer high or until we can
- * no longer reclaim from 'C'. If 'A', i.e. @root, isn't excluded by
- * mem_cgroup_low when reclaming from 'A', then 'B' won't be considered
- * low and we will reclaim indiscriminately from both 'B' and 'C'.
+ * For example, if there are memcgs A, A/B, A/C, A/D and A/E:
+ *
+ * A A/memory.low = 2G, A/memory.current = 6G
+ * //\\
+ * BC DE B/memory.low = 3G B/memory.current = 2G
+ * C/memory.low = 1G C/memory.current = 2G
+ * D/memory.low = 0 D/memory.current = 2G
+ * E/memory.low = 10G E/memory.current = 0
+ *
+ * and the memory pressure is applied, the following memory distribution
+ * is expected (approximately):
+ *
+ * A/memory.current = 2G
+ *
+ * B/memory.current = 1.3G
+ * C/memory.current = 0.6G
+ * D/memory.current = 0
+ * E/memory.current = 0
+ *
+ * These calculations require constant tracking of the actual low usages
+ * (see propagate_protected_usage()), as well as recursive calculation of
+ * effective memory.low values. But as we do call mem_cgroup_protected()
+ * path for each memory cgroup top-down from the reclaim,
+ * it's possible to optimize this part, and save calculated elow
+ * for next usage. This part is intentionally racy, but it's ok,
+ * as memory.low is a best-effort mechanism.
*/
-bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg)
+enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
+ struct mem_cgroup *memcg)
{
+ struct mem_cgroup *parent;
+ unsigned long emin, parent_emin;
+ unsigned long elow, parent_elow;
+ unsigned long usage;
+
if (mem_cgroup_disabled())
- return false;
+ return MEMCG_PROT_NONE;
if (!root)
root = root_mem_cgroup;
if (memcg == root)
- return false;
+ return MEMCG_PROT_NONE;
+
+ usage = page_counter_read(&memcg->memory);
+ if (!usage)
+ return MEMCG_PROT_NONE;
- for (; memcg != root; memcg = parent_mem_cgroup(memcg)) {
- if (page_counter_read(&memcg->memory) >= memcg->low)
- return false;
+ emin = memcg->memory.min;
+ elow = memcg->memory.low;
+
+ parent = parent_mem_cgroup(memcg);
+ /* No parent means a non-hierarchical mode on v1 memcg */
+ if (!parent)
+ return MEMCG_PROT_NONE;
+
+ if (parent == root)
+ goto exit;
+
+ parent_emin = READ_ONCE(parent->memory.emin);
+ emin = min(emin, parent_emin);
+ if (emin && parent_emin) {
+ unsigned long min_usage, siblings_min_usage;
+
+ min_usage = min(usage, memcg->memory.min);
+ siblings_min_usage = atomic_long_read(
+ &parent->memory.children_min_usage);
+
+ if (min_usage && siblings_min_usage)
+ emin = min(emin, parent_emin * min_usage /
+ siblings_min_usage);
}
- return true;
+ parent_elow = READ_ONCE(parent->memory.elow);
+ elow = min(elow, parent_elow);
+ if (elow && parent_elow) {
+ unsigned long low_usage, siblings_low_usage;
+
+ low_usage = min(usage, memcg->memory.low);
+ siblings_low_usage = atomic_long_read(
+ &parent->memory.children_low_usage);
+
+ if (low_usage && siblings_low_usage)
+ elow = min(elow, parent_elow * low_usage /
+ siblings_low_usage);
+ }
+
+exit:
+ memcg->memory.emin = emin;
+ memcg->memory.elow = elow;
+
+ if (usage <= emin)
+ return MEMCG_PROT_MIN;
+ else if (usage <= elow)
+ return MEMCG_PROT_LOW;
+ else
+ return MEMCG_PROT_NONE;
}
/**
@@ -6012,10 +6147,17 @@ int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
if (!memcg)
return 0;
+ if (!entry.val) {
+ memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
+ return 0;
+ }
+
memcg = mem_cgroup_id_get_online(memcg);
if (!mem_cgroup_is_root(memcg) &&
!page_counter_try_charge(&memcg->swap, nr_pages, &counter)) {
+ memcg_memory_event(memcg, MEMCG_SWAP_MAX);
+ memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
mem_cgroup_id_put(memcg);
return -ENOMEM;
}
@@ -6067,7 +6209,7 @@ long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg)
return nr_swap_pages;
for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg))
nr_swap_pages = min_t(long, nr_swap_pages,
- READ_ONCE(memcg->swap.limit) -
+ READ_ONCE(memcg->swap.max) -
page_counter_read(&memcg->swap));
return nr_swap_pages;
}
@@ -6088,7 +6230,7 @@ bool mem_cgroup_swap_full(struct page *page)
return false;
for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg))
- if (page_counter_read(&memcg->swap) * 2 >= memcg->swap.limit)
+ if (page_counter_read(&memcg->swap) * 2 >= memcg->swap.max)
return true;
return false;
@@ -6122,7 +6264,7 @@ static u64 swap_current_read(struct cgroup_subsys_state *css,
static int swap_max_show(struct seq_file *m, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
- unsigned long max = READ_ONCE(memcg->swap.limit);
+ unsigned long max = READ_ONCE(memcg->swap.max);
if (max == PAGE_COUNTER_MAX)
seq_puts(m, "max\n");
@@ -6144,15 +6286,23 @@ static ssize_t swap_max_write(struct kernfs_open_file *of,
if (err)
return err;
- mutex_lock(&memcg_limit_mutex);
- err = page_counter_limit(&memcg->swap, max);
- mutex_unlock(&memcg_limit_mutex);
- if (err)
- return err;
+ xchg(&memcg->swap.max, max);
return nbytes;
}
+static int swap_events_show(struct seq_file *m, void *v)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+
+ seq_printf(m, "max %lu\n",
+ atomic_long_read(&memcg->memory_events[MEMCG_SWAP_MAX]));
+ seq_printf(m, "fail %lu\n",
+ atomic_long_read(&memcg->memory_events[MEMCG_SWAP_FAIL]));
+
+ return 0;
+}
+
static struct cftype swap_files[] = {
{
.name = "swap.current",
@@ -6165,6 +6315,12 @@ static struct cftype swap_files[] = {
.seq_show = swap_max_show,
.write = swap_max_write,
},
+ {
+ .name = "swap.events",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .file_offset = offsetof(struct mem_cgroup, swap_events_file),
+ .seq_show = swap_events_show,
+ },
{ } /* terminate */
};
diff --git a/mm/memfd.c b/mm/memfd.c
new file mode 100644
index 000000000000..27069518e3c5
--- /dev/null
+++ b/mm/memfd.c
@@ -0,0 +1,345 @@
+/*
+ * memfd_create system call and file sealing support
+ *
+ * Code was originally included in shmem.c, and broken out to facilitate
+ * use by hugetlbfs as well as tmpfs.
+ *
+ * This file is released under the GPL.
+ */
+
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/pagemap.h>
+#include <linux/file.h>
+#include <linux/mm.h>
+#include <linux/sched/signal.h>
+#include <linux/khugepaged.h>
+#include <linux/syscalls.h>
+#include <linux/hugetlb.h>
+#include <linux/shmem_fs.h>
+#include <linux/memfd.h>
+#include <uapi/linux/memfd.h>
+
+/*
+ * We need a tag: a new tag would expand every radix_tree_node by 8 bytes,
+ * so reuse a tag which we firmly believe is never set or cleared on tmpfs
+ * or hugetlbfs because they are memory only filesystems.
+ */
+#define MEMFD_TAG_PINNED PAGECACHE_TAG_TOWRITE
+#define LAST_SCAN 4 /* about 150ms max */
+
+static void memfd_tag_pins(struct address_space *mapping)
+{
+ struct radix_tree_iter iter;
+ void __rcu **slot;
+ pgoff_t start;
+ struct page *page;
+
+ lru_add_drain();
+ start = 0;
+ rcu_read_lock();
+
+ radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) {
+ page = radix_tree_deref_slot(slot);
+ if (!page || radix_tree_exception(page)) {
+ if (radix_tree_deref_retry(page)) {
+ slot = radix_tree_iter_retry(&iter);
+ continue;
+ }
+ } else if (page_count(page) - page_mapcount(page) > 1) {
+ xa_lock_irq(&mapping->i_pages);
+ radix_tree_tag_set(&mapping->i_pages, iter.index,
+ MEMFD_TAG_PINNED);
+ xa_unlock_irq(&mapping->i_pages);
+ }
+
+ if (need_resched()) {
+ slot = radix_tree_iter_resume(slot, &iter);
+ cond_resched_rcu();
+ }
+ }
+ rcu_read_unlock();
+}
+
+/*
+ * Setting SEAL_WRITE requires us to verify there's no pending writer. However,
+ * via get_user_pages(), drivers might have some pending I/O without any active
+ * user-space mappings (eg., direct-IO, AIO). Therefore, we look at all pages
+ * and see whether it has an elevated ref-count. If so, we tag them and wait for
+ * them to be dropped.
+ * The caller must guarantee that no new user will acquire writable references
+ * to those pages to avoid races.
+ */
+static int memfd_wait_for_pins(struct address_space *mapping)
+{
+ struct radix_tree_iter iter;
+ void __rcu **slot;
+ pgoff_t start;
+ struct page *page;
+ int error, scan;
+
+ memfd_tag_pins(mapping);
+
+ error = 0;
+ for (scan = 0; scan <= LAST_SCAN; scan++) {
+ if (!radix_tree_tagged(&mapping->i_pages, MEMFD_TAG_PINNED))
+ break;
+
+ if (!scan)
+ lru_add_drain_all();
+ else if (schedule_timeout_killable((HZ << scan) / 200))
+ scan = LAST_SCAN;
+
+ start = 0;
+ rcu_read_lock();
+ radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter,
+ start, MEMFD_TAG_PINNED) {
+
+ page = radix_tree_deref_slot(slot);
+ if (radix_tree_exception(page)) {
+ if (radix_tree_deref_retry(page)) {
+ slot = radix_tree_iter_retry(&iter);
+ continue;
+ }
+
+ page = NULL;
+ }
+
+ if (page &&
+ page_count(page) - page_mapcount(page) != 1) {
+ if (scan < LAST_SCAN)
+ goto continue_resched;
+
+ /*
+ * On the last scan, we clean up all those tags
+ * we inserted; but make a note that we still
+ * found pages pinned.
+ */
+ error = -EBUSY;
+ }
+
+ xa_lock_irq(&mapping->i_pages);
+ radix_tree_tag_clear(&mapping->i_pages,
+ iter.index, MEMFD_TAG_PINNED);
+ xa_unlock_irq(&mapping->i_pages);
+continue_resched:
+ if (need_resched()) {
+ slot = radix_tree_iter_resume(slot, &iter);
+ cond_resched_rcu();
+ }
+ }
+ rcu_read_unlock();
+ }
+
+ return error;
+}
+
+static unsigned int *memfd_file_seals_ptr(struct file *file)
+{
+ if (shmem_file(file))
+ return &SHMEM_I(file_inode(file))->seals;
+
+#ifdef CONFIG_HUGETLBFS
+ if (is_file_hugepages(file))
+ return &HUGETLBFS_I(file_inode(file))->seals;
+#endif
+
+ return NULL;
+}
+
+#define F_ALL_SEALS (F_SEAL_SEAL | \
+ F_SEAL_SHRINK | \
+ F_SEAL_GROW | \
+ F_SEAL_WRITE)
+
+static int memfd_add_seals(struct file *file, unsigned int seals)
+{
+ struct inode *inode = file_inode(file);
+ unsigned int *file_seals;
+ int error;
+
+ /*
+ * SEALING
+ * Sealing allows multiple parties to share a tmpfs or hugetlbfs file
+ * but restrict access to a specific subset of file operations. Seals
+ * can only be added, but never removed. This way, mutually untrusted
+ * parties can share common memory regions with a well-defined policy.
+ * A malicious peer can thus never perform unwanted operations on a
+ * shared object.
+ *
+ * Seals are only supported on special tmpfs or hugetlbfs files and
+ * always affect the whole underlying inode. Once a seal is set, it
+ * may prevent some kinds of access to the file. Currently, the
+ * following seals are defined:
+ * SEAL_SEAL: Prevent further seals from being set on this file
+ * SEAL_SHRINK: Prevent the file from shrinking
+ * SEAL_GROW: Prevent the file from growing
+ * SEAL_WRITE: Prevent write access to the file
+ *
+ * As we don't require any trust relationship between two parties, we
+ * must prevent seals from being removed. Therefore, sealing a file
+ * only adds a given set of seals to the file, it never touches
+ * existing seals. Furthermore, the "setting seals"-operation can be
+ * sealed itself, which basically prevents any further seal from being
+ * added.
+ *
+ * Semantics of sealing are only defined on volatile files. Only
+ * anonymous tmpfs and hugetlbfs files support sealing. More
+ * importantly, seals are never written to disk. Therefore, there's
+ * no plan to support it on other file types.
+ */
+
+ if (!(file->f_mode & FMODE_WRITE))
+ return -EPERM;
+ if (seals & ~(unsigned int)F_ALL_SEALS)
+ return -EINVAL;
+
+ inode_lock(inode);
+
+ file_seals = memfd_file_seals_ptr(file);
+ if (!file_seals) {
+ error = -EINVAL;
+ goto unlock;
+ }
+
+ if (*file_seals & F_SEAL_SEAL) {
+ error = -EPERM;
+ goto unlock;
+ }
+
+ if ((seals & F_SEAL_WRITE) && !(*file_seals & F_SEAL_WRITE)) {
+ error = mapping_deny_writable(file->f_mapping);
+ if (error)
+ goto unlock;
+
+ error = memfd_wait_for_pins(file->f_mapping);
+ if (error) {
+ mapping_allow_writable(file->f_mapping);
+ goto unlock;
+ }
+ }
+
+ *file_seals |= seals;
+ error = 0;
+
+unlock:
+ inode_unlock(inode);
+ return error;
+}
+
+static int memfd_get_seals(struct file *file)
+{
+ unsigned int *seals = memfd_file_seals_ptr(file);
+
+ return seals ? *seals : -EINVAL;
+}
+
+long memfd_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+ long error;
+
+ switch (cmd) {
+ case F_ADD_SEALS:
+ /* disallow upper 32bit */
+ if (arg > UINT_MAX)
+ return -EINVAL;
+
+ error = memfd_add_seals(file, arg);
+ break;
+ case F_GET_SEALS:
+ error = memfd_get_seals(file);
+ break;
+ default:
+ error = -EINVAL;
+ break;
+ }
+
+ return error;
+}
+
+#define MFD_NAME_PREFIX "memfd:"
+#define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1)
+#define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN)
+
+#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB)
+
+SYSCALL_DEFINE2(memfd_create,
+ const char __user *, uname,
+ unsigned int, flags)
+{
+ unsigned int *file_seals;
+ struct file *file;
+ int fd, error;
+ char *name;
+ long len;
+
+ if (!(flags & MFD_HUGETLB)) {
+ if (flags & ~(unsigned int)MFD_ALL_FLAGS)
+ return -EINVAL;
+ } else {
+ /* Allow huge page size encoding in flags. */
+ if (flags & ~(unsigned int)(MFD_ALL_FLAGS |
+ (MFD_HUGE_MASK << MFD_HUGE_SHIFT)))
+ return -EINVAL;
+ }
+
+ /* length includes terminating zero */
+ len = strnlen_user(uname, MFD_NAME_MAX_LEN + 1);
+ if (len <= 0)
+ return -EFAULT;
+ if (len > MFD_NAME_MAX_LEN + 1)
+ return -EINVAL;
+
+ name = kmalloc(len + MFD_NAME_PREFIX_LEN, GFP_KERNEL);
+ if (!name)
+ return -ENOMEM;
+
+ strcpy(name, MFD_NAME_PREFIX);
+ if (copy_from_user(&name[MFD_NAME_PREFIX_LEN], uname, len)) {
+ error = -EFAULT;
+ goto err_name;
+ }
+
+ /* terminating-zero may have changed after strnlen_user() returned */
+ if (name[len + MFD_NAME_PREFIX_LEN - 1]) {
+ error = -EFAULT;
+ goto err_name;
+ }
+
+ fd = get_unused_fd_flags((flags & MFD_CLOEXEC) ? O_CLOEXEC : 0);
+ if (fd < 0) {
+ error = fd;
+ goto err_name;
+ }
+
+ if (flags & MFD_HUGETLB) {
+ struct user_struct *user = NULL;
+
+ file = hugetlb_file_setup(name, 0, VM_NORESERVE, &user,
+ HUGETLB_ANONHUGE_INODE,
+ (flags >> MFD_HUGE_SHIFT) &
+ MFD_HUGE_MASK);
+ } else
+ file = shmem_file_setup(name, 0, VM_NORESERVE);
+ if (IS_ERR(file)) {
+ error = PTR_ERR(file);
+ goto err_fd;
+ }
+ file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE;
+ file->f_flags |= O_RDWR | O_LARGEFILE;
+
+ if (flags & MFD_ALLOW_SEALING) {
+ file_seals = memfd_file_seals_ptr(file);
+ *file_seals &= ~F_SEAL_SEAL;
+ }
+
+ fd_install(fd, file);
+ kfree(name);
+ return fd;
+
+err_fd:
+ put_unused_fd(fd);
+err_name:
+ kfree(name);
+ return error;
+}
diff --git a/mm/memory.c b/mm/memory.c
index 01f5464e0fd2..7206a634270b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -817,17 +817,12 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
* PFNMAP mappings in order to support COWable mappings.
*
*/
-#ifdef __HAVE_ARCH_PTE_SPECIAL
-# define HAVE_PTE_SPECIAL 1
-#else
-# define HAVE_PTE_SPECIAL 0
-#endif
struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
pte_t pte, bool with_public_device)
{
unsigned long pfn = pte_pfn(pte);
- if (HAVE_PTE_SPECIAL) {
+ if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL)) {
if (likely(!pte_special(pte)))
goto check_pfn;
if (vma->vm_ops && vma->vm_ops->find_special_page)
@@ -862,7 +857,7 @@ struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
return NULL;
}
- /* !HAVE_PTE_SPECIAL case follows: */
+ /* !CONFIG_ARCH_HAS_PTE_SPECIAL case follows: */
if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
if (vma->vm_flags & VM_MIXEDMAP) {
@@ -881,6 +876,7 @@ struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
if (is_zero_pfn(pfn))
return NULL;
+
check_pfn:
if (unlikely(pfn > highest_memmap_pfn)) {
print_bad_pte(vma, addr, pte, NULL);
@@ -904,7 +900,7 @@ struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
/*
* There is no pmd_special() but there may be special pmds, e.g.
* in a direct-access (dax) mapping, so let's just replicate the
- * !HAVE_PTE_SPECIAL case from vm_normal_page() here.
+ * !CONFIG_ARCH_HAS_PTE_SPECIAL case from vm_normal_page() here.
*/
if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
if (vma->vm_flags & VM_MIXEDMAP) {
@@ -1660,16 +1656,15 @@ static void zap_page_range_single(struct vm_area_struct *vma, unsigned long addr
*
* The entire address range must be fully contained within the vma.
*
- * Returns 0 if successful.
*/
-int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
+void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
unsigned long size)
{
if (address < vma->vm_start || address + size > vma->vm_end ||
!(vma->vm_flags & VM_PFNMAP))
- return -1;
+ return;
+
zap_page_range_single(vma, address, size, NULL);
- return 0;
}
EXPORT_SYMBOL_GPL(zap_vma_ptes);
@@ -1933,7 +1928,8 @@ static int __vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
* than insert_pfn). If a zero_pfn were inserted into a VM_MIXEDMAP
* without pte special, it would there be refcounted as a normal page.
*/
- if (!HAVE_PTE_SPECIAL && !pfn_t_devmap(pfn) && pfn_t_valid(pfn)) {
+ if (!IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) &&
+ !pfn_t_devmap(pfn) && pfn_t_valid(pfn)) {
struct page *page;
/*
@@ -1955,12 +1951,25 @@ int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
}
EXPORT_SYMBOL(vm_insert_mixed);
-int vm_insert_mixed_mkwrite(struct vm_area_struct *vma, unsigned long addr,
- pfn_t pfn)
+/*
+ * If the insertion of PTE failed because someone else already added a
+ * different entry in the mean time, we treat that as success as we assume
+ * the same entry was actually inserted.
+ */
+
+vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma,
+ unsigned long addr, pfn_t pfn)
{
- return __vm_insert_mixed(vma, addr, pfn, true);
+ int err;
+
+ err = __vm_insert_mixed(vma, addr, pfn, true);
+ if (err == -ENOMEM)
+ return VM_FAULT_OOM;
+ if (err < 0 && err != -EBUSY)
+ return VM_FAULT_SIGBUS;
+ return VM_FAULT_NOPAGE;
}
-EXPORT_SYMBOL(vm_insert_mixed_mkwrite);
+EXPORT_SYMBOL(vmf_insert_mixed_mkwrite);
/*
* maps a range of physical memory into the requested pages. the old
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 25982467800b..7deb49f69e27 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1237,6 +1237,29 @@ static struct page *next_active_pageblock(struct page *page)
return page + pageblock_nr_pages;
}
+static bool is_pageblock_removable_nolock(struct page *page)
+{
+ struct zone *zone;
+ unsigned long pfn;
+
+ /*
+ * We have to be careful here because we are iterating over memory
+ * sections which are not zone aware so we might end up outside of
+ * the zone but still within the section.
+ * We have to take care about the node as well. If the node is offline
+ * its NODE_DATA will be NULL - see page_zone.
+ */
+ if (!node_online(page_to_nid(page)))
+ return false;
+
+ zone = page_zone(page);
+ pfn = page_to_pfn(page);
+ if (!zone_spans_pfn(zone, pfn))
+ return false;
+
+ return !has_unmovable_pages(zone, page, 0, MIGRATE_MOVABLE, true);
+}
+
/* Checks if this range of memory is likely to be hot-removable. */
bool is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages)
{
diff --git a/mm/mmap.c b/mm/mmap.c
index d817764a9974..d1eb87ef4b1a 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -3277,7 +3277,7 @@ void vm_stat_account(struct mm_struct *mm, vm_flags_t flags, long npages)
mm->data_vm += npages;
}
-static int special_mapping_fault(struct vm_fault *vmf);
+static vm_fault_t special_mapping_fault(struct vm_fault *vmf);
/*
* Having a close hook prevents vma merging regardless of flags.
@@ -3316,7 +3316,7 @@ static const struct vm_operations_struct legacy_special_mapping_vmops = {
.fault = special_mapping_fault,
};
-static int special_mapping_fault(struct vm_fault *vmf)
+static vm_fault_t special_mapping_fault(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
pgoff_t pgoff;
diff --git a/mm/mremap.c b/mm/mremap.c
index 049470aa1e3e..5c2e18505f75 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -191,8 +191,6 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
drop_rmap_locks(vma);
}
-#define LATENCY_LIMIT (64 * PAGE_SIZE)
-
unsigned long move_page_tables(struct vm_area_struct *vma,
unsigned long old_addr, struct vm_area_struct *new_vma,
unsigned long new_addr, unsigned long len,
@@ -247,8 +245,6 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
next = (new_addr + PMD_SIZE) & PMD_MASK;
if (extent > next - new_addr)
extent = next - new_addr;
- if (extent > LATENCY_LIMIT)
- extent = LATENCY_LIMIT;
move_ptes(vma, old_pmd, old_addr, old_addr + extent, new_vma,
new_pmd, new_addr, need_rmap_locks, &need_flush);
}
diff --git a/mm/nommu.c b/mm/nommu.c
index 13723736d38f..4452d8bd9ae4 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1763,7 +1763,7 @@ unsigned long arch_get_unmapped_area(struct file *file, unsigned long addr,
return -ENOMEM;
}
-int filemap_fault(struct vm_fault *vmf)
+vm_fault_t filemap_fault(struct vm_fault *vmf)
{
BUG();
return 0;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 8ba6cb88cf58..84081e77bc51 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -256,7 +256,7 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc)
int nid;
if (is_memcg_oom(oc)) {
- oc->totalpages = mem_cgroup_get_limit(oc->memcg) ?: 1;
+ oc->totalpages = mem_cgroup_get_max(oc->memcg) ?: 1;
return CONSTRAINT_MEMCG;
}
@@ -913,7 +913,7 @@ static void oom_kill_process(struct oom_control *oc, const char *message)
/* Raise event before sending signal: task reaper must see this */
count_vm_event(OOM_KILL);
- count_memcg_event_mm(mm, OOM_KILL);
+ memcg_memory_event_mm(mm, MEMCG_OOM_KILL);
/*
* We should send SIGKILL before granting access to memory reserves
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 22320ea27489..1521100f1e63 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -705,16 +705,14 @@ static inline void rmv_page_order(struct page *page)
/*
* This function checks whether a page is free && is the buddy
- * we can do coalesce a page and its buddy if
+ * we can coalesce a page and its buddy if
* (a) the buddy is not in a hole (check before calling!) &&
* (b) the buddy is in the buddy system &&
* (c) a page and its buddy have the same order &&
* (d) a page and its buddy are in the same zone.
*
- * For recording whether a page is in the buddy system, we set ->_mapcount
- * PAGE_BUDDY_MAPCOUNT_VALUE.
- * Setting, clearing, and testing _mapcount PAGE_BUDDY_MAPCOUNT_VALUE is
- * serialized by zone->lock.
+ * For recording whether a page is in the buddy system, we set PageBuddy.
+ * Setting, clearing, and testing PageBuddy is serialized by zone->lock.
*
* For recording page's order, we use page_private(page).
*/
@@ -759,9 +757,8 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
* as necessary, plus some accounting needed to play nicely with other
* parts of the VM system.
* At each level, we keep a list of pages, which are heads of continuous
- * free pages of length of (1 << order) and marked with _mapcount
- * PAGE_BUDDY_MAPCOUNT_VALUE. Page's order is recorded in page_private(page)
- * field.
+ * free pages of length of (1 << order) and marked with PageBuddy.
+ * Page's order is recorded in page_private(page) field.
* So when we are allocating or freeing one, we can derive the state of the
* other. That is, if we allocate a small block, and both were
* free, the remainder of the region must be split into blocks.
@@ -946,7 +943,7 @@ static int free_tail_pages_check(struct page *head_page, struct page *page)
}
switch (page - head_page) {
case 1:
- /* the first tail page: ->mapping is compound_mapcount() */
+ /* the first tail page: ->mapping may be compound_mapcount() */
if (unlikely(compound_mapcount(page))) {
bad_page(page, "nonzero compound_mapcount", 0);
goto out;
@@ -955,7 +952,7 @@ static int free_tail_pages_check(struct page *head_page, struct page *page)
case 2:
/*
* the second tail page: ->mapping is
- * page_deferred_list().next -- ignore value.
+ * deferred_list.next -- ignore value.
*/
break;
default:
@@ -3064,7 +3061,7 @@ static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
static int __init fail_page_alloc_debugfs(void)
{
- umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
+ umode_t mode = S_IFREG | 0600;
struct dentry *dir;
dir = fault_create_debugfs_attr("fail_page_alloc", NULL,
@@ -3701,7 +3698,7 @@ should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_fla
#endif /* CONFIG_COMPACTION */
#ifdef CONFIG_LOCKDEP
-struct lockdep_map __fs_reclaim_map =
+static struct lockdep_map __fs_reclaim_map =
STATIC_LOCKDEP_MAP_INIT("fs_reclaim", &__fs_reclaim_map);
static bool __need_fs_reclaim(gfp_t gfp_mask)
@@ -3726,17 +3723,27 @@ static bool __need_fs_reclaim(gfp_t gfp_mask)
return true;
}
+void __fs_reclaim_acquire(void)
+{
+ lock_map_acquire(&__fs_reclaim_map);
+}
+
+void __fs_reclaim_release(void)
+{
+ lock_map_release(&__fs_reclaim_map);
+}
+
void fs_reclaim_acquire(gfp_t gfp_mask)
{
if (__need_fs_reclaim(gfp_mask))
- lock_map_acquire(&__fs_reclaim_map);
+ __fs_reclaim_acquire();
}
EXPORT_SYMBOL_GPL(fs_reclaim_acquire);
void fs_reclaim_release(gfp_t gfp_mask)
{
if (__need_fs_reclaim(gfp_mask))
- lock_map_release(&__fs_reclaim_map);
+ __fs_reclaim_release();
}
EXPORT_SYMBOL_GPL(fs_reclaim_release);
#endif
@@ -3754,8 +3761,8 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order,
/* We now go into synchronous reclaim */
cpuset_memory_pressure_bump();
- noreclaim_flag = memalloc_noreclaim_save();
fs_reclaim_acquire(gfp_mask);
+ noreclaim_flag = memalloc_noreclaim_save();
reclaim_state.reclaimed_slab = 0;
current->reclaim_state = &reclaim_state;
@@ -3763,8 +3770,8 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order,
ac->nodemask);
current->reclaim_state = NULL;
- fs_reclaim_release(gfp_mask);
memalloc_noreclaim_restore(noreclaim_flag);
+ fs_reclaim_release(gfp_mask);
cond_resched();
@@ -4162,7 +4169,6 @@ retry:
* orientated.
*/
if (!(alloc_flags & ALLOC_CPUSET) || reserve_flags) {
- ac->zonelist = node_zonelist(numa_node_id(), gfp_mask);
ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
ac->high_zoneidx, ac->nodemask);
}
@@ -4326,8 +4332,7 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
}
/* Determine whether to spread dirty pages and what the first usable zone */
-static inline void finalise_ac(gfp_t gfp_mask,
- unsigned int order, struct alloc_context *ac)
+static inline void finalise_ac(gfp_t gfp_mask, struct alloc_context *ac)
{
/* Dirty zone balancing only done in the fast path */
ac->spread_dirty_pages = (gfp_mask & __GFP_WRITE);
@@ -4358,7 +4363,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid,
if (!prepare_alloc_pages(gfp_mask, order, preferred_nid, nodemask, &ac, &alloc_mask, &alloc_flags))
return NULL;
- finalise_ac(gfp_mask, order, &ac);
+ finalise_ac(gfp_mask, &ac);
/* First allocation attempt */
page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac);
@@ -6229,18 +6234,18 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
for (j = 0; j < MAX_NR_ZONES; j++) {
struct zone *zone = pgdat->node_zones + j;
- unsigned long size, realsize, freesize, memmap_pages;
+ unsigned long size, freesize, memmap_pages;
unsigned long zone_start_pfn = zone->zone_start_pfn;
size = zone->spanned_pages;
- realsize = freesize = zone->present_pages;
+ freesize = zone->present_pages;
/*
* Adjust freesize so that it accounts for how much memory
* is used by this zone for memmap. This affects the watermark
* and per-cpu initialisations
*/
- memmap_pages = calc_memmap_size(size, realsize);
+ memmap_pages = calc_memmap_size(size, freesize);
if (!is_highmem_idx(j)) {
if (freesize >= memmap_pages) {
freesize -= memmap_pages;
@@ -6272,7 +6277,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
* when the bootmem allocator frees pages into the buddy system.
* And all highmem pages will be managed by the buddy system.
*/
- zone->managed_pages = is_highmem_idx(j) ? realsize : freesize;
+ zone->managed_pages = freesize;
#ifdef CONFIG_NUMA
zone->node = nid;
#endif
@@ -7682,29 +7687,6 @@ unmovable:
return true;
}
-bool is_pageblock_removable_nolock(struct page *page)
-{
- struct zone *zone;
- unsigned long pfn;
-
- /*
- * We have to be careful here because we are iterating over memory
- * sections which are not zone aware so we might end up outside of
- * the zone but still within the section.
- * We have to take care about the node as well. If the node is offline
- * its NODE_DATA will be NULL - see page_zone.
- */
- if (!node_online(page_to_nid(page)))
- return false;
-
- zone = page_zone(page);
- pfn = page_to_pfn(page);
- if (!zone_spans_pfn(zone, pfn))
- return false;
-
- return !has_unmovable_pages(zone, page, 0, MIGRATE_MOVABLE, true);
-}
-
#if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA)
static unsigned long pfn_max_align_down(unsigned long pfn)
diff --git a/mm/page_counter.c b/mm/page_counter.c
index 2a8df3ad60a4..de31470655f6 100644
--- a/mm/page_counter.c
+++ b/mm/page_counter.c
@@ -13,6 +13,40 @@
#include <linux/bug.h>
#include <asm/page.h>
+static void propagate_protected_usage(struct page_counter *c,
+ unsigned long usage)
+{
+ unsigned long protected, old_protected;
+ long delta;
+
+ if (!c->parent)
+ return;
+
+ if (c->min || atomic_long_read(&c->min_usage)) {
+ if (usage <= c->min)
+ protected = usage;
+ else
+ protected = 0;
+
+ old_protected = atomic_long_xchg(&c->min_usage, protected);
+ delta = protected - old_protected;
+ if (delta)
+ atomic_long_add(delta, &c->parent->children_min_usage);
+ }
+
+ if (c->low || atomic_long_read(&c->low_usage)) {
+ if (usage <= c->low)
+ protected = usage;
+ else
+ protected = 0;
+
+ old_protected = atomic_long_xchg(&c->low_usage, protected);
+ delta = protected - old_protected;
+ if (delta)
+ atomic_long_add(delta, &c->parent->children_low_usage);
+ }
+}
+
/**
* page_counter_cancel - take pages out of the local counter
* @counter: counter
@@ -22,7 +56,8 @@ void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages)
{
long new;
- new = atomic_long_sub_return(nr_pages, &counter->count);
+ new = atomic_long_sub_return(nr_pages, &counter->usage);
+ propagate_protected_usage(counter, new);
/* More uncharges than charges? */
WARN_ON_ONCE(new < 0);
}
@@ -41,7 +76,8 @@ void page_counter_charge(struct page_counter *counter, unsigned long nr_pages)
for (c = counter; c; c = c->parent) {
long new;
- new = atomic_long_add_return(nr_pages, &c->count);
+ new = atomic_long_add_return(nr_pages, &c->usage);
+ propagate_protected_usage(counter, new);
/*
* This is indeed racy, but we can live with some
* inaccuracy in the watermark.
@@ -82,9 +118,10 @@ bool page_counter_try_charge(struct page_counter *counter,
* we either see the new limit or the setter sees the
* counter has changed and retries.
*/
- new = atomic_long_add_return(nr_pages, &c->count);
- if (new > c->limit) {
- atomic_long_sub(nr_pages, &c->count);
+ new = atomic_long_add_return(nr_pages, &c->usage);
+ if (new > c->max) {
+ atomic_long_sub(nr_pages, &c->usage);
+ propagate_protected_usage(counter, new);
/*
* This is racy, but we can live with some
* inaccuracy in the failcnt.
@@ -93,6 +130,7 @@ bool page_counter_try_charge(struct page_counter *counter,
*fail = c;
goto failed;
}
+ propagate_protected_usage(counter, new);
/*
* Just like with failcnt, we can live with some
* inaccuracy in the watermark.
@@ -123,20 +161,20 @@ void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages)
}
/**
- * page_counter_limit - limit the number of pages allowed
+ * page_counter_set_max - set the maximum number of pages allowed
* @counter: counter
- * @limit: limit to set
+ * @nr_pages: limit to set
*
* Returns 0 on success, -EBUSY if the current number of pages on the
* counter already exceeds the specified limit.
*
* The caller must serialize invocations on the same counter.
*/
-int page_counter_limit(struct page_counter *counter, unsigned long limit)
+int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages)
{
for (;;) {
unsigned long old;
- long count;
+ long usage;
/*
* Update the limit while making sure that it's not
@@ -149,22 +187,56 @@ int page_counter_limit(struct page_counter *counter, unsigned long limit)
* the limit, so if it sees the old limit, we see the
* modified counter and retry.
*/
- count = atomic_long_read(&counter->count);
+ usage = atomic_long_read(&counter->usage);
- if (count > limit)
+ if (usage > nr_pages)
return -EBUSY;
- old = xchg(&counter->limit, limit);
+ old = xchg(&counter->max, nr_pages);
- if (atomic_long_read(&counter->count) <= count)
+ if (atomic_long_read(&counter->usage) <= usage)
return 0;
- counter->limit = old;
+ counter->max = old;
cond_resched();
}
}
/**
+ * page_counter_set_min - set the amount of protected memory
+ * @counter: counter
+ * @nr_pages: value to set
+ *
+ * The caller must serialize invocations on the same counter.
+ */
+void page_counter_set_min(struct page_counter *counter, unsigned long nr_pages)
+{
+ struct page_counter *c;
+
+ counter->min = nr_pages;
+
+ for (c = counter; c; c = c->parent)
+ propagate_protected_usage(c, atomic_long_read(&c->usage));
+}
+
+/**
+ * page_counter_set_low - set the amount of protected memory
+ * @counter: counter
+ * @nr_pages: value to set
+ *
+ * The caller must serialize invocations on the same counter.
+ */
+void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages)
+{
+ struct page_counter *c;
+
+ counter->low = nr_pages;
+
+ for (c = counter; c; c = c->parent)
+ propagate_protected_usage(c, atomic_long_read(&c->usage));
+}
+
+/**
* page_counter_memparse - memparse() for page counter limits
* @buf: string to parse
* @max: string meaning maximum possible value
diff --git a/mm/page_idle.c b/mm/page_idle.c
index e412a63b2b74..6302bc62c27d 100644
--- a/mm/page_idle.c
+++ b/mm/page_idle.c
@@ -201,7 +201,7 @@ static ssize_t page_idle_bitmap_write(struct file *file, struct kobject *kobj,
}
static struct bin_attribute page_idle_bitmap_attr =
- __BIN_ATTR(bitmap, S_IRUSR | S_IWUSR,
+ __BIN_ATTR(bitmap, 0600,
page_idle_bitmap_read, page_idle_bitmap_write, 0);
static struct bin_attribute *page_idle_bin_attrs[] = {
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 75d21a2259b3..d80adfe702d3 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -631,8 +631,8 @@ static int __init pageowner_init(void)
return 0;
}
- dentry = debugfs_create_file("page_owner", S_IRUSR, NULL,
- NULL, &proc_page_owner_operations);
+ dentry = debugfs_create_file("page_owner", 0400, NULL,
+ NULL, &proc_page_owner_operations);
return PTR_ERR_OR_ZERO(dentry);
}
diff --git a/mm/percpu-stats.c b/mm/percpu-stats.c
index 063ff60ecd90..b5fdd43b60c9 100644
--- a/mm/percpu-stats.c
+++ b/mm/percpu-stats.c
@@ -144,7 +144,7 @@ alloc_buffer:
spin_unlock_irq(&pcpu_lock);
/* there can be at most this many free and allocated fragments */
- buffer = vmalloc((2 * max_nr_alloc + 1) * sizeof(int));
+ buffer = vmalloc(array_size(sizeof(int), (2 * max_nr_alloc + 1)));
if (!buffer)
return -ENOMEM;
diff --git a/mm/readahead.c b/mm/readahead.c
index 539bbb6c1fad..e273f0de3376 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -140,23 +140,23 @@ out:
}
/*
- * __do_page_cache_readahead() actually reads a chunk of disk. It allocates all
- * the pages first, then submits them all for I/O. This avoids the very bad
+ * __do_page_cache_readahead() actually reads a chunk of disk. It allocates
+ * the pages first, then submits them for I/O. This avoids the very bad
* behaviour which would occur if page allocations are causing VM writeback.
* We really don't want to intermingle reads and writes like that.
*
* Returns the number of pages requested, or the maximum amount of I/O allowed.
*/
-int __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
- pgoff_t offset, unsigned long nr_to_read,
- unsigned long lookahead_size)
+unsigned int __do_page_cache_readahead(struct address_space *mapping,
+ struct file *filp, pgoff_t offset, unsigned long nr_to_read,
+ unsigned long lookahead_size)
{
struct inode *inode = mapping->host;
struct page *page;
unsigned long end_index; /* The last page we want to read */
LIST_HEAD(page_pool);
int page_idx;
- int ret = 0;
+ unsigned int nr_pages = 0;
loff_t isize = i_size_read(inode);
gfp_t gfp_mask = readahead_gfp_mask(mapping);
@@ -177,8 +177,18 @@ int __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
rcu_read_lock();
page = radix_tree_lookup(&mapping->i_pages, page_offset);
rcu_read_unlock();
- if (page && !radix_tree_exceptional_entry(page))
+ if (page && !radix_tree_exceptional_entry(page)) {
+ /*
+ * Page already present? Kick off the current batch of
+ * contiguous pages before continuing with the next
+ * batch.
+ */
+ if (nr_pages)
+ read_pages(mapping, filp, &page_pool, nr_pages,
+ gfp_mask);
+ nr_pages = 0;
continue;
+ }
page = __page_cache_alloc(gfp_mask);
if (!page)
@@ -187,7 +197,7 @@ int __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
list_add(&page->lru, &page_pool);
if (page_idx == nr_to_read - lookahead_size)
SetPageReadahead(page);
- ret++;
+ nr_pages++;
}
/*
@@ -195,11 +205,11 @@ int __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
* uptodate then the caller will launch readpage again, and
* will then handle the error.
*/
- if (ret)
- read_pages(mapping, filp, &page_pool, ret, gfp_mask);
+ if (nr_pages)
+ read_pages(mapping, filp, &page_pool, nr_pages, gfp_mask);
BUG_ON(!list_empty(&page_pool));
out:
- return ret;
+ return nr_pages;
}
/*
@@ -223,16 +233,11 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
max_pages = max_t(unsigned long, bdi->io_pages, ra->ra_pages);
nr_to_read = min(nr_to_read, max_pages);
while (nr_to_read) {
- int err;
-
unsigned long this_chunk = (2 * 1024 * 1024) / PAGE_SIZE;
if (this_chunk > nr_to_read)
this_chunk = nr_to_read;
- err = __do_page_cache_readahead(mapping, filp,
- offset, this_chunk, 0);
- if (err < 0)
- return err;
+ __do_page_cache_readahead(mapping, filp, offset, this_chunk, 0);
offset += this_chunk;
nr_to_read -= this_chunk;
diff --git a/mm/shmem.c b/mm/shmem.c
index 9d6c7e595415..2cab84403055 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -327,7 +327,7 @@ static int shmem_radix_tree_replace(struct address_space *mapping,
pgoff_t index, void *expected, void *replacement)
{
struct radix_tree_node *node;
- void **pslot;
+ void __rcu **pslot;
void *item;
VM_BUG_ON(!expected);
@@ -395,7 +395,7 @@ static bool shmem_confirm_swap(struct address_space *mapping,
#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
/* ifdef here to avoid bloating shmem.o when not necessary */
-int shmem_huge __read_mostly;
+static int shmem_huge __read_mostly;
#if defined(CONFIG_SYSFS) || defined(CONFIG_TMPFS)
static int shmem_parse_huge(const char *str)
@@ -571,6 +571,15 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
}
#endif /* CONFIG_TRANSPARENT_HUGE_PAGECACHE */
+static inline bool is_huge_enabled(struct shmem_sb_info *sbinfo)
+{
+ if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) &&
+ (shmem_huge == SHMEM_HUGE_FORCE || sbinfo->huge) &&
+ shmem_huge != SHMEM_HUGE_DENY)
+ return true;
+ return false;
+}
+
/*
* Like add_to_page_cache_locked, but error if expected item has gone.
*/
@@ -682,7 +691,7 @@ unsigned long shmem_partial_swap_usage(struct address_space *mapping,
pgoff_t start, pgoff_t end)
{
struct radix_tree_iter iter;
- void **slot;
+ void __rcu **slot;
struct page *page;
unsigned long swapped = 0;
@@ -988,6 +997,7 @@ static int shmem_getattr(const struct path *path, struct kstat *stat,
{
struct inode *inode = path->dentry->d_inode;
struct shmem_inode_info *info = SHMEM_I(inode);
+ struct shmem_sb_info *sb_info = SHMEM_SB(inode->i_sb);
if (info->alloced - info->swapped != inode->i_mapping->nrpages) {
spin_lock_irq(&info->lock);
@@ -995,6 +1005,10 @@ static int shmem_getattr(const struct path *path, struct kstat *stat,
spin_unlock_irq(&info->lock);
}
generic_fillattr(inode, stat);
+
+ if (is_huge_enabled(sb_info))
+ stat->blksize = HPAGE_PMD_SIZE;
+
return 0;
}
@@ -1098,13 +1112,19 @@ static void shmem_evict_inode(struct inode *inode)
static unsigned long find_swap_entry(struct radix_tree_root *root, void *item)
{
struct radix_tree_iter iter;
- void **slot;
+ void __rcu **slot;
unsigned long found = -1;
unsigned int checked = 0;
rcu_read_lock();
radix_tree_for_each_slot(slot, root, &iter, 0) {
- if (*slot == item) {
+ void *entry = radix_tree_deref_slot(slot);
+
+ if (radix_tree_deref_retry(entry)) {
+ slot = radix_tree_iter_retry(&iter);
+ continue;
+ }
+ if (entry == item) {
found = iter.index;
break;
}
@@ -1322,9 +1342,6 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
if (!swap.val)
goto redirty;
- if (mem_cgroup_try_charge_swap(page, swap))
- goto free_swap;
-
/*
* Add inode to shmem_unuse()'s list of swapped-out inodes,
* if it's not already there. Do it now before the page is
@@ -1353,7 +1370,6 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
}
mutex_unlock(&shmem_swaplist_mutex);
-free_swap:
put_swap_page(page, swap);
redirty:
set_page_dirty(page);
@@ -1404,10 +1420,9 @@ static void shmem_pseudo_vma_init(struct vm_area_struct *vma,
struct shmem_inode_info *info, pgoff_t index)
{
/* Create a pseudo vma that just contains the policy */
- vma->vm_start = 0;
+ memset(vma, 0, sizeof(*vma));
/* Bias interleave by inode number to distribute better across nodes */
vma->vm_pgoff = index + info->vfs_inode.i_ino;
- vma->vm_ops = NULL;
vma->vm_policy = mpol_shared_policy_lookup(&info->policy, index);
}
@@ -1931,14 +1946,14 @@ static int synchronous_wake_function(wait_queue_entry_t *wait, unsigned mode, in
return ret;
}
-static int shmem_fault(struct vm_fault *vmf)
+static vm_fault_t shmem_fault(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
struct inode *inode = file_inode(vma->vm_file);
gfp_t gfp = mapping_gfp_mask(inode->i_mapping);
enum sgp_type sgp;
- int error;
- int ret = VM_FAULT_LOCKED;
+ int err;
+ vm_fault_t ret = VM_FAULT_LOCKED;
/*
* Trinity finds that probing a hole which tmpfs is punching can
@@ -2006,10 +2021,10 @@ static int shmem_fault(struct vm_fault *vmf)
else if (vma->vm_flags & VM_HUGEPAGE)
sgp = SGP_HUGE;
- error = shmem_getpage_gfp(inode, vmf->pgoff, &vmf->page, sgp,
+ err = shmem_getpage_gfp(inode, vmf->pgoff, &vmf->page, sgp,
gfp, vma, vmf, &ret);
- if (error)
- return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS);
+ if (err)
+ return vmf_error(err);
return ret;
}
@@ -2616,241 +2631,6 @@ static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence)
return offset;
}
-/*
- * We need a tag: a new tag would expand every radix_tree_node by 8 bytes,
- * so reuse a tag which we firmly believe is never set or cleared on shmem.
- */
-#define SHMEM_TAG_PINNED PAGECACHE_TAG_TOWRITE
-#define LAST_SCAN 4 /* about 150ms max */
-
-static void shmem_tag_pins(struct address_space *mapping)
-{
- struct radix_tree_iter iter;
- void **slot;
- pgoff_t start;
- struct page *page;
-
- lru_add_drain();
- start = 0;
- rcu_read_lock();
-
- radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) {
- page = radix_tree_deref_slot(slot);
- if (!page || radix_tree_exception(page)) {
- if (radix_tree_deref_retry(page)) {
- slot = radix_tree_iter_retry(&iter);
- continue;
- }
- } else if (page_count(page) - page_mapcount(page) > 1) {
- xa_lock_irq(&mapping->i_pages);
- radix_tree_tag_set(&mapping->i_pages, iter.index,
- SHMEM_TAG_PINNED);
- xa_unlock_irq(&mapping->i_pages);
- }
-
- if (need_resched()) {
- slot = radix_tree_iter_resume(slot, &iter);
- cond_resched_rcu();
- }
- }
- rcu_read_unlock();
-}
-
-/*
- * Setting SEAL_WRITE requires us to verify there's no pending writer. However,
- * via get_user_pages(), drivers might have some pending I/O without any active
- * user-space mappings (eg., direct-IO, AIO). Therefore, we look at all pages
- * and see whether it has an elevated ref-count. If so, we tag them and wait for
- * them to be dropped.
- * The caller must guarantee that no new user will acquire writable references
- * to those pages to avoid races.
- */
-static int shmem_wait_for_pins(struct address_space *mapping)
-{
- struct radix_tree_iter iter;
- void **slot;
- pgoff_t start;
- struct page *page;
- int error, scan;
-
- shmem_tag_pins(mapping);
-
- error = 0;
- for (scan = 0; scan <= LAST_SCAN; scan++) {
- if (!radix_tree_tagged(&mapping->i_pages, SHMEM_TAG_PINNED))
- break;
-
- if (!scan)
- lru_add_drain_all();
- else if (schedule_timeout_killable((HZ << scan) / 200))
- scan = LAST_SCAN;
-
- start = 0;
- rcu_read_lock();
- radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter,
- start, SHMEM_TAG_PINNED) {
-
- page = radix_tree_deref_slot(slot);
- if (radix_tree_exception(page)) {
- if (radix_tree_deref_retry(page)) {
- slot = radix_tree_iter_retry(&iter);
- continue;
- }
-
- page = NULL;
- }
-
- if (page &&
- page_count(page) - page_mapcount(page) != 1) {
- if (scan < LAST_SCAN)
- goto continue_resched;
-
- /*
- * On the last scan, we clean up all those tags
- * we inserted; but make a note that we still
- * found pages pinned.
- */
- error = -EBUSY;
- }
-
- xa_lock_irq(&mapping->i_pages);
- radix_tree_tag_clear(&mapping->i_pages,
- iter.index, SHMEM_TAG_PINNED);
- xa_unlock_irq(&mapping->i_pages);
-continue_resched:
- if (need_resched()) {
- slot = radix_tree_iter_resume(slot, &iter);
- cond_resched_rcu();
- }
- }
- rcu_read_unlock();
- }
-
- return error;
-}
-
-static unsigned int *memfd_file_seals_ptr(struct file *file)
-{
- if (file->f_op == &shmem_file_operations)
- return &SHMEM_I(file_inode(file))->seals;
-
-#ifdef CONFIG_HUGETLBFS
- if (file->f_op == &hugetlbfs_file_operations)
- return &HUGETLBFS_I(file_inode(file))->seals;
-#endif
-
- return NULL;
-}
-
-#define F_ALL_SEALS (F_SEAL_SEAL | \
- F_SEAL_SHRINK | \
- F_SEAL_GROW | \
- F_SEAL_WRITE)
-
-static int memfd_add_seals(struct file *file, unsigned int seals)
-{
- struct inode *inode = file_inode(file);
- unsigned int *file_seals;
- int error;
-
- /*
- * SEALING
- * Sealing allows multiple parties to share a shmem-file but restrict
- * access to a specific subset of file operations. Seals can only be
- * added, but never removed. This way, mutually untrusted parties can
- * share common memory regions with a well-defined policy. A malicious
- * peer can thus never perform unwanted operations on a shared object.
- *
- * Seals are only supported on special shmem-files and always affect
- * the whole underlying inode. Once a seal is set, it may prevent some
- * kinds of access to the file. Currently, the following seals are
- * defined:
- * SEAL_SEAL: Prevent further seals from being set on this file
- * SEAL_SHRINK: Prevent the file from shrinking
- * SEAL_GROW: Prevent the file from growing
- * SEAL_WRITE: Prevent write access to the file
- *
- * As we don't require any trust relationship between two parties, we
- * must prevent seals from being removed. Therefore, sealing a file
- * only adds a given set of seals to the file, it never touches
- * existing seals. Furthermore, the "setting seals"-operation can be
- * sealed itself, which basically prevents any further seal from being
- * added.
- *
- * Semantics of sealing are only defined on volatile files. Only
- * anonymous shmem files support sealing. More importantly, seals are
- * never written to disk. Therefore, there's no plan to support it on
- * other file types.
- */
-
- if (!(file->f_mode & FMODE_WRITE))
- return -EPERM;
- if (seals & ~(unsigned int)F_ALL_SEALS)
- return -EINVAL;
-
- inode_lock(inode);
-
- file_seals = memfd_file_seals_ptr(file);
- if (!file_seals) {
- error = -EINVAL;
- goto unlock;
- }
-
- if (*file_seals & F_SEAL_SEAL) {
- error = -EPERM;
- goto unlock;
- }
-
- if ((seals & F_SEAL_WRITE) && !(*file_seals & F_SEAL_WRITE)) {
- error = mapping_deny_writable(file->f_mapping);
- if (error)
- goto unlock;
-
- error = shmem_wait_for_pins(file->f_mapping);
- if (error) {
- mapping_allow_writable(file->f_mapping);
- goto unlock;
- }
- }
-
- *file_seals |= seals;
- error = 0;
-
-unlock:
- inode_unlock(inode);
- return error;
-}
-
-static int memfd_get_seals(struct file *file)
-{
- unsigned int *seals = memfd_file_seals_ptr(file);
-
- return seals ? *seals : -EINVAL;
-}
-
-long memfd_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
-{
- long error;
-
- switch (cmd) {
- case F_ADD_SEALS:
- /* disallow upper 32bit */
- if (arg > UINT_MAX)
- return -EINVAL;
-
- error = memfd_add_seals(file, arg);
- break;
- case F_GET_SEALS:
- error = memfd_get_seals(file);
- break;
- default:
- error = -EINVAL;
- break;
- }
-
- return error;
-}
-
static long shmem_fallocate(struct file *file, int mode, loff_t offset,
loff_t len)
{
@@ -3233,7 +3013,8 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
if (len > PAGE_SIZE)
return -ENAMETOOLONG;
- inode = shmem_get_inode(dir->i_sb, dir, S_IFLNK|S_IRWXUGO, 0, VM_NORESERVE);
+ inode = shmem_get_inode(dir->i_sb, dir, S_IFLNK | 0777, 0,
+ VM_NORESERVE);
if (!inode)
return -ENOSPC;
@@ -3428,6 +3209,15 @@ static int shmem_match(struct inode *ino, void *vfh)
return ino->i_ino == inum && fh[0] == ino->i_generation;
}
+/* Find any alias of inode, but prefer a hashed alias */
+static struct dentry *shmem_find_alias(struct inode *inode)
+{
+ struct dentry *alias = d_find_alias(inode);
+
+ return alias ?: d_find_any_alias(inode);
+}
+
+
static struct dentry *shmem_fh_to_dentry(struct super_block *sb,
struct fid *fid, int fh_len, int fh_type)
{
@@ -3444,7 +3234,7 @@ static struct dentry *shmem_fh_to_dentry(struct super_block *sb,
inode = ilookup5(sb, (unsigned long)(inum + fid->raw[0]),
shmem_match, fid->raw);
if (inode) {
- dentry = d_find_alias(inode);
+ dentry = shmem_find_alias(inode);
iput(inode);
}
@@ -3656,7 +3446,7 @@ static int shmem_show_options(struct seq_file *seq, struct dentry *root)
sbinfo->max_blocks << (PAGE_SHIFT - 10));
if (sbinfo->max_inodes != shmem_default_max_inodes())
seq_printf(seq, ",nr_inodes=%lu", sbinfo->max_inodes);
- if (sbinfo->mode != (S_IRWXUGO | S_ISVTX))
+ if (sbinfo->mode != (0777 | S_ISVTX))
seq_printf(seq, ",mode=%03ho", sbinfo->mode);
if (!uid_eq(sbinfo->uid, GLOBAL_ROOT_UID))
seq_printf(seq, ",uid=%u",
@@ -3673,93 +3463,6 @@ static int shmem_show_options(struct seq_file *seq, struct dentry *root)
return 0;
}
-#define MFD_NAME_PREFIX "memfd:"
-#define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1)
-#define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN)
-
-#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB)
-
-SYSCALL_DEFINE2(memfd_create,
- const char __user *, uname,
- unsigned int, flags)
-{
- unsigned int *file_seals;
- struct file *file;
- int fd, error;
- char *name;
- long len;
-
- if (!(flags & MFD_HUGETLB)) {
- if (flags & ~(unsigned int)MFD_ALL_FLAGS)
- return -EINVAL;
- } else {
- /* Allow huge page size encoding in flags. */
- if (flags & ~(unsigned int)(MFD_ALL_FLAGS |
- (MFD_HUGE_MASK << MFD_HUGE_SHIFT)))
- return -EINVAL;
- }
-
- /* length includes terminating zero */
- len = strnlen_user(uname, MFD_NAME_MAX_LEN + 1);
- if (len <= 0)
- return -EFAULT;
- if (len > MFD_NAME_MAX_LEN + 1)
- return -EINVAL;
-
- name = kmalloc(len + MFD_NAME_PREFIX_LEN, GFP_KERNEL);
- if (!name)
- return -ENOMEM;
-
- strcpy(name, MFD_NAME_PREFIX);
- if (copy_from_user(&name[MFD_NAME_PREFIX_LEN], uname, len)) {
- error = -EFAULT;
- goto err_name;
- }
-
- /* terminating-zero may have changed after strnlen_user() returned */
- if (name[len + MFD_NAME_PREFIX_LEN - 1]) {
- error = -EFAULT;
- goto err_name;
- }
-
- fd = get_unused_fd_flags((flags & MFD_CLOEXEC) ? O_CLOEXEC : 0);
- if (fd < 0) {
- error = fd;
- goto err_name;
- }
-
- if (flags & MFD_HUGETLB) {
- struct user_struct *user = NULL;
-
- file = hugetlb_file_setup(name, 0, VM_NORESERVE, &user,
- HUGETLB_ANONHUGE_INODE,
- (flags >> MFD_HUGE_SHIFT) &
- MFD_HUGE_MASK);
- } else
- file = shmem_file_setup(name, 0, VM_NORESERVE);
- if (IS_ERR(file)) {
- error = PTR_ERR(file);
- goto err_fd;
- }
- file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE;
- file->f_flags |= O_RDWR | O_LARGEFILE;
-
- if (flags & MFD_ALLOW_SEALING) {
- file_seals = memfd_file_seals_ptr(file);
- *file_seals &= ~F_SEAL_SEAL;
- }
-
- fd_install(fd, file);
- kfree(name);
- return fd;
-
-err_fd:
- put_unused_fd(fd);
-err_name:
- kfree(name);
- return error;
-}
-
#endif /* CONFIG_TMPFS */
static void shmem_put_super(struct super_block *sb)
@@ -3784,7 +3487,7 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent)
if (!sbinfo)
return -ENOMEM;
- sbinfo->mode = S_IRWXUGO | S_ISVTX;
+ sbinfo->mode = 0777 | S_ISVTX;
sbinfo->uid = current_fsuid();
sbinfo->gid = current_fsgid();
sb->s_fs_info = sbinfo;
@@ -4227,7 +3930,7 @@ static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name, l
d_set_d_op(path.dentry, &anon_ops);
res = ERR_PTR(-ENOSPC);
- inode = shmem_get_inode(sb, NULL, S_IFREG | S_IRWXUGO, 0, flags);
+ inode = shmem_get_inode(sb, NULL, S_IFREG | 0777, 0, flags);
if (!inode)
goto put_memory;
diff --git a/mm/slab.c b/mm/slab.c
index 2f308253c3d7..aa76a70e087e 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1235,8 +1235,6 @@ void __init kmem_cache_init(void)
{
int i;
- BUILD_BUG_ON(sizeof(((struct page *)NULL)->lru) <
- sizeof(struct rcu_head));
kmem_cache = &kmem_cache_boot;
if (!IS_ENABLED(CONFIG_NUMA) || num_possible_nodes() == 1)
@@ -2665,6 +2663,7 @@ static struct page *cache_grow_begin(struct kmem_cache *cachep,
invalid_mask, &invalid_mask, flags, &flags);
dump_stack();
}
+ WARN_ON_ONCE(cachep->ctor && (flags & __GFP_ZERO));
local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
check_irq_off();
@@ -3071,6 +3070,7 @@ static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep,
static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
gfp_t flags, void *objp, unsigned long caller)
{
+ WARN_ON_ONCE(cachep->ctor && (flags & __GFP_ZERO));
if (!objp)
return objp;
if (cachep->flags & SLAB_POISON) {
@@ -4338,7 +4338,8 @@ static int leaks_show(struct seq_file *m, void *p)
if (x[0] == x[1]) {
/* Increase the buffer size */
mutex_unlock(&slab_mutex);
- m->private = kzalloc(x[0] * 4 * sizeof(unsigned long), GFP_KERNEL);
+ m->private = kcalloc(x[0] * 4, sizeof(unsigned long),
+ GFP_KERNEL);
if (!m->private) {
/* Too bad, we are really out */
m->private = x;
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 98dcdc352062..890b1f04a03a 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -136,6 +136,7 @@ void slab_init_memcg_params(struct kmem_cache *s)
s->memcg_params.root_cache = NULL;
RCU_INIT_POINTER(s->memcg_params.memcg_caches, NULL);
INIT_LIST_HEAD(&s->memcg_params.children);
+ s->memcg_params.dying = false;
}
static int init_memcg_params(struct kmem_cache *s,
@@ -608,7 +609,7 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg,
* The memory cgroup could have been offlined while the cache
* creation work was pending.
*/
- if (memcg->kmem_state != KMEM_ONLINE)
+ if (memcg->kmem_state != KMEM_ONLINE || root_cache->memcg_params.dying)
goto out_unlock;
idx = memcg_cache_id(memcg);
@@ -712,6 +713,9 @@ void slab_deactivate_memcg_cache_rcu_sched(struct kmem_cache *s,
WARN_ON_ONCE(s->memcg_params.deact_fn))
return;
+ if (s->memcg_params.root_cache->memcg_params.dying)
+ return;
+
/* pin memcg so that @s doesn't get destroyed in the middle */
css_get(&s->memcg_params.memcg->css);
@@ -823,11 +827,36 @@ static int shutdown_memcg_caches(struct kmem_cache *s)
return -EBUSY;
return 0;
}
+
+static void flush_memcg_workqueue(struct kmem_cache *s)
+{
+ mutex_lock(&slab_mutex);
+ s->memcg_params.dying = true;
+ mutex_unlock(&slab_mutex);
+
+ /*
+ * SLUB deactivates the kmem_caches through call_rcu_sched. Make
+ * sure all registered rcu callbacks have been invoked.
+ */
+ if (IS_ENABLED(CONFIG_SLUB))
+ rcu_barrier_sched();
+
+ /*
+ * SLAB and SLUB create memcg kmem_caches through workqueue and SLUB
+ * deactivates the memcg kmem_caches through workqueue. Make sure all
+ * previous workitems on workqueue are processed.
+ */
+ flush_workqueue(memcg_kmem_cache_wq);
+}
#else
static inline int shutdown_memcg_caches(struct kmem_cache *s)
{
return 0;
}
+
+static inline void flush_memcg_workqueue(struct kmem_cache *s)
+{
+}
#endif /* CONFIG_MEMCG && !CONFIG_SLOB */
void slab_kmem_cache_release(struct kmem_cache *s)
@@ -845,6 +874,8 @@ void kmem_cache_destroy(struct kmem_cache *s)
if (unlikely(!s))
return;
+ flush_memcg_workqueue(s);
+
get_online_cpus();
get_online_mems();
@@ -1212,9 +1243,9 @@ void cache_random_seq_destroy(struct kmem_cache *cachep)
#if defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG)
#ifdef CONFIG_SLAB
-#define SLABINFO_RIGHTS (S_IWUSR | S_IRUSR)
+#define SLABINFO_RIGHTS (0600)
#else
-#define SLABINFO_RIGHTS S_IRUSR
+#define SLABINFO_RIGHTS (0400)
#endif
static void print_slabinfo_header(struct seq_file *m)
diff --git a/mm/slob.c b/mm/slob.c
index 623e8a5c46ce..307c2c9feb44 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -555,8 +555,10 @@ static void *slob_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
flags, node);
}
- if (b && c->ctor)
+ if (b && c->ctor) {
+ WARN_ON_ONCE(flags & __GFP_ZERO);
c->ctor(b);
+ }
kmemleak_alloc_recursive(b, c->size, 1, c->flags, flags);
return b;
diff --git a/mm/slub.c b/mm/slub.c
index 44aa7847324a..a3b8467c14af 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -52,11 +52,11 @@
* and to synchronize major metadata changes to slab cache structures.
*
* The slab_lock is only used for debugging and on arches that do not
- * have the ability to do a cmpxchg_double. It only protects the second
- * double word in the page struct. Meaning
+ * have the ability to do a cmpxchg_double. It only protects:
* A. page->freelist -> List of object free in a page
- * B. page->counters -> Counters of objects
- * C. page->frozen -> frozen state
+ * B. page->inuse -> Number of objects in use
+ * C. page->objects -> Number of objects in page
+ * D. page->frozen -> frozen state
*
* If a slab is frozen then it is exempt from list management. It is not
* on any list. The processor that froze the slab is the one who can
@@ -316,16 +316,16 @@ static inline unsigned int slab_index(void *p, struct kmem_cache *s, void *addr)
return (p - addr) / s->size;
}
-static inline unsigned int order_objects(unsigned int order, unsigned int size, unsigned int reserved)
+static inline unsigned int order_objects(unsigned int order, unsigned int size)
{
- return (((unsigned int)PAGE_SIZE << order) - reserved) / size;
+ return ((unsigned int)PAGE_SIZE << order) / size;
}
static inline struct kmem_cache_order_objects oo_make(unsigned int order,
- unsigned int size, unsigned int reserved)
+ unsigned int size)
{
struct kmem_cache_order_objects x = {
- (order << OO_SHIFT) + order_objects(order, size, reserved)
+ (order << OO_SHIFT) + order_objects(order, size)
};
return x;
@@ -356,21 +356,6 @@ static __always_inline void slab_unlock(struct page *page)
__bit_spin_unlock(PG_locked, &page->flags);
}
-static inline void set_page_slub_counters(struct page *page, unsigned long counters_new)
-{
- struct page tmp;
- tmp.counters = counters_new;
- /*
- * page->counters can cover frozen/inuse/objects as well
- * as page->_refcount. If we assign to ->counters directly
- * we run the risk of losing updates to page->_refcount, so
- * be careful and only assign to the fields we need.
- */
- page->frozen = tmp.frozen;
- page->inuse = tmp.inuse;
- page->objects = tmp.objects;
-}
-
/* Interrupts must be disabled (for the fallback code to work right) */
static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
void *freelist_old, unsigned long counters_old,
@@ -392,7 +377,7 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page
if (page->freelist == freelist_old &&
page->counters == counters_old) {
page->freelist = freelist_new;
- set_page_slub_counters(page, counters_new);
+ page->counters = counters_new;
slab_unlock(page);
return true;
}
@@ -431,7 +416,7 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
if (page->freelist == freelist_old &&
page->counters == counters_old) {
page->freelist = freelist_new;
- set_page_slub_counters(page, counters_new);
+ page->counters = counters_new;
slab_unlock(page);
local_irq_restore(flags);
return true;
@@ -711,7 +696,7 @@ void object_err(struct kmem_cache *s, struct page *page,
print_trailer(s, page, object);
}
-static void slab_err(struct kmem_cache *s, struct page *page,
+static __printf(3, 4) void slab_err(struct kmem_cache *s, struct page *page,
const char *fmt, ...)
{
va_list args;
@@ -847,7 +832,7 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page)
return 1;
start = page_address(page);
- length = (PAGE_SIZE << compound_order(page)) - s->reserved;
+ length = PAGE_SIZE << compound_order(page);
end = start + length;
remainder = length % s->size;
if (!remainder)
@@ -936,7 +921,7 @@ static int check_slab(struct kmem_cache *s, struct page *page)
return 0;
}
- maxobj = order_objects(compound_order(page), s->size, s->reserved);
+ maxobj = order_objects(compound_order(page), s->size);
if (page->objects > maxobj) {
slab_err(s, page, "objects %u > max %u",
page->objects, maxobj);
@@ -986,7 +971,7 @@ static int on_freelist(struct kmem_cache *s, struct page *page, void *search)
nr++;
}
- max_objects = order_objects(compound_order(page), s->size, s->reserved);
+ max_objects = order_objects(compound_order(page), s->size);
if (max_objects > MAX_OBJS_PER_PAGE)
max_objects = MAX_OBJS_PER_PAGE;
@@ -1694,24 +1679,16 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
__ClearPageSlabPfmemalloc(page);
__ClearPageSlab(page);
- page_mapcount_reset(page);
+ page->mapping = NULL;
if (current->reclaim_state)
current->reclaim_state->reclaimed_slab += pages;
memcg_uncharge_slab(page, order, s);
__free_pages(page, order);
}
-#define need_reserve_slab_rcu \
- (sizeof(((struct page *)NULL)->lru) < sizeof(struct rcu_head))
-
static void rcu_free_slab(struct rcu_head *h)
{
- struct page *page;
-
- if (need_reserve_slab_rcu)
- page = virt_to_head_page(h);
- else
- page = container_of((struct list_head *)h, struct page, lru);
+ struct page *page = container_of(h, struct page, rcu_head);
__free_slab(page->slab_cache, page);
}
@@ -1719,19 +1696,7 @@ static void rcu_free_slab(struct rcu_head *h)
static void free_slab(struct kmem_cache *s, struct page *page)
{
if (unlikely(s->flags & SLAB_TYPESAFE_BY_RCU)) {
- struct rcu_head *head;
-
- if (need_reserve_slab_rcu) {
- int order = compound_order(page);
- int offset = (PAGE_SIZE << order) - s->reserved;
-
- VM_BUG_ON(s->reserved != sizeof(*head));
- head = page_address(page) + offset;
- } else {
- head = &page->rcu_head;
- }
-
- call_rcu(head, rcu_free_slab);
+ call_rcu(&page->rcu_head, rcu_free_slab);
} else
__free_slab(s, page);
}
@@ -2444,6 +2409,8 @@ static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags,
struct kmem_cache_cpu *c = *pc;
struct page *page;
+ WARN_ON_ONCE(s->ctor && (flags & __GFP_ZERO));
+
freelist = get_partial(s, flags, node, c);
if (freelist)
@@ -3226,21 +3193,21 @@ static unsigned int slub_min_objects;
*/
static inline unsigned int slab_order(unsigned int size,
unsigned int min_objects, unsigned int max_order,
- unsigned int fract_leftover, unsigned int reserved)
+ unsigned int fract_leftover)
{
unsigned int min_order = slub_min_order;
unsigned int order;
- if (order_objects(min_order, size, reserved) > MAX_OBJS_PER_PAGE)
+ if (order_objects(min_order, size) > MAX_OBJS_PER_PAGE)
return get_order(size * MAX_OBJS_PER_PAGE) - 1;
- for (order = max(min_order, (unsigned int)get_order(min_objects * size + reserved));
+ for (order = max(min_order, (unsigned int)get_order(min_objects * size));
order <= max_order; order++) {
unsigned int slab_size = (unsigned int)PAGE_SIZE << order;
unsigned int rem;
- rem = (slab_size - reserved) % size;
+ rem = slab_size % size;
if (rem <= slab_size / fract_leftover)
break;
@@ -3249,7 +3216,7 @@ static inline unsigned int slab_order(unsigned int size,
return order;
}
-static inline int calculate_order(unsigned int size, unsigned int reserved)
+static inline int calculate_order(unsigned int size)
{
unsigned int order;
unsigned int min_objects;
@@ -3266,7 +3233,7 @@ static inline int calculate_order(unsigned int size, unsigned int reserved)
min_objects = slub_min_objects;
if (!min_objects)
min_objects = 4 * (fls(nr_cpu_ids) + 1);
- max_objects = order_objects(slub_max_order, size, reserved);
+ max_objects = order_objects(slub_max_order, size);
min_objects = min(min_objects, max_objects);
while (min_objects > 1) {
@@ -3275,7 +3242,7 @@ static inline int calculate_order(unsigned int size, unsigned int reserved)
fraction = 16;
while (fraction >= 4) {
order = slab_order(size, min_objects,
- slub_max_order, fraction, reserved);
+ slub_max_order, fraction);
if (order <= slub_max_order)
return order;
fraction /= 2;
@@ -3287,14 +3254,14 @@ static inline int calculate_order(unsigned int size, unsigned int reserved)
* We were unable to place multiple objects in a slab. Now
* lets see if we can place a single object there.
*/
- order = slab_order(size, 1, slub_max_order, 1, reserved);
+ order = slab_order(size, 1, slub_max_order, 1);
if (order <= slub_max_order)
return order;
/*
* Doh this slab cannot be placed using slub_max_order.
*/
- order = slab_order(size, 1, MAX_ORDER, 1, reserved);
+ order = slab_order(size, 1, MAX_ORDER, 1);
if (order < MAX_ORDER)
return order;
return -ENOSYS;
@@ -3562,7 +3529,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
if (forced_order >= 0)
order = forced_order;
else
- order = calculate_order(size, s->reserved);
+ order = calculate_order(size);
if ((int)order < 0)
return 0;
@@ -3580,8 +3547,8 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
/*
* Determine the number of objects per slab
*/
- s->oo = oo_make(order, size, s->reserved);
- s->min = oo_make(get_order(size), size, s->reserved);
+ s->oo = oo_make(order, size);
+ s->min = oo_make(get_order(size), size);
if (oo_objects(s->oo) > oo_objects(s->max))
s->max = s->oo;
@@ -3591,14 +3558,10 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags)
{
s->flags = kmem_cache_flags(s->size, flags, s->name, s->ctor);
- s->reserved = 0;
#ifdef CONFIG_SLAB_FREELIST_HARDENED
s->random = get_random_long();
#endif
- if (need_reserve_slab_rcu && (s->flags & SLAB_TYPESAFE_BY_RCU))
- s->reserved = sizeof(struct rcu_head);
-
if (!calculate_sizes(s, -1))
goto error;
if (disable_higher_order_debug) {
@@ -3660,8 +3623,9 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
#ifdef CONFIG_SLUB_DEBUG
void *addr = page_address(page);
void *p;
- unsigned long *map = kzalloc(BITS_TO_LONGS(page->objects) *
- sizeof(long), GFP_ATOMIC);
+ unsigned long *map = kcalloc(BITS_TO_LONGS(page->objects),
+ sizeof(long),
+ GFP_ATOMIC);
if (!map)
return;
slab_err(s, page, text, s->name);
@@ -4239,12 +4203,6 @@ void __init kmem_cache_init(void)
SLAB_HWCACHE_ALIGN, 0, 0);
kmem_cache = bootstrap(&boot_kmem_cache);
-
- /*
- * Allocate kmem_cache_node properly from the kmem_cache slab.
- * kmem_cache_node is separately allocated so no need to
- * update any list pointers.
- */
kmem_cache_node = bootstrap(&boot_kmem_cache_node);
/* Now we can use the kmem_cache to allocate kmalloc slabs */
@@ -4455,8 +4413,9 @@ static long validate_slab_cache(struct kmem_cache *s)
{
int node;
unsigned long count = 0;
- unsigned long *map = kmalloc(BITS_TO_LONGS(oo_objects(s->max)) *
- sizeof(unsigned long), GFP_KERNEL);
+ unsigned long *map = kmalloc_array(BITS_TO_LONGS(oo_objects(s->max)),
+ sizeof(unsigned long),
+ GFP_KERNEL);
struct kmem_cache_node *n;
if (!map)
@@ -4616,8 +4575,9 @@ static int list_locations(struct kmem_cache *s, char *buf,
unsigned long i;
struct loc_track t = { 0, 0, NULL };
int node;
- unsigned long *map = kmalloc(BITS_TO_LONGS(oo_objects(s->max)) *
- sizeof(unsigned long), GFP_KERNEL);
+ unsigned long *map = kmalloc_array(BITS_TO_LONGS(oo_objects(s->max)),
+ sizeof(unsigned long),
+ GFP_KERNEL);
struct kmem_cache_node *n;
if (!map || !alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location),
@@ -4793,7 +4753,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
int x;
unsigned long *nodes;
- nodes = kzalloc(sizeof(unsigned long) * nr_node_ids, GFP_KERNEL);
+ nodes = kcalloc(nr_node_ids, sizeof(unsigned long), GFP_KERNEL);
if (!nodes)
return -ENOMEM;
@@ -5117,12 +5077,6 @@ static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf)
}
SLAB_ATTR_RO(destroy_by_rcu);
-static ssize_t reserved_show(struct kmem_cache *s, char *buf)
-{
- return sprintf(buf, "%u\n", s->reserved);
-}
-SLAB_ATTR_RO(reserved);
-
#ifdef CONFIG_SLUB_DEBUG
static ssize_t slabs_show(struct kmem_cache *s, char *buf)
{
@@ -5342,7 +5296,7 @@ static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si)
unsigned long sum = 0;
int cpu;
int len;
- int *data = kmalloc(nr_cpu_ids * sizeof(int), GFP_KERNEL);
+ int *data = kmalloc_array(nr_cpu_ids, sizeof(int), GFP_KERNEL);
if (!data)
return -ENOMEM;
@@ -5435,7 +5389,6 @@ static struct attribute *slab_attrs[] = {
&reclaim_account_attr.attr,
&destroy_by_rcu_attr.attr,
&shrink_attr.attr,
- &reserved_attr.attr,
&slabs_cpu_partial_attr.attr,
#ifdef CONFIG_SLUB_DEBUG
&total_objects_attr.attr,
diff --git a/mm/sparse.c b/mm/sparse.c
index 73dc2fcc0eab..f13f2723950a 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -190,15 +190,13 @@ static inline int next_present_section_nr(int section_nr)
section_nr++;
if (present_section_nr(section_nr))
return section_nr;
- } while ((section_nr < NR_MEM_SECTIONS) &&
- (section_nr <= __highest_present_section_nr));
+ } while ((section_nr <= __highest_present_section_nr));
return -1;
}
#define for_each_present_section_nr(start, section_nr) \
for (section_nr = next_present_section_nr(start-1); \
((section_nr >= 0) && \
- (section_nr < NR_MEM_SECTIONS) && \
(section_nr <= __highest_present_section_nr)); \
section_nr = next_present_section_nr(section_nr))
@@ -524,7 +522,7 @@ static void __init alloc_usemap_and_memmap(void (*alloc_func)
map_count = 1;
}
/* ok, last chunk */
- alloc_func(data, pnum_begin, NR_MEM_SECTIONS,
+ alloc_func(data, pnum_begin, __highest_present_section_nr+1,
map_count, nodeid_begin);
}
diff --git a/mm/swap.c b/mm/swap.c
index 3dd518832096..26fc9b5f1b6c 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -29,6 +29,7 @@
#include <linux/cpu.h>
#include <linux/notifier.h>
#include <linux/backing-dev.h>
+#include <linux/memremap.h>
#include <linux/memcontrol.h>
#include <linux/gfp.h>
#include <linux/uio.h>
@@ -743,7 +744,7 @@ void release_pages(struct page **pages, int nr)
flags);
locked_pgdat = NULL;
}
- put_zone_device_private_or_public_page(page);
+ put_devmap_managed_page(page);
continue;
}
diff --git a/mm/swap_slots.c b/mm/swap_slots.c
index f2641894f440..a791411fed71 100644
--- a/mm/swap_slots.c
+++ b/mm/swap_slots.c
@@ -122,12 +122,12 @@ static int alloc_swap_slot_cache(unsigned int cpu)
* as kvzalloc could trigger reclaim and get_swap_page,
* which can lock swap_slots_cache_mutex.
*/
- slots = kvzalloc(sizeof(swp_entry_t) * SWAP_SLOTS_CACHE_SIZE,
+ slots = kvcalloc(SWAP_SLOTS_CACHE_SIZE, sizeof(swp_entry_t),
GFP_KERNEL);
if (!slots)
return -ENOMEM;
- slots_ret = kvzalloc(sizeof(swp_entry_t) * SWAP_SLOTS_CACHE_SIZE,
+ slots_ret = kvcalloc(SWAP_SLOTS_CACHE_SIZE, sizeof(swp_entry_t),
GFP_KERNEL);
if (!slots_ret) {
kvfree(slots);
@@ -317,7 +317,7 @@ swp_entry_t get_swap_page(struct page *page)
if (PageTransHuge(page)) {
if (IS_ENABLED(CONFIG_THP_SWAP))
get_swap_pages(1, true, &entry);
- return entry;
+ goto out;
}
/*
@@ -347,10 +347,14 @@ repeat:
}
mutex_unlock(&cache->alloc_lock);
if (entry.val)
- return entry;
+ goto out;
}
get_swap_pages(1, false, &entry);
-
+out:
+ if (mem_cgroup_try_charge_swap(page, entry)) {
+ put_swap_page(page, entry);
+ entry.val = 0;
+ }
return entry;
}
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 07f9aa2340c3..ecee9c6c4cc1 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -216,9 +216,6 @@ int add_to_swap(struct page *page)
if (!entry.val)
return 0;
- if (mem_cgroup_try_charge_swap(page, entry))
- goto fail;
-
/*
* Radix-tree node allocations from PF_MEMALLOC contexts could
* completely exhaust the page allocator. __GFP_NOMEMALLOC
@@ -623,7 +620,7 @@ int init_swap_address_space(unsigned int type, unsigned long nr_pages)
unsigned int i, nr;
nr = DIV_ROUND_UP(nr_pages, SWAP_ADDRESS_SPACE_PAGES);
- spaces = kvzalloc(sizeof(struct address_space) * nr, GFP_KERNEL);
+ spaces = kvcalloc(nr, sizeof(struct address_space), GFP_KERNEL);
if (!spaces)
return -ENOMEM;
for (i = 0; i < nr; i++) {
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 78a015fcec3b..2cc2972eedaf 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -100,7 +100,7 @@ atomic_t nr_rotate_swap = ATOMIC_INIT(0);
static inline unsigned char swap_count(unsigned char ent)
{
- return ent & ~SWAP_HAS_CACHE; /* may include SWAP_HAS_CONT flag */
+ return ent & ~SWAP_HAS_CACHE; /* may include COUNT_CONTINUED flag */
}
/* returns 1 if swap entry is freed */
@@ -3196,7 +3196,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
p->cluster_next = 1 + (prandom_u32() % p->highest_bit);
nr_cluster = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
- cluster_info = kvzalloc(nr_cluster * sizeof(*cluster_info),
+ cluster_info = kvcalloc(nr_cluster, sizeof(*cluster_info),
GFP_KERNEL);
if (!cluster_info) {
error = -ENOMEM;
@@ -3233,7 +3233,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
}
/* frontswap enabled? set up bit-per-page map for frontswap */
if (IS_ENABLED(CONFIG_FRONTSWAP))
- frontswap_map = kvzalloc(BITS_TO_LONGS(maxpages) * sizeof(long),
+ frontswap_map = kvcalloc(BITS_TO_LONGS(maxpages),
+ sizeof(long),
GFP_KERNEL);
if (p->bdev &&(swap_flags & SWAP_FLAG_DISCARD) && swap_discardable(p)) {
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 39791b81ede7..5029f241908f 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -404,7 +404,8 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
unsigned long dst_start,
unsigned long src_start,
unsigned long len,
- bool zeropage)
+ bool zeropage,
+ bool *mmap_changing)
{
struct vm_area_struct *dst_vma;
ssize_t err;
@@ -431,6 +432,15 @@ retry:
down_read(&dst_mm->mmap_sem);
/*
+ * If memory mappings are changing because of non-cooperative
+ * operation (e.g. mremap) running in parallel, bail out and
+ * request the user to retry later
+ */
+ err = -EAGAIN;
+ if (mmap_changing && READ_ONCE(*mmap_changing))
+ goto out_unlock;
+
+ /*
* Make sure the vma is not shared, that the dst range is
* both valid and fully within a single existing vma.
*/
@@ -563,13 +573,15 @@ out:
}
ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
- unsigned long src_start, unsigned long len)
+ unsigned long src_start, unsigned long len,
+ bool *mmap_changing)
{
- return __mcopy_atomic(dst_mm, dst_start, src_start, len, false);
+ return __mcopy_atomic(dst_mm, dst_start, src_start, len, false,
+ mmap_changing);
}
ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start,
- unsigned long len)
+ unsigned long len, bool *mmap_changing)
{
- return __mcopy_atomic(dst_mm, start, 0, len, true);
+ return __mcopy_atomic(dst_mm, start, 0, len, true, mmap_changing);
}
diff --git a/mm/util.c b/mm/util.c
index c2d0a7cdb189..3351659200e6 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -391,7 +391,8 @@ EXPORT_SYMBOL(vm_mmap);
* __GFP_RETRY_MAYFAIL is supported, and it should be used only if kmalloc is
* preferable to the vmalloc fallback, due to visible performance drawbacks.
*
- * Any use of gfp flags outside of GFP_KERNEL should be consulted with mm people.
+ * Please note that any use of gfp flags outside of GFP_KERNEL is careful to not
+ * fall back to vmalloc.
*/
void *kvmalloc_node(size_t size, gfp_t flags, int node)
{
@@ -402,7 +403,8 @@ void *kvmalloc_node(size_t size, gfp_t flags, int node)
* vmalloc uses GFP_KERNEL for some internal allocations (e.g page tables)
* so the given set of flags has to be compatible.
*/
- WARN_ON_ONCE((flags & GFP_KERNEL) != GFP_KERNEL);
+ if ((flags & GFP_KERNEL) != GFP_KERNEL)
+ return kmalloc_node(size, flags, node);
/*
* We want to attempt a large physically contiguous block first because
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 63a5f502da08..cfea25be7754 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -603,26 +603,6 @@ static void unmap_vmap_area(struct vmap_area *va)
vunmap_page_range(va->va_start, va->va_end);
}
-static void vmap_debug_free_range(unsigned long start, unsigned long end)
-{
- /*
- * Unmap page tables and force a TLB flush immediately if pagealloc
- * debugging is enabled. This catches use after free bugs similarly to
- * those in linear kernel virtual address space after a page has been
- * freed.
- *
- * All the lazy freeing logic is still retained, in order to minimise
- * intrusiveness of this debugging feature.
- *
- * This is going to be *slow* (linear kernel virtual address debugging
- * doesn't do a broadcast TLB flush so it is a lot faster).
- */
- if (debug_pagealloc_enabled()) {
- vunmap_page_range(start, end);
- flush_tlb_kernel_range(start, end);
- }
-}
-
/*
* lazy_max_pages is the maximum amount of virtual address space we gather up
* before attempting to purge with a TLB flush.
@@ -756,6 +736,9 @@ static void free_unmap_vmap_area(struct vmap_area *va)
{
flush_cache_vunmap(va->va_start, va->va_end);
unmap_vmap_area(va);
+ if (debug_pagealloc_enabled())
+ flush_tlb_kernel_range(va->va_start, va->va_end);
+
free_vmap_area_noflush(va);
}
@@ -1053,6 +1036,10 @@ static void vb_free(const void *addr, unsigned long size)
vunmap_page_range((unsigned long)addr, (unsigned long)addr + size);
+ if (debug_pagealloc_enabled())
+ flush_tlb_kernel_range((unsigned long)addr,
+ (unsigned long)addr + size);
+
spin_lock(&vb->lock);
/* Expand dirty range */
@@ -1141,16 +1128,16 @@ void vm_unmap_ram(const void *mem, unsigned int count)
BUG_ON(addr > VMALLOC_END);
BUG_ON(!PAGE_ALIGNED(addr));
- debug_check_no_locks_freed(mem, size);
- vmap_debug_free_range(addr, addr+size);
-
if (likely(count <= VMAP_MAX_ALLOC)) {
+ debug_check_no_locks_freed(mem, size);
vb_free(mem, size);
return;
}
va = find_vmap_area(addr);
BUG_ON(!va);
+ debug_check_no_locks_freed((void *)va->va_start,
+ (va->va_end - va->va_start));
free_unmap_vmap_area(va);
}
EXPORT_SYMBOL(vm_unmap_ram);
@@ -1499,7 +1486,6 @@ struct vm_struct *remove_vm_area(const void *addr)
va->flags |= VM_LAZY_FREE;
spin_unlock(&vmap_area_lock);
- vmap_debug_free_range(va->va_start, va->va_end);
kasan_free_shadow(vm);
free_unmap_vmap_area(va);
@@ -1519,16 +1505,17 @@ static void __vunmap(const void *addr, int deallocate_pages)
addr))
return;
- area = remove_vm_area(addr);
+ area = find_vmap_area((unsigned long)addr)->vm;
if (unlikely(!area)) {
WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n",
addr);
return;
}
- debug_check_no_locks_freed(addr, get_vm_area_size(area));
- debug_check_no_obj_freed(addr, get_vm_area_size(area));
+ debug_check_no_locks_freed(area->addr, get_vm_area_size(area));
+ debug_check_no_obj_freed(area->addr, get_vm_area_size(area));
+ remove_vm_area(addr);
if (deallocate_pages) {
int i;
@@ -2754,11 +2741,11 @@ static const struct seq_operations vmalloc_op = {
static int __init proc_vmalloc_init(void)
{
if (IS_ENABLED(CONFIG_NUMA))
- proc_create_seq_private("vmallocinfo", S_IRUSR, NULL,
+ proc_create_seq_private("vmallocinfo", 0400, NULL,
&vmalloc_op,
nr_node_ids * sizeof(unsigned int), NULL);
else
- proc_create_seq("vmallocinfo", S_IRUSR, NULL, &vmalloc_op);
+ proc_create_seq("vmallocinfo", 0400, NULL, &vmalloc_op);
return 0;
}
module_init(proc_vmalloc_init);
diff --git a/mm/vmpressure.c b/mm/vmpressure.c
index 85350ce2d25d..4854584ec436 100644
--- a/mm/vmpressure.c
+++ b/mm/vmpressure.c
@@ -342,26 +342,6 @@ void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio)
vmpressure(gfp, memcg, true, vmpressure_win, 0);
}
-static enum vmpressure_levels str_to_level(const char *arg)
-{
- enum vmpressure_levels level;
-
- for (level = 0; level < VMPRESSURE_NUM_LEVELS; level++)
- if (!strcmp(vmpressure_str_levels[level], arg))
- return level;
- return -1;
-}
-
-static enum vmpressure_modes str_to_mode(const char *arg)
-{
- enum vmpressure_modes mode;
-
- for (mode = 0; mode < VMPRESSURE_NUM_MODES; mode++)
- if (!strcmp(vmpressure_str_modes[mode], arg))
- return mode;
- return -1;
-}
-
#define MAX_VMPRESSURE_ARGS_LEN (strlen("critical") + strlen("hierarchy") + 2)
/**
@@ -390,27 +370,26 @@ int vmpressure_register_event(struct mem_cgroup *memcg,
char *token;
int ret = 0;
- spec_orig = spec = kzalloc(MAX_VMPRESSURE_ARGS_LEN + 1, GFP_KERNEL);
+ spec_orig = spec = kstrndup(args, MAX_VMPRESSURE_ARGS_LEN, GFP_KERNEL);
if (!spec) {
ret = -ENOMEM;
goto out;
}
- strncpy(spec, args, MAX_VMPRESSURE_ARGS_LEN);
/* Find required level */
token = strsep(&spec, ",");
- level = str_to_level(token);
- if (level == -1) {
- ret = -EINVAL;
+ level = match_string(vmpressure_str_levels, VMPRESSURE_NUM_LEVELS, token);
+ if (level < 0) {
+ ret = level;
goto out;
}
/* Find optional mode */
token = strsep(&spec, ",");
if (token) {
- mode = str_to_mode(token);
- if (mode == -1) {
- ret = -EINVAL;
+ mode = match_string(vmpressure_str_modes, VMPRESSURE_NUM_MODES, token);
+ if (mode < 0) {
+ ret = mode;
goto out;
}
}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 9270a4370d54..03822f86f288 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2544,12 +2544,28 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
unsigned long reclaimed;
unsigned long scanned;
- if (mem_cgroup_low(root, memcg)) {
+ switch (mem_cgroup_protected(root, memcg)) {
+ case MEMCG_PROT_MIN:
+ /*
+ * Hard protection.
+ * If there is no reclaimable memory, OOM.
+ */
+ continue;
+ case MEMCG_PROT_LOW:
+ /*
+ * Soft protection.
+ * Respect the protection only as long as
+ * there is an unprotected supply
+ * of reclaimable memory from other cgroups.
+ */
if (!sc->memcg_low_reclaim) {
sc->memcg_low_skipped = 1;
continue;
}
memcg_memory_event(memcg, MEMCG_LOW);
+ break;
+ case MEMCG_PROT_NONE:
+ break;
}
reclaimed = sc->nr_reclaimed;
@@ -3318,11 +3334,15 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
.may_unmap = 1,
.may_swap = 1,
};
+
+ __fs_reclaim_acquire();
+
count_vm_event(PAGEOUTRUN);
do {
unsigned long nr_reclaimed = sc.nr_reclaimed;
bool raise_priority = true;
+ bool ret;
sc.reclaim_idx = classzone_idx;
@@ -3395,7 +3415,10 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
wake_up_all(&pgdat->pfmemalloc_wait);
/* Check if kswapd should be suspending */
- if (try_to_freeze() || kthread_should_stop())
+ __fs_reclaim_release();
+ ret = try_to_freeze();
+ __fs_reclaim_acquire();
+ if (ret || kthread_should_stop())
break;
/*
@@ -3412,6 +3435,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
out:
snapshot_refaults(NULL, pgdat);
+ __fs_reclaim_release();
/*
* Return the order kswapd stopped reclaiming at as
* prepare_kswapd_sleep() takes it into account. If another caller
@@ -3600,9 +3624,7 @@ kswapd_try_sleep:
*/
trace_mm_vmscan_kswapd_wake(pgdat->node_id, classzone_idx,
alloc_order);
- fs_reclaim_acquire(GFP_KERNEL);
reclaim_order = balance_pgdat(pgdat, alloc_order, classzone_idx);
- fs_reclaim_release(GFP_KERNEL);
if (reclaim_order < alloc_order)
goto kswapd_try_sleep;
}
@@ -3684,16 +3706,16 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
unsigned long nr_reclaimed;
unsigned int noreclaim_flag;
- noreclaim_flag = memalloc_noreclaim_save();
fs_reclaim_acquire(sc.gfp_mask);
+ noreclaim_flag = memalloc_noreclaim_save();
reclaim_state.reclaimed_slab = 0;
p->reclaim_state = &reclaim_state;
nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
p->reclaim_state = NULL;
- fs_reclaim_release(sc.gfp_mask);
memalloc_noreclaim_restore(noreclaim_flag);
+ fs_reclaim_release(sc.gfp_mask);
return nr_reclaimed;
}
@@ -3870,6 +3892,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
};
cond_resched();
+ fs_reclaim_acquire(sc.gfp_mask);
/*
* We need to be able to allocate from the reserves for RECLAIM_UNMAP
* and we also need to be able to write out pages for RECLAIM_WRITE
@@ -3877,7 +3900,6 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
*/
noreclaim_flag = memalloc_noreclaim_save();
p->flags |= PF_SWAPWRITE;
- fs_reclaim_acquire(sc.gfp_mask);
reclaim_state.reclaimed_slab = 0;
p->reclaim_state = &reclaim_state;
@@ -3892,9 +3914,9 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
}
p->reclaim_state = NULL;
- fs_reclaim_release(gfp_mask);
current->flags &= ~PF_SWAPWRITE;
memalloc_noreclaim_restore(noreclaim_flag);
+ fs_reclaim_release(sc.gfp_mask);
return sc.nr_reclaimed >= nr_pages;
}
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 61cb05dc950c..8d87e973a4f5 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -661,8 +661,9 @@ static void zs_pool_stat_create(struct zs_pool *pool, const char *name)
}
pool->stat_dentry = entry;
- entry = debugfs_create_file("classes", S_IFREG | S_IRUGO,
- pool->stat_dentry, pool, &zs_stats_size_fops);
+ entry = debugfs_create_file("classes", S_IFREG | 0444,
+ pool->stat_dentry, pool,
+ &zs_stats_size_fops);
if (!entry) {
pr_warn("%s: debugfs file entry <%s> creation failed\n",
name, "classes");
diff --git a/mm/zswap.c b/mm/zswap.c
index 61a5c41972db..7d34e69507e3 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -1256,26 +1256,26 @@ static int __init zswap_debugfs_init(void)
if (!zswap_debugfs_root)
return -ENOMEM;
- debugfs_create_u64("pool_limit_hit", S_IRUGO,
- zswap_debugfs_root, &zswap_pool_limit_hit);
- debugfs_create_u64("reject_reclaim_fail", S_IRUGO,
- zswap_debugfs_root, &zswap_reject_reclaim_fail);
- debugfs_create_u64("reject_alloc_fail", S_IRUGO,
- zswap_debugfs_root, &zswap_reject_alloc_fail);
- debugfs_create_u64("reject_kmemcache_fail", S_IRUGO,
- zswap_debugfs_root, &zswap_reject_kmemcache_fail);
- debugfs_create_u64("reject_compress_poor", S_IRUGO,
- zswap_debugfs_root, &zswap_reject_compress_poor);
- debugfs_create_u64("written_back_pages", S_IRUGO,
- zswap_debugfs_root, &zswap_written_back_pages);
- debugfs_create_u64("duplicate_entry", S_IRUGO,
- zswap_debugfs_root, &zswap_duplicate_entry);
- debugfs_create_u64("pool_total_size", S_IRUGO,
- zswap_debugfs_root, &zswap_pool_total_size);
- debugfs_create_atomic_t("stored_pages", S_IRUGO,
- zswap_debugfs_root, &zswap_stored_pages);
+ debugfs_create_u64("pool_limit_hit", 0444,
+ zswap_debugfs_root, &zswap_pool_limit_hit);
+ debugfs_create_u64("reject_reclaim_fail", 0444,
+ zswap_debugfs_root, &zswap_reject_reclaim_fail);
+ debugfs_create_u64("reject_alloc_fail", 0444,
+ zswap_debugfs_root, &zswap_reject_alloc_fail);
+ debugfs_create_u64("reject_kmemcache_fail", 0444,
+ zswap_debugfs_root, &zswap_reject_kmemcache_fail);
+ debugfs_create_u64("reject_compress_poor", 0444,
+ zswap_debugfs_root, &zswap_reject_compress_poor);
+ debugfs_create_u64("written_back_pages", 0444,
+ zswap_debugfs_root, &zswap_written_back_pages);
+ debugfs_create_u64("duplicate_entry", 0444,
+ zswap_debugfs_root, &zswap_duplicate_entry);
+ debugfs_create_u64("pool_total_size", 0444,
+ zswap_debugfs_root, &zswap_pool_total_size);
+ debugfs_create_atomic_t("stored_pages", 0444,
+ zswap_debugfs_root, &zswap_stored_pages);
debugfs_create_atomic_t("same_filled_pages", 0444,
- zswap_debugfs_root, &zswap_same_filled_pages);
+ zswap_debugfs_root, &zswap_same_filled_pages);
return 0;
}