aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorDave Airlie <airlied@redhat.com>2015-04-20 11:32:26 +1000
committerDave Airlie <airlied@redhat.com>2015-04-20 13:05:20 +1000
commit2c33ce009ca2389dbf0535d0672214d09738e35e (patch)
tree6186a6458c3c160385d794a23eaf07c786a9e61b /mm
parentmedia-bus: Fixup RGB444_1X12, RGB565_1X16, and YUV8_1X24 media bus format (diff)
parentMerge branch 'turbostat' of git://git.kernel.org/pub/scm/linux/kernel/git/lenb/linux (diff)
downloadlinux-dev-2c33ce009ca2389dbf0535d0672214d09738e35e.tar.xz
linux-dev-2c33ce009ca2389dbf0535d0672214d09738e35e.zip
Merge Linus master into drm-next
The merge is clean, but the arm build fails afterwards, due to API changes in the regulator tree. I've included the patch into the merge to fix the build. Signed-off-by: Dave Airlie <airlied@redhat.com>
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig6
-rw-r--r--mm/Makefile2
-rw-r--r--mm/cleancache.c276
-rw-r--r--mm/cma.c52
-rw-r--r--mm/cma.h24
-rw-r--r--mm/cma_debug.c205
-rw-r--r--mm/compaction.c75
-rw-r--r--mm/filemap.c130
-rw-r--r--mm/gup.c128
-rw-r--r--mm/huge_memory.c111
-rw-r--r--mm/hugetlb.c246
-rw-r--r--mm/internal.h8
-rw-r--r--mm/kasan/kasan.c13
-rw-r--r--mm/ksm.c10
-rw-r--r--mm/memblock.c22
-rw-r--r--mm/memcontrol.c241
-rw-r--r--mm/memory-failure.c122
-rw-r--r--mm/memory.c419
-rw-r--r--mm/memory_hotplug.c37
-rw-r--r--mm/mempolicy.c6
-rw-r--r--mm/mempool.c127
-rw-r--r--mm/memtest.c118
-rw-r--r--mm/migrate.c40
-rw-r--r--mm/mlock.c131
-rw-r--r--mm/mmap.c25
-rw-r--r--mm/mremap.c35
-rw-r--r--mm/nommu.c4
-rw-r--r--mm/oom_kill.c9
-rw-r--r--mm/page-writeback.c22
-rw-r--r--mm/page_alloc.c253
-rw-r--r--mm/page_io.c7
-rw-r--r--mm/percpu.c4
-rw-r--r--mm/process_vm_access.c35
-rw-r--r--mm/rmap.c6
-rw-r--r--mm/shmem.c4
-rw-r--r--mm/slab.c22
-rw-r--r--mm/slob.c3
-rw-r--r--mm/slub.c32
-rw-r--r--mm/swap.c34
-rw-r--r--mm/swap_state.c2
-rw-r--r--mm/swapfile.c2
-rw-r--r--mm/truncate.c39
-rw-r--r--mm/util.c41
-rw-r--r--mm/vmalloc.c103
-rw-r--r--mm/zsmalloc.c971
45 files changed, 2702 insertions, 1500 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index a03131b6ba8e..390214da4546 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -517,6 +517,12 @@ config CMA_DEBUG
processing calls such as dma_alloc_from_contiguous().
This option does not affect warning and error messages.
+config CMA_DEBUGFS
+ bool "CMA debugfs interface"
+ depends on CMA && DEBUG_FS
+ help
+ Turns on the DebugFS interface for CMA.
+
config CMA_AREAS
int "Maximum count of the CMA areas"
depends on CMA
diff --git a/mm/Makefile b/mm/Makefile
index 15dbe9903c27..98c4eaeabdcb 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -55,6 +55,7 @@ obj-$(CONFIG_KMEMCHECK) += kmemcheck.o
obj-$(CONFIG_KASAN) += kasan/
obj-$(CONFIG_FAILSLAB) += failslab.o
obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
+obj-$(CONFIG_MEMTEST) += memtest.o
obj-$(CONFIG_MIGRATION) += migrate.o
obj-$(CONFIG_QUICKLIST) += quicklist.o
obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o
@@ -76,3 +77,4 @@ obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o
obj-$(CONFIG_CMA) += cma.o
obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o
obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o
+obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o
diff --git a/mm/cleancache.c b/mm/cleancache.c
index 053bcd8f12fb..8fc50811119b 100644
--- a/mm/cleancache.c
+++ b/mm/cleancache.c
@@ -19,7 +19,7 @@
#include <linux/cleancache.h>
/*
- * cleancache_ops is set by cleancache_ops_register to contain the pointers
+ * cleancache_ops is set by cleancache_register_ops to contain the pointers
* to the cleancache "backend" implementation functions.
*/
static struct cleancache_ops *cleancache_ops __read_mostly;
@@ -34,145 +34,107 @@ static u64 cleancache_failed_gets;
static u64 cleancache_puts;
static u64 cleancache_invalidates;
-/*
- * When no backend is registered all calls to init_fs and init_shared_fs
- * are registered and fake poolids (FAKE_FS_POOLID_OFFSET or
- * FAKE_SHARED_FS_POOLID_OFFSET, plus offset in the respective array
- * [shared_|]fs_poolid_map) are given to the respective super block
- * (sb->cleancache_poolid) and no tmem_pools are created. When a backend
- * registers with cleancache the previous calls to init_fs and init_shared_fs
- * are executed to create tmem_pools and set the respective poolids. While no
- * backend is registered all "puts", "gets" and "flushes" are ignored or failed.
- */
-#define MAX_INITIALIZABLE_FS 32
-#define FAKE_FS_POOLID_OFFSET 1000
-#define FAKE_SHARED_FS_POOLID_OFFSET 2000
-
-#define FS_NO_BACKEND (-1)
-#define FS_UNKNOWN (-2)
-static int fs_poolid_map[MAX_INITIALIZABLE_FS];
-static int shared_fs_poolid_map[MAX_INITIALIZABLE_FS];
-static char *uuids[MAX_INITIALIZABLE_FS];
-/*
- * Mutex for the [shared_|]fs_poolid_map to guard against multiple threads
- * invoking umount (and ending in __cleancache_invalidate_fs) and also multiple
- * threads calling mount (and ending up in __cleancache_init_[shared|]fs).
- */
-static DEFINE_MUTEX(poolid_mutex);
-/*
- * When set to false (default) all calls to the cleancache functions, except
- * the __cleancache_invalidate_fs and __cleancache_init_[shared|]fs are guarded
- * by the if (!cleancache_ops) return. This means multiple threads (from
- * different filesystems) will be checking cleancache_ops. The usage of a
- * bool instead of a atomic_t or a bool guarded by a spinlock is OK - we are
- * OK if the time between the backend's have been initialized (and
- * cleancache_ops has been set to not NULL) and when the filesystems start
- * actually calling the backends. The inverse (when unloading) is obviously
- * not good - but this shim does not do that (yet).
- */
-
-/*
- * The backends and filesystems work all asynchronously. This is b/c the
- * backends can be built as modules.
- * The usual sequence of events is:
- * a) mount / -> __cleancache_init_fs is called. We set the
- * [shared_|]fs_poolid_map and uuids for.
- *
- * b). user does I/Os -> we call the rest of __cleancache_* functions
- * which return immediately as cleancache_ops is false.
- *
- * c). modprobe zcache -> cleancache_register_ops. We init the backend
- * and set cleancache_ops to true, and for any fs_poolid_map
- * (which is set by __cleancache_init_fs) we initialize the poolid.
- *
- * d). user does I/Os -> now that cleancache_ops is true all the
- * __cleancache_* functions can call the backend. They all check
- * that fs_poolid_map is valid and if so invoke the backend.
- *
- * e). umount / -> __cleancache_invalidate_fs, the fs_poolid_map is
- * reset (which is the second check in the __cleancache_* ops
- * to call the backend).
- *
- * The sequence of event could also be c), followed by a), and d). and e). The
- * c) would not happen anymore. There is also the chance of c), and one thread
- * doing a) + d), and another doing e). For that case we depend on the
- * filesystem calling __cleancache_invalidate_fs in the proper sequence (so
- * that it handles all I/Os before it invalidates the fs (which is last part
- * of unmounting process).
- *
- * Note: The acute reader will notice that there is no "rmmod zcache" case.
- * This is b/c the functionality for that is not yet implemented and when
- * done, will require some extra locking not yet devised.
- */
+static void cleancache_register_ops_sb(struct super_block *sb, void *unused)
+{
+ switch (sb->cleancache_poolid) {
+ case CLEANCACHE_NO_BACKEND:
+ __cleancache_init_fs(sb);
+ break;
+ case CLEANCACHE_NO_BACKEND_SHARED:
+ __cleancache_init_shared_fs(sb);
+ break;
+ }
+}
/*
- * Register operations for cleancache, returning previous thus allowing
- * detection of multiple backends and possible nesting.
+ * Register operations for cleancache. Returns 0 on success.
*/
-struct cleancache_ops *cleancache_register_ops(struct cleancache_ops *ops)
+int cleancache_register_ops(struct cleancache_ops *ops)
{
- struct cleancache_ops *old = cleancache_ops;
- int i;
+ if (cmpxchg(&cleancache_ops, NULL, ops))
+ return -EBUSY;
- mutex_lock(&poolid_mutex);
- for (i = 0; i < MAX_INITIALIZABLE_FS; i++) {
- if (fs_poolid_map[i] == FS_NO_BACKEND)
- fs_poolid_map[i] = ops->init_fs(PAGE_SIZE);
- if (shared_fs_poolid_map[i] == FS_NO_BACKEND)
- shared_fs_poolid_map[i] = ops->init_shared_fs
- (uuids[i], PAGE_SIZE);
- }
/*
- * We MUST set cleancache_ops _after_ we have called the backends
- * init_fs or init_shared_fs functions. Otherwise the compiler might
- * re-order where cleancache_ops is set in this function.
+ * A cleancache backend can be built as a module and hence loaded after
+ * a cleancache enabled filesystem has called cleancache_init_fs. To
+ * handle such a scenario, here we call ->init_fs or ->init_shared_fs
+ * for each active super block. To differentiate between local and
+ * shared filesystems, we temporarily initialize sb->cleancache_poolid
+ * to CLEANCACHE_NO_BACKEND or CLEANCACHE_NO_BACKEND_SHARED
+ * respectively in case there is no backend registered at the time
+ * cleancache_init_fs or cleancache_init_shared_fs is called.
+ *
+ * Since filesystems can be mounted concurrently with cleancache
+ * backend registration, we have to be careful to guarantee that all
+ * cleancache enabled filesystems that has been mounted by the time
+ * cleancache_register_ops is called has got and all mounted later will
+ * get cleancache_poolid. This is assured by the following statements
+ * tied together:
+ *
+ * a) iterate_supers skips only those super blocks that has started
+ * ->kill_sb
+ *
+ * b) if iterate_supers encounters a super block that has not finished
+ * ->mount yet, it waits until it is finished
+ *
+ * c) cleancache_init_fs is called from ->mount and
+ * cleancache_invalidate_fs is called from ->kill_sb
+ *
+ * d) we call iterate_supers after cleancache_ops has been set
+ *
+ * From a) it follows that if iterate_supers skips a super block, then
+ * either the super block is already dead, in which case we do not need
+ * to bother initializing cleancache for it, or it was mounted after we
+ * initiated iterate_supers. In the latter case, it must have seen
+ * cleancache_ops set according to d) and initialized cleancache from
+ * ->mount by itself according to c). This proves that we call
+ * ->init_fs at least once for each active super block.
+ *
+ * From b) and c) it follows that if iterate_supers encounters a super
+ * block that has already started ->init_fs, it will wait until ->mount
+ * and hence ->init_fs has finished, then check cleancache_poolid, see
+ * that it has already been set and therefore do nothing. This proves
+ * that we call ->init_fs no more than once for each super block.
+ *
+ * Combined together, the last two paragraphs prove the function
+ * correctness.
+ *
+ * Note that various cleancache callbacks may proceed before this
+ * function is called or even concurrently with it, but since
+ * CLEANCACHE_NO_BACKEND is negative, they will all result in a noop
+ * until the corresponding ->init_fs has been actually called and
+ * cleancache_ops has been set.
*/
- barrier();
- cleancache_ops = ops;
- mutex_unlock(&poolid_mutex);
- return old;
+ iterate_supers(cleancache_register_ops_sb, NULL);
+ return 0;
}
EXPORT_SYMBOL(cleancache_register_ops);
/* Called by a cleancache-enabled filesystem at time of mount */
void __cleancache_init_fs(struct super_block *sb)
{
- int i;
+ int pool_id = CLEANCACHE_NO_BACKEND;
- mutex_lock(&poolid_mutex);
- for (i = 0; i < MAX_INITIALIZABLE_FS; i++) {
- if (fs_poolid_map[i] == FS_UNKNOWN) {
- sb->cleancache_poolid = i + FAKE_FS_POOLID_OFFSET;
- if (cleancache_ops)
- fs_poolid_map[i] = cleancache_ops->init_fs(PAGE_SIZE);
- else
- fs_poolid_map[i] = FS_NO_BACKEND;
- break;
- }
+ if (cleancache_ops) {
+ pool_id = cleancache_ops->init_fs(PAGE_SIZE);
+ if (pool_id < 0)
+ pool_id = CLEANCACHE_NO_POOL;
}
- mutex_unlock(&poolid_mutex);
+ sb->cleancache_poolid = pool_id;
}
EXPORT_SYMBOL(__cleancache_init_fs);
/* Called by a cleancache-enabled clustered filesystem at time of mount */
-void __cleancache_init_shared_fs(char *uuid, struct super_block *sb)
+void __cleancache_init_shared_fs(struct super_block *sb)
{
- int i;
+ int pool_id = CLEANCACHE_NO_BACKEND_SHARED;
- mutex_lock(&poolid_mutex);
- for (i = 0; i < MAX_INITIALIZABLE_FS; i++) {
- if (shared_fs_poolid_map[i] == FS_UNKNOWN) {
- sb->cleancache_poolid = i + FAKE_SHARED_FS_POOLID_OFFSET;
- uuids[i] = uuid;
- if (cleancache_ops)
- shared_fs_poolid_map[i] = cleancache_ops->init_shared_fs
- (uuid, PAGE_SIZE);
- else
- shared_fs_poolid_map[i] = FS_NO_BACKEND;
- break;
- }
+ if (cleancache_ops) {
+ pool_id = cleancache_ops->init_shared_fs(sb->s_uuid, PAGE_SIZE);
+ if (pool_id < 0)
+ pool_id = CLEANCACHE_NO_POOL;
}
- mutex_unlock(&poolid_mutex);
+ sb->cleancache_poolid = pool_id;
}
EXPORT_SYMBOL(__cleancache_init_shared_fs);
@@ -202,19 +164,6 @@ static int cleancache_get_key(struct inode *inode,
}
/*
- * Returns a pool_id that is associated with a given fake poolid.
- */
-static int get_poolid_from_fake(int fake_pool_id)
-{
- if (fake_pool_id >= FAKE_SHARED_FS_POOLID_OFFSET)
- return shared_fs_poolid_map[fake_pool_id -
- FAKE_SHARED_FS_POOLID_OFFSET];
- else if (fake_pool_id >= FAKE_FS_POOLID_OFFSET)
- return fs_poolid_map[fake_pool_id - FAKE_FS_POOLID_OFFSET];
- return FS_NO_BACKEND;
-}
-
-/*
* "Get" data from cleancache associated with the poolid/inode/index
* that were specified when the data was put to cleanache and, if
* successful, use it to fill the specified page with data and return 0.
@@ -229,7 +178,6 @@ int __cleancache_get_page(struct page *page)
{
int ret = -1;
int pool_id;
- int fake_pool_id;
struct cleancache_filekey key = { .u.key = { 0 } };
if (!cleancache_ops) {
@@ -238,17 +186,14 @@ int __cleancache_get_page(struct page *page)
}
VM_BUG_ON_PAGE(!PageLocked(page), page);
- fake_pool_id = page->mapping->host->i_sb->cleancache_poolid;
- if (fake_pool_id < 0)
+ pool_id = page->mapping->host->i_sb->cleancache_poolid;
+ if (pool_id < 0)
goto out;
- pool_id = get_poolid_from_fake(fake_pool_id);
if (cleancache_get_key(page->mapping->host, &key) < 0)
goto out;
- if (pool_id >= 0)
- ret = cleancache_ops->get_page(pool_id,
- key, page->index, page);
+ ret = cleancache_ops->get_page(pool_id, key, page->index, page);
if (ret == 0)
cleancache_succ_gets++;
else
@@ -271,7 +216,6 @@ EXPORT_SYMBOL(__cleancache_get_page);
void __cleancache_put_page(struct page *page)
{
int pool_id;
- int fake_pool_id;
struct cleancache_filekey key = { .u.key = { 0 } };
if (!cleancache_ops) {
@@ -280,12 +224,7 @@ void __cleancache_put_page(struct page *page)
}
VM_BUG_ON_PAGE(!PageLocked(page), page);
- fake_pool_id = page->mapping->host->i_sb->cleancache_poolid;
- if (fake_pool_id < 0)
- return;
-
- pool_id = get_poolid_from_fake(fake_pool_id);
-
+ pool_id = page->mapping->host->i_sb->cleancache_poolid;
if (pool_id >= 0 &&
cleancache_get_key(page->mapping->host, &key) >= 0) {
cleancache_ops->put_page(pool_id, key, page->index, page);
@@ -306,18 +245,13 @@ void __cleancache_invalidate_page(struct address_space *mapping,
struct page *page)
{
/* careful... page->mapping is NULL sometimes when this is called */
- int pool_id;
- int fake_pool_id = mapping->host->i_sb->cleancache_poolid;
+ int pool_id = mapping->host->i_sb->cleancache_poolid;
struct cleancache_filekey key = { .u.key = { 0 } };
if (!cleancache_ops)
return;
- if (fake_pool_id >= 0) {
- pool_id = get_poolid_from_fake(fake_pool_id);
- if (pool_id < 0)
- return;
-
+ if (pool_id >= 0) {
VM_BUG_ON_PAGE(!PageLocked(page), page);
if (cleancache_get_key(mapping->host, &key) >= 0) {
cleancache_ops->invalidate_page(pool_id,
@@ -339,18 +273,12 @@ EXPORT_SYMBOL(__cleancache_invalidate_page);
*/
void __cleancache_invalidate_inode(struct address_space *mapping)
{
- int pool_id;
- int fake_pool_id = mapping->host->i_sb->cleancache_poolid;
+ int pool_id = mapping->host->i_sb->cleancache_poolid;
struct cleancache_filekey key = { .u.key = { 0 } };
if (!cleancache_ops)
return;
- if (fake_pool_id < 0)
- return;
-
- pool_id = get_poolid_from_fake(fake_pool_id);
-
if (pool_id >= 0 && cleancache_get_key(mapping->host, &key) >= 0)
cleancache_ops->invalidate_inode(pool_id, key);
}
@@ -363,32 +291,18 @@ EXPORT_SYMBOL(__cleancache_invalidate_inode);
*/
void __cleancache_invalidate_fs(struct super_block *sb)
{
- int index;
- int fake_pool_id = sb->cleancache_poolid;
- int old_poolid = fake_pool_id;
+ int pool_id;
- mutex_lock(&poolid_mutex);
- if (fake_pool_id >= FAKE_SHARED_FS_POOLID_OFFSET) {
- index = fake_pool_id - FAKE_SHARED_FS_POOLID_OFFSET;
- old_poolid = shared_fs_poolid_map[index];
- shared_fs_poolid_map[index] = FS_UNKNOWN;
- uuids[index] = NULL;
- } else if (fake_pool_id >= FAKE_FS_POOLID_OFFSET) {
- index = fake_pool_id - FAKE_FS_POOLID_OFFSET;
- old_poolid = fs_poolid_map[index];
- fs_poolid_map[index] = FS_UNKNOWN;
- }
- sb->cleancache_poolid = -1;
- if (cleancache_ops)
- cleancache_ops->invalidate_fs(old_poolid);
- mutex_unlock(&poolid_mutex);
+ pool_id = sb->cleancache_poolid;
+ sb->cleancache_poolid = CLEANCACHE_NO_POOL;
+
+ if (cleancache_ops && pool_id >= 0)
+ cleancache_ops->invalidate_fs(pool_id);
}
EXPORT_SYMBOL(__cleancache_invalidate_fs);
static int __init init_cleancache(void)
{
- int i;
-
#ifdef CONFIG_DEBUG_FS
struct dentry *root = debugfs_create_dir("cleancache", NULL);
if (root == NULL)
@@ -400,10 +314,6 @@ static int __init init_cleancache(void)
debugfs_create_u64("invalidates", S_IRUGO,
root, &cleancache_invalidates);
#endif
- for (i = 0; i < MAX_INITIALIZABLE_FS; i++) {
- fs_poolid_map[i] = FS_UNKNOWN;
- shared_fs_poolid_map[i] = FS_UNKNOWN;
- }
return 0;
}
module_init(init_cleancache)
diff --git a/mm/cma.c b/mm/cma.c
index 68ecb7a42983..3a7a67b93394 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -23,6 +23,7 @@
# define DEBUG
#endif
#endif
+#define CREATE_TRACE_POINTS
#include <linux/memblock.h>
#include <linux/err.h>
@@ -34,30 +35,26 @@
#include <linux/cma.h>
#include <linux/highmem.h>
#include <linux/io.h>
+#include <trace/events/cma.h>
-struct cma {
- unsigned long base_pfn;
- unsigned long count;
- unsigned long *bitmap;
- unsigned int order_per_bit; /* Order of pages represented by one bit */
- struct mutex lock;
-};
+#include "cma.h"
-static struct cma cma_areas[MAX_CMA_AREAS];
-static unsigned cma_area_count;
+struct cma cma_areas[MAX_CMA_AREAS];
+unsigned cma_area_count;
static DEFINE_MUTEX(cma_mutex);
-phys_addr_t cma_get_base(struct cma *cma)
+phys_addr_t cma_get_base(const struct cma *cma)
{
return PFN_PHYS(cma->base_pfn);
}
-unsigned long cma_get_size(struct cma *cma)
+unsigned long cma_get_size(const struct cma *cma)
{
return cma->count << PAGE_SHIFT;
}
-static unsigned long cma_bitmap_aligned_mask(struct cma *cma, int align_order)
+static unsigned long cma_bitmap_aligned_mask(const struct cma *cma,
+ int align_order)
{
if (align_order <= cma->order_per_bit)
return 0;
@@ -68,7 +65,8 @@ static unsigned long cma_bitmap_aligned_mask(struct cma *cma, int align_order)
* Find a PFN aligned to the specified order and return an offset represented in
* order_per_bits.
*/
-static unsigned long cma_bitmap_aligned_offset(struct cma *cma, int align_order)
+static unsigned long cma_bitmap_aligned_offset(const struct cma *cma,
+ int align_order)
{
if (align_order <= cma->order_per_bit)
return 0;
@@ -77,18 +75,14 @@ static unsigned long cma_bitmap_aligned_offset(struct cma *cma, int align_order)
- cma->base_pfn) >> cma->order_per_bit;
}
-static unsigned long cma_bitmap_maxno(struct cma *cma)
-{
- return cma->count >> cma->order_per_bit;
-}
-
-static unsigned long cma_bitmap_pages_to_bits(struct cma *cma,
- unsigned long pages)
+static unsigned long cma_bitmap_pages_to_bits(const struct cma *cma,
+ unsigned long pages)
{
return ALIGN(pages, 1UL << cma->order_per_bit) >> cma->order_per_bit;
}
-static void cma_clear_bitmap(struct cma *cma, unsigned long pfn, int count)
+static void cma_clear_bitmap(struct cma *cma, unsigned long pfn,
+ unsigned int count)
{
unsigned long bitmap_no, bitmap_count;
@@ -134,6 +128,12 @@ static int __init cma_activate_area(struct cma *cma)
} while (--i);
mutex_init(&cma->lock);
+
+#ifdef CONFIG_CMA_DEBUGFS
+ INIT_HLIST_HEAD(&cma->mem_head);
+ spin_lock_init(&cma->mem_head_lock);
+#endif
+
return 0;
err:
@@ -167,7 +167,8 @@ core_initcall(cma_init_reserved_areas);
* This function creates custom contiguous area from already reserved memory.
*/
int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size,
- int order_per_bit, struct cma **res_cma)
+ unsigned int order_per_bit,
+ struct cma **res_cma)
{
struct cma *cma;
phys_addr_t alignment;
@@ -358,7 +359,7 @@ err:
* This function allocates part of contiguous memory on specific
* contiguous memory area.
*/
-struct page *cma_alloc(struct cma *cma, int count, unsigned int align)
+struct page *cma_alloc(struct cma *cma, unsigned int count, unsigned int align)
{
unsigned long mask, offset, pfn, start = 0;
unsigned long bitmap_maxno, bitmap_no, bitmap_count;
@@ -415,6 +416,8 @@ struct page *cma_alloc(struct cma *cma, int count, unsigned int align)
start = bitmap_no + mask + 1;
}
+ trace_cma_alloc(page ? pfn : -1UL, page, count, align);
+
pr_debug("%s(): returned %p\n", __func__, page);
return page;
}
@@ -429,7 +432,7 @@ struct page *cma_alloc(struct cma *cma, int count, unsigned int align)
* It returns false when provided pages do not belong to contiguous area and
* true otherwise.
*/
-bool cma_release(struct cma *cma, struct page *pages, int count)
+bool cma_release(struct cma *cma, const struct page *pages, unsigned int count)
{
unsigned long pfn;
@@ -447,6 +450,7 @@ bool cma_release(struct cma *cma, struct page *pages, int count)
free_contig_range(pfn, count);
cma_clear_bitmap(cma, pfn, count);
+ trace_cma_release(pfn, pages, count);
return true;
}
diff --git a/mm/cma.h b/mm/cma.h
new file mode 100644
index 000000000000..1132d733556d
--- /dev/null
+++ b/mm/cma.h
@@ -0,0 +1,24 @@
+#ifndef __MM_CMA_H__
+#define __MM_CMA_H__
+
+struct cma {
+ unsigned long base_pfn;
+ unsigned long count;
+ unsigned long *bitmap;
+ unsigned int order_per_bit; /* Order of pages represented by one bit */
+ struct mutex lock;
+#ifdef CONFIG_CMA_DEBUGFS
+ struct hlist_head mem_head;
+ spinlock_t mem_head_lock;
+#endif
+};
+
+extern struct cma cma_areas[MAX_CMA_AREAS];
+extern unsigned cma_area_count;
+
+static unsigned long cma_bitmap_maxno(struct cma *cma)
+{
+ return cma->count >> cma->order_per_bit;
+}
+
+#endif
diff --git a/mm/cma_debug.c b/mm/cma_debug.c
new file mode 100644
index 000000000000..7621ee34daa0
--- /dev/null
+++ b/mm/cma_debug.c
@@ -0,0 +1,205 @@
+/*
+ * CMA DebugFS Interface
+ *
+ * Copyright (c) 2015 Sasha Levin <sasha.levin@oracle.com>
+ */
+
+
+#include <linux/debugfs.h>
+#include <linux/cma.h>
+#include <linux/list.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/mm_types.h>
+
+#include "cma.h"
+
+struct cma_mem {
+ struct hlist_node node;
+ struct page *p;
+ unsigned long n;
+};
+
+static struct dentry *cma_debugfs_root;
+
+static int cma_debugfs_get(void *data, u64 *val)
+{
+ unsigned long *p = data;
+
+ *val = *p;
+
+ return 0;
+}
+DEFINE_SIMPLE_ATTRIBUTE(cma_debugfs_fops, cma_debugfs_get, NULL, "%llu\n");
+
+static int cma_used_get(void *data, u64 *val)
+{
+ struct cma *cma = data;
+ unsigned long used;
+
+ mutex_lock(&cma->lock);
+ /* pages counter is smaller than sizeof(int) */
+ used = bitmap_weight(cma->bitmap, (int)cma->count);
+ mutex_unlock(&cma->lock);
+ *val = (u64)used << cma->order_per_bit;
+
+ return 0;
+}
+DEFINE_SIMPLE_ATTRIBUTE(cma_used_fops, cma_used_get, NULL, "%llu\n");
+
+static int cma_maxchunk_get(void *data, u64 *val)
+{
+ struct cma *cma = data;
+ unsigned long maxchunk = 0;
+ unsigned long start, end = 0;
+
+ mutex_lock(&cma->lock);
+ for (;;) {
+ start = find_next_zero_bit(cma->bitmap, cma->count, end);
+ if (start >= cma->count)
+ break;
+ end = find_next_bit(cma->bitmap, cma->count, start);
+ maxchunk = max(end - start, maxchunk);
+ }
+ mutex_unlock(&cma->lock);
+ *val = (u64)maxchunk << cma->order_per_bit;
+
+ return 0;
+}
+DEFINE_SIMPLE_ATTRIBUTE(cma_maxchunk_fops, cma_maxchunk_get, NULL, "%llu\n");
+
+static void cma_add_to_cma_mem_list(struct cma *cma, struct cma_mem *mem)
+{
+ spin_lock(&cma->mem_head_lock);
+ hlist_add_head(&mem->node, &cma->mem_head);
+ spin_unlock(&cma->mem_head_lock);
+}
+
+static struct cma_mem *cma_get_entry_from_list(struct cma *cma)
+{
+ struct cma_mem *mem = NULL;
+
+ spin_lock(&cma->mem_head_lock);
+ if (!hlist_empty(&cma->mem_head)) {
+ mem = hlist_entry(cma->mem_head.first, struct cma_mem, node);
+ hlist_del_init(&mem->node);
+ }
+ spin_unlock(&cma->mem_head_lock);
+
+ return mem;
+}
+
+static int cma_free_mem(struct cma *cma, int count)
+{
+ struct cma_mem *mem = NULL;
+
+ while (count) {
+ mem = cma_get_entry_from_list(cma);
+ if (mem == NULL)
+ return 0;
+
+ if (mem->n <= count) {
+ cma_release(cma, mem->p, mem->n);
+ count -= mem->n;
+ kfree(mem);
+ } else if (cma->order_per_bit == 0) {
+ cma_release(cma, mem->p, count);
+ mem->p += count;
+ mem->n -= count;
+ count = 0;
+ cma_add_to_cma_mem_list(cma, mem);
+ } else {
+ pr_debug("cma: cannot release partial block when order_per_bit != 0\n");
+ cma_add_to_cma_mem_list(cma, mem);
+ break;
+ }
+ }
+
+ return 0;
+
+}
+
+static int cma_free_write(void *data, u64 val)
+{
+ int pages = val;
+ struct cma *cma = data;
+
+ return cma_free_mem(cma, pages);
+}
+DEFINE_SIMPLE_ATTRIBUTE(cma_free_fops, NULL, cma_free_write, "%llu\n");
+
+static int cma_alloc_mem(struct cma *cma, int count)
+{
+ struct cma_mem *mem;
+ struct page *p;
+
+ mem = kzalloc(sizeof(*mem), GFP_KERNEL);
+ if (!mem)
+ return -ENOMEM;
+
+ p = cma_alloc(cma, count, 0);
+ if (!p) {
+ kfree(mem);
+ return -ENOMEM;
+ }
+
+ mem->p = p;
+ mem->n = count;
+
+ cma_add_to_cma_mem_list(cma, mem);
+
+ return 0;
+}
+
+static int cma_alloc_write(void *data, u64 val)
+{
+ int pages = val;
+ struct cma *cma = data;
+
+ return cma_alloc_mem(cma, pages);
+}
+DEFINE_SIMPLE_ATTRIBUTE(cma_alloc_fops, NULL, cma_alloc_write, "%llu\n");
+
+static void cma_debugfs_add_one(struct cma *cma, int idx)
+{
+ struct dentry *tmp;
+ char name[16];
+ int u32s;
+
+ sprintf(name, "cma-%d", idx);
+
+ tmp = debugfs_create_dir(name, cma_debugfs_root);
+
+ debugfs_create_file("alloc", S_IWUSR, cma_debugfs_root, cma,
+ &cma_alloc_fops);
+
+ debugfs_create_file("free", S_IWUSR, cma_debugfs_root, cma,
+ &cma_free_fops);
+
+ debugfs_create_file("base_pfn", S_IRUGO, tmp,
+ &cma->base_pfn, &cma_debugfs_fops);
+ debugfs_create_file("count", S_IRUGO, tmp,
+ &cma->count, &cma_debugfs_fops);
+ debugfs_create_file("order_per_bit", S_IRUGO, tmp,
+ &cma->order_per_bit, &cma_debugfs_fops);
+ debugfs_create_file("used", S_IRUGO, tmp, cma, &cma_used_fops);
+ debugfs_create_file("maxchunk", S_IRUGO, tmp, cma, &cma_maxchunk_fops);
+
+ u32s = DIV_ROUND_UP(cma_bitmap_maxno(cma), BITS_PER_BYTE * sizeof(u32));
+ debugfs_create_u32_array("bitmap", S_IRUGO, tmp, (u32*)cma->bitmap, u32s);
+}
+
+static int __init cma_debugfs_init(void)
+{
+ int i;
+
+ cma_debugfs_root = debugfs_create_dir("cma", NULL);
+ if (!cma_debugfs_root)
+ return -ENOMEM;
+
+ for (i = 0; i < cma_area_count; i++)
+ cma_debugfs_add_one(&cma_areas[i], i);
+
+ return 0;
+}
+late_initcall(cma_debugfs_init);
diff --git a/mm/compaction.c b/mm/compaction.c
index 8c0d9459b54a..018f08da99a2 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -391,28 +391,6 @@ static inline bool compact_should_abort(struct compact_control *cc)
return false;
}
-/* Returns true if the page is within a block suitable for migration to */
-static bool suitable_migration_target(struct page *page)
-{
- /* If the page is a large free page, then disallow migration */
- if (PageBuddy(page)) {
- /*
- * We are checking page_order without zone->lock taken. But
- * the only small danger is that we skip a potentially suitable
- * pageblock, so it's not worth to check order for valid range.
- */
- if (page_order_unsafe(page) >= pageblock_order)
- return false;
- }
-
- /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */
- if (migrate_async_suitable(get_pageblock_migratetype(page)))
- return true;
-
- /* Otherwise skip the block */
- return false;
-}
-
/*
* Isolate free pages onto a private freelist. If @strict is true, will abort
* returning 0 on any invalid PFNs or non-free pages inside of the pageblock
@@ -896,6 +874,29 @@ isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn,
#endif /* CONFIG_COMPACTION || CONFIG_CMA */
#ifdef CONFIG_COMPACTION
+
+/* Returns true if the page is within a block suitable for migration to */
+static bool suitable_migration_target(struct page *page)
+{
+ /* If the page is a large free page, then disallow migration */
+ if (PageBuddy(page)) {
+ /*
+ * We are checking page_order without zone->lock taken. But
+ * the only small danger is that we skip a potentially suitable
+ * pageblock, so it's not worth to check order for valid range.
+ */
+ if (page_order_unsafe(page) >= pageblock_order)
+ return false;
+ }
+
+ /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */
+ if (migrate_async_suitable(get_pageblock_migratetype(page)))
+ return true;
+
+ /* Otherwise skip the block */
+ return false;
+}
+
/*
* Based on information in the current compact_control, find blocks
* suitable for isolating free pages from and then isolate them.
@@ -1047,6 +1048,12 @@ typedef enum {
} isolate_migrate_t;
/*
+ * Allow userspace to control policy on scanning the unevictable LRU for
+ * compactable pages.
+ */
+int sysctl_compact_unevictable_allowed __read_mostly = 1;
+
+/*
* Isolate all pages that can be migrated from the first suitable block,
* starting at the block pointed to by the migrate scanner pfn within
* compact_control.
@@ -1057,6 +1064,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
unsigned long low_pfn, end_pfn;
struct page *page;
const isolate_mode_t isolate_mode =
+ (sysctl_compact_unevictable_allowed ? ISOLATE_UNEVICTABLE : 0) |
(cc->mode == MIGRATE_ASYNC ? ISOLATE_ASYNC_MIGRATE : 0);
/*
@@ -1174,13 +1182,24 @@ static int __compact_finished(struct zone *zone, struct compact_control *cc,
/* Direct compactor: Is a suitable page free? */
for (order = cc->order; order < MAX_ORDER; order++) {
struct free_area *area = &zone->free_area[order];
+ bool can_steal;
/* Job done if page is free of the right migratetype */
if (!list_empty(&area->free_list[migratetype]))
return COMPACT_PARTIAL;
- /* Job done if allocation would set block type */
- if (order >= pageblock_order && area->nr_free)
+#ifdef CONFIG_CMA
+ /* MIGRATE_MOVABLE can fallback on MIGRATE_CMA */
+ if (migratetype == MIGRATE_MOVABLE &&
+ !list_empty(&area->free_list[MIGRATE_CMA]))
+ return COMPACT_PARTIAL;
+#endif
+ /*
+ * Job done if allocation would steal freepages from
+ * other migratetype buddy lists.
+ */
+ if (find_suitable_fallback(area, order, migratetype,
+ true, &can_steal) != -1)
return COMPACT_PARTIAL;
}
@@ -1587,6 +1606,14 @@ static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
INIT_LIST_HEAD(&cc->freepages);
INIT_LIST_HEAD(&cc->migratepages);
+ /*
+ * When called via /proc/sys/vm/compact_memory
+ * this makes sure we compact the whole zone regardless of
+ * cached scanner positions.
+ */
+ if (cc->order == -1)
+ __reset_isolation_suitable(zone);
+
if (cc->order == -1 || !compaction_deferred(zone, cc->order))
compact_zone(zone, cc);
diff --git a/mm/filemap.c b/mm/filemap.c
index ad7242043bdb..6bf5e42d560a 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -13,7 +13,6 @@
#include <linux/compiler.h>
#include <linux/fs.h>
#include <linux/uaccess.h>
-#include <linux/aio.h>
#include <linux/capability.h>
#include <linux/kernel_stat.h>
#include <linux/gfp.h>
@@ -203,16 +202,15 @@ void __delete_from_page_cache(struct page *page, void *shadow)
BUG_ON(page_mapped(page));
/*
- * Some filesystems seem to re-dirty the page even after
- * the VM has canceled the dirty bit (eg ext3 journaling).
+ * At this point page must be either written or cleaned by truncate.
+ * Dirty page here signals a bug and loss of unwritten data.
*
- * Fix it up by doing a final dirty accounting check after
- * having removed the page entirely.
+ * This fixes dirty accounting after removing the page entirely but
+ * leaves PageDirty set: it has no effect for truncated page and
+ * anyway will be cleared before returning page into buddy allocator.
*/
- if (PageDirty(page) && mapping_cap_account_dirty(mapping)) {
- dec_zone_page_state(page, NR_FILE_DIRTY);
- dec_bdi_stat(inode_to_bdi(mapping->host), BDI_RECLAIMABLE);
- }
+ if (WARN_ON_ONCE(PageDirty(page)))
+ account_page_cleaned(page, mapping);
}
/**
@@ -1695,7 +1693,7 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
loff_t *ppos = &iocb->ki_pos;
loff_t pos = *ppos;
- if (io_is_direct(file)) {
+ if (iocb->ki_flags & IOCB_DIRECT) {
struct address_space *mapping = file->f_mapping;
struct inode *inode = mapping->host;
size_t count = iov_iter_count(iter);
@@ -1708,7 +1706,7 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
pos + count - 1);
if (!retval) {
struct iov_iter data = *iter;
- retval = mapping->a_ops->direct_IO(READ, iocb, &data, pos);
+ retval = mapping->a_ops->direct_IO(iocb, &data, pos);
}
if (retval > 0) {
@@ -2261,41 +2259,38 @@ EXPORT_SYMBOL(read_cache_page_gfp);
* Returns appropriate error code that caller should return or
* zero in case that write should be allowed.
*/
-inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk)
+inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
{
+ struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
unsigned long limit = rlimit(RLIMIT_FSIZE);
+ loff_t pos;
- if (unlikely(*pos < 0))
- return -EINVAL;
+ if (!iov_iter_count(from))
+ return 0;
- if (!isblk) {
- /* FIXME: this is for backwards compatibility with 2.4 */
- if (file->f_flags & O_APPEND)
- *pos = i_size_read(inode);
+ /* FIXME: this is for backwards compatibility with 2.4 */
+ if (iocb->ki_flags & IOCB_APPEND)
+ iocb->ki_pos = i_size_read(inode);
- if (limit != RLIM_INFINITY) {
- if (*pos >= limit) {
- send_sig(SIGXFSZ, current, 0);
- return -EFBIG;
- }
- if (*count > limit - (typeof(limit))*pos) {
- *count = limit - (typeof(limit))*pos;
- }
+ pos = iocb->ki_pos;
+
+ if (limit != RLIM_INFINITY) {
+ if (iocb->ki_pos >= limit) {
+ send_sig(SIGXFSZ, current, 0);
+ return -EFBIG;
}
+ iov_iter_truncate(from, limit - (unsigned long)pos);
}
/*
* LFS rule
*/
- if (unlikely(*pos + *count > MAX_NON_LFS &&
+ if (unlikely(pos + iov_iter_count(from) > MAX_NON_LFS &&
!(file->f_flags & O_LARGEFILE))) {
- if (*pos >= MAX_NON_LFS) {
+ if (pos >= MAX_NON_LFS)
return -EFBIG;
- }
- if (*count > MAX_NON_LFS - (unsigned long)*pos) {
- *count = MAX_NON_LFS - (unsigned long)*pos;
- }
+ iov_iter_truncate(from, MAX_NON_LFS - (unsigned long)pos);
}
/*
@@ -2305,34 +2300,11 @@ inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, i
* exceeded without writing data we send a signal and return EFBIG.
* Linus frestrict idea will clean these up nicely..
*/
- if (likely(!isblk)) {
- if (unlikely(*pos >= inode->i_sb->s_maxbytes)) {
- if (*count || *pos > inode->i_sb->s_maxbytes) {
- return -EFBIG;
- }
- /* zero-length writes at ->s_maxbytes are OK */
- }
-
- if (unlikely(*pos + *count > inode->i_sb->s_maxbytes))
- *count = inode->i_sb->s_maxbytes - *pos;
- } else {
-#ifdef CONFIG_BLOCK
- loff_t isize;
- if (bdev_read_only(I_BDEV(inode)))
- return -EPERM;
- isize = i_size_read(inode);
- if (*pos >= isize) {
- if (*count || *pos > isize)
- return -ENOSPC;
- }
+ if (unlikely(pos >= inode->i_sb->s_maxbytes))
+ return -EFBIG;
- if (*pos + *count > isize)
- *count = isize - *pos;
-#else
- return -EPERM;
-#endif
- }
- return 0;
+ iov_iter_truncate(from, inode->i_sb->s_maxbytes - pos);
+ return iov_iter_count(from);
}
EXPORT_SYMBOL(generic_write_checks);
@@ -2396,7 +2368,7 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos)
}
data = *from;
- written = mapping->a_ops->direct_IO(WRITE, iocb, &data, pos);
+ written = mapping->a_ops->direct_IO(iocb, &data, pos);
/*
* Finally, try again to invalidate clean pages which might have been
@@ -2558,23 +2530,12 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
struct file *file = iocb->ki_filp;
struct address_space * mapping = file->f_mapping;
struct inode *inode = mapping->host;
- loff_t pos = iocb->ki_pos;
ssize_t written = 0;
ssize_t err;
ssize_t status;
- size_t count = iov_iter_count(from);
/* We can write back this queue in page reclaim */
current->backing_dev_info = inode_to_bdi(inode);
- err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
- if (err)
- goto out;
-
- if (count == 0)
- goto out;
-
- iov_iter_truncate(from, count);
-
err = file_remove_suid(file);
if (err)
goto out;
@@ -2583,10 +2544,10 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
if (err)
goto out;
- if (io_is_direct(file)) {
- loff_t endbyte;
+ if (iocb->ki_flags & IOCB_DIRECT) {
+ loff_t pos, endbyte;
- written = generic_file_direct_write(iocb, from, pos);
+ written = generic_file_direct_write(iocb, from, iocb->ki_pos);
/*
* If the write stopped short of completing, fall back to
* buffered writes. Some filesystems do this for writes to
@@ -2594,13 +2555,10 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
* not succeed (even if it did, DAX does not handle dirty
* page-cache pages correctly).
*/
- if (written < 0 || written == count || IS_DAX(inode))
+ if (written < 0 || !iov_iter_count(from) || IS_DAX(inode))
goto out;
- pos += written;
- count -= written;
-
- status = generic_perform_write(file, from, pos);
+ status = generic_perform_write(file, from, pos = iocb->ki_pos);
/*
* If generic_perform_write() returned a synchronous error
* then we want to return the number of bytes which were
@@ -2612,15 +2570,15 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
err = status;
goto out;
}
- iocb->ki_pos = pos + status;
/*
* We need to ensure that the page cache pages are written to
* disk and invalidated to preserve the expected O_DIRECT
* semantics.
*/
endbyte = pos + status - 1;
- err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte);
+ err = filemap_write_and_wait_range(mapping, pos, endbyte);
if (err == 0) {
+ iocb->ki_pos = endbyte + 1;
written += status;
invalidate_mapping_pages(mapping,
pos >> PAGE_CACHE_SHIFT,
@@ -2632,9 +2590,9 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
*/
}
} else {
- written = generic_perform_write(file, from, pos);
- if (likely(written >= 0))
- iocb->ki_pos = pos + written;
+ written = generic_perform_write(file, from, iocb->ki_pos);
+ if (likely(written > 0))
+ iocb->ki_pos += written;
}
out:
current->backing_dev_info = NULL;
@@ -2658,7 +2616,9 @@ ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
ssize_t ret;
mutex_lock(&inode->i_mutex);
- ret = __generic_file_write_iter(iocb, from);
+ ret = generic_write_checks(iocb, from);
+ if (ret > 0)
+ ret = __generic_file_write_iter(iocb, from);
mutex_unlock(&inode->i_mutex);
if (ret > 0) {
diff --git a/mm/gup.c b/mm/gup.c
index a6e24e246f86..6297f6bccfb1 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -92,7 +92,7 @@ retry:
*/
mark_page_accessed(page);
}
- if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
+ if ((flags & FOLL_POPULATE) && (vma->vm_flags & VM_LOCKED)) {
/*
* The preliminary mapping check is mainly to avoid the
* pointless overhead of lock_page on the ZERO_PAGE
@@ -265,8 +265,8 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
unsigned int fault_flags = 0;
int ret;
- /* For mlock, just skip the stack guard page. */
- if ((*flags & FOLL_MLOCK) &&
+ /* For mm_populate(), just skip the stack guard page. */
+ if ((*flags & FOLL_POPULATE) &&
(stack_guard_page_start(vma, address) ||
stack_guard_page_end(vma, address + PAGE_SIZE)))
return -ENOENT;
@@ -819,6 +819,124 @@ long get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
EXPORT_SYMBOL(get_user_pages);
/**
+ * populate_vma_page_range() - populate a range of pages in the vma.
+ * @vma: target vma
+ * @start: start address
+ * @end: end address
+ * @nonblocking:
+ *
+ * This takes care of mlocking the pages too if VM_LOCKED is set.
+ *
+ * return 0 on success, negative error code on error.
+ *
+ * vma->vm_mm->mmap_sem must be held.
+ *
+ * If @nonblocking is NULL, it may be held for read or write and will
+ * be unperturbed.
+ *
+ * If @nonblocking is non-NULL, it must held for read only and may be
+ * released. If it's released, *@nonblocking will be set to 0.
+ */
+long populate_vma_page_range(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end, int *nonblocking)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long nr_pages = (end - start) / PAGE_SIZE;
+ int gup_flags;
+
+ VM_BUG_ON(start & ~PAGE_MASK);
+ VM_BUG_ON(end & ~PAGE_MASK);
+ VM_BUG_ON_VMA(start < vma->vm_start, vma);
+ VM_BUG_ON_VMA(end > vma->vm_end, vma);
+ VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm);
+
+ gup_flags = FOLL_TOUCH | FOLL_POPULATE;
+ /*
+ * We want to touch writable mappings with a write fault in order
+ * to break COW, except for shared mappings because these don't COW
+ * and we would not want to dirty them for nothing.
+ */
+ if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE)
+ gup_flags |= FOLL_WRITE;
+
+ /*
+ * We want mlock to succeed for regions that have any permissions
+ * other than PROT_NONE.
+ */
+ if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC))
+ gup_flags |= FOLL_FORCE;
+
+ /*
+ * We made sure addr is within a VMA, so the following will
+ * not result in a stack expansion that recurses back here.
+ */
+ return __get_user_pages(current, mm, start, nr_pages, gup_flags,
+ NULL, NULL, nonblocking);
+}
+
+/*
+ * __mm_populate - populate and/or mlock pages within a range of address space.
+ *
+ * This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap
+ * flags. VMAs must be already marked with the desired vm_flags, and
+ * mmap_sem must not be held.
+ */
+int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
+{
+ struct mm_struct *mm = current->mm;
+ unsigned long end, nstart, nend;
+ struct vm_area_struct *vma = NULL;
+ int locked = 0;
+ long ret = 0;
+
+ VM_BUG_ON(start & ~PAGE_MASK);
+ VM_BUG_ON(len != PAGE_ALIGN(len));
+ end = start + len;
+
+ for (nstart = start; nstart < end; nstart = nend) {
+ /*
+ * We want to fault in pages for [nstart; end) address range.
+ * Find first corresponding VMA.
+ */
+ if (!locked) {
+ locked = 1;
+ down_read(&mm->mmap_sem);
+ vma = find_vma(mm, nstart);
+ } else if (nstart >= vma->vm_end)
+ vma = vma->vm_next;
+ if (!vma || vma->vm_start >= end)
+ break;
+ /*
+ * Set [nstart; nend) to intersection of desired address
+ * range with the first VMA. Also, skip undesirable VMA types.
+ */
+ nend = min(end, vma->vm_end);
+ if (vma->vm_flags & (VM_IO | VM_PFNMAP))
+ continue;
+ if (nstart < vma->vm_start)
+ nstart = vma->vm_start;
+ /*
+ * Now fault in a range of pages. populate_vma_page_range()
+ * double checks the vma flags, so that it won't mlock pages
+ * if the vma was already munlocked.
+ */
+ ret = populate_vma_page_range(vma, nstart, nend, &locked);
+ if (ret < 0) {
+ if (ignore_errors) {
+ ret = 0;
+ continue; /* continue at next VMA */
+ }
+ break;
+ }
+ nend = nstart + ret * PAGE_SIZE;
+ ret = 0;
+ }
+ if (locked)
+ up_read(&mm->mmap_sem);
+ return ret; /* 0 or negative error code */
+}
+
+/**
* get_dump_page() - pin user page in memory while writing it to core dump
* @addr: user address
*
@@ -901,7 +1019,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
*
* for an example see gup_get_pte in arch/x86/mm/gup.c
*/
- pte_t pte = ACCESS_ONCE(*ptep);
+ pte_t pte = READ_ONCE(*ptep);
struct page *page;
/*
@@ -1191,7 +1309,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
local_irq_save(flags);
pgdp = pgd_offset(mm, addr);
do {
- pgd_t pgd = ACCESS_ONCE(*pgdp);
+ pgd_t pgd = READ_ONCE(*pgdp);
next = pgd_addr_end(addr, end);
if (pgd_none(pgd))
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 6817b0350c71..078832cf3636 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -67,6 +67,7 @@ static unsigned int khugepaged_max_ptes_none __read_mostly = HPAGE_PMD_NR-1;
static int khugepaged(void *none);
static int khugepaged_slab_init(void);
+static void khugepaged_slab_exit(void);
#define MM_SLOTS_HASH_BITS 10
static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
@@ -109,9 +110,6 @@ static int set_recommended_min_free_kbytes(void)
int nr_zones = 0;
unsigned long recommended_min;
- if (!khugepaged_enabled())
- return 0;
-
for_each_populated_zone(zone)
nr_zones++;
@@ -143,9 +141,8 @@ static int set_recommended_min_free_kbytes(void)
setup_per_zone_wmarks();
return 0;
}
-late_initcall(set_recommended_min_free_kbytes);
-static int start_khugepaged(void)
+static int start_stop_khugepaged(void)
{
int err = 0;
if (khugepaged_enabled()) {
@@ -156,6 +153,7 @@ static int start_khugepaged(void)
pr_err("khugepaged: kthread_run(khugepaged) failed\n");
err = PTR_ERR(khugepaged_thread);
khugepaged_thread = NULL;
+ goto fail;
}
if (!list_empty(&khugepaged_scan.mm_head))
@@ -166,7 +164,7 @@ static int start_khugepaged(void)
kthread_stop(khugepaged_thread);
khugepaged_thread = NULL;
}
-
+fail:
return err;
}
@@ -183,7 +181,7 @@ static struct page *get_huge_zero_page(void)
struct page *zero_page;
retry:
if (likely(atomic_inc_not_zero(&huge_zero_refcount)))
- return ACCESS_ONCE(huge_zero_page);
+ return READ_ONCE(huge_zero_page);
zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE,
HPAGE_PMD_ORDER);
@@ -202,7 +200,7 @@ retry:
/* We take additional reference here. It will be put back by shrinker */
atomic_set(&huge_zero_refcount, 2);
preempt_enable();
- return ACCESS_ONCE(huge_zero_page);
+ return READ_ONCE(huge_zero_page);
}
static void put_huge_zero_page(void)
@@ -300,7 +298,7 @@ static ssize_t enabled_store(struct kobject *kobj,
int err;
mutex_lock(&khugepaged_mutex);
- err = start_khugepaged();
+ err = start_stop_khugepaged();
mutex_unlock(&khugepaged_mutex);
if (err)
@@ -634,27 +632,38 @@ static int __init hugepage_init(void)
err = hugepage_init_sysfs(&hugepage_kobj);
if (err)
- return err;
+ goto err_sysfs;
err = khugepaged_slab_init();
if (err)
- goto out;
+ goto err_slab;
- register_shrinker(&huge_zero_page_shrinker);
+ err = register_shrinker(&huge_zero_page_shrinker);
+ if (err)
+ goto err_hzp_shrinker;
/*
* By default disable transparent hugepages on smaller systems,
* where the extra memory used could hurt more than TLB overhead
* is likely to save. The admin can still enable it through /sys.
*/
- if (totalram_pages < (512 << (20 - PAGE_SHIFT)))
+ if (totalram_pages < (512 << (20 - PAGE_SHIFT))) {
transparent_hugepage_flags = 0;
+ return 0;
+ }
- start_khugepaged();
+ err = start_stop_khugepaged();
+ if (err)
+ goto err_khugepaged;
return 0;
-out:
+err_khugepaged:
+ unregister_shrinker(&huge_zero_page_shrinker);
+err_hzp_shrinker:
+ khugepaged_slab_exit();
+err_slab:
hugepage_exit_sysfs(hugepage_kobj);
+err_sysfs:
return err;
}
subsys_initcall(hugepage_init);
@@ -708,7 +717,7 @@ static inline pmd_t mk_huge_pmd(struct page *page, pgprot_t prot)
static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
struct vm_area_struct *vma,
unsigned long haddr, pmd_t *pmd,
- struct page *page)
+ struct page *page, gfp_t gfp)
{
struct mem_cgroup *memcg;
pgtable_t pgtable;
@@ -716,7 +725,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
VM_BUG_ON_PAGE(!PageCompound(page), page);
- if (mem_cgroup_try_charge(page, mm, GFP_TRANSHUGE, &memcg))
+ if (mem_cgroup_try_charge(page, mm, gfp, &memcg))
return VM_FAULT_OOM;
pgtable = pte_alloc_one(mm, haddr);
@@ -822,7 +831,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
count_vm_event(THP_FAULT_FALLBACK);
return VM_FAULT_FALLBACK;
}
- if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page))) {
+ if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page, gfp))) {
put_page(page);
count_vm_event(THP_FAULT_FALLBACK);
return VM_FAULT_FALLBACK;
@@ -1080,6 +1089,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long haddr;
unsigned long mmun_start; /* For mmu_notifiers */
unsigned long mmun_end; /* For mmu_notifiers */
+ gfp_t huge_gfp; /* for allocation and charge */
ptl = pmd_lockptr(mm, pmd);
VM_BUG_ON_VMA(!vma->anon_vma, vma);
@@ -1106,10 +1116,8 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
alloc:
if (transparent_hugepage_enabled(vma) &&
!transparent_hugepage_debug_cow()) {
- gfp_t gfp;
-
- gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0);
- new_page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER);
+ huge_gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0);
+ new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER);
} else
new_page = NULL;
@@ -1130,8 +1138,7 @@ alloc:
goto out;
}
- if (unlikely(mem_cgroup_try_charge(new_page, mm,
- GFP_TRANSHUGE, &memcg))) {
+ if (unlikely(mem_cgroup_try_charge(new_page, mm, huge_gfp, &memcg))) {
put_page(new_page);
if (page) {
split_huge_page(page);
@@ -1231,7 +1238,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
pmd, _pmd, 1))
update_mmu_cache_pmd(vma, addr, pmd);
}
- if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
+ if ((flags & FOLL_POPULATE) && (vma->vm_flags & VM_LOCKED)) {
if (page->mapping && trylock_page(page)) {
lru_add_drain();
if (page->mapping)
@@ -1976,6 +1983,11 @@ static int __init khugepaged_slab_init(void)
return 0;
}
+static void __init khugepaged_slab_exit(void)
+{
+ kmem_cache_destroy(mm_slot_cache);
+}
+
static inline struct mm_slot *alloc_mm_slot(void)
{
if (!mm_slot_cache) /* initialization failed */
@@ -2109,7 +2121,7 @@ static void release_pte_pages(pte_t *pte, pte_t *_pte)
{
while (--_pte >= pte) {
pte_t pteval = *_pte;
- if (!pte_none(pteval))
+ if (!pte_none(pteval) && !is_zero_pfn(pte_pfn(pteval)))
release_pte_page(pte_page(pteval));
}
}
@@ -2120,13 +2132,13 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
{
struct page *page;
pte_t *_pte;
- int none = 0;
+ int none_or_zero = 0;
bool referenced = false, writable = false;
for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
_pte++, address += PAGE_SIZE) {
pte_t pteval = *_pte;
- if (pte_none(pteval)) {
- if (++none <= khugepaged_max_ptes_none)
+ if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
+ if (++none_or_zero <= khugepaged_max_ptes_none)
continue;
else
goto out;
@@ -2207,9 +2219,21 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
pte_t pteval = *_pte;
struct page *src_page;
- if (pte_none(pteval)) {
+ if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
clear_user_highpage(page, address);
add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1);
+ if (is_zero_pfn(pte_pfn(pteval))) {
+ /*
+ * ptl mostly unnecessary.
+ */
+ spin_lock(ptl);
+ /*
+ * paravirt calls inside pte_clear here are
+ * superfluous.
+ */
+ pte_clear(vma->vm_mm, address, _pte);
+ spin_unlock(ptl);
+ }
} else {
src_page = pte_page(pteval);
copy_user_highpage(page, src_page, address, vma);
@@ -2311,8 +2335,8 @@ static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
return true;
}
-static struct page
-*khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm,
+static struct page *
+khugepaged_alloc_page(struct page **hpage, gfp_t gfp, struct mm_struct *mm,
struct vm_area_struct *vma, unsigned long address,
int node)
{
@@ -2326,8 +2350,7 @@ static struct page
*/
up_read(&mm->mmap_sem);
- *hpage = alloc_pages_exact_node(node, alloc_hugepage_gfpmask(
- khugepaged_defrag(), __GFP_OTHER_NODE), HPAGE_PMD_ORDER);
+ *hpage = alloc_pages_exact_node(node, gfp, HPAGE_PMD_ORDER);
if (unlikely(!*hpage)) {
count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
*hpage = ERR_PTR(-ENOMEM);
@@ -2380,13 +2403,14 @@ static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
return true;
}
-static struct page
-*khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm,
+static struct page *
+khugepaged_alloc_page(struct page **hpage, gfp_t gfp, struct mm_struct *mm,
struct vm_area_struct *vma, unsigned long address,
int node)
{
up_read(&mm->mmap_sem);
VM_BUG_ON(!*hpage);
+
return *hpage;
}
#endif
@@ -2421,16 +2445,21 @@ static void collapse_huge_page(struct mm_struct *mm,
struct mem_cgroup *memcg;
unsigned long mmun_start; /* For mmu_notifiers */
unsigned long mmun_end; /* For mmu_notifiers */
+ gfp_t gfp;
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+ /* Only allocate from the target node */
+ gfp = alloc_hugepage_gfpmask(khugepaged_defrag(), __GFP_OTHER_NODE) |
+ __GFP_THISNODE;
+
/* release the mmap_sem read lock. */
- new_page = khugepaged_alloc_page(hpage, mm, vma, address, node);
+ new_page = khugepaged_alloc_page(hpage, gfp, mm, vma, address, node);
if (!new_page)
return;
if (unlikely(mem_cgroup_try_charge(new_page, mm,
- GFP_TRANSHUGE, &memcg)))
+ gfp, &memcg)))
return;
/*
@@ -2543,7 +2572,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
{
pmd_t *pmd;
pte_t *pte, *_pte;
- int ret = 0, none = 0;
+ int ret = 0, none_or_zero = 0;
struct page *page;
unsigned long _address;
spinlock_t *ptl;
@@ -2561,8 +2590,8 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR;
_pte++, _address += PAGE_SIZE) {
pte_t pteval = *_pte;
- if (pte_none(pteval)) {
- if (++none <= khugepaged_max_ptes_none)
+ if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
+ if (++none_or_zero <= khugepaged_max_ptes_none)
continue;
else
goto out_unmap;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index c41b2a0ee273..271e4432734c 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -61,6 +61,9 @@ DEFINE_SPINLOCK(hugetlb_lock);
static int num_fault_mutexes;
static struct mutex *htlb_fault_mutex_table ____cacheline_aligned_in_smp;
+/* Forward declaration */
+static int hugetlb_acct_memory(struct hstate *h, long delta);
+
static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)
{
bool free = (spool->count == 0) && (spool->used_hpages == 0);
@@ -68,23 +71,36 @@ static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)
spin_unlock(&spool->lock);
/* If no pages are used, and no other handles to the subpool
- * remain, free the subpool the subpool remain */
- if (free)
+ * remain, give up any reservations mased on minimum size and
+ * free the subpool */
+ if (free) {
+ if (spool->min_hpages != -1)
+ hugetlb_acct_memory(spool->hstate,
+ -spool->min_hpages);
kfree(spool);
+ }
}
-struct hugepage_subpool *hugepage_new_subpool(long nr_blocks)
+struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages,
+ long min_hpages)
{
struct hugepage_subpool *spool;
- spool = kmalloc(sizeof(*spool), GFP_KERNEL);
+ spool = kzalloc(sizeof(*spool), GFP_KERNEL);
if (!spool)
return NULL;
spin_lock_init(&spool->lock);
spool->count = 1;
- spool->max_hpages = nr_blocks;
- spool->used_hpages = 0;
+ spool->max_hpages = max_hpages;
+ spool->hstate = h;
+ spool->min_hpages = min_hpages;
+
+ if (min_hpages != -1 && hugetlb_acct_memory(h, min_hpages)) {
+ kfree(spool);
+ return NULL;
+ }
+ spool->rsv_hpages = min_hpages;
return spool;
}
@@ -97,36 +113,89 @@ void hugepage_put_subpool(struct hugepage_subpool *spool)
unlock_or_release_subpool(spool);
}
-static int hugepage_subpool_get_pages(struct hugepage_subpool *spool,
+/*
+ * Subpool accounting for allocating and reserving pages.
+ * Return -ENOMEM if there are not enough resources to satisfy the
+ * the request. Otherwise, return the number of pages by which the
+ * global pools must be adjusted (upward). The returned value may
+ * only be different than the passed value (delta) in the case where
+ * a subpool minimum size must be manitained.
+ */
+static long hugepage_subpool_get_pages(struct hugepage_subpool *spool,
long delta)
{
- int ret = 0;
+ long ret = delta;
if (!spool)
- return 0;
+ return ret;
spin_lock(&spool->lock);
- if ((spool->used_hpages + delta) <= spool->max_hpages) {
- spool->used_hpages += delta;
- } else {
- ret = -ENOMEM;
+
+ if (spool->max_hpages != -1) { /* maximum size accounting */
+ if ((spool->used_hpages + delta) <= spool->max_hpages)
+ spool->used_hpages += delta;
+ else {
+ ret = -ENOMEM;
+ goto unlock_ret;
+ }
}
- spin_unlock(&spool->lock);
+ if (spool->min_hpages != -1) { /* minimum size accounting */
+ if (delta > spool->rsv_hpages) {
+ /*
+ * Asking for more reserves than those already taken on
+ * behalf of subpool. Return difference.
+ */
+ ret = delta - spool->rsv_hpages;
+ spool->rsv_hpages = 0;
+ } else {
+ ret = 0; /* reserves already accounted for */
+ spool->rsv_hpages -= delta;
+ }
+ }
+
+unlock_ret:
+ spin_unlock(&spool->lock);
return ret;
}
-static void hugepage_subpool_put_pages(struct hugepage_subpool *spool,
+/*
+ * Subpool accounting for freeing and unreserving pages.
+ * Return the number of global page reservations that must be dropped.
+ * The return value may only be different than the passed value (delta)
+ * in the case where a subpool minimum size must be maintained.
+ */
+static long hugepage_subpool_put_pages(struct hugepage_subpool *spool,
long delta)
{
+ long ret = delta;
+
if (!spool)
- return;
+ return delta;
spin_lock(&spool->lock);
- spool->used_hpages -= delta;
- /* If hugetlbfs_put_super couldn't free spool due to
- * an outstanding quota reference, free it now. */
+
+ if (spool->max_hpages != -1) /* maximum size accounting */
+ spool->used_hpages -= delta;
+
+ if (spool->min_hpages != -1) { /* minimum size accounting */
+ if (spool->rsv_hpages + delta <= spool->min_hpages)
+ ret = 0;
+ else
+ ret = spool->rsv_hpages + delta - spool->min_hpages;
+
+ spool->rsv_hpages += delta;
+ if (spool->rsv_hpages > spool->min_hpages)
+ spool->rsv_hpages = spool->min_hpages;
+ }
+
+ /*
+ * If hugetlbfs_put_super couldn't free spool due to an outstanding
+ * quota reference, free it now.
+ */
unlock_or_release_subpool(spool);
+
+ return ret;
}
static inline struct hugepage_subpool *subpool_inode(struct inode *inode)
@@ -855,6 +924,31 @@ struct hstate *size_to_hstate(unsigned long size)
return NULL;
}
+/*
+ * Test to determine whether the hugepage is "active/in-use" (i.e. being linked
+ * to hstate->hugepage_activelist.)
+ *
+ * This function can be called for tail pages, but never returns true for them.
+ */
+bool page_huge_active(struct page *page)
+{
+ VM_BUG_ON_PAGE(!PageHuge(page), page);
+ return PageHead(page) && PagePrivate(&page[1]);
+}
+
+/* never called for tail page */
+static void set_page_huge_active(struct page *page)
+{
+ VM_BUG_ON_PAGE(!PageHeadHuge(page), page);
+ SetPagePrivate(&page[1]);
+}
+
+static void clear_page_huge_active(struct page *page)
+{
+ VM_BUG_ON_PAGE(!PageHeadHuge(page), page);
+ ClearPagePrivate(&page[1]);
+}
+
void free_huge_page(struct page *page)
{
/*
@@ -874,7 +968,16 @@ void free_huge_page(struct page *page)
restore_reserve = PagePrivate(page);
ClearPagePrivate(page);
+ /*
+ * A return code of zero implies that the subpool will be under its
+ * minimum size if the reservation is not restored after page is free.
+ * Therefore, force restore_reserve operation.
+ */
+ if (hugepage_subpool_put_pages(spool, 1) == 0)
+ restore_reserve = true;
+
spin_lock(&hugetlb_lock);
+ clear_page_huge_active(page);
hugetlb_cgroup_uncharge_page(hstate_index(h),
pages_per_huge_page(h), page);
if (restore_reserve)
@@ -891,7 +994,6 @@ void free_huge_page(struct page *page)
enqueue_huge_page(h, page);
}
spin_unlock(&hugetlb_lock);
- hugepage_subpool_put_pages(spool, 1);
}
static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
@@ -1386,7 +1488,7 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
if (chg < 0)
return ERR_PTR(-ENOMEM);
if (chg || avoid_reserve)
- if (hugepage_subpool_get_pages(spool, 1))
+ if (hugepage_subpool_get_pages(spool, 1) < 0)
return ERR_PTR(-ENOSPC);
ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg);
@@ -2454,6 +2556,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
struct resv_map *resv = vma_resv_map(vma);
struct hugepage_subpool *spool = subpool_vma(vma);
unsigned long reserve, start, end;
+ long gbl_reserve;
if (!resv || !is_vma_resv_set(vma, HPAGE_RESV_OWNER))
return;
@@ -2466,8 +2569,12 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
kref_put(&resv->refs, resv_map_release);
if (reserve) {
- hugetlb_acct_memory(h, -reserve);
- hugepage_subpool_put_pages(spool, reserve);
+ /*
+ * Decrement reserve counts. The global reserve count may be
+ * adjusted if the subpool has a minimum size.
+ */
+ gbl_reserve = hugepage_subpool_put_pages(spool, reserve);
+ hugetlb_acct_memory(h, -gbl_reserve);
}
}
@@ -2891,6 +2998,7 @@ retry_avoidcopy:
copy_user_huge_page(new_page, old_page, address, vma,
pages_per_huge_page(h));
__SetPageUptodate(new_page);
+ set_page_huge_active(new_page);
mmun_start = address & huge_page_mask(h);
mmun_end = mmun_start + huge_page_size(h);
@@ -3003,6 +3111,7 @@ retry:
}
clear_huge_page(page, address, pages_per_huge_page(h));
__SetPageUptodate(page);
+ set_page_huge_active(page);
if (vma->vm_flags & VM_MAYSHARE) {
int err;
@@ -3278,6 +3387,15 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
struct page *page;
/*
+ * If we have a pending SIGKILL, don't keep faulting pages and
+ * potentially allocating memory.
+ */
+ if (unlikely(fatal_signal_pending(current))) {
+ remainder = 0;
+ break;
+ }
+
+ /*
* Some archs (sparc64, sh*) have multiple pte_ts to
* each hugepage. We have to make sure we get the
* first, for the page indexing below to work.
@@ -3438,6 +3556,7 @@ int hugetlb_reserve_pages(struct inode *inode,
struct hstate *h = hstate_inode(inode);
struct hugepage_subpool *spool = subpool_inode(inode);
struct resv_map *resv_map;
+ long gbl_reserve;
/*
* Only apply hugepage reservation if asked. At fault time, an
@@ -3474,8 +3593,13 @@ int hugetlb_reserve_pages(struct inode *inode,
goto out_err;
}
- /* There must be enough pages in the subpool for the mapping */
- if (hugepage_subpool_get_pages(spool, chg)) {
+ /*
+ * There must be enough pages in the subpool for the mapping. If
+ * the subpool has a minimum size, there may be some global
+ * reservations already in place (gbl_reserve).
+ */
+ gbl_reserve = hugepage_subpool_get_pages(spool, chg);
+ if (gbl_reserve < 0) {
ret = -ENOSPC;
goto out_err;
}
@@ -3484,9 +3608,10 @@ int hugetlb_reserve_pages(struct inode *inode,
* Check enough hugepages are available for the reservation.
* Hand the pages back to the subpool if there are not
*/
- ret = hugetlb_acct_memory(h, chg);
+ ret = hugetlb_acct_memory(h, gbl_reserve);
if (ret < 0) {
- hugepage_subpool_put_pages(spool, chg);
+ /* put back original number of pages, chg */
+ (void)hugepage_subpool_put_pages(spool, chg);
goto out_err;
}
@@ -3516,6 +3641,7 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
struct resv_map *resv_map = inode_resv_map(inode);
long chg = 0;
struct hugepage_subpool *spool = subpool_inode(inode);
+ long gbl_reserve;
if (resv_map)
chg = region_truncate(resv_map, offset);
@@ -3523,8 +3649,12 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
inode->i_blocks -= (blocks_per_huge_page(h) * freed);
spin_unlock(&inode->i_lock);
- hugepage_subpool_put_pages(spool, (chg - freed));
- hugetlb_acct_memory(h, -(chg - freed));
+ /*
+ * If the subpool has a minimum size, the number of global
+ * reservations to be released may be adjusted.
+ */
+ gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed));
+ hugetlb_acct_memory(h, -gbl_reserve);
}
#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
@@ -3735,8 +3865,7 @@ retry:
if (!pmd_huge(*pmd))
goto out;
if (pmd_present(*pmd)) {
- page = pte_page(*(pte_t *)pmd) +
- ((address & ~PMD_MASK) >> PAGE_SHIFT);
+ page = pmd_page(*pmd) + ((address & ~PMD_MASK) >> PAGE_SHIFT);
if (flags & FOLL_GET)
get_page(page);
} else {
@@ -3767,20 +3896,6 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address,
#ifdef CONFIG_MEMORY_FAILURE
-/* Should be called in hugetlb_lock */
-static int is_hugepage_on_freelist(struct page *hpage)
-{
- struct page *page;
- struct page *tmp;
- struct hstate *h = page_hstate(hpage);
- int nid = page_to_nid(hpage);
-
- list_for_each_entry_safe(page, tmp, &h->hugepage_freelists[nid], lru)
- if (page == hpage)
- return 1;
- return 0;
-}
-
/*
* This function is called from memory failure code.
* Assume the caller holds page lock of the head page.
@@ -3792,7 +3907,11 @@ int dequeue_hwpoisoned_huge_page(struct page *hpage)
int ret = -EBUSY;
spin_lock(&hugetlb_lock);
- if (is_hugepage_on_freelist(hpage)) {
+ /*
+ * Just checking !page_huge_active is not enough, because that could be
+ * an isolated/hwpoisoned hugepage (which have >0 refcount).
+ */
+ if (!page_huge_active(hpage) && !page_count(hpage)) {
/*
* Hwpoisoned hugepage isn't linked to activelist or freelist,
* but dangling hpage->lru can trigger list-debug warnings
@@ -3812,42 +3931,27 @@ int dequeue_hwpoisoned_huge_page(struct page *hpage)
bool isolate_huge_page(struct page *page, struct list_head *list)
{
+ bool ret = true;
+
VM_BUG_ON_PAGE(!PageHead(page), page);
- if (!get_page_unless_zero(page))
- return false;
spin_lock(&hugetlb_lock);
+ if (!page_huge_active(page) || !get_page_unless_zero(page)) {
+ ret = false;
+ goto unlock;
+ }
+ clear_page_huge_active(page);
list_move_tail(&page->lru, list);
+unlock:
spin_unlock(&hugetlb_lock);
- return true;
+ return ret;
}
void putback_active_hugepage(struct page *page)
{
VM_BUG_ON_PAGE(!PageHead(page), page);
spin_lock(&hugetlb_lock);
+ set_page_huge_active(page);
list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist);
spin_unlock(&hugetlb_lock);
put_page(page);
}
-
-bool is_hugepage_active(struct page *page)
-{
- VM_BUG_ON_PAGE(!PageHuge(page), page);
- /*
- * This function can be called for a tail page because the caller,
- * scan_movable_pages, scans through a given pfn-range which typically
- * covers one memory block. In systems using gigantic hugepage (1GB
- * for x86_64,) a hugepage is larger than a memory block, and we don't
- * support migrating such large hugepages for now, so return false
- * when called for tail pages.
- */
- if (PageTail(page))
- return false;
- /*
- * Refcount of a hwpoisoned hugepages is 1, but they are not active,
- * so we should return false for them.
- */
- if (unlikely(PageHWPoison(page)))
- return false;
- return page_count(page) > 0;
-}
diff --git a/mm/internal.h b/mm/internal.h
index a96da5b0029d..a25e359a4039 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -200,6 +200,8 @@ isolate_freepages_range(struct compact_control *cc,
unsigned long
isolate_migratepages_range(struct compact_control *cc,
unsigned long low_pfn, unsigned long end_pfn);
+int find_suitable_fallback(struct free_area *area, unsigned int order,
+ int migratetype, bool only_stealable, bool *can_steal);
#endif
@@ -222,13 +224,13 @@ static inline unsigned long page_order(struct page *page)
* PageBuddy() should be checked first by the caller to minimize race window,
* and invalid values must be handled gracefully.
*
- * ACCESS_ONCE is used so that if the caller assigns the result into a local
+ * READ_ONCE is used so that if the caller assigns the result into a local
* variable and e.g. tests it for valid range before using, the compiler cannot
* decide to remove the variable and inline the page_private(page) multiple
* times, potentially observing different values in the tests and the actual
* use of the result.
*/
-#define page_order_unsafe(page) ACCESS_ONCE(page_private(page))
+#define page_order_unsafe(page) READ_ONCE(page_private(page))
static inline bool is_cow_mapping(vm_flags_t flags)
{
@@ -240,7 +242,7 @@ void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
struct vm_area_struct *prev, struct rb_node *rb_parent);
#ifdef CONFIG_MMU
-extern long __mlock_vma_pages_range(struct vm_area_struct *vma,
+extern long populate_vma_page_range(struct vm_area_struct *vma,
unsigned long start, unsigned long end, int *nonblocking);
extern void munlock_vma_pages_range(struct vm_area_struct *vma,
unsigned long start, unsigned long end);
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index 936d81661c47..6c513a63ea84 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -389,6 +389,19 @@ void kasan_krealloc(const void *object, size_t size)
kasan_kmalloc(page->slab_cache, object, size);
}
+void kasan_kfree(void *ptr)
+{
+ struct page *page;
+
+ page = virt_to_head_page(ptr);
+
+ if (unlikely(!PageSlab(page)))
+ kasan_poison_shadow(ptr, PAGE_SIZE << compound_order(page),
+ KASAN_FREE_PAGE);
+ else
+ kasan_slab_free(page->slab_cache, ptr);
+}
+
void kasan_kfree_large(const void *ptr)
{
struct page *page = virt_to_page(ptr);
diff --git a/mm/ksm.c b/mm/ksm.c
index 4162dce2eb44..7ee101eaacdf 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -542,7 +542,7 @@ static struct page *get_ksm_page(struct stable_node *stable_node, bool lock_it)
expected_mapping = (void *)stable_node +
(PAGE_MAPPING_ANON | PAGE_MAPPING_KSM);
again:
- kpfn = ACCESS_ONCE(stable_node->kpfn);
+ kpfn = READ_ONCE(stable_node->kpfn);
page = pfn_to_page(kpfn);
/*
@@ -551,7 +551,7 @@ again:
* but on Alpha we need to be more careful.
*/
smp_read_barrier_depends();
- if (ACCESS_ONCE(page->mapping) != expected_mapping)
+ if (READ_ONCE(page->mapping) != expected_mapping)
goto stale;
/*
@@ -577,14 +577,14 @@ again:
cpu_relax();
}
- if (ACCESS_ONCE(page->mapping) != expected_mapping) {
+ if (READ_ONCE(page->mapping) != expected_mapping) {
put_page(page);
goto stale;
}
if (lock_it) {
lock_page(page);
- if (ACCESS_ONCE(page->mapping) != expected_mapping) {
+ if (READ_ONCE(page->mapping) != expected_mapping) {
unlock_page(page);
put_page(page);
goto stale;
@@ -600,7 +600,7 @@ stale:
* before checking whether node->kpfn has been changed.
*/
smp_rmb();
- if (ACCESS_ONCE(stable_node->kpfn) != kpfn)
+ if (READ_ONCE(stable_node->kpfn) != kpfn)
goto again;
remove_node_from_stable_tree(stable_node);
return NULL;
diff --git a/mm/memblock.c b/mm/memblock.c
index 252b77bdf65e..9318b567ed79 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -580,10 +580,24 @@ int __init_memblock memblock_add_node(phys_addr_t base, phys_addr_t size,
return memblock_add_range(&memblock.memory, base, size, nid, 0);
}
+static int __init_memblock memblock_add_region(phys_addr_t base,
+ phys_addr_t size,
+ int nid,
+ unsigned long flags)
+{
+ struct memblock_type *_rgn = &memblock.memory;
+
+ memblock_dbg("memblock_add: [%#016llx-%#016llx] flags %#02lx %pF\n",
+ (unsigned long long)base,
+ (unsigned long long)base + size - 1,
+ flags, (void *)_RET_IP_);
+
+ return memblock_add_range(_rgn, base, size, nid, flags);
+}
+
int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size)
{
- return memblock_add_range(&memblock.memory, base, size,
- MAX_NUMNODES, 0);
+ return memblock_add_region(base, size, MAX_NUMNODES, 0);
}
/**
@@ -699,14 +713,14 @@ static int __init_memblock memblock_reserve_region(phys_addr_t base,
int nid,
unsigned long flags)
{
- struct memblock_type *_rgn = &memblock.reserved;
+ struct memblock_type *type = &memblock.reserved;
memblock_dbg("memblock_reserve: [%#016llx-%#016llx] flags %#02lx %pF\n",
(unsigned long long)base,
(unsigned long long)base + size - 1,
flags, (void *)_RET_IP_);
- return memblock_add_range(_rgn, base, size, nid, flags);
+ return memblock_add_range(type, base, size, nid, flags);
}
int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index b34ef4a32a3b..14c2f2017e37 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -14,6 +14,12 @@
* Copyright (C) 2012 Parallels Inc. and Google Inc.
* Authors: Glauber Costa and Suleiman Souhlal
*
+ * Native page reclaim
+ * Charge lifetime sanitation
+ * Lockless page tracking & accounting
+ * Unified hierarchy configuration model
+ * Copyright (C) 2015 Red Hat, Inc., Johannes Weiner
+ *
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
@@ -253,11 +259,6 @@ static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
* page cache and RSS per cgroup. We would eventually like to provide
* statistics based on the statistics developed by Rik Van Riel for clock-pro,
* to help the administrator determine what knobs to tune.
- *
- * TODO: Add a water mark for the memory controller. Reclaim will begin when
- * we hit the water mark. May be even add a low water mark, such that
- * no reclaim occurs from a cgroup at it's low water mark, this is
- * a feature that will be implemented much later in the future.
*/
struct mem_cgroup {
struct cgroup_subsys_state css;
@@ -454,6 +455,12 @@ static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
return memcg->css.id;
}
+/*
+ * A helper function to get mem_cgroup from ID. must be called under
+ * rcu_read_lock(). The caller is responsible for calling
+ * css_tryget_online() if the mem_cgroup is used for charging. (dropping
+ * refcnt from swap can be called against removed memcg.)
+ */
static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
{
struct cgroup_subsys_state *css;
@@ -667,7 +674,7 @@ static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz,
static unsigned long soft_limit_excess(struct mem_cgroup *memcg)
{
unsigned long nr_pages = page_counter_read(&memcg->memory);
- unsigned long soft_limit = ACCESS_ONCE(memcg->soft_limit);
+ unsigned long soft_limit = READ_ONCE(memcg->soft_limit);
unsigned long excess = 0;
if (nr_pages > soft_limit)
@@ -1035,7 +1042,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
goto out_unlock;
do {
- pos = ACCESS_ONCE(iter->position);
+ pos = READ_ONCE(iter->position);
/*
* A racing update may change the position and
* put the last reference, hence css_tryget(),
@@ -1352,13 +1359,13 @@ static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
unsigned long limit;
count = page_counter_read(&memcg->memory);
- limit = ACCESS_ONCE(memcg->memory.limit);
+ limit = READ_ONCE(memcg->memory.limit);
if (count < limit)
margin = limit - count;
if (do_swap_account) {
count = page_counter_read(&memcg->memsw);
- limit = ACCESS_ONCE(memcg->memsw.limit);
+ limit = READ_ONCE(memcg->memsw.limit);
if (count <= limit)
margin = min(margin, limit - count);
}
@@ -1436,15 +1443,17 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
struct mem_cgroup *iter;
unsigned int i;
- if (!p)
- return;
-
mutex_lock(&oom_info_lock);
rcu_read_lock();
- pr_info("Task in ");
- pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id));
- pr_cont(" killed as a result of limit of ");
+ if (p) {
+ pr_info("Task in ");
+ pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id));
+ pr_cont(" killed as a result of limit of ");
+ } else {
+ pr_info("Memory limit reached of cgroup ");
+ }
+
pr_cont_cgroup_path(memcg->css.cgroup);
pr_cont("\n");
@@ -1531,7 +1540,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
return;
}
- check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL);
+ check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL, memcg);
totalpages = mem_cgroup_get_limit(memcg) ? : 1;
for_each_mem_cgroup_tree(iter, memcg) {
struct css_task_iter it;
@@ -2341,20 +2350,6 @@ static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
}
/*
- * A helper function to get mem_cgroup from ID. must be called under
- * rcu_read_lock(). The caller is responsible for calling
- * css_tryget_online() if the mem_cgroup is used for charging. (dropping
- * refcnt from swap can be called against removed memcg.)
- */
-static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
-{
- /* ID 0 is unused ID */
- if (!id)
- return NULL;
- return mem_cgroup_from_id(id);
-}
-
-/*
* try_get_mem_cgroup_from_page - look up page's memcg association
* @page: the page
*
@@ -2380,7 +2375,7 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
ent.val = page_private(page);
id = lookup_swap_cgroup_id(ent);
rcu_read_lock();
- memcg = mem_cgroup_lookup(id);
+ memcg = mem_cgroup_from_id(id);
if (memcg && !css_tryget_online(&memcg->css))
memcg = NULL;
rcu_read_unlock();
@@ -2642,7 +2637,7 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep)
return cachep;
memcg = get_mem_cgroup_from_mm(current->mm);
- kmemcg_id = ACCESS_ONCE(memcg->kmemcg_id);
+ kmemcg_id = READ_ONCE(memcg->kmemcg_id);
if (kmemcg_id < 0)
goto out;
@@ -2779,92 +2774,6 @@ void mem_cgroup_split_huge_fixup(struct page *head)
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-/**
- * mem_cgroup_move_account - move account of the page
- * @page: the page
- * @nr_pages: number of regular pages (>1 for huge pages)
- * @from: mem_cgroup which the page is moved from.
- * @to: mem_cgroup which the page is moved to. @from != @to.
- *
- * The caller must confirm following.
- * - page is not on LRU (isolate_page() is useful.)
- * - compound_lock is held when nr_pages > 1
- *
- * This function doesn't do "charge" to new cgroup and doesn't do "uncharge"
- * from old cgroup.
- */
-static int mem_cgroup_move_account(struct page *page,
- unsigned int nr_pages,
- struct mem_cgroup *from,
- struct mem_cgroup *to)
-{
- unsigned long flags;
- int ret;
-
- VM_BUG_ON(from == to);
- VM_BUG_ON_PAGE(PageLRU(page), page);
- /*
- * The page is isolated from LRU. So, collapse function
- * will not handle this page. But page splitting can happen.
- * Do this check under compound_page_lock(). The caller should
- * hold it.
- */
- ret = -EBUSY;
- if (nr_pages > 1 && !PageTransHuge(page))
- goto out;
-
- /*
- * Prevent mem_cgroup_migrate() from looking at page->mem_cgroup
- * of its source page while we change it: page migration takes
- * both pages off the LRU, but page cache replacement doesn't.
- */
- if (!trylock_page(page))
- goto out;
-
- ret = -EINVAL;
- if (page->mem_cgroup != from)
- goto out_unlock;
-
- spin_lock_irqsave(&from->move_lock, flags);
-
- if (!PageAnon(page) && page_mapped(page)) {
- __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
- nr_pages);
- __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
- nr_pages);
- }
-
- if (PageWriteback(page)) {
- __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_WRITEBACK],
- nr_pages);
- __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_WRITEBACK],
- nr_pages);
- }
-
- /*
- * It is safe to change page->mem_cgroup here because the page
- * is referenced, charged, and isolated - we can't race with
- * uncharging, charging, migration, or LRU putback.
- */
-
- /* caller should have done css_get */
- page->mem_cgroup = to;
- spin_unlock_irqrestore(&from->move_lock, flags);
-
- ret = 0;
-
- local_irq_disable();
- mem_cgroup_charge_statistics(to, page, nr_pages);
- memcg_check_events(to, page);
- mem_cgroup_charge_statistics(from, page, -nr_pages);
- memcg_check_events(from, page);
- local_irq_enable();
-out_unlock:
- unlock_page(page);
-out:
- return ret;
-}
-
#ifdef CONFIG_MEMCG_SWAP
static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
bool charge)
@@ -4816,6 +4725,92 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
return page;
}
+/**
+ * mem_cgroup_move_account - move account of the page
+ * @page: the page
+ * @nr_pages: number of regular pages (>1 for huge pages)
+ * @from: mem_cgroup which the page is moved from.
+ * @to: mem_cgroup which the page is moved to. @from != @to.
+ *
+ * The caller must confirm following.
+ * - page is not on LRU (isolate_page() is useful.)
+ * - compound_lock is held when nr_pages > 1
+ *
+ * This function doesn't do "charge" to new cgroup and doesn't do "uncharge"
+ * from old cgroup.
+ */
+static int mem_cgroup_move_account(struct page *page,
+ unsigned int nr_pages,
+ struct mem_cgroup *from,
+ struct mem_cgroup *to)
+{
+ unsigned long flags;
+ int ret;
+
+ VM_BUG_ON(from == to);
+ VM_BUG_ON_PAGE(PageLRU(page), page);
+ /*
+ * The page is isolated from LRU. So, collapse function
+ * will not handle this page. But page splitting can happen.
+ * Do this check under compound_page_lock(). The caller should
+ * hold it.
+ */
+ ret = -EBUSY;
+ if (nr_pages > 1 && !PageTransHuge(page))
+ goto out;
+
+ /*
+ * Prevent mem_cgroup_migrate() from looking at page->mem_cgroup
+ * of its source page while we change it: page migration takes
+ * both pages off the LRU, but page cache replacement doesn't.
+ */
+ if (!trylock_page(page))
+ goto out;
+
+ ret = -EINVAL;
+ if (page->mem_cgroup != from)
+ goto out_unlock;
+
+ spin_lock_irqsave(&from->move_lock, flags);
+
+ if (!PageAnon(page) && page_mapped(page)) {
+ __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
+ nr_pages);
+ __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
+ nr_pages);
+ }
+
+ if (PageWriteback(page)) {
+ __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_WRITEBACK],
+ nr_pages);
+ __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_WRITEBACK],
+ nr_pages);
+ }
+
+ /*
+ * It is safe to change page->mem_cgroup here because the page
+ * is referenced, charged, and isolated - we can't race with
+ * uncharging, charging, migration, or LRU putback.
+ */
+
+ /* caller should have done css_get */
+ page->mem_cgroup = to;
+ spin_unlock_irqrestore(&from->move_lock, flags);
+
+ ret = 0;
+
+ local_irq_disable();
+ mem_cgroup_charge_statistics(to, page, nr_pages);
+ memcg_check_events(to, page);
+ mem_cgroup_charge_statistics(from, page, -nr_pages);
+ memcg_check_events(from, page);
+ local_irq_enable();
+out_unlock:
+ unlock_page(page);
+out:
+ return ret;
+}
+
static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
unsigned long addr, pte_t ptent, union mc_target *target)
{
@@ -5012,7 +5007,7 @@ static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
* tunable will only affect upcoming migrations, not the current one.
* So we need to save it, and keep it going.
*/
- move_flags = ACCESS_ONCE(memcg->move_charge_at_immigrate);
+ move_flags = READ_ONCE(memcg->move_charge_at_immigrate);
if (move_flags) {
struct mm_struct *mm;
struct mem_cgroup *from = mem_cgroup_from_task(p);
@@ -5246,7 +5241,7 @@ static u64 memory_current_read(struct cgroup_subsys_state *css,
static int memory_low_show(struct seq_file *m, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
- unsigned long low = ACCESS_ONCE(memcg->low);
+ unsigned long low = READ_ONCE(memcg->low);
if (low == PAGE_COUNTER_MAX)
seq_puts(m, "max\n");
@@ -5276,7 +5271,7 @@ static ssize_t memory_low_write(struct kernfs_open_file *of,
static int memory_high_show(struct seq_file *m, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
- unsigned long high = ACCESS_ONCE(memcg->high);
+ unsigned long high = READ_ONCE(memcg->high);
if (high == PAGE_COUNTER_MAX)
seq_puts(m, "max\n");
@@ -5306,7 +5301,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
static int memory_max_show(struct seq_file *m, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
- unsigned long max = ACCESS_ONCE(memcg->memory.limit);
+ unsigned long max = READ_ONCE(memcg->memory.limit);
if (max == PAGE_COUNTER_MAX)
seq_puts(m, "max\n");
@@ -5861,7 +5856,7 @@ void mem_cgroup_uncharge_swap(swp_entry_t entry)
id = swap_cgroup_record(entry, 0);
rcu_read_lock();
- memcg = mem_cgroup_lookup(id);
+ memcg = mem_cgroup_from_id(id);
if (memcg) {
if (!mem_cgroup_is_root(memcg))
page_counter_uncharge(&memcg->memsw, 1);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index d487f8dc6d39..d9359b770cd9 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -521,6 +521,52 @@ static const char *action_name[] = {
[RECOVERED] = "Recovered",
};
+enum action_page_type {
+ MSG_KERNEL,
+ MSG_KERNEL_HIGH_ORDER,
+ MSG_SLAB,
+ MSG_DIFFERENT_COMPOUND,
+ MSG_POISONED_HUGE,
+ MSG_HUGE,
+ MSG_FREE_HUGE,
+ MSG_UNMAP_FAILED,
+ MSG_DIRTY_SWAPCACHE,
+ MSG_CLEAN_SWAPCACHE,
+ MSG_DIRTY_MLOCKED_LRU,
+ MSG_CLEAN_MLOCKED_LRU,
+ MSG_DIRTY_UNEVICTABLE_LRU,
+ MSG_CLEAN_UNEVICTABLE_LRU,
+ MSG_DIRTY_LRU,
+ MSG_CLEAN_LRU,
+ MSG_TRUNCATED_LRU,
+ MSG_BUDDY,
+ MSG_BUDDY_2ND,
+ MSG_UNKNOWN,
+};
+
+static const char * const action_page_types[] = {
+ [MSG_KERNEL] = "reserved kernel page",
+ [MSG_KERNEL_HIGH_ORDER] = "high-order kernel page",
+ [MSG_SLAB] = "kernel slab page",
+ [MSG_DIFFERENT_COMPOUND] = "different compound page after locking",
+ [MSG_POISONED_HUGE] = "huge page already hardware poisoned",
+ [MSG_HUGE] = "huge page",
+ [MSG_FREE_HUGE] = "free huge page",
+ [MSG_UNMAP_FAILED] = "unmapping failed page",
+ [MSG_DIRTY_SWAPCACHE] = "dirty swapcache page",
+ [MSG_CLEAN_SWAPCACHE] = "clean swapcache page",
+ [MSG_DIRTY_MLOCKED_LRU] = "dirty mlocked LRU page",
+ [MSG_CLEAN_MLOCKED_LRU] = "clean mlocked LRU page",
+ [MSG_DIRTY_UNEVICTABLE_LRU] = "dirty unevictable LRU page",
+ [MSG_CLEAN_UNEVICTABLE_LRU] = "clean unevictable LRU page",
+ [MSG_DIRTY_LRU] = "dirty LRU page",
+ [MSG_CLEAN_LRU] = "clean LRU page",
+ [MSG_TRUNCATED_LRU] = "already truncated LRU page",
+ [MSG_BUDDY] = "free buddy page",
+ [MSG_BUDDY_2ND] = "free buddy page (2nd try)",
+ [MSG_UNKNOWN] = "unknown page",
+};
+
/*
* XXX: It is possible that a page is isolated from LRU cache,
* and then kept in swap cache or failed to remove from page cache.
@@ -777,10 +823,10 @@ static int me_huge_page(struct page *p, unsigned long pfn)
static struct page_state {
unsigned long mask;
unsigned long res;
- char *msg;
+ enum action_page_type type;
int (*action)(struct page *p, unsigned long pfn);
} error_states[] = {
- { reserved, reserved, "reserved kernel", me_kernel },
+ { reserved, reserved, MSG_KERNEL, me_kernel },
/*
* free pages are specially detected outside this table:
* PG_buddy pages only make a small fraction of all free pages.
@@ -791,31 +837,31 @@ static struct page_state {
* currently unused objects without touching them. But just
* treat it as standard kernel for now.
*/
- { slab, slab, "kernel slab", me_kernel },
+ { slab, slab, MSG_SLAB, me_kernel },
#ifdef CONFIG_PAGEFLAGS_EXTENDED
- { head, head, "huge", me_huge_page },
- { tail, tail, "huge", me_huge_page },
+ { head, head, MSG_HUGE, me_huge_page },
+ { tail, tail, MSG_HUGE, me_huge_page },
#else
- { compound, compound, "huge", me_huge_page },
+ { compound, compound, MSG_HUGE, me_huge_page },
#endif
- { sc|dirty, sc|dirty, "dirty swapcache", me_swapcache_dirty },
- { sc|dirty, sc, "clean swapcache", me_swapcache_clean },
+ { sc|dirty, sc|dirty, MSG_DIRTY_SWAPCACHE, me_swapcache_dirty },
+ { sc|dirty, sc, MSG_CLEAN_SWAPCACHE, me_swapcache_clean },
- { mlock|dirty, mlock|dirty, "dirty mlocked LRU", me_pagecache_dirty },
- { mlock|dirty, mlock, "clean mlocked LRU", me_pagecache_clean },
+ { mlock|dirty, mlock|dirty, MSG_DIRTY_MLOCKED_LRU, me_pagecache_dirty },
+ { mlock|dirty, mlock, MSG_CLEAN_MLOCKED_LRU, me_pagecache_clean },
- { unevict|dirty, unevict|dirty, "dirty unevictable LRU", me_pagecache_dirty },
- { unevict|dirty, unevict, "clean unevictable LRU", me_pagecache_clean },
+ { unevict|dirty, unevict|dirty, MSG_DIRTY_UNEVICTABLE_LRU, me_pagecache_dirty },
+ { unevict|dirty, unevict, MSG_CLEAN_UNEVICTABLE_LRU, me_pagecache_clean },
- { lru|dirty, lru|dirty, "dirty LRU", me_pagecache_dirty },
- { lru|dirty, lru, "clean LRU", me_pagecache_clean },
+ { lru|dirty, lru|dirty, MSG_DIRTY_LRU, me_pagecache_dirty },
+ { lru|dirty, lru, MSG_CLEAN_LRU, me_pagecache_clean },
/*
* Catchall entry: must be at end.
*/
- { 0, 0, "unknown page state", me_unknown },
+ { 0, 0, MSG_UNKNOWN, me_unknown },
};
#undef dirty
@@ -835,10 +881,10 @@ static struct page_state {
* "Dirty/Clean" indication is not 100% accurate due to the possibility of
* setting PG_dirty outside page lock. See also comment above set_page_dirty().
*/
-static void action_result(unsigned long pfn, char *msg, int result)
+static void action_result(unsigned long pfn, enum action_page_type type, int result)
{
- pr_err("MCE %#lx: %s page recovery: %s\n",
- pfn, msg, action_name[result]);
+ pr_err("MCE %#lx: recovery action for %s: %s\n",
+ pfn, action_page_types[type], action_name[result]);
}
static int page_action(struct page_state *ps, struct page *p,
@@ -854,11 +900,11 @@ static int page_action(struct page_state *ps, struct page *p,
count--;
if (count != 0) {
printk(KERN_ERR
- "MCE %#lx: %s page still referenced by %d users\n",
- pfn, ps->msg, count);
+ "MCE %#lx: %s still referenced by %d users\n",
+ pfn, action_page_types[ps->type], count);
result = FAILED;
}
- action_result(pfn, ps->msg, result);
+ action_result(pfn, ps->type, result);
/* Could do more checks here if page looks ok */
/*
@@ -1106,7 +1152,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
if (!(flags & MF_COUNT_INCREASED) &&
!get_page_unless_zero(hpage)) {
if (is_free_buddy_page(p)) {
- action_result(pfn, "free buddy", DELAYED);
+ action_result(pfn, MSG_BUDDY, DELAYED);
return 0;
} else if (PageHuge(hpage)) {
/*
@@ -1123,12 +1169,12 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
}
set_page_hwpoison_huge_page(hpage);
res = dequeue_hwpoisoned_huge_page(hpage);
- action_result(pfn, "free huge",
+ action_result(pfn, MSG_FREE_HUGE,
res ? IGNORED : DELAYED);
unlock_page(hpage);
return res;
} else {
- action_result(pfn, "high order kernel", IGNORED);
+ action_result(pfn, MSG_KERNEL_HIGH_ORDER, IGNORED);
return -EBUSY;
}
}
@@ -1150,9 +1196,10 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
*/
if (is_free_buddy_page(p)) {
if (flags & MF_COUNT_INCREASED)
- action_result(pfn, "free buddy", DELAYED);
+ action_result(pfn, MSG_BUDDY, DELAYED);
else
- action_result(pfn, "free buddy, 2nd try", DELAYED);
+ action_result(pfn, MSG_BUDDY_2ND,
+ DELAYED);
return 0;
}
}
@@ -1165,7 +1212,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
* If this happens just bail out.
*/
if (compound_head(p) != hpage) {
- action_result(pfn, "different compound page after locking", IGNORED);
+ action_result(pfn, MSG_DIFFERENT_COMPOUND, IGNORED);
res = -EBUSY;
goto out;
}
@@ -1205,8 +1252,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
* on the head page to show that the hugepage is hwpoisoned
*/
if (PageHuge(p) && PageTail(p) && TestSetPageHWPoison(hpage)) {
- action_result(pfn, "hugepage already hardware poisoned",
- IGNORED);
+ action_result(pfn, MSG_POISONED_HUGE, IGNORED);
unlock_page(hpage);
put_page(hpage);
return 0;
@@ -1235,7 +1281,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
*/
if (hwpoison_user_mappings(p, pfn, trapno, flags, &hpage)
!= SWAP_SUCCESS) {
- action_result(pfn, "unmapping failed", IGNORED);
+ action_result(pfn, MSG_UNMAP_FAILED, IGNORED);
res = -EBUSY;
goto out;
}
@@ -1244,7 +1290,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
* Torn down by someone else?
*/
if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) {
- action_result(pfn, "already truncated LRU", IGNORED);
+ action_result(pfn, MSG_TRUNCATED_LRU, IGNORED);
res = -EBUSY;
goto out;
}
@@ -1540,8 +1586,18 @@ static int soft_offline_huge_page(struct page *page, int flags)
}
unlock_page(hpage);
- /* Keep page count to indicate a given hugepage is isolated. */
- list_move(&hpage->lru, &pagelist);
+ ret = isolate_huge_page(hpage, &pagelist);
+ if (ret) {
+ /*
+ * get_any_page() and isolate_huge_page() takes a refcount each,
+ * so need to drop one here.
+ */
+ put_page(hpage);
+ } else {
+ pr_info("soft offline: %#lx hugepage failed to isolate\n", pfn);
+ return -EBUSY;
+ }
+
ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
MIGRATE_SYNC, MR_MEMORY_FAILURE);
if (ret) {
diff --git a/mm/memory.c b/mm/memory.c
index 97839f5c8c30..22e037e3364e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -690,12 +690,11 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
/*
* Choose text because data symbols depend on CONFIG_KALLSYMS_ALL=y
*/
- if (vma->vm_ops)
- printk(KERN_ALERT "vma->vm_ops->fault: %pSR\n",
- vma->vm_ops->fault);
- if (vma->vm_file)
- printk(KERN_ALERT "vma->vm_file->f_op->mmap: %pSR\n",
- vma->vm_file->f_op->mmap);
+ pr_alert("file:%pD fault:%pf mmap:%pf readpage:%pf\n",
+ vma->vm_file,
+ vma->vm_ops ? vma->vm_ops->fault : NULL,
+ vma->vm_file ? vma->vm_file->f_op->mmap : NULL,
+ mapping ? mapping->a_ops->readpage : NULL);
dump_stack();
add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
}
@@ -1983,167 +1982,91 @@ static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page,
}
/*
- * This routine handles present pages, when users try to write
- * to a shared page. It is done by copying the page to a new address
- * and decrementing the shared-page counter for the old page.
+ * Handle write page faults for pages that can be reused in the current vma
*
- * Note that this routine assumes that the protection checks have been
- * done by the caller (the low-level page fault routine in most cases).
- * Thus we can safely just mark it writable once we've done any necessary
- * COW.
- *
- * We also mark the page dirty at this point even though the page will
- * change only once the write actually happens. This avoids a few races,
- * and potentially makes it more efficient.
- *
- * We enter with non-exclusive mmap_sem (to exclude vma changes,
- * but allow concurrent faults), with pte both mapped and locked.
- * We return with mmap_sem still held, but pte unmapped and unlocked.
+ * This can happen either due to the mapping being with the VM_SHARED flag,
+ * or due to us being the last reference standing to the page. In either
+ * case, all we need to do here is to mark the page as writable and update
+ * any related book-keeping.
*/
-static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long address, pte_t *page_table, pmd_t *pmd,
- spinlock_t *ptl, pte_t orig_pte)
+static inline int wp_page_reuse(struct mm_struct *mm,
+ struct vm_area_struct *vma, unsigned long address,
+ pte_t *page_table, spinlock_t *ptl, pte_t orig_pte,
+ struct page *page, int page_mkwrite,
+ int dirty_shared)
__releases(ptl)
{
- struct page *old_page, *new_page = NULL;
pte_t entry;
- int ret = 0;
- int page_mkwrite = 0;
- bool dirty_shared = false;
- unsigned long mmun_start = 0; /* For mmu_notifiers */
- unsigned long mmun_end = 0; /* For mmu_notifiers */
- struct mem_cgroup *memcg;
-
- old_page = vm_normal_page(vma, address, orig_pte);
- if (!old_page) {
- /*
- * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a
- * VM_PFNMAP VMA.
- *
- * We should not cow pages in a shared writeable mapping.
- * Just mark the pages writable as we can't do any dirty
- * accounting on raw pfn maps.
- */
- if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
- (VM_WRITE|VM_SHARED))
- goto reuse;
- goto gotten;
- }
-
/*
- * Take out anonymous pages first, anonymous shared vmas are
- * not dirty accountable.
+ * Clear the pages cpupid information as the existing
+ * information potentially belongs to a now completely
+ * unrelated process.
*/
- if (PageAnon(old_page) && !PageKsm(old_page)) {
- if (!trylock_page(old_page)) {
- page_cache_get(old_page);
- pte_unmap_unlock(page_table, ptl);
- lock_page(old_page);
- page_table = pte_offset_map_lock(mm, pmd, address,
- &ptl);
- if (!pte_same(*page_table, orig_pte)) {
- unlock_page(old_page);
- goto unlock;
- }
- page_cache_release(old_page);
- }
- if (reuse_swap_page(old_page)) {
- /*
- * The page is all ours. Move it to our anon_vma so
- * the rmap code will not search our parent or siblings.
- * Protected against the rmap code by the page lock.
- */
- page_move_anon_rmap(old_page, vma, address);
- unlock_page(old_page);
- goto reuse;
- }
- unlock_page(old_page);
- } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
- (VM_WRITE|VM_SHARED))) {
- page_cache_get(old_page);
- /*
- * Only catch write-faults on shared writable pages,
- * read-only shared pages can get COWed by
- * get_user_pages(.write=1, .force=1).
- */
- if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
- int tmp;
-
- pte_unmap_unlock(page_table, ptl);
- tmp = do_page_mkwrite(vma, old_page, address);
- if (unlikely(!tmp || (tmp &
- (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
- page_cache_release(old_page);
- return tmp;
- }
- /*
- * Since we dropped the lock we need to revalidate
- * the PTE as someone else may have changed it. If
- * they did, we just return, as we can count on the
- * MMU to tell us if they didn't also make it writable.
- */
- page_table = pte_offset_map_lock(mm, pmd, address,
- &ptl);
- if (!pte_same(*page_table, orig_pte)) {
- unlock_page(old_page);
- goto unlock;
- }
- page_mkwrite = 1;
- }
-
- dirty_shared = true;
-
-reuse:
- /*
- * Clear the pages cpupid information as the existing
- * information potentially belongs to a now completely
- * unrelated process.
- */
- if (old_page)
- page_cpupid_xchg_last(old_page, (1 << LAST_CPUPID_SHIFT) - 1);
+ if (page)
+ page_cpupid_xchg_last(page, (1 << LAST_CPUPID_SHIFT) - 1);
- flush_cache_page(vma, address, pte_pfn(orig_pte));
- entry = pte_mkyoung(orig_pte);
- entry = maybe_mkwrite(pte_mkdirty(entry), vma);
- if (ptep_set_access_flags(vma, address, page_table, entry,1))
- update_mmu_cache(vma, address, page_table);
- pte_unmap_unlock(page_table, ptl);
- ret |= VM_FAULT_WRITE;
+ flush_cache_page(vma, address, pte_pfn(orig_pte));
+ entry = pte_mkyoung(orig_pte);
+ entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+ if (ptep_set_access_flags(vma, address, page_table, entry, 1))
+ update_mmu_cache(vma, address, page_table);
+ pte_unmap_unlock(page_table, ptl);
- if (dirty_shared) {
- struct address_space *mapping;
- int dirtied;
+ if (dirty_shared) {
+ struct address_space *mapping;
+ int dirtied;
- if (!page_mkwrite)
- lock_page(old_page);
+ if (!page_mkwrite)
+ lock_page(page);
- dirtied = set_page_dirty(old_page);
- VM_BUG_ON_PAGE(PageAnon(old_page), old_page);
- mapping = old_page->mapping;
- unlock_page(old_page);
- page_cache_release(old_page);
+ dirtied = set_page_dirty(page);
+ VM_BUG_ON_PAGE(PageAnon(page), page);
+ mapping = page->mapping;
+ unlock_page(page);
+ page_cache_release(page);
- if ((dirtied || page_mkwrite) && mapping) {
- /*
- * Some device drivers do not set page.mapping
- * but still dirty their pages
- */
- balance_dirty_pages_ratelimited(mapping);
- }
-
- if (!page_mkwrite)
- file_update_time(vma->vm_file);
+ if ((dirtied || page_mkwrite) && mapping) {
+ /*
+ * Some device drivers do not set page.mapping
+ * but still dirty their pages
+ */
+ balance_dirty_pages_ratelimited(mapping);
}
- return ret;
+ if (!page_mkwrite)
+ file_update_time(vma->vm_file);
}
- /*
- * Ok, we need to copy. Oh, well..
- */
- page_cache_get(old_page);
-gotten:
- pte_unmap_unlock(page_table, ptl);
+ return VM_FAULT_WRITE;
+}
+
+/*
+ * Handle the case of a page which we actually need to copy to a new page.
+ *
+ * Called with mmap_sem locked and the old page referenced, but
+ * without the ptl held.
+ *
+ * High level logic flow:
+ *
+ * - Allocate a page, copy the content of the old page to the new one.
+ * - Handle book keeping and accounting - cgroups, mmu-notifiers, etc.
+ * - Take the PTL. If the pte changed, bail out and release the allocated page
+ * - If the pte is still the way we remember it, update the page table and all
+ * relevant references. This includes dropping the reference the page-table
+ * held to the old page, as well as updating the rmap.
+ * - In any case, unlock the PTL and drop the reference we took to the old page.
+ */
+static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pte_t *page_table, pmd_t *pmd,
+ pte_t orig_pte, struct page *old_page)
+{
+ struct page *new_page = NULL;
+ spinlock_t *ptl = NULL;
+ pte_t entry;
+ int page_copied = 0;
+ const unsigned long mmun_start = address & PAGE_MASK; /* For mmu_notifiers */
+ const unsigned long mmun_end = mmun_start + PAGE_SIZE; /* For mmu_notifiers */
+ struct mem_cgroup *memcg;
if (unlikely(anon_vma_prepare(vma)))
goto oom;
@@ -2163,8 +2086,6 @@ gotten:
if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg))
goto oom_free_new;
- mmun_start = address & PAGE_MASK;
- mmun_end = mmun_start + PAGE_SIZE;
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
/*
@@ -2177,8 +2098,9 @@ gotten:
dec_mm_counter_fast(mm, MM_FILEPAGES);
inc_mm_counter_fast(mm, MM_ANONPAGES);
}
- } else
+ } else {
inc_mm_counter_fast(mm, MM_ANONPAGES);
+ }
flush_cache_page(vma, address, pte_pfn(orig_pte));
entry = mk_pte(new_page, vma->vm_page_prot);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
@@ -2227,29 +2149,29 @@ gotten:
/* Free the old page.. */
new_page = old_page;
- ret |= VM_FAULT_WRITE;
- } else
+ page_copied = 1;
+ } else {
mem_cgroup_cancel_charge(new_page, memcg);
+ }
if (new_page)
page_cache_release(new_page);
-unlock:
+
pte_unmap_unlock(page_table, ptl);
- if (mmun_end > mmun_start)
- mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
if (old_page) {
/*
* Don't let another task, with possibly unlocked vma,
* keep the mlocked page.
*/
- if ((ret & VM_FAULT_WRITE) && (vma->vm_flags & VM_LOCKED)) {
+ if (page_copied && (vma->vm_flags & VM_LOCKED)) {
lock_page(old_page); /* LRU manipulation */
munlock_vma_page(old_page);
unlock_page(old_page);
}
page_cache_release(old_page);
}
- return ret;
+ return page_copied ? VM_FAULT_WRITE : 0;
oom_free_new:
page_cache_release(new_page);
oom:
@@ -2258,6 +2180,179 @@ oom:
return VM_FAULT_OOM;
}
+/*
+ * Handle write page faults for VM_MIXEDMAP or VM_PFNMAP for a VM_SHARED
+ * mapping
+ */
+static int wp_pfn_shared(struct mm_struct *mm,
+ struct vm_area_struct *vma, unsigned long address,
+ pte_t *page_table, spinlock_t *ptl, pte_t orig_pte,
+ pmd_t *pmd)
+{
+ if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) {
+ struct vm_fault vmf = {
+ .page = NULL,
+ .pgoff = linear_page_index(vma, address),
+ .virtual_address = (void __user *)(address & PAGE_MASK),
+ .flags = FAULT_FLAG_WRITE | FAULT_FLAG_MKWRITE,
+ };
+ int ret;
+
+ pte_unmap_unlock(page_table, ptl);
+ ret = vma->vm_ops->pfn_mkwrite(vma, &vmf);
+ if (ret & VM_FAULT_ERROR)
+ return ret;
+ page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
+ /*
+ * We might have raced with another page fault while we
+ * released the pte_offset_map_lock.
+ */
+ if (!pte_same(*page_table, orig_pte)) {
+ pte_unmap_unlock(page_table, ptl);
+ return 0;
+ }
+ }
+ return wp_page_reuse(mm, vma, address, page_table, ptl, orig_pte,
+ NULL, 0, 0);
+}
+
+static int wp_page_shared(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pte_t *page_table,
+ pmd_t *pmd, spinlock_t *ptl, pte_t orig_pte,
+ struct page *old_page)
+ __releases(ptl)
+{
+ int page_mkwrite = 0;
+
+ page_cache_get(old_page);
+
+ /*
+ * Only catch write-faults on shared writable pages,
+ * read-only shared pages can get COWed by
+ * get_user_pages(.write=1, .force=1).
+ */
+ if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
+ int tmp;
+
+ pte_unmap_unlock(page_table, ptl);
+ tmp = do_page_mkwrite(vma, old_page, address);
+ if (unlikely(!tmp || (tmp &
+ (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
+ page_cache_release(old_page);
+ return tmp;
+ }
+ /*
+ * Since we dropped the lock we need to revalidate
+ * the PTE as someone else may have changed it. If
+ * they did, we just return, as we can count on the
+ * MMU to tell us if they didn't also make it writable.
+ */
+ page_table = pte_offset_map_lock(mm, pmd, address,
+ &ptl);
+ if (!pte_same(*page_table, orig_pte)) {
+ unlock_page(old_page);
+ pte_unmap_unlock(page_table, ptl);
+ page_cache_release(old_page);
+ return 0;
+ }
+ page_mkwrite = 1;
+ }
+
+ return wp_page_reuse(mm, vma, address, page_table, ptl,
+ orig_pte, old_page, page_mkwrite, 1);
+}
+
+/*
+ * This routine handles present pages, when users try to write
+ * to a shared page. It is done by copying the page to a new address
+ * and decrementing the shared-page counter for the old page.
+ *
+ * Note that this routine assumes that the protection checks have been
+ * done by the caller (the low-level page fault routine in most cases).
+ * Thus we can safely just mark it writable once we've done any necessary
+ * COW.
+ *
+ * We also mark the page dirty at this point even though the page will
+ * change only once the write actually happens. This avoids a few races,
+ * and potentially makes it more efficient.
+ *
+ * We enter with non-exclusive mmap_sem (to exclude vma changes,
+ * but allow concurrent faults), with pte both mapped and locked.
+ * We return with mmap_sem still held, but pte unmapped and unlocked.
+ */
+static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pte_t *page_table, pmd_t *pmd,
+ spinlock_t *ptl, pte_t orig_pte)
+ __releases(ptl)
+{
+ struct page *old_page;
+
+ old_page = vm_normal_page(vma, address, orig_pte);
+ if (!old_page) {
+ /*
+ * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a
+ * VM_PFNMAP VMA.
+ *
+ * We should not cow pages in a shared writeable mapping.
+ * Just mark the pages writable and/or call ops->pfn_mkwrite.
+ */
+ if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
+ (VM_WRITE|VM_SHARED))
+ return wp_pfn_shared(mm, vma, address, page_table, ptl,
+ orig_pte, pmd);
+
+ pte_unmap_unlock(page_table, ptl);
+ return wp_page_copy(mm, vma, address, page_table, pmd,
+ orig_pte, old_page);
+ }
+
+ /*
+ * Take out anonymous pages first, anonymous shared vmas are
+ * not dirty accountable.
+ */
+ if (PageAnon(old_page) && !PageKsm(old_page)) {
+ if (!trylock_page(old_page)) {
+ page_cache_get(old_page);
+ pte_unmap_unlock(page_table, ptl);
+ lock_page(old_page);
+ page_table = pte_offset_map_lock(mm, pmd, address,
+ &ptl);
+ if (!pte_same(*page_table, orig_pte)) {
+ unlock_page(old_page);
+ pte_unmap_unlock(page_table, ptl);
+ page_cache_release(old_page);
+ return 0;
+ }
+ page_cache_release(old_page);
+ }
+ if (reuse_swap_page(old_page)) {
+ /*
+ * The page is all ours. Move it to our anon_vma so
+ * the rmap code will not search our parent or siblings.
+ * Protected against the rmap code by the page lock.
+ */
+ page_move_anon_rmap(old_page, vma, address);
+ unlock_page(old_page);
+ return wp_page_reuse(mm, vma, address, page_table, ptl,
+ orig_pte, old_page, 0, 0);
+ }
+ unlock_page(old_page);
+ } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
+ (VM_WRITE|VM_SHARED))) {
+ return wp_page_shared(mm, vma, address, page_table, pmd,
+ ptl, orig_pte, old_page);
+ }
+
+ /*
+ * Ok, we need to copy. Oh, well..
+ */
+ page_cache_get(old_page);
+
+ pte_unmap_unlock(page_table, ptl);
+ return wp_page_copy(mm, vma, address, page_table, pmd,
+ orig_pte, old_page);
+}
+
static void unmap_mapping_range_vma(struct vm_area_struct *vma,
unsigned long start_addr, unsigned long end_addr,
struct zap_details *details)
@@ -2784,7 +2879,7 @@ static void do_fault_around(struct vm_area_struct *vma, unsigned long address,
struct vm_fault vmf;
int off;
- nr_pages = ACCESS_ONCE(fault_around_bytes) >> PAGE_SHIFT;
+ nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT;
mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK;
start_addr = max(address & mask, vma->vm_start);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 65842d688b7c..457bde530cbe 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -104,7 +104,7 @@ void put_online_mems(void)
}
-static void mem_hotplug_begin(void)
+void mem_hotplug_begin(void)
{
mem_hotplug.active_writer = current;
@@ -119,7 +119,7 @@ static void mem_hotplug_begin(void)
}
}
-static void mem_hotplug_done(void)
+void mem_hotplug_done(void)
{
mem_hotplug.active_writer = NULL;
mutex_unlock(&mem_hotplug.lock);
@@ -502,7 +502,7 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn,
end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1);
for (i = start_sec; i <= end_sec; i++) {
- err = __add_section(nid, zone, i << PFN_SECTION_SHIFT);
+ err = __add_section(nid, zone, section_nr_to_pfn(i));
/*
* EEXIST is finally dealt with by ioresource collision
@@ -959,6 +959,7 @@ static void node_states_set_node(int node, struct memory_notify *arg)
}
+/* Must be protected by mem_hotplug_begin() */
int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type)
{
unsigned long flags;
@@ -969,7 +970,6 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
int ret;
struct memory_notify arg;
- mem_hotplug_begin();
/*
* This doesn't need a lock to do pfn_to_page().
* The section can't be removed here because of the
@@ -977,21 +977,20 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
*/
zone = page_zone(pfn_to_page(pfn));
- ret = -EINVAL;
if ((zone_idx(zone) > ZONE_NORMAL ||
online_type == MMOP_ONLINE_MOVABLE) &&
!can_online_high_movable(zone))
- goto out;
+ return -EINVAL;
if (online_type == MMOP_ONLINE_KERNEL &&
zone_idx(zone) == ZONE_MOVABLE) {
if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages))
- goto out;
+ return -EINVAL;
}
if (online_type == MMOP_ONLINE_MOVABLE &&
zone_idx(zone) == ZONE_MOVABLE - 1) {
if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages))
- goto out;
+ return -EINVAL;
}
/* Previous code may changed the zone of the pfn range */
@@ -1007,7 +1006,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
ret = notifier_to_errno(ret);
if (ret) {
memory_notify(MEM_CANCEL_ONLINE, &arg);
- goto out;
+ return ret;
}
/*
* If this zone is not populated, then it is not in zonelist.
@@ -1031,7 +1030,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
(((unsigned long long) pfn + nr_pages)
<< PAGE_SHIFT) - 1);
memory_notify(MEM_CANCEL_ONLINE, &arg);
- goto out;
+ return ret;
}
zone->present_pages += onlined_pages;
@@ -1061,9 +1060,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
if (onlined_pages)
memory_notify(MEM_ONLINE, &arg);
-out:
- mem_hotplug_done();
- return ret;
+ return 0;
}
#endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */
@@ -1376,7 +1373,7 @@ static unsigned long scan_movable_pages(unsigned long start, unsigned long end)
if (PageLRU(page))
return pfn;
if (PageHuge(page)) {
- if (is_hugepage_active(page))
+ if (page_huge_active(page))
return pfn;
else
pfn = round_up(pfn + 1,
@@ -1688,21 +1685,18 @@ static int __ref __offline_pages(unsigned long start_pfn,
if (!test_pages_in_a_zone(start_pfn, end_pfn))
return -EINVAL;
- mem_hotplug_begin();
-
zone = page_zone(pfn_to_page(start_pfn));
node = zone_to_nid(zone);
nr_pages = end_pfn - start_pfn;
- ret = -EINVAL;
if (zone_idx(zone) <= ZONE_NORMAL && !can_offline_normal(zone, nr_pages))
- goto out;
+ return -EINVAL;
/* set above range as isolated */
ret = start_isolate_page_range(start_pfn, end_pfn,
MIGRATE_MOVABLE, true);
if (ret)
- goto out;
+ return ret;
arg.start_pfn = start_pfn;
arg.nr_pages = nr_pages;
@@ -1795,7 +1789,6 @@ repeat:
writeback_set_ratelimit();
memory_notify(MEM_OFFLINE, &arg);
- mem_hotplug_done();
return 0;
failed_removal:
@@ -1805,12 +1798,10 @@ failed_removal:
memory_notify(MEM_CANCEL_OFFLINE, &arg);
/* pushback to free area */
undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
-
-out:
- mem_hotplug_done();
return ret;
}
+/* Must be protected by mem_hotplug_begin() */
int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
{
return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 4721046a134a..ede26291d4aa 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -945,7 +945,8 @@ static struct page *new_node_page(struct page *page, unsigned long node, int **x
return alloc_huge_page_node(page_hstate(compound_head(page)),
node);
else
- return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0);
+ return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE |
+ __GFP_THISNODE, 0);
}
/*
@@ -1985,7 +1986,8 @@ retry_cpuset:
nmask = policy_nodemask(gfp, pol);
if (!nmask || node_isset(node, *nmask)) {
mpol_cond_put(pol);
- page = alloc_pages_exact_node(node, gfp, order);
+ page = alloc_pages_exact_node(node,
+ gfp | __GFP_THISNODE, order);
goto out;
}
}
diff --git a/mm/mempool.c b/mm/mempool.c
index e209c98c7203..2cc08de8b1db 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -6,26 +6,138 @@
* extreme VM load.
*
* started by Ingo Molnar, Copyright (C) 2001
+ * debugging by David Rientjes, Copyright (C) 2015
*/
#include <linux/mm.h>
#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/kasan.h>
#include <linux/kmemleak.h>
#include <linux/export.h>
#include <linux/mempool.h>
#include <linux/blkdev.h>
#include <linux/writeback.h>
+#include "slab.h"
+
+#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_SLUB_DEBUG_ON)
+static void poison_error(mempool_t *pool, void *element, size_t size,
+ size_t byte)
+{
+ const int nr = pool->curr_nr;
+ const int start = max_t(int, byte - (BITS_PER_LONG / 8), 0);
+ const int end = min_t(int, byte + (BITS_PER_LONG / 8), size);
+ int i;
+
+ pr_err("BUG: mempool element poison mismatch\n");
+ pr_err("Mempool %p size %zu\n", pool, size);
+ pr_err(" nr=%d @ %p: %s0x", nr, element, start > 0 ? "... " : "");
+ for (i = start; i < end; i++)
+ pr_cont("%x ", *(u8 *)(element + i));
+ pr_cont("%s\n", end < size ? "..." : "");
+ dump_stack();
+}
+
+static void __check_element(mempool_t *pool, void *element, size_t size)
+{
+ u8 *obj = element;
+ size_t i;
+
+ for (i = 0; i < size; i++) {
+ u8 exp = (i < size - 1) ? POISON_FREE : POISON_END;
+
+ if (obj[i] != exp) {
+ poison_error(pool, element, size, i);
+ return;
+ }
+ }
+ memset(obj, POISON_INUSE, size);
+}
+
+static void check_element(mempool_t *pool, void *element)
+{
+ /* Mempools backed by slab allocator */
+ if (pool->free == mempool_free_slab || pool->free == mempool_kfree)
+ __check_element(pool, element, ksize(element));
+
+ /* Mempools backed by page allocator */
+ if (pool->free == mempool_free_pages) {
+ int order = (int)(long)pool->pool_data;
+ void *addr = kmap_atomic((struct page *)element);
+
+ __check_element(pool, addr, 1UL << (PAGE_SHIFT + order));
+ kunmap_atomic(addr);
+ }
+}
+
+static void __poison_element(void *element, size_t size)
+{
+ u8 *obj = element;
+
+ memset(obj, POISON_FREE, size - 1);
+ obj[size - 1] = POISON_END;
+}
+
+static void poison_element(mempool_t *pool, void *element)
+{
+ /* Mempools backed by slab allocator */
+ if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc)
+ __poison_element(element, ksize(element));
+
+ /* Mempools backed by page allocator */
+ if (pool->alloc == mempool_alloc_pages) {
+ int order = (int)(long)pool->pool_data;
+ void *addr = kmap_atomic((struct page *)element);
+
+ __poison_element(addr, 1UL << (PAGE_SHIFT + order));
+ kunmap_atomic(addr);
+ }
+}
+#else /* CONFIG_DEBUG_SLAB || CONFIG_SLUB_DEBUG_ON */
+static inline void check_element(mempool_t *pool, void *element)
+{
+}
+static inline void poison_element(mempool_t *pool, void *element)
+{
+}
+#endif /* CONFIG_DEBUG_SLAB || CONFIG_SLUB_DEBUG_ON */
+
+static void kasan_poison_element(mempool_t *pool, void *element)
+{
+ if (pool->alloc == mempool_alloc_slab)
+ kasan_slab_free(pool->pool_data, element);
+ if (pool->alloc == mempool_kmalloc)
+ kasan_kfree(element);
+ if (pool->alloc == mempool_alloc_pages)
+ kasan_free_pages(element, (unsigned long)pool->pool_data);
+}
+
+static void kasan_unpoison_element(mempool_t *pool, void *element)
+{
+ if (pool->alloc == mempool_alloc_slab)
+ kasan_slab_alloc(pool->pool_data, element);
+ if (pool->alloc == mempool_kmalloc)
+ kasan_krealloc(element, (size_t)pool->pool_data);
+ if (pool->alloc == mempool_alloc_pages)
+ kasan_alloc_pages(element, (unsigned long)pool->pool_data);
+}
static void add_element(mempool_t *pool, void *element)
{
BUG_ON(pool->curr_nr >= pool->min_nr);
+ poison_element(pool, element);
+ kasan_poison_element(pool, element);
pool->elements[pool->curr_nr++] = element;
}
static void *remove_element(mempool_t *pool)
{
- BUG_ON(pool->curr_nr <= 0);
- return pool->elements[--pool->curr_nr];
+ void *element = pool->elements[--pool->curr_nr];
+
+ BUG_ON(pool->curr_nr < 0);
+ check_element(pool, element);
+ kasan_unpoison_element(pool, element);
+ return element;
}
/**
@@ -113,23 +225,24 @@ EXPORT_SYMBOL(mempool_create_node);
* mempool_create().
* @new_min_nr: the new minimum number of elements guaranteed to be
* allocated for this pool.
- * @gfp_mask: the usual allocation bitmask.
*
* This function shrinks/grows the pool. In the case of growing,
* it cannot be guaranteed that the pool will be grown to the new
* size immediately, but new mempool_free() calls will refill it.
+ * This function may sleep.
*
* Note, the caller must guarantee that no mempool_destroy is called
* while this function is running. mempool_alloc() & mempool_free()
* might be called (eg. from IRQ contexts) while this function executes.
*/
-int mempool_resize(mempool_t *pool, int new_min_nr, gfp_t gfp_mask)
+int mempool_resize(mempool_t *pool, int new_min_nr)
{
void *element;
void **new_elements;
unsigned long flags;
BUG_ON(new_min_nr <= 0);
+ might_sleep();
spin_lock_irqsave(&pool->lock, flags);
if (new_min_nr <= pool->min_nr) {
@@ -145,7 +258,8 @@ int mempool_resize(mempool_t *pool, int new_min_nr, gfp_t gfp_mask)
spin_unlock_irqrestore(&pool->lock, flags);
/* Grow the pool */
- new_elements = kmalloc(new_min_nr * sizeof(*new_elements), gfp_mask);
+ new_elements = kmalloc_array(new_min_nr, sizeof(*new_elements),
+ GFP_KERNEL);
if (!new_elements)
return -ENOMEM;
@@ -164,7 +278,7 @@ int mempool_resize(mempool_t *pool, int new_min_nr, gfp_t gfp_mask)
while (pool->curr_nr < pool->min_nr) {
spin_unlock_irqrestore(&pool->lock, flags);
- element = pool->alloc(gfp_mask, pool->pool_data);
+ element = pool->alloc(GFP_KERNEL, pool->pool_data);
if (!element)
goto out;
spin_lock_irqsave(&pool->lock, flags);
@@ -332,6 +446,7 @@ EXPORT_SYMBOL(mempool_free);
void *mempool_alloc_slab(gfp_t gfp_mask, void *pool_data)
{
struct kmem_cache *mem = pool_data;
+ VM_BUG_ON(mem->ctor);
return kmem_cache_alloc(mem, gfp_mask);
}
EXPORT_SYMBOL(mempool_alloc_slab);
diff --git a/mm/memtest.c b/mm/memtest.c
new file mode 100644
index 000000000000..1997d934b13b
--- /dev/null
+++ b/mm/memtest.c
@@ -0,0 +1,118 @@
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/init.h>
+#include <linux/pfn.h>
+#include <linux/memblock.h>
+
+static u64 patterns[] __initdata = {
+ /* The first entry has to be 0 to leave memtest with zeroed memory */
+ 0,
+ 0xffffffffffffffffULL,
+ 0x5555555555555555ULL,
+ 0xaaaaaaaaaaaaaaaaULL,
+ 0x1111111111111111ULL,
+ 0x2222222222222222ULL,
+ 0x4444444444444444ULL,
+ 0x8888888888888888ULL,
+ 0x3333333333333333ULL,
+ 0x6666666666666666ULL,
+ 0x9999999999999999ULL,
+ 0xccccccccccccccccULL,
+ 0x7777777777777777ULL,
+ 0xbbbbbbbbbbbbbbbbULL,
+ 0xddddddddddddddddULL,
+ 0xeeeeeeeeeeeeeeeeULL,
+ 0x7a6c7258554e494cULL, /* yeah ;-) */
+};
+
+static void __init reserve_bad_mem(u64 pattern, phys_addr_t start_bad, phys_addr_t end_bad)
+{
+ printk(KERN_INFO " %016llx bad mem addr %010llx - %010llx reserved\n",
+ (unsigned long long) pattern,
+ (unsigned long long) start_bad,
+ (unsigned long long) end_bad);
+ memblock_reserve(start_bad, end_bad - start_bad);
+}
+
+static void __init memtest(u64 pattern, phys_addr_t start_phys, phys_addr_t size)
+{
+ u64 *p, *start, *end;
+ phys_addr_t start_bad, last_bad;
+ phys_addr_t start_phys_aligned;
+ const size_t incr = sizeof(pattern);
+
+ start_phys_aligned = ALIGN(start_phys, incr);
+ start = __va(start_phys_aligned);
+ end = start + (size - (start_phys_aligned - start_phys)) / incr;
+ start_bad = 0;
+ last_bad = 0;
+
+ for (p = start; p < end; p++)
+ *p = pattern;
+
+ for (p = start; p < end; p++, start_phys_aligned += incr) {
+ if (*p == pattern)
+ continue;
+ if (start_phys_aligned == last_bad + incr) {
+ last_bad += incr;
+ continue;
+ }
+ if (start_bad)
+ reserve_bad_mem(pattern, start_bad, last_bad + incr);
+ start_bad = last_bad = start_phys_aligned;
+ }
+ if (start_bad)
+ reserve_bad_mem(pattern, start_bad, last_bad + incr);
+}
+
+static void __init do_one_pass(u64 pattern, phys_addr_t start, phys_addr_t end)
+{
+ u64 i;
+ phys_addr_t this_start, this_end;
+
+ for_each_free_mem_range(i, NUMA_NO_NODE, &this_start, &this_end, NULL) {
+ this_start = clamp(this_start, start, end);
+ this_end = clamp(this_end, start, end);
+ if (this_start < this_end) {
+ printk(KERN_INFO " %010llx - %010llx pattern %016llx\n",
+ (unsigned long long)this_start,
+ (unsigned long long)this_end,
+ (unsigned long long)cpu_to_be64(pattern));
+ memtest(pattern, this_start, this_end - this_start);
+ }
+ }
+}
+
+/* default is disabled */
+static int memtest_pattern __initdata;
+
+static int __init parse_memtest(char *arg)
+{
+ if (arg)
+ memtest_pattern = simple_strtoul(arg, NULL, 0);
+ else
+ memtest_pattern = ARRAY_SIZE(patterns);
+
+ return 0;
+}
+
+early_param("memtest", parse_memtest);
+
+void __init early_memtest(phys_addr_t start, phys_addr_t end)
+{
+ unsigned int i;
+ unsigned int idx = 0;
+
+ if (!memtest_pattern)
+ return;
+
+ printk(KERN_INFO "early_memtest: # of tests: %d\n", memtest_pattern);
+ for (i = memtest_pattern-1; i < UINT_MAX; --i) {
+ idx = i % ARRAY_SIZE(patterns);
+ do_one_pass(patterns[idx], start, end);
+ }
+}
diff --git a/mm/migrate.c b/mm/migrate.c
index 85e042686031..f53838fe3dfe 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -537,7 +537,8 @@ void migrate_page_copy(struct page *newpage, struct page *page)
* Please do not reorder this without considering how mm/ksm.c's
* get_ksm_page() depends upon ksm_migrate_page() and PageSwapCache().
*/
- ClearPageSwapCache(page);
+ if (PageSwapCache(page))
+ ClearPageSwapCache(page);
ClearPagePrivate(page);
set_page_private(page, 0);
@@ -901,12 +902,23 @@ out:
}
/*
+ * gcc 4.7 and 4.8 on arm get an ICEs when inlining unmap_and_move(). Work
+ * around it.
+ */
+#if (GCC_VERSION >= 40700 && GCC_VERSION < 40900) && defined(CONFIG_ARM)
+#define ICE_noinline noinline
+#else
+#define ICE_noinline
+#endif
+
+/*
* Obtain the lock on page, remove all ptes and migrate the page
* to the newly allocated page in newpage.
*/
-static int unmap_and_move(new_page_t get_new_page, free_page_t put_new_page,
- unsigned long private, struct page *page, int force,
- enum migrate_mode mode)
+static ICE_noinline int unmap_and_move(new_page_t get_new_page,
+ free_page_t put_new_page,
+ unsigned long private, struct page *page,
+ int force, enum migrate_mode mode)
{
int rc = 0;
int *result = NULL;
@@ -1554,30 +1566,10 @@ static struct page *alloc_misplaced_dst_page(struct page *page,
* page migration rate limiting control.
* Do not migrate more than @pages_to_migrate in a @migrate_interval_millisecs
* window of time. Default here says do not migrate more than 1280M per second.
- * If a node is rate-limited then PTE NUMA updates are also rate-limited. However
- * as it is faults that reset the window, pte updates will happen unconditionally
- * if there has not been a fault since @pteupdate_interval_millisecs after the
- * throttle window closed.
*/
static unsigned int migrate_interval_millisecs __read_mostly = 100;
-static unsigned int pteupdate_interval_millisecs __read_mostly = 1000;
static unsigned int ratelimit_pages __read_mostly = 128 << (20 - PAGE_SHIFT);
-/* Returns true if NUMA migration is currently rate limited */
-bool migrate_ratelimited(int node)
-{
- pg_data_t *pgdat = NODE_DATA(node);
-
- if (time_after(jiffies, pgdat->numabalancing_migrate_next_window +
- msecs_to_jiffies(pteupdate_interval_millisecs)))
- return false;
-
- if (pgdat->numabalancing_migrate_nr_pages < ratelimit_pages)
- return false;
-
- return true;
-}
-
/* Returns true if the node is migrate rate-limited after the update */
static bool numamigrate_update_ratelimit(pg_data_t *pgdat,
unsigned long nr_pages)
diff --git a/mm/mlock.c b/mm/mlock.c
index 8a54cd214925..6fd2cf15e868 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -205,62 +205,6 @@ out:
return nr_pages - 1;
}
-/**
- * __mlock_vma_pages_range() - mlock a range of pages in the vma.
- * @vma: target vma
- * @start: start address
- * @end: end address
- * @nonblocking:
- *
- * This takes care of making the pages present too.
- *
- * return 0 on success, negative error code on error.
- *
- * vma->vm_mm->mmap_sem must be held.
- *
- * If @nonblocking is NULL, it may be held for read or write and will
- * be unperturbed.
- *
- * If @nonblocking is non-NULL, it must held for read only and may be
- * released. If it's released, *@nonblocking will be set to 0.
- */
-long __mlock_vma_pages_range(struct vm_area_struct *vma,
- unsigned long start, unsigned long end, int *nonblocking)
-{
- struct mm_struct *mm = vma->vm_mm;
- unsigned long nr_pages = (end - start) / PAGE_SIZE;
- int gup_flags;
-
- VM_BUG_ON(start & ~PAGE_MASK);
- VM_BUG_ON(end & ~PAGE_MASK);
- VM_BUG_ON_VMA(start < vma->vm_start, vma);
- VM_BUG_ON_VMA(end > vma->vm_end, vma);
- VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm);
-
- gup_flags = FOLL_TOUCH | FOLL_MLOCK;
- /*
- * We want to touch writable mappings with a write fault in order
- * to break COW, except for shared mappings because these don't COW
- * and we would not want to dirty them for nothing.
- */
- if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE)
- gup_flags |= FOLL_WRITE;
-
- /*
- * We want mlock to succeed for regions that have any permissions
- * other than PROT_NONE.
- */
- if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC))
- gup_flags |= FOLL_FORCE;
-
- /*
- * We made sure addr is within a VMA, so the following will
- * not result in a stack expansion that recurses back here.
- */
- return __get_user_pages(current, mm, start, nr_pages, gup_flags,
- NULL, NULL, nonblocking);
-}
-
/*
* convert get_user_pages() return value to posix mlock() error
*/
@@ -596,7 +540,7 @@ success:
/*
* vm_flags is protected by the mmap_sem held in write mode.
* It's okay if try_to_unmap_one unmaps a page just after we
- * set VM_LOCKED, __mlock_vma_pages_range will bring it back.
+ * set VM_LOCKED, populate_vma_page_range will bring it back.
*/
if (lock)
@@ -660,69 +604,6 @@ static int do_mlock(unsigned long start, size_t len, int on)
return error;
}
-/*
- * __mm_populate - populate and/or mlock pages within a range of address space.
- *
- * This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap
- * flags. VMAs must be already marked with the desired vm_flags, and
- * mmap_sem must not be held.
- */
-int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
-{
- struct mm_struct *mm = current->mm;
- unsigned long end, nstart, nend;
- struct vm_area_struct *vma = NULL;
- int locked = 0;
- long ret = 0;
-
- VM_BUG_ON(start & ~PAGE_MASK);
- VM_BUG_ON(len != PAGE_ALIGN(len));
- end = start + len;
-
- for (nstart = start; nstart < end; nstart = nend) {
- /*
- * We want to fault in pages for [nstart; end) address range.
- * Find first corresponding VMA.
- */
- if (!locked) {
- locked = 1;
- down_read(&mm->mmap_sem);
- vma = find_vma(mm, nstart);
- } else if (nstart >= vma->vm_end)
- vma = vma->vm_next;
- if (!vma || vma->vm_start >= end)
- break;
- /*
- * Set [nstart; nend) to intersection of desired address
- * range with the first VMA. Also, skip undesirable VMA types.
- */
- nend = min(end, vma->vm_end);
- if (vma->vm_flags & (VM_IO | VM_PFNMAP))
- continue;
- if (nstart < vma->vm_start)
- nstart = vma->vm_start;
- /*
- * Now fault in a range of pages. __mlock_vma_pages_range()
- * double checks the vma flags, so that it won't mlock pages
- * if the vma was already munlocked.
- */
- ret = __mlock_vma_pages_range(vma, nstart, nend, &locked);
- if (ret < 0) {
- if (ignore_errors) {
- ret = 0;
- continue; /* continue at next VMA */
- }
- ret = __mlock_posix_error_return(ret);
- break;
- }
- nend = nstart + ret * PAGE_SIZE;
- ret = 0;
- }
- if (locked)
- up_read(&mm->mmap_sem);
- return ret; /* 0 or negative error code */
-}
-
SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
{
unsigned long locked;
@@ -750,9 +631,13 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
error = do_mlock(start, len, 1);
up_write(&current->mm->mmap_sem);
- if (!error)
- error = __mm_populate(start, len, 0);
- return error;
+ if (error)
+ return error;
+
+ error = __mm_populate(start, len, 0);
+ if (error)
+ return __mlock_posix_error_return(error);
+ return 0;
}
SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len)
diff --git a/mm/mmap.c b/mm/mmap.c
index 9ec50a368634..bb50cacc3ea5 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1133,7 +1133,7 @@ static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct *
* by another page fault trying to merge _that_. But that's ok: if it
* is being set up, that automatically means that it will be a singleton
* acceptable for merging, so we can do all of this optimistically. But
- * we do that ACCESS_ONCE() to make sure that we never re-load the pointer.
+ * we do that READ_ONCE() to make sure that we never re-load the pointer.
*
* IOW: that the "list_is_singular()" test on the anon_vma_chain only
* matters for the 'stable anon_vma' case (ie the thing we want to avoid
@@ -1147,7 +1147,7 @@ static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct *
static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old, struct vm_area_struct *a, struct vm_area_struct *b)
{
if (anon_vma_compatible(a, b)) {
- struct anon_vma *anon_vma = ACCESS_ONCE(old->anon_vma);
+ struct anon_vma *anon_vma = READ_ONCE(old->anon_vma);
if (anon_vma && list_is_singular(&old->anon_vma_chain))
return anon_vma;
@@ -1551,11 +1551,10 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
/* Clear old maps */
error = -ENOMEM;
-munmap_back:
- if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) {
+ while (find_vma_links(mm, addr, addr + len, &prev, &rb_link,
+ &rb_parent)) {
if (do_munmap(mm, addr, len))
return -ENOMEM;
- goto munmap_back;
}
/*
@@ -1571,7 +1570,8 @@ munmap_back:
/*
* Can we just expand an old mapping?
*/
- vma = vma_merge(mm, prev, addr, addr + len, vm_flags, NULL, file, pgoff, NULL);
+ vma = vma_merge(mm, prev, addr, addr + len, vm_flags, NULL, file, pgoff,
+ NULL);
if (vma)
goto out;
@@ -2100,7 +2100,7 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns
actual_size = size;
if (size && (vma->vm_flags & (VM_GROWSUP | VM_GROWSDOWN)))
actual_size -= PAGE_SIZE;
- if (actual_size > ACCESS_ONCE(rlim[RLIMIT_STACK].rlim_cur))
+ if (actual_size > READ_ONCE(rlim[RLIMIT_STACK].rlim_cur))
return -ENOMEM;
/* mlock limit tests */
@@ -2108,7 +2108,7 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns
unsigned long locked;
unsigned long limit;
locked = mm->locked_vm + grow;
- limit = ACCESS_ONCE(rlim[RLIMIT_MEMLOCK].rlim_cur);
+ limit = READ_ONCE(rlim[RLIMIT_MEMLOCK].rlim_cur);
limit >>= PAGE_SHIFT;
if (locked > limit && !capable(CAP_IPC_LOCK))
return -ENOMEM;
@@ -2316,7 +2316,7 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr)
if (!prev || expand_stack(prev, addr))
return NULL;
if (prev->vm_flags & VM_LOCKED)
- __mlock_vma_pages_range(prev, addr, prev->vm_end, NULL);
+ populate_vma_page_range(prev, addr, prev->vm_end, NULL);
return prev;
}
#else
@@ -2351,7 +2351,7 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr)
if (expand_stack(vma, addr))
return NULL;
if (vma->vm_flags & VM_LOCKED)
- __mlock_vma_pages_range(vma, addr, start, NULL);
+ populate_vma_page_range(vma, addr, start, NULL);
return vma;
}
#endif
@@ -2739,11 +2739,10 @@ static unsigned long do_brk(unsigned long addr, unsigned long len)
/*
* Clear old maps. this also does some error checking for us
*/
- munmap_back:
- if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) {
+ while (find_vma_links(mm, addr, addr + len, &prev, &rb_link,
+ &rb_parent)) {
if (do_munmap(mm, addr, len))
return -ENOMEM;
- goto munmap_back;
}
/* Check against address space limits *after* clearing old maps... */
diff --git a/mm/mremap.c b/mm/mremap.c
index 57dadc025c64..034e2d360652 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -286,8 +286,14 @@ static unsigned long move_vma(struct vm_area_struct *vma,
old_len = new_len;
old_addr = new_addr;
new_addr = -ENOMEM;
- } else if (vma->vm_file && vma->vm_file->f_op->mremap)
- vma->vm_file->f_op->mremap(vma->vm_file, new_vma);
+ } else if (vma->vm_file && vma->vm_file->f_op->mremap) {
+ err = vma->vm_file->f_op->mremap(vma->vm_file, new_vma);
+ if (err < 0) {
+ move_page_tables(new_vma, new_addr, vma, old_addr,
+ moved_len, true);
+ return err;
+ }
+ }
/* Conceal VM_ACCOUNT so old reservation is not undone */
if (vm_flags & VM_ACCOUNT) {
@@ -339,25 +345,25 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
struct vm_area_struct *vma = find_vma(mm, addr);
if (!vma || vma->vm_start > addr)
- goto Efault;
+ return ERR_PTR(-EFAULT);
if (is_vm_hugetlb_page(vma))
- goto Einval;
+ return ERR_PTR(-EINVAL);
/* We can't remap across vm area boundaries */
if (old_len > vma->vm_end - addr)
- goto Efault;
+ return ERR_PTR(-EFAULT);
/* Need to be careful about a growing mapping */
if (new_len > old_len) {
unsigned long pgoff;
if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP))
- goto Efault;
+ return ERR_PTR(-EFAULT);
pgoff = (addr - vma->vm_start) >> PAGE_SHIFT;
pgoff += vma->vm_pgoff;
if (pgoff + (new_len >> PAGE_SHIFT) < pgoff)
- goto Einval;
+ return ERR_PTR(-EINVAL);
}
if (vma->vm_flags & VM_LOCKED) {
@@ -366,29 +372,20 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
lock_limit = rlimit(RLIMIT_MEMLOCK);
locked += new_len - old_len;
if (locked > lock_limit && !capable(CAP_IPC_LOCK))
- goto Eagain;
+ return ERR_PTR(-EAGAIN);
}
if (!may_expand_vm(mm, (new_len - old_len) >> PAGE_SHIFT))
- goto Enomem;
+ return ERR_PTR(-ENOMEM);
if (vma->vm_flags & VM_ACCOUNT) {
unsigned long charged = (new_len - old_len) >> PAGE_SHIFT;
if (security_vm_enough_memory_mm(mm, charged))
- goto Efault;
+ return ERR_PTR(-ENOMEM);
*p = charged;
}
return vma;
-
-Efault: /* very odd choice for most of the cases, but... */
- return ERR_PTR(-EFAULT);
-Einval:
- return ERR_PTR(-EINVAL);
-Enomem:
- return ERR_PTR(-ENOMEM);
-Eagain:
- return ERR_PTR(-EAGAIN);
}
static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
diff --git a/mm/nommu.c b/mm/nommu.c
index 3fba2dc97c44..e544508e2a4b 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1016,7 +1016,7 @@ static int validate_mmap_request(struct file *file,
* device */
if (!file->f_op->get_unmapped_area)
capabilities &= ~NOMMU_MAP_DIRECT;
- if (!file->f_op->read)
+ if (!(file->f_mode & FMODE_CAN_READ))
capabilities &= ~NOMMU_MAP_COPY;
/* The file shall have been opened with read permission. */
@@ -1240,7 +1240,7 @@ static int do_mmap_private(struct vm_area_struct *vma,
old_fs = get_fs();
set_fs(KERNEL_DS);
- ret = vma->vm_file->f_op->read(vma->vm_file, base, len, &fpos);
+ ret = __vfs_read(vma->vm_file, base, len, &fpos);
set_fs(old_fs);
if (ret < 0)
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 642f38cb175a..2b665da1b3c9 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -408,7 +408,7 @@ bool oom_killer_disabled __read_mostly;
static DECLARE_RWSEM(oom_sem);
/**
- * mark_tsk_oom_victim - marks the given taks as OOM victim.
+ * mark_tsk_oom_victim - marks the given task as OOM victim.
* @tsk: task to mark
*
* Has to be called with oom_sem taken for read and never after
@@ -612,7 +612,8 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
* Determines whether the kernel must panic because of the panic_on_oom sysctl.
*/
void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
- int order, const nodemask_t *nodemask)
+ int order, const nodemask_t *nodemask,
+ struct mem_cgroup *memcg)
{
if (likely(!sysctl_panic_on_oom))
return;
@@ -625,7 +626,7 @@ void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
if (constraint != CONSTRAINT_NONE)
return;
}
- dump_header(NULL, gfp_mask, order, NULL, nodemask);
+ dump_header(NULL, gfp_mask, order, memcg, nodemask);
panic("Out of memory: %s panic_on_oom is enabled\n",
sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide");
}
@@ -740,7 +741,7 @@ static void __out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
constraint = constrained_alloc(zonelist, gfp_mask, nodemask,
&totalpages);
mpol_mask = (constraint == CONSTRAINT_MEMORY_POLICY) ? nodemask : NULL;
- check_panic_on_oom(constraint, gfp_mask, order, mpol_mask);
+ check_panic_on_oom(constraint, gfp_mask, order, mpol_mask, NULL);
if (sysctl_oom_kill_allocating_task && current->mm &&
!oom_unkillable_task(current, NULL, nodemask) &&
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 644bcb665773..5daf5568b9e1 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2111,6 +2111,25 @@ void account_page_dirtied(struct page *page, struct address_space *mapping)
EXPORT_SYMBOL(account_page_dirtied);
/*
+ * Helper function for deaccounting dirty page without writeback.
+ *
+ * Doing this should *normally* only ever be done when a page
+ * is truncated, and is not actually mapped anywhere at all. However,
+ * fs/buffer.c does this when it notices that somebody has cleaned
+ * out all the buffers on a page without actually doing it through
+ * the VM. Can you say "ext3 is horribly ugly"? Thought you could.
+ */
+void account_page_cleaned(struct page *page, struct address_space *mapping)
+{
+ if (mapping_cap_account_dirty(mapping)) {
+ dec_zone_page_state(page, NR_FILE_DIRTY);
+ dec_bdi_stat(inode_to_bdi(mapping->host), BDI_RECLAIMABLE);
+ task_io_account_cancelled_write(PAGE_CACHE_SIZE);
+ }
+}
+EXPORT_SYMBOL(account_page_cleaned);
+
+/*
* For address_spaces which do not use buffers. Just tag the page as dirty in
* its radix tree.
*
@@ -2209,7 +2228,8 @@ int set_page_dirty(struct page *page)
* it will confuse readahead and make it restart the size rampup
* process. But it's a trivial problem.
*/
- ClearPageReclaim(page);
+ if (PageReclaim(page))
+ ClearPageReclaim(page);
#ifdef CONFIG_BLOCK
if (!spd)
spd = __set_page_dirty_buffers;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 40e29429e7b0..ebffa0e4a9c0 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1032,11 +1032,9 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
static int fallbacks[MIGRATE_TYPES][4] = {
[MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },
[MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },
+ [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
#ifdef CONFIG_CMA
- [MIGRATE_MOVABLE] = { MIGRATE_CMA, MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
[MIGRATE_CMA] = { MIGRATE_RESERVE }, /* Never used */
-#else
- [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
#endif
[MIGRATE_RESERVE] = { MIGRATE_RESERVE }, /* Never used */
#ifdef CONFIG_MEMORY_ISOLATION
@@ -1044,6 +1042,17 @@ static int fallbacks[MIGRATE_TYPES][4] = {
#endif
};
+#ifdef CONFIG_CMA
+static struct page *__rmqueue_cma_fallback(struct zone *zone,
+ unsigned int order)
+{
+ return __rmqueue_smallest(zone, order, MIGRATE_CMA);
+}
+#else
+static inline struct page *__rmqueue_cma_fallback(struct zone *zone,
+ unsigned int order) { return NULL; }
+#endif
+
/*
* Move the free pages in a range to the free lists of the requested type.
* Note that start_page and end_pages are not aligned on a pageblock
@@ -1136,14 +1145,40 @@ static void change_pageblock_range(struct page *pageblock_page,
* as fragmentation caused by those allocations polluting movable pageblocks
* is worse than movable allocations stealing from unmovable and reclaimable
* pageblocks.
- *
- * If we claim more than half of the pageblock, change pageblock's migratetype
- * as well.
*/
-static void try_to_steal_freepages(struct zone *zone, struct page *page,
- int start_type, int fallback_type)
+static bool can_steal_fallback(unsigned int order, int start_mt)
+{
+ /*
+ * Leaving this order check is intended, although there is
+ * relaxed order check in next check. The reason is that
+ * we can actually steal whole pageblock if this condition met,
+ * but, below check doesn't guarantee it and that is just heuristic
+ * so could be changed anytime.
+ */
+ if (order >= pageblock_order)
+ return true;
+
+ if (order >= pageblock_order / 2 ||
+ start_mt == MIGRATE_RECLAIMABLE ||
+ start_mt == MIGRATE_UNMOVABLE ||
+ page_group_by_mobility_disabled)
+ return true;
+
+ return false;
+}
+
+/*
+ * This function implements actual steal behaviour. If order is large enough,
+ * we can steal whole pageblock. If not, we first move freepages in this
+ * pageblock and check whether half of pages are moved or not. If half of
+ * pages are moved, we can change migratetype of pageblock and permanently
+ * use it's pages as requested migratetype in the future.
+ */
+static void steal_suitable_fallback(struct zone *zone, struct page *page,
+ int start_type)
{
int current_order = page_order(page);
+ int pages;
/* Take ownership for orders >= pageblock_order */
if (current_order >= pageblock_order) {
@@ -1151,19 +1186,49 @@ static void try_to_steal_freepages(struct zone *zone, struct page *page,
return;
}
- if (current_order >= pageblock_order / 2 ||
- start_type == MIGRATE_RECLAIMABLE ||
- start_type == MIGRATE_UNMOVABLE ||
- page_group_by_mobility_disabled) {
- int pages;
+ pages = move_freepages_block(zone, page, start_type);
+
+ /* Claim the whole block if over half of it is free */
+ if (pages >= (1 << (pageblock_order-1)) ||
+ page_group_by_mobility_disabled)
+ set_pageblock_migratetype(page, start_type);
+}
+
+/*
+ * Check whether there is a suitable fallback freepage with requested order.
+ * If only_stealable is true, this function returns fallback_mt only if
+ * we can steal other freepages all together. This would help to reduce
+ * fragmentation due to mixed migratetype pages in one pageblock.
+ */
+int find_suitable_fallback(struct free_area *area, unsigned int order,
+ int migratetype, bool only_stealable, bool *can_steal)
+{
+ int i;
+ int fallback_mt;
+
+ if (area->nr_free == 0)
+ return -1;
+
+ *can_steal = false;
+ for (i = 0;; i++) {
+ fallback_mt = fallbacks[migratetype][i];
+ if (fallback_mt == MIGRATE_RESERVE)
+ break;
+
+ if (list_empty(&area->free_list[fallback_mt]))
+ continue;
- pages = move_freepages_block(zone, page, start_type);
+ if (can_steal_fallback(order, migratetype))
+ *can_steal = true;
- /* Claim the whole block if over half of it is free */
- if (pages >= (1 << (pageblock_order-1)) ||
- page_group_by_mobility_disabled)
- set_pageblock_migratetype(page, start_type);
+ if (!only_stealable)
+ return fallback_mt;
+
+ if (*can_steal)
+ return fallback_mt;
}
+
+ return -1;
}
/* Remove an element from the buddy allocator from the fallback list */
@@ -1173,64 +1238,45 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
struct free_area *area;
unsigned int current_order;
struct page *page;
+ int fallback_mt;
+ bool can_steal;
/* Find the largest possible block of pages in the other list */
for (current_order = MAX_ORDER-1;
current_order >= order && current_order <= MAX_ORDER-1;
--current_order) {
- int i;
- for (i = 0;; i++) {
- int migratetype = fallbacks[start_migratetype][i];
- int buddy_type = start_migratetype;
-
- /* MIGRATE_RESERVE handled later if necessary */
- if (migratetype == MIGRATE_RESERVE)
- break;
-
- area = &(zone->free_area[current_order]);
- if (list_empty(&area->free_list[migratetype]))
- continue;
-
- page = list_entry(area->free_list[migratetype].next,
- struct page, lru);
- area->nr_free--;
-
- if (!is_migrate_cma(migratetype)) {
- try_to_steal_freepages(zone, page,
- start_migratetype,
- migratetype);
- } else {
- /*
- * When borrowing from MIGRATE_CMA, we need to
- * release the excess buddy pages to CMA
- * itself, and we do not try to steal extra
- * free pages.
- */
- buddy_type = migratetype;
- }
+ area = &(zone->free_area[current_order]);
+ fallback_mt = find_suitable_fallback(area, current_order,
+ start_migratetype, false, &can_steal);
+ if (fallback_mt == -1)
+ continue;
- /* Remove the page from the freelists */
- list_del(&page->lru);
- rmv_page_order(page);
+ page = list_entry(area->free_list[fallback_mt].next,
+ struct page, lru);
+ if (can_steal)
+ steal_suitable_fallback(zone, page, start_migratetype);
- expand(zone, page, order, current_order, area,
- buddy_type);
+ /* Remove the page from the freelists */
+ area->nr_free--;
+ list_del(&page->lru);
+ rmv_page_order(page);
- /*
- * The freepage_migratetype may differ from pageblock's
- * migratetype depending on the decisions in
- * try_to_steal_freepages(). This is OK as long as it
- * does not differ for MIGRATE_CMA pageblocks. For CMA
- * we need to make sure unallocated pages flushed from
- * pcp lists are returned to the correct freelist.
- */
- set_freepage_migratetype(page, buddy_type);
+ expand(zone, page, order, current_order, area,
+ start_migratetype);
+ /*
+ * The freepage_migratetype may differ from pageblock's
+ * migratetype depending on the decisions in
+ * try_to_steal_freepages(). This is OK as long as it
+ * does not differ for MIGRATE_CMA pageblocks. For CMA
+ * we need to make sure unallocated pages flushed from
+ * pcp lists are returned to the correct freelist.
+ */
+ set_freepage_migratetype(page, start_migratetype);
- trace_mm_page_alloc_extfrag(page, order, current_order,
- start_migratetype, migratetype);
+ trace_mm_page_alloc_extfrag(page, order, current_order,
+ start_migratetype, fallback_mt);
- return page;
- }
+ return page;
}
return NULL;
@@ -1249,7 +1295,11 @@ retry_reserve:
page = __rmqueue_smallest(zone, order, migratetype);
if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {
- page = __rmqueue_fallback(zone, order, migratetype);
+ if (migratetype == MIGRATE_MOVABLE)
+ page = __rmqueue_cma_fallback(zone, order);
+
+ if (!page)
+ page = __rmqueue_fallback(zone, order, migratetype);
/*
* Use MIGRATE_RESERVE rather than fail an allocation. goto
@@ -1321,7 +1371,7 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
int to_drain, batch;
local_irq_save(flags);
- batch = ACCESS_ONCE(pcp->batch);
+ batch = READ_ONCE(pcp->batch);
to_drain = min(pcp->count, batch);
if (to_drain > 0) {
free_pcppages_bulk(zone, to_drain, pcp);
@@ -1520,7 +1570,7 @@ void free_hot_cold_page(struct page *page, bool cold)
list_add_tail(&page->lru, &pcp->lists[migratetype]);
pcp->count++;
if (pcp->count >= pcp->high) {
- unsigned long batch = ACCESS_ONCE(pcp->batch);
+ unsigned long batch = READ_ONCE(pcp->batch);
free_pcppages_bulk(zone, batch, pcp);
pcp->count -= batch;
}
@@ -2362,13 +2412,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
*did_some_progress = 1;
goto out;
}
- /*
- * GFP_THISNODE contains __GFP_NORETRY and we never hit this.
- * Sanity check for bare calls of __GFP_THISNODE, not real OOM.
- * The caller should handle page allocation failure by itself if
- * it specifies __GFP_THISNODE.
- * Note: Hugepage uses it but will hit PAGE_ALLOC_COSTLY_ORDER.
- */
+ /* The OOM killer may not free memory on a specific node */
if (gfp_mask & __GFP_THISNODE)
goto out;
}
@@ -2623,15 +2667,11 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
}
/*
- * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
- * __GFP_NOWARN set) should not cause reclaim since the subsystem
- * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim
- * using a larger set of nodes after it has established that the
- * allowed per node queues are empty and that nodes are
- * over allocated.
+ * If this allocation cannot block and it is for a specific node, then
+ * fail early. There's no need to wakeup kswapd or retry for a
+ * speculative node-specific allocation.
*/
- if (IS_ENABLED(CONFIG_NUMA) &&
- (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
+ if (IS_ENABLED(CONFIG_NUMA) && (gfp_mask & __GFP_THISNODE) && !wait)
goto nopage;
retry:
@@ -2824,7 +2864,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
/*
* Check the zones suitable for the gfp_mask contain at least one
* valid zone. It's possible to have an empty zonelist as a result
- * of GFP_THISNODE and a memoryless node
+ * of __GFP_THISNODE and a memoryless node
*/
if (unlikely(!zonelist->_zonerefs->zone))
return NULL;
@@ -3201,38 +3241,31 @@ static void show_migration_types(unsigned char type)
* Show free area list (used inside shift_scroll-lock stuff)
* We also calculate the percentage fragmentation. We do this by counting the
* memory on each free list with the exception of the first item on the list.
- * Suppresses nodes that are not allowed by current's cpuset if
- * SHOW_MEM_FILTER_NODES is passed.
+ *
+ * Bits in @filter:
+ * SHOW_MEM_FILTER_NODES: suppress nodes that are not allowed by current's
+ * cpuset.
*/
void show_free_areas(unsigned int filter)
{
+ unsigned long free_pcp = 0;
int cpu;
struct zone *zone;
for_each_populated_zone(zone) {
if (skip_free_areas_node(filter, zone_to_nid(zone)))
continue;
- show_node(zone);
- printk("%s per-cpu:\n", zone->name);
- for_each_online_cpu(cpu) {
- struct per_cpu_pageset *pageset;
-
- pageset = per_cpu_ptr(zone->pageset, cpu);
-
- printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n",
- cpu, pageset->pcp.high,
- pageset->pcp.batch, pageset->pcp.count);
- }
+ for_each_online_cpu(cpu)
+ free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count;
}
printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"
" active_file:%lu inactive_file:%lu isolated_file:%lu\n"
- " unevictable:%lu"
- " dirty:%lu writeback:%lu unstable:%lu\n"
- " free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n"
+ " unevictable:%lu dirty:%lu writeback:%lu unstable:%lu\n"
+ " slab_reclaimable:%lu slab_unreclaimable:%lu\n"
" mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n"
- " free_cma:%lu\n",
+ " free:%lu free_pcp:%lu free_cma:%lu\n",
global_page_state(NR_ACTIVE_ANON),
global_page_state(NR_INACTIVE_ANON),
global_page_state(NR_ISOLATED_ANON),
@@ -3243,13 +3276,14 @@ void show_free_areas(unsigned int filter)
global_page_state(NR_FILE_DIRTY),
global_page_state(NR_WRITEBACK),
global_page_state(NR_UNSTABLE_NFS),
- global_page_state(NR_FREE_PAGES),
global_page_state(NR_SLAB_RECLAIMABLE),
global_page_state(NR_SLAB_UNRECLAIMABLE),
global_page_state(NR_FILE_MAPPED),
global_page_state(NR_SHMEM),
global_page_state(NR_PAGETABLE),
global_page_state(NR_BOUNCE),
+ global_page_state(NR_FREE_PAGES),
+ free_pcp,
global_page_state(NR_FREE_CMA_PAGES));
for_each_populated_zone(zone) {
@@ -3257,6 +3291,11 @@ void show_free_areas(unsigned int filter)
if (skip_free_areas_node(filter, zone_to_nid(zone)))
continue;
+
+ free_pcp = 0;
+ for_each_online_cpu(cpu)
+ free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count;
+
show_node(zone);
printk("%s"
" free:%lukB"
@@ -3283,6 +3322,8 @@ void show_free_areas(unsigned int filter)
" pagetables:%lukB"
" unstable:%lukB"
" bounce:%lukB"
+ " free_pcp:%lukB"
+ " local_pcp:%ukB"
" free_cma:%lukB"
" writeback_tmp:%lukB"
" pages_scanned:%lu"
@@ -3314,6 +3355,8 @@ void show_free_areas(unsigned int filter)
K(zone_page_state(zone, NR_PAGETABLE)),
K(zone_page_state(zone, NR_UNSTABLE_NFS)),
K(zone_page_state(zone, NR_BOUNCE)),
+ K(free_pcp),
+ K(this_cpu_read(zone->pageset->pcp.count)),
K(zone_page_state(zone, NR_FREE_CMA_PAGES)),
K(zone_page_state(zone, NR_WRITEBACK_TEMP)),
K(zone_page_state(zone, NR_PAGES_SCANNED)),
@@ -5717,7 +5760,7 @@ static void __setup_per_zone_wmarks(void)
* value here.
*
* The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)
- * deltas controls asynch page reclaim, and so should
+ * deltas control asynch page reclaim, and so should
* not be capped for highmem.
*/
unsigned long min_pages;
@@ -6164,7 +6207,7 @@ void set_pfnblock_flags_mask(struct page *page, unsigned long flags,
mask <<= (BITS_PER_LONG - bitidx - 1);
flags <<= (BITS_PER_LONG - bitidx - 1);
- word = ACCESS_ONCE(bitmap[word_bitidx]);
+ word = READ_ONCE(bitmap[word_bitidx]);
for (;;) {
old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags);
if (word == old_word)
diff --git a/mm/page_io.c b/mm/page_io.c
index e6045804c8d8..6424869e275e 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -20,8 +20,8 @@
#include <linux/buffer_head.h>
#include <linux/writeback.h>
#include <linux/frontswap.h>
-#include <linux/aio.h>
#include <linux/blkdev.h>
+#include <linux/uio.h>
#include <asm/pgtable.h>
static struct bio *get_swap_bio(gfp_t gfp_flags,
@@ -274,13 +274,10 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc,
iov_iter_bvec(&from, ITER_BVEC | WRITE, &bv, 1, PAGE_SIZE);
init_sync_kiocb(&kiocb, swap_file);
kiocb.ki_pos = page_file_offset(page);
- kiocb.ki_nbytes = PAGE_SIZE;
set_page_writeback(page);
unlock_page(page);
- ret = mapping->a_ops->direct_IO(ITER_BVEC | WRITE,
- &kiocb, &from,
- kiocb.ki_pos);
+ ret = mapping->a_ops->direct_IO(&kiocb, &from, kiocb.ki_pos);
if (ret == PAGE_SIZE) {
count_vm_event(PSWPOUT);
ret = 0;
diff --git a/mm/percpu.c b/mm/percpu.c
index 73c97a5f4495..dfd02484e8de 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1310,7 +1310,7 @@ bool is_kernel_percpu_address(unsigned long addr)
* and, from the second one, the backing allocator (currently either vm or
* km) provides translation.
*
- * The addr can be tranlated simply without checking if it falls into the
+ * The addr can be translated simply without checking if it falls into the
* first chunk. But the current code reflects better how percpu allocator
* actually works, and the verification can discover both bugs in percpu
* allocator itself and per_cpu_ptr_to_phys() callers. So we keep current
@@ -1762,7 +1762,7 @@ early_param("percpu_alloc", percpu_alloc_setup);
* and other parameters considering needed percpu size, allocation
* atom size and distances between CPUs.
*
- * Groups are always mutliples of atom size and CPUs which are of
+ * Groups are always multiples of atom size and CPUs which are of
* LOCAL_DISTANCE both ways are grouped together and share space for
* units in the same group. The returned configuration is guaranteed
* to have CPUs on different nodes on different groups and >=75% usage
diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c
index b1597690530c..e88d071648c2 100644
--- a/mm/process_vm_access.c
+++ b/mm/process_vm_access.c
@@ -257,22 +257,18 @@ static ssize_t process_vm_rw(pid_t pid,
struct iovec *iov_r = iovstack_r;
struct iov_iter iter;
ssize_t rc;
+ int dir = vm_write ? WRITE : READ;
if (flags != 0)
return -EINVAL;
/* Check iovecs */
- if (vm_write)
- rc = rw_copy_check_uvector(WRITE, lvec, liovcnt, UIO_FASTIOV,
- iovstack_l, &iov_l);
- else
- rc = rw_copy_check_uvector(READ, lvec, liovcnt, UIO_FASTIOV,
- iovstack_l, &iov_l);
- if (rc <= 0)
+ rc = import_iovec(dir, lvec, liovcnt, UIO_FASTIOV, &iov_l, &iter);
+ if (rc < 0)
+ return rc;
+ if (!iov_iter_count(&iter))
goto free_iovecs;
- iov_iter_init(&iter, vm_write ? WRITE : READ, iov_l, liovcnt, rc);
-
rc = rw_copy_check_uvector(CHECK_IOVEC_ONLY, rvec, riovcnt, UIO_FASTIOV,
iovstack_r, &iov_r);
if (rc <= 0)
@@ -283,8 +279,7 @@ static ssize_t process_vm_rw(pid_t pid,
free_iovecs:
if (iov_r != iovstack_r)
kfree(iov_r);
- if (iov_l != iovstack_l)
- kfree(iov_l);
+ kfree(iov_l);
return rc;
}
@@ -320,21 +315,16 @@ compat_process_vm_rw(compat_pid_t pid,
struct iovec *iov_r = iovstack_r;
struct iov_iter iter;
ssize_t rc = -EFAULT;
+ int dir = vm_write ? WRITE : READ;
if (flags != 0)
return -EINVAL;
- if (vm_write)
- rc = compat_rw_copy_check_uvector(WRITE, lvec, liovcnt,
- UIO_FASTIOV, iovstack_l,
- &iov_l);
- else
- rc = compat_rw_copy_check_uvector(READ, lvec, liovcnt,
- UIO_FASTIOV, iovstack_l,
- &iov_l);
- if (rc <= 0)
+ rc = compat_import_iovec(dir, lvec, liovcnt, UIO_FASTIOV, &iov_l, &iter);
+ if (rc < 0)
+ return rc;
+ if (!iov_iter_count(&iter))
goto free_iovecs;
- iov_iter_init(&iter, vm_write ? WRITE : READ, iov_l, liovcnt, rc);
rc = compat_rw_copy_check_uvector(CHECK_IOVEC_ONLY, rvec, riovcnt,
UIO_FASTIOV, iovstack_r,
&iov_r);
@@ -346,8 +336,7 @@ compat_process_vm_rw(compat_pid_t pid,
free_iovecs:
if (iov_r != iovstack_r)
kfree(iov_r);
- if (iov_l != iovstack_l)
- kfree(iov_l);
+ kfree(iov_l);
return rc;
}
diff --git a/mm/rmap.c b/mm/rmap.c
index c161a14b6a8f..24dd3f9fee27 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -456,7 +456,7 @@ struct anon_vma *page_get_anon_vma(struct page *page)
unsigned long anon_mapping;
rcu_read_lock();
- anon_mapping = (unsigned long) ACCESS_ONCE(page->mapping);
+ anon_mapping = (unsigned long)READ_ONCE(page->mapping);
if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
goto out;
if (!page_mapped(page))
@@ -500,14 +500,14 @@ struct anon_vma *page_lock_anon_vma_read(struct page *page)
unsigned long anon_mapping;
rcu_read_lock();
- anon_mapping = (unsigned long) ACCESS_ONCE(page->mapping);
+ anon_mapping = (unsigned long)READ_ONCE(page->mapping);
if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
goto out;
if (!page_mapped(page))
goto out;
anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
- root_anon_vma = ACCESS_ONCE(anon_vma->root);
+ root_anon_vma = READ_ONCE(anon_vma->root);
if (down_read_trylock(&root_anon_vma->rwsem)) {
/*
* If the page is still mapped, then this anon_vma is still
diff --git a/mm/shmem.c b/mm/shmem.c
index cf2d0ca010bc..1ea2400b5245 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -31,7 +31,7 @@
#include <linux/mm.h>
#include <linux/export.h>
#include <linux/swap.h>
-#include <linux/aio.h>
+#include <linux/uio.h>
static struct vfsmount *shm_mnt;
@@ -3118,8 +3118,6 @@ static const struct file_operations shmem_file_operations = {
.mmap = shmem_mmap,
#ifdef CONFIG_TMPFS
.llseek = shmem_file_llseek,
- .read = new_sync_read,
- .write = new_sync_write,
.read_iter = shmem_file_read_iter,
.write_iter = generic_file_write_iter,
.fsync = noop_fsync,
diff --git a/mm/slab.c b/mm/slab.c
index c4b89eaf4c96..7eb38dd1cefa 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -857,6 +857,11 @@ static inline void *____cache_alloc_node(struct kmem_cache *cachep,
return NULL;
}
+static inline gfp_t gfp_exact_node(gfp_t flags)
+{
+ return flags;
+}
+
#else /* CONFIG_NUMA */
static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int);
@@ -1023,6 +1028,15 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
return __cache_free_alien(cachep, objp, node, page_node);
}
+
+/*
+ * Construct gfp mask to allocate from a specific node but do not invoke reclaim
+ * or warn about failures.
+ */
+static inline gfp_t gfp_exact_node(gfp_t flags)
+{
+ return (flags | __GFP_THISNODE | __GFP_NOWARN) & ~__GFP_WAIT;
+}
#endif
/*
@@ -2825,7 +2839,7 @@ alloc_done:
if (unlikely(!ac->avail)) {
int x;
force_grow:
- x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL);
+ x = cache_grow(cachep, gfp_exact_node(flags), node, NULL);
/* cache_grow can reenable interrupts, then ac could change. */
ac = cpu_cache_get(cachep);
@@ -3019,7 +3033,7 @@ retry:
get_node(cache, nid) &&
get_node(cache, nid)->free_objects) {
obj = ____cache_alloc_node(cache,
- flags | GFP_THISNODE, nid);
+ gfp_exact_node(flags), nid);
if (obj)
break;
}
@@ -3047,7 +3061,7 @@ retry:
nid = page_to_nid(page);
if (cache_grow(cache, flags, nid, page)) {
obj = ____cache_alloc_node(cache,
- flags | GFP_THISNODE, nid);
+ gfp_exact_node(flags), nid);
if (!obj)
/*
* Another processor may allocate the
@@ -3118,7 +3132,7 @@ retry:
must_grow:
spin_unlock(&n->list_lock);
- x = cache_grow(cachep, flags | GFP_THISNODE, nodeid, NULL);
+ x = cache_grow(cachep, gfp_exact_node(flags), nodeid, NULL);
if (x)
goto retry;
diff --git a/mm/slob.c b/mm/slob.c
index 94a7fede6d48..4765f65019c7 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -532,7 +532,7 @@ int __kmem_cache_create(struct kmem_cache *c, unsigned long flags)
return 0;
}
-void *slob_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
+static void *slob_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
{
void *b;
@@ -558,7 +558,6 @@ void *slob_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
kmemleak_alloc_recursive(b, c->size, 1, c->flags, flags);
return b;
}
-EXPORT_SYMBOL(slob_alloc_node);
void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
{
diff --git a/mm/slub.c b/mm/slub.c
index 82c473780c91..54c0876b43d5 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -374,7 +374,7 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page
if (cmpxchg_double(&page->freelist, &page->counters,
freelist_old, counters_old,
freelist_new, counters_new))
- return 1;
+ return true;
} else
#endif
{
@@ -384,7 +384,7 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page
page->freelist = freelist_new;
set_page_slub_counters(page, counters_new);
slab_unlock(page);
- return 1;
+ return true;
}
slab_unlock(page);
}
@@ -396,7 +396,7 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page
pr_info("%s %s: cmpxchg double redo ", n, s->name);
#endif
- return 0;
+ return false;
}
static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
@@ -410,7 +410,7 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
if (cmpxchg_double(&page->freelist, &page->counters,
freelist_old, counters_old,
freelist_new, counters_new))
- return 1;
+ return true;
} else
#endif
{
@@ -424,7 +424,7 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
set_page_slub_counters(page, counters_new);
slab_unlock(page);
local_irq_restore(flags);
- return 1;
+ return true;
}
slab_unlock(page);
local_irq_restore(flags);
@@ -437,7 +437,7 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
pr_info("%s %s: cmpxchg double redo ", n, s->name);
#endif
- return 0;
+ return false;
}
#ifdef CONFIG_SLUB_DEBUG
@@ -1137,15 +1137,6 @@ static int __init setup_slub_debug(char *str)
*/
goto check_slabs;
- if (tolower(*str) == 'o') {
- /*
- * Avoid enabling debugging on caches if its minimum order
- * would increase as a result.
- */
- disable_higher_order_debug = 1;
- goto out;
- }
-
slub_debug = 0;
if (*str == '-')
/*
@@ -1176,6 +1167,13 @@ static int __init setup_slub_debug(char *str)
case 'a':
slub_debug |= SLAB_FAILSLAB;
break;
+ case 'o':
+ /*
+ * Avoid enabling debugging on caches if its minimum
+ * order would increase as a result.
+ */
+ disable_higher_order_debug = 1;
+ break;
default:
pr_err("slub_debug option '%c' unknown. skipped\n",
*str);
@@ -4279,7 +4277,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
int node;
struct page *page;
- page = ACCESS_ONCE(c->page);
+ page = READ_ONCE(c->page);
if (!page)
continue;
@@ -4294,7 +4292,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
total += x;
nodes[node] += x;
- page = ACCESS_ONCE(c->partial);
+ page = READ_ONCE(c->partial);
if (page) {
node = page_to_nid(page);
if (flags & SO_TOTAL)
diff --git a/mm/swap.c b/mm/swap.c
index cd3a5e64cea9..a7251a8ed532 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -31,6 +31,7 @@
#include <linux/memcontrol.h>
#include <linux/gfp.h>
#include <linux/uio.h>
+#include <linux/hugetlb.h>
#include "internal.h"
@@ -42,7 +43,7 @@ int page_cluster;
static DEFINE_PER_CPU(struct pagevec, lru_add_pvec);
static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs);
-static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs);
+static DEFINE_PER_CPU(struct pagevec, lru_deactivate_file_pvecs);
/*
* This path almost never happens for VM activity - pages are normally
@@ -75,7 +76,14 @@ static void __put_compound_page(struct page *page)
{
compound_page_dtor *dtor;
- __page_cache_release(page);
+ /*
+ * __page_cache_release() is supposed to be called for thp, not for
+ * hugetlb. This is because hugetlb page does never have PageLRU set
+ * (it's never listed to any LRU lists) and no memcg routines should
+ * be called for hugetlb (it has a separate hugetlb_cgroup.)
+ */
+ if (!PageHuge(page))
+ __page_cache_release(page);
dtor = get_compound_page_dtor(page);
(*dtor)(page);
}
@@ -743,7 +751,7 @@ void lru_cache_add_active_or_unevictable(struct page *page,
* be write it out by flusher threads as this is much more effective
* than the single-page writeout from reclaim.
*/
-static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec,
+static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec,
void *arg)
{
int lru, file;
@@ -811,36 +819,36 @@ void lru_add_drain_cpu(int cpu)
local_irq_restore(flags);
}
- pvec = &per_cpu(lru_deactivate_pvecs, cpu);
+ pvec = &per_cpu(lru_deactivate_file_pvecs, cpu);
if (pagevec_count(pvec))
- pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
+ pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL);
activate_page_drain(cpu);
}
/**
- * deactivate_page - forcefully deactivate a page
+ * deactivate_file_page - forcefully deactivate a file page
* @page: page to deactivate
*
* This function hints the VM that @page is a good reclaim candidate,
* for example if its invalidation fails due to the page being dirty
* or under writeback.
*/
-void deactivate_page(struct page *page)
+void deactivate_file_page(struct page *page)
{
/*
- * In a workload with many unevictable page such as mprotect, unevictable
- * page deactivation for accelerating reclaim is pointless.
+ * In a workload with many unevictable page such as mprotect,
+ * unevictable page deactivation for accelerating reclaim is pointless.
*/
if (PageUnevictable(page))
return;
if (likely(get_page_unless_zero(page))) {
- struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs);
+ struct pagevec *pvec = &get_cpu_var(lru_deactivate_file_pvecs);
if (!pagevec_add(pvec, page))
- pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
- put_cpu_var(lru_deactivate_pvecs);
+ pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL);
+ put_cpu_var(lru_deactivate_file_pvecs);
}
}
@@ -872,7 +880,7 @@ void lru_add_drain_all(void)
if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
- pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) ||
+ pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) ||
need_activate_page_drain(cpu)) {
INIT_WORK(work, lru_add_drain_per_cpu);
schedule_work_on(cpu, work);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 405923f77334..8bc8e66138da 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -390,7 +390,7 @@ static unsigned long swapin_nr_pages(unsigned long offset)
unsigned int pages, max_pages, last_ra;
static atomic_t last_readahead_pages;
- max_pages = 1 << ACCESS_ONCE(page_cluster);
+ max_pages = 1 << READ_ONCE(page_cluster);
if (max_pages <= 1)
return 1;
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 63f55ccb9b26..a7e72103f23b 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1312,7 +1312,7 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
else
continue;
}
- count = ACCESS_ONCE(si->swap_map[i]);
+ count = READ_ONCE(si->swap_map[i]);
if (count && swap_count(count) != SWAP_MAP_BAD)
break;
}
diff --git a/mm/truncate.c b/mm/truncate.c
index ddec5a5966d7..66af9031fae8 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -93,35 +93,6 @@ void do_invalidatepage(struct page *page, unsigned int offset,
}
/*
- * This cancels just the dirty bit on the kernel page itself, it
- * does NOT actually remove dirty bits on any mmap's that may be
- * around. It also leaves the page tagged dirty, so any sync
- * activity will still find it on the dirty lists, and in particular,
- * clear_page_dirty_for_io() will still look at the dirty bits in
- * the VM.
- *
- * Doing this should *normally* only ever be done when a page
- * is truncated, and is not actually mapped anywhere at all. However,
- * fs/buffer.c does this when it notices that somebody has cleaned
- * out all the buffers on a page without actually doing it through
- * the VM. Can you say "ext3 is horribly ugly"? Tought you could.
- */
-void cancel_dirty_page(struct page *page, unsigned int account_size)
-{
- if (TestClearPageDirty(page)) {
- struct address_space *mapping = page->mapping;
- if (mapping && mapping_cap_account_dirty(mapping)) {
- dec_zone_page_state(page, NR_FILE_DIRTY);
- dec_bdi_stat(inode_to_bdi(mapping->host),
- BDI_RECLAIMABLE);
- if (account_size)
- task_io_account_cancelled_write(account_size);
- }
- }
-}
-EXPORT_SYMBOL(cancel_dirty_page);
-
-/*
* If truncate cannot remove the fs-private metadata from the page, the page
* becomes orphaned. It will be left on the LRU and may even be mapped into
* user pagetables if we're racing with filemap_fault().
@@ -140,7 +111,13 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
if (page_has_private(page))
do_invalidatepage(page, 0, PAGE_CACHE_SIZE);
- cancel_dirty_page(page, PAGE_CACHE_SIZE);
+ /*
+ * Some filesystems seem to re-dirty the page even after
+ * the VM has canceled the dirty bit (eg ext3 journaling).
+ * Hence dirty accounting check is placed after invalidation.
+ */
+ if (TestClearPageDirty(page))
+ account_page_cleaned(page, mapping);
ClearPageMappedToDisk(page);
delete_from_page_cache(page);
@@ -513,7 +490,7 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
* of interest and try to speed up its reclaim.
*/
if (!ret)
- deactivate_page(page);
+ deactivate_file_page(page);
count += ret;
}
pagevec_remove_exceptionals(&pvec);
diff --git a/mm/util.c b/mm/util.c
index 3981ae9d1b15..68ff8a5361e7 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -325,9 +325,37 @@ void kvfree(const void *addr)
}
EXPORT_SYMBOL(kvfree);
+static inline void *__page_rmapping(struct page *page)
+{
+ unsigned long mapping;
+
+ mapping = (unsigned long)page->mapping;
+ mapping &= ~PAGE_MAPPING_FLAGS;
+
+ return (void *)mapping;
+}
+
+/* Neutral page->mapping pointer to address_space or anon_vma or other */
+void *page_rmapping(struct page *page)
+{
+ page = compound_head(page);
+ return __page_rmapping(page);
+}
+
+struct anon_vma *page_anon_vma(struct page *page)
+{
+ unsigned long mapping;
+
+ page = compound_head(page);
+ mapping = (unsigned long)page->mapping;
+ if ((mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
+ return NULL;
+ return __page_rmapping(page);
+}
+
struct address_space *page_mapping(struct page *page)
{
- struct address_space *mapping = page->mapping;
+ unsigned long mapping;
/* This happens if someone calls flush_dcache_page on slab page */
if (unlikely(PageSlab(page)))
@@ -337,10 +365,13 @@ struct address_space *page_mapping(struct page *page)
swp_entry_t entry;
entry.val = page_private(page);
- mapping = swap_address_space(entry);
- } else if ((unsigned long)mapping & PAGE_MAPPING_ANON)
- mapping = NULL;
- return mapping;
+ return swap_address_space(entry);
+ }
+
+ mapping = (unsigned long)page->mapping;
+ if (mapping & PAGE_MAPPING_FLAGS)
+ return NULL;
+ return page->mapping;
}
int overcommit_ratio_handler(struct ctl_table *table, int write,
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 49abccf29a29..2faaa2976447 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -29,6 +29,7 @@
#include <linux/atomic.h>
#include <linux/compiler.h>
#include <linux/llist.h>
+#include <linux/bitops.h>
#include <asm/uaccess.h>
#include <asm/tlbflush.h>
@@ -74,6 +75,8 @@ static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end)
pmd = pmd_offset(pud, addr);
do {
next = pmd_addr_end(addr, end);
+ if (pmd_clear_huge(pmd))
+ continue;
if (pmd_none_or_clear_bad(pmd))
continue;
vunmap_pte_range(pmd, addr, next);
@@ -88,6 +91,8 @@ static void vunmap_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end)
pud = pud_offset(pgd, addr);
do {
next = pud_addr_end(addr, end);
+ if (pud_clear_huge(pud))
+ continue;
if (pud_none_or_clear_bad(pud))
continue;
vunmap_pmd_range(pud, addr, next);
@@ -760,7 +765,7 @@ struct vmap_block {
spinlock_t lock;
struct vmap_area *va;
unsigned long free, dirty;
- DECLARE_BITMAP(dirty_map, VMAP_BBMAP_BITS);
+ unsigned long dirty_min, dirty_max; /*< dirty range */
struct list_head free_list;
struct rcu_head rcu_head;
struct list_head purge;
@@ -791,13 +796,31 @@ static unsigned long addr_to_vb_idx(unsigned long addr)
return addr;
}
-static struct vmap_block *new_vmap_block(gfp_t gfp_mask)
+static void *vmap_block_vaddr(unsigned long va_start, unsigned long pages_off)
+{
+ unsigned long addr;
+
+ addr = va_start + (pages_off << PAGE_SHIFT);
+ BUG_ON(addr_to_vb_idx(addr) != addr_to_vb_idx(va_start));
+ return (void *)addr;
+}
+
+/**
+ * new_vmap_block - allocates new vmap_block and occupies 2^order pages in this
+ * block. Of course pages number can't exceed VMAP_BBMAP_BITS
+ * @order: how many 2^order pages should be occupied in newly allocated block
+ * @gfp_mask: flags for the page level allocator
+ *
+ * Returns: virtual address in a newly allocated block or ERR_PTR(-errno)
+ */
+static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
{
struct vmap_block_queue *vbq;
struct vmap_block *vb;
struct vmap_area *va;
unsigned long vb_idx;
int node, err;
+ void *vaddr;
node = numa_node_id();
@@ -821,11 +844,15 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask)
return ERR_PTR(err);
}
+ vaddr = vmap_block_vaddr(va->va_start, 0);
spin_lock_init(&vb->lock);
vb->va = va;
- vb->free = VMAP_BBMAP_BITS;
+ /* At least something should be left free */
+ BUG_ON(VMAP_BBMAP_BITS <= (1UL << order));
+ vb->free = VMAP_BBMAP_BITS - (1UL << order);
vb->dirty = 0;
- bitmap_zero(vb->dirty_map, VMAP_BBMAP_BITS);
+ vb->dirty_min = VMAP_BBMAP_BITS;
+ vb->dirty_max = 0;
INIT_LIST_HEAD(&vb->free_list);
vb_idx = addr_to_vb_idx(va->va_start);
@@ -837,11 +864,11 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask)
vbq = &get_cpu_var(vmap_block_queue);
spin_lock(&vbq->lock);
- list_add_rcu(&vb->free_list, &vbq->free);
+ list_add_tail_rcu(&vb->free_list, &vbq->free);
spin_unlock(&vbq->lock);
put_cpu_var(vmap_block_queue);
- return vb;
+ return vaddr;
}
static void free_vmap_block(struct vmap_block *vb)
@@ -876,7 +903,8 @@ static void purge_fragmented_blocks(int cpu)
if (vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS) {
vb->free = 0; /* prevent further allocs after releasing lock */
vb->dirty = VMAP_BBMAP_BITS; /* prevent purging it again */
- bitmap_fill(vb->dirty_map, VMAP_BBMAP_BITS);
+ vb->dirty_min = 0;
+ vb->dirty_max = VMAP_BBMAP_BITS;
spin_lock(&vbq->lock);
list_del_rcu(&vb->free_list);
spin_unlock(&vbq->lock);
@@ -905,7 +933,7 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
{
struct vmap_block_queue *vbq;
struct vmap_block *vb;
- unsigned long addr = 0;
+ void *vaddr = NULL;
unsigned int order;
BUG_ON(size & ~PAGE_MASK);
@@ -920,43 +948,38 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
}
order = get_order(size);
-again:
rcu_read_lock();
vbq = &get_cpu_var(vmap_block_queue);
list_for_each_entry_rcu(vb, &vbq->free, free_list) {
- int i;
+ unsigned long pages_off;
spin_lock(&vb->lock);
- if (vb->free < 1UL << order)
- goto next;
+ if (vb->free < (1UL << order)) {
+ spin_unlock(&vb->lock);
+ continue;
+ }
- i = VMAP_BBMAP_BITS - vb->free;
- addr = vb->va->va_start + (i << PAGE_SHIFT);
- BUG_ON(addr_to_vb_idx(addr) !=
- addr_to_vb_idx(vb->va->va_start));
+ pages_off = VMAP_BBMAP_BITS - vb->free;
+ vaddr = vmap_block_vaddr(vb->va->va_start, pages_off);
vb->free -= 1UL << order;
if (vb->free == 0) {
spin_lock(&vbq->lock);
list_del_rcu(&vb->free_list);
spin_unlock(&vbq->lock);
}
+
spin_unlock(&vb->lock);
break;
-next:
- spin_unlock(&vb->lock);
}
put_cpu_var(vmap_block_queue);
rcu_read_unlock();
- if (!addr) {
- vb = new_vmap_block(gfp_mask);
- if (IS_ERR(vb))
- return vb;
- goto again;
- }
+ /* Allocate new block if nothing was found */
+ if (!vaddr)
+ vaddr = new_vmap_block(order, gfp_mask);
- return (void *)addr;
+ return vaddr;
}
static void vb_free(const void *addr, unsigned long size)
@@ -974,6 +997,7 @@ static void vb_free(const void *addr, unsigned long size)
order = get_order(size);
offset = (unsigned long)addr & (VMAP_BLOCK_SIZE - 1);
+ offset >>= PAGE_SHIFT;
vb_idx = addr_to_vb_idx((unsigned long)addr);
rcu_read_lock();
@@ -984,7 +1008,10 @@ static void vb_free(const void *addr, unsigned long size)
vunmap_page_range((unsigned long)addr, (unsigned long)addr + size);
spin_lock(&vb->lock);
- BUG_ON(bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order));
+
+ /* Expand dirty range */
+ vb->dirty_min = min(vb->dirty_min, offset);
+ vb->dirty_max = max(vb->dirty_max, offset + (1UL << order));
vb->dirty += 1UL << order;
if (vb->dirty == VMAP_BBMAP_BITS) {
@@ -1023,25 +1050,18 @@ void vm_unmap_aliases(void)
rcu_read_lock();
list_for_each_entry_rcu(vb, &vbq->free, free_list) {
- int i, j;
-
spin_lock(&vb->lock);
- i = find_first_bit(vb->dirty_map, VMAP_BBMAP_BITS);
- if (i < VMAP_BBMAP_BITS) {
+ if (vb->dirty) {
+ unsigned long va_start = vb->va->va_start;
unsigned long s, e;
- j = find_last_bit(vb->dirty_map,
- VMAP_BBMAP_BITS);
- j = j + 1; /* need exclusive index */
+ s = va_start + (vb->dirty_min << PAGE_SHIFT);
+ e = va_start + (vb->dirty_max << PAGE_SHIFT);
- s = vb->va->va_start + (i << PAGE_SHIFT);
- e = vb->va->va_start + (j << PAGE_SHIFT);
- flush = 1;
+ start = min(s, start);
+ end = max(e, end);
- if (s < start)
- start = s;
- if (e > end)
- end = e;
+ flush = 1;
}
spin_unlock(&vb->lock);
}
@@ -1314,7 +1334,8 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
BUG_ON(in_interrupt());
if (flags & VM_IOREMAP)
- align = 1ul << clamp(fls(size), PAGE_SHIFT, IOREMAP_MAX_ORDER);
+ align = 1ul << clamp_t(int, fls_long(size),
+ PAGE_SHIFT, IOREMAP_MAX_ORDER);
size = PAGE_ALIGN(size);
if (unlikely(!size))
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 0dec1fa5f656..08bd7a3d464a 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -12,35 +12,6 @@
*/
/*
- * This allocator is designed for use with zram. Thus, the allocator is
- * supposed to work well under low memory conditions. In particular, it
- * never attempts higher order page allocation which is very likely to
- * fail under memory pressure. On the other hand, if we just use single
- * (0-order) pages, it would suffer from very high fragmentation --
- * any object of size PAGE_SIZE/2 or larger would occupy an entire page.
- * This was one of the major issues with its predecessor (xvmalloc).
- *
- * To overcome these issues, zsmalloc allocates a bunch of 0-order pages
- * and links them together using various 'struct page' fields. These linked
- * pages act as a single higher-order page i.e. an object can span 0-order
- * page boundaries. The code refers to these linked pages as a single entity
- * called zspage.
- *
- * For simplicity, zsmalloc can only allocate objects of size up to PAGE_SIZE
- * since this satisfies the requirements of all its current users (in the
- * worst case, page is incompressible and is thus stored "as-is" i.e. in
- * uncompressed form). For allocation requests larger than this size, failure
- * is returned (see zs_malloc).
- *
- * Additionally, zs_malloc() does not return a dereferenceable pointer.
- * Instead, it returns an opaque handle (unsigned long) which encodes actual
- * location of the allocated object. The reason for this indirection is that
- * zsmalloc does not keep zspages permanently mapped since that would cause
- * issues on 32-bit systems where the VA region for kernel space mappings
- * is very small. So, before using the allocating memory, the object has to
- * be mapped using zs_map_object() to get a usable pointer and subsequently
- * unmapped using zs_unmap_object().
- *
* Following is how we use various fields and flags of underlying
* struct page(s) to form a zspage.
*
@@ -57,6 +28,8 @@
*
* page->private (union with page->first_page): refers to the
* component page after the first page
+ * If the page is first_page for huge object, it stores handle.
+ * Look at size_class->huge.
* page->freelist: points to the first free object in zspage.
* Free objects are linked together using in-place
* metadata.
@@ -78,6 +51,7 @@
#include <linux/module.h>
#include <linux/kernel.h>
+#include <linux/sched.h>
#include <linux/bitops.h>
#include <linux/errno.h>
#include <linux/highmem.h>
@@ -110,6 +84,8 @@
#define ZS_MAX_ZSPAGE_ORDER 2
#define ZS_MAX_PAGES_PER_ZSPAGE (_AC(1, UL) << ZS_MAX_ZSPAGE_ORDER)
+#define ZS_HANDLE_SIZE (sizeof(unsigned long))
+
/*
* Object location (<PFN>, <obj_idx>) is encoded as
* as single (unsigned long) handle value.
@@ -133,13 +109,33 @@
#endif
#endif
#define _PFN_BITS (MAX_PHYSMEM_BITS - PAGE_SHIFT)
-#define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS)
+
+/*
+ * Memory for allocating for handle keeps object position by
+ * encoding <page, obj_idx> and the encoded value has a room
+ * in least bit(ie, look at obj_to_location).
+ * We use the bit to synchronize between object access by
+ * user and migration.
+ */
+#define HANDLE_PIN_BIT 0
+
+/*
+ * Head in allocated object should have OBJ_ALLOCATED_TAG
+ * to identify the object was allocated or not.
+ * It's okay to add the status bit in the least bit because
+ * header keeps handle which is 4byte-aligned address so we
+ * have room for two bit at least.
+ */
+#define OBJ_ALLOCATED_TAG 1
+#define OBJ_TAG_BITS 1
+#define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS - OBJ_TAG_BITS)
#define OBJ_INDEX_MASK ((_AC(1, UL) << OBJ_INDEX_BITS) - 1)
#define MAX(a, b) ((a) >= (b) ? (a) : (b))
/* ZS_MIN_ALLOC_SIZE must be multiple of ZS_ALIGN */
#define ZS_MIN_ALLOC_SIZE \
MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS))
+/* each chunk includes extra space to keep handle */
#define ZS_MAX_ALLOC_SIZE PAGE_SIZE
/*
@@ -172,6 +168,8 @@ enum fullness_group {
enum zs_stat_type {
OBJ_ALLOCATED,
OBJ_USED,
+ CLASS_ALMOST_FULL,
+ CLASS_ALMOST_EMPTY,
NR_ZS_STAT_TYPE,
};
@@ -216,6 +214,8 @@ struct size_class {
/* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */
int pages_per_zspage;
+ /* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */
+ bool huge;
#ifdef CONFIG_ZSMALLOC_STAT
struct zs_size_stat stats;
@@ -233,14 +233,24 @@ struct size_class {
* This must be power of 2 and less than or equal to ZS_ALIGN
*/
struct link_free {
- /* Handle of next free chunk (encodes <PFN, obj_idx>) */
- void *next;
+ union {
+ /*
+ * Position of next free chunk (encodes <PFN, obj_idx>)
+ * It's valid for non-allocated object
+ */
+ void *next;
+ /*
+ * Handle of allocated object.
+ */
+ unsigned long handle;
+ };
};
struct zs_pool {
char *name;
struct size_class **size_class;
+ struct kmem_cache *handle_cachep;
gfp_t flags; /* allocation flags used when growing pool */
atomic_long_t pages_allocated;
@@ -267,8 +277,37 @@ struct mapping_area {
#endif
char *vm_addr; /* address of kmap_atomic()'ed pages */
enum zs_mapmode vm_mm; /* mapping mode */
+ bool huge;
};
+static int create_handle_cache(struct zs_pool *pool)
+{
+ pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_SIZE,
+ 0, 0, NULL);
+ return pool->handle_cachep ? 0 : 1;
+}
+
+static void destroy_handle_cache(struct zs_pool *pool)
+{
+ kmem_cache_destroy(pool->handle_cachep);
+}
+
+static unsigned long alloc_handle(struct zs_pool *pool)
+{
+ return (unsigned long)kmem_cache_alloc(pool->handle_cachep,
+ pool->flags & ~__GFP_HIGHMEM);
+}
+
+static void free_handle(struct zs_pool *pool, unsigned long handle)
+{
+ kmem_cache_free(pool->handle_cachep, (void *)handle);
+}
+
+static void record_obj(unsigned long handle, unsigned long obj)
+{
+ *(unsigned long *)handle = obj;
+}
+
/* zpool driver */
#ifdef CONFIG_ZPOOL
@@ -346,6 +385,11 @@ static struct zpool_driver zs_zpool_driver = {
MODULE_ALIAS("zpool-zsmalloc");
#endif /* CONFIG_ZPOOL */
+static unsigned int get_maxobj_per_zspage(int size, int pages_per_zspage)
+{
+ return pages_per_zspage * PAGE_SIZE / size;
+}
+
/* per-cpu VM mapping areas for zspage accesses that cross page boundaries */
static DEFINE_PER_CPU(struct mapping_area, zs_map_area);
@@ -396,9 +440,182 @@ static int get_size_class_index(int size)
idx = DIV_ROUND_UP(size - ZS_MIN_ALLOC_SIZE,
ZS_SIZE_CLASS_DELTA);
- return idx;
+ return min(zs_size_classes - 1, idx);
+}
+
+#ifdef CONFIG_ZSMALLOC_STAT
+
+static inline void zs_stat_inc(struct size_class *class,
+ enum zs_stat_type type, unsigned long cnt)
+{
+ class->stats.objs[type] += cnt;
+}
+
+static inline void zs_stat_dec(struct size_class *class,
+ enum zs_stat_type type, unsigned long cnt)
+{
+ class->stats.objs[type] -= cnt;
+}
+
+static inline unsigned long zs_stat_get(struct size_class *class,
+ enum zs_stat_type type)
+{
+ return class->stats.objs[type];
+}
+
+static int __init zs_stat_init(void)
+{
+ if (!debugfs_initialized())
+ return -ENODEV;
+
+ zs_stat_root = debugfs_create_dir("zsmalloc", NULL);
+ if (!zs_stat_root)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static void __exit zs_stat_exit(void)
+{
+ debugfs_remove_recursive(zs_stat_root);
+}
+
+static int zs_stats_size_show(struct seq_file *s, void *v)
+{
+ int i;
+ struct zs_pool *pool = s->private;
+ struct size_class *class;
+ int objs_per_zspage;
+ unsigned long class_almost_full, class_almost_empty;
+ unsigned long obj_allocated, obj_used, pages_used;
+ unsigned long total_class_almost_full = 0, total_class_almost_empty = 0;
+ unsigned long total_objs = 0, total_used_objs = 0, total_pages = 0;
+
+ seq_printf(s, " %5s %5s %11s %12s %13s %10s %10s %16s\n",
+ "class", "size", "almost_full", "almost_empty",
+ "obj_allocated", "obj_used", "pages_used",
+ "pages_per_zspage");
+
+ for (i = 0; i < zs_size_classes; i++) {
+ class = pool->size_class[i];
+
+ if (class->index != i)
+ continue;
+
+ spin_lock(&class->lock);
+ class_almost_full = zs_stat_get(class, CLASS_ALMOST_FULL);
+ class_almost_empty = zs_stat_get(class, CLASS_ALMOST_EMPTY);
+ obj_allocated = zs_stat_get(class, OBJ_ALLOCATED);
+ obj_used = zs_stat_get(class, OBJ_USED);
+ spin_unlock(&class->lock);
+
+ objs_per_zspage = get_maxobj_per_zspage(class->size,
+ class->pages_per_zspage);
+ pages_used = obj_allocated / objs_per_zspage *
+ class->pages_per_zspage;
+
+ seq_printf(s, " %5u %5u %11lu %12lu %13lu %10lu %10lu %16d\n",
+ i, class->size, class_almost_full, class_almost_empty,
+ obj_allocated, obj_used, pages_used,
+ class->pages_per_zspage);
+
+ total_class_almost_full += class_almost_full;
+ total_class_almost_empty += class_almost_empty;
+ total_objs += obj_allocated;
+ total_used_objs += obj_used;
+ total_pages += pages_used;
+ }
+
+ seq_puts(s, "\n");
+ seq_printf(s, " %5s %5s %11lu %12lu %13lu %10lu %10lu\n",
+ "Total", "", total_class_almost_full,
+ total_class_almost_empty, total_objs,
+ total_used_objs, total_pages);
+
+ return 0;
+}
+
+static int zs_stats_size_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, zs_stats_size_show, inode->i_private);
+}
+
+static const struct file_operations zs_stat_size_ops = {
+ .open = zs_stats_size_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int zs_pool_stat_create(char *name, struct zs_pool *pool)
+{
+ struct dentry *entry;
+
+ if (!zs_stat_root)
+ return -ENODEV;
+
+ entry = debugfs_create_dir(name, zs_stat_root);
+ if (!entry) {
+ pr_warn("debugfs dir <%s> creation failed\n", name);
+ return -ENOMEM;
+ }
+ pool->stat_dentry = entry;
+
+ entry = debugfs_create_file("classes", S_IFREG | S_IRUGO,
+ pool->stat_dentry, pool, &zs_stat_size_ops);
+ if (!entry) {
+ pr_warn("%s: debugfs file entry <%s> creation failed\n",
+ name, "classes");
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static void zs_pool_stat_destroy(struct zs_pool *pool)
+{
+ debugfs_remove_recursive(pool->stat_dentry);
+}
+
+#else /* CONFIG_ZSMALLOC_STAT */
+
+static inline void zs_stat_inc(struct size_class *class,
+ enum zs_stat_type type, unsigned long cnt)
+{
+}
+
+static inline void zs_stat_dec(struct size_class *class,
+ enum zs_stat_type type, unsigned long cnt)
+{
+}
+
+static inline unsigned long zs_stat_get(struct size_class *class,
+ enum zs_stat_type type)
+{
+ return 0;
+}
+
+static int __init zs_stat_init(void)
+{
+ return 0;
+}
+
+static void __exit zs_stat_exit(void)
+{
+}
+
+static inline int zs_pool_stat_create(char *name, struct zs_pool *pool)
+{
+ return 0;
+}
+
+static inline void zs_pool_stat_destroy(struct zs_pool *pool)
+{
}
+#endif
+
+
/*
* For each size class, zspages are divided into different groups
* depending on how "full" they are. This was done so that we could
@@ -419,7 +636,7 @@ static enum fullness_group get_fullness_group(struct page *page)
fg = ZS_EMPTY;
else if (inuse == max_objects)
fg = ZS_FULL;
- else if (inuse <= max_objects / fullness_threshold_frac)
+ else if (inuse <= 3 * max_objects / fullness_threshold_frac)
fg = ZS_ALMOST_EMPTY;
else
fg = ZS_ALMOST_FULL;
@@ -448,6 +665,8 @@ static void insert_zspage(struct page *page, struct size_class *class,
list_add_tail(&page->lru, &(*head)->lru);
*head = page;
+ zs_stat_inc(class, fullness == ZS_ALMOST_EMPTY ?
+ CLASS_ALMOST_EMPTY : CLASS_ALMOST_FULL, 1);
}
/*
@@ -473,6 +692,8 @@ static void remove_zspage(struct page *page, struct size_class *class,
struct page, lru);
list_del_init(&page->lru);
+ zs_stat_dec(class, fullness == ZS_ALMOST_EMPTY ?
+ CLASS_ALMOST_EMPTY : CLASS_ALMOST_FULL, 1);
}
/*
@@ -484,11 +705,10 @@ static void remove_zspage(struct page *page, struct size_class *class,
* page from the freelist of the old fullness group to that of the new
* fullness group.
*/
-static enum fullness_group fix_fullness_group(struct zs_pool *pool,
+static enum fullness_group fix_fullness_group(struct size_class *class,
struct page *page)
{
int class_idx;
- struct size_class *class;
enum fullness_group currfg, newfg;
BUG_ON(!is_first_page(page));
@@ -498,7 +718,6 @@ static enum fullness_group fix_fullness_group(struct zs_pool *pool,
if (newfg == currfg)
goto out;
- class = pool->size_class[class_idx];
remove_zspage(page, class, currfg);
insert_zspage(page, class, newfg);
set_zspage_mapping(page, class_idx, newfg);
@@ -512,7 +731,8 @@ out:
* to form a zspage for each size class. This is important
* to reduce wastage due to unusable space left at end of
* each zspage which is given as:
- * wastage = Zp - Zp % size_class
+ * wastage = Zp % class_size
+ * usage = Zp - wastage
* where Zp = zspage size = k * PAGE_SIZE where k = 1, 2, ...
*
* For example, for size class of 3/8 * PAGE_SIZE, we should
@@ -571,35 +791,50 @@ static struct page *get_next_page(struct page *page)
/*
* Encode <page, obj_idx> as a single handle value.
- * On hardware platforms with physical memory starting at 0x0 the pfn
- * could be 0 so we ensure that the handle will never be 0 by adjusting the
- * encoded obj_idx value before encoding.
+ * We use the least bit of handle for tagging.
*/
-static void *obj_location_to_handle(struct page *page, unsigned long obj_idx)
+static void *location_to_obj(struct page *page, unsigned long obj_idx)
{
- unsigned long handle;
+ unsigned long obj;
if (!page) {
BUG_ON(obj_idx);
return NULL;
}
- handle = page_to_pfn(page) << OBJ_INDEX_BITS;
- handle |= ((obj_idx + 1) & OBJ_INDEX_MASK);
+ obj = page_to_pfn(page) << OBJ_INDEX_BITS;
+ obj |= ((obj_idx) & OBJ_INDEX_MASK);
+ obj <<= OBJ_TAG_BITS;
- return (void *)handle;
+ return (void *)obj;
}
/*
* Decode <page, obj_idx> pair from the given object handle. We adjust the
* decoded obj_idx back to its original value since it was adjusted in
- * obj_location_to_handle().
+ * location_to_obj().
*/
-static void obj_handle_to_location(unsigned long handle, struct page **page,
+static void obj_to_location(unsigned long obj, struct page **page,
unsigned long *obj_idx)
{
- *page = pfn_to_page(handle >> OBJ_INDEX_BITS);
- *obj_idx = (handle & OBJ_INDEX_MASK) - 1;
+ obj >>= OBJ_TAG_BITS;
+ *page = pfn_to_page(obj >> OBJ_INDEX_BITS);
+ *obj_idx = (obj & OBJ_INDEX_MASK);
+}
+
+static unsigned long handle_to_obj(unsigned long handle)
+{
+ return *(unsigned long *)handle;
+}
+
+static unsigned long obj_to_head(struct size_class *class, struct page *page,
+ void *obj)
+{
+ if (class->huge) {
+ VM_BUG_ON(!is_first_page(page));
+ return *(unsigned long *)page_private(page);
+ } else
+ return *(unsigned long *)obj;
}
static unsigned long obj_idx_to_offset(struct page *page,
@@ -613,6 +848,25 @@ static unsigned long obj_idx_to_offset(struct page *page,
return off + obj_idx * class_size;
}
+static inline int trypin_tag(unsigned long handle)
+{
+ unsigned long *ptr = (unsigned long *)handle;
+
+ return !test_and_set_bit_lock(HANDLE_PIN_BIT, ptr);
+}
+
+static void pin_tag(unsigned long handle)
+{
+ while (!trypin_tag(handle));
+}
+
+static void unpin_tag(unsigned long handle)
+{
+ unsigned long *ptr = (unsigned long *)handle;
+
+ clear_bit_unlock(HANDLE_PIN_BIT, ptr);
+}
+
static void reset_page(struct page *page)
{
clear_bit(PG_private, &page->flags);
@@ -674,7 +928,7 @@ static void init_zspage(struct page *first_page, struct size_class *class)
link = (struct link_free *)vaddr + off / sizeof(*link);
while ((off += class->size) < PAGE_SIZE) {
- link->next = obj_location_to_handle(page, i++);
+ link->next = location_to_obj(page, i++);
link += class->size / sizeof(*link);
}
@@ -684,7 +938,7 @@ static void init_zspage(struct page *first_page, struct size_class *class)
* page (if present)
*/
next_page = get_next_page(page);
- link->next = obj_location_to_handle(next_page, 0);
+ link->next = location_to_obj(next_page, 0);
kunmap_atomic(vaddr);
page = next_page;
off %= PAGE_SIZE;
@@ -738,7 +992,7 @@ static struct page *alloc_zspage(struct size_class *class, gfp_t flags)
init_zspage(first_page, class);
- first_page->freelist = obj_location_to_handle(first_page, 0);
+ first_page->freelist = location_to_obj(first_page, 0);
/* Maximum number of objects we can store in this zspage */
first_page->objects = class->pages_per_zspage * PAGE_SIZE / class->size;
@@ -860,12 +1114,19 @@ static void __zs_unmap_object(struct mapping_area *area,
{
int sizes[2];
void *addr;
- char *buf = area->vm_buf;
+ char *buf;
/* no write fastpath */
if (area->vm_mm == ZS_MM_RO)
goto out;
+ buf = area->vm_buf;
+ if (!area->huge) {
+ buf = buf + ZS_HANDLE_SIZE;
+ size -= ZS_HANDLE_SIZE;
+ off += ZS_HANDLE_SIZE;
+ }
+
sizes[0] = PAGE_SIZE - off;
sizes[1] = size - sizes[0];
@@ -952,11 +1213,6 @@ static void init_zs_size_classes(void)
zs_size_classes = nr;
}
-static unsigned int get_maxobj_per_zspage(int size, int pages_per_zspage)
-{
- return pages_per_zspage * PAGE_SIZE / size;
-}
-
static bool can_merge(struct size_class *prev, int size, int pages_per_zspage)
{
if (prev->pages_per_zspage != pages_per_zspage)
@@ -969,166 +1225,13 @@ static bool can_merge(struct size_class *prev, int size, int pages_per_zspage)
return true;
}
-#ifdef CONFIG_ZSMALLOC_STAT
-
-static inline void zs_stat_inc(struct size_class *class,
- enum zs_stat_type type, unsigned long cnt)
-{
- class->stats.objs[type] += cnt;
-}
-
-static inline void zs_stat_dec(struct size_class *class,
- enum zs_stat_type type, unsigned long cnt)
-{
- class->stats.objs[type] -= cnt;
-}
-
-static inline unsigned long zs_stat_get(struct size_class *class,
- enum zs_stat_type type)
-{
- return class->stats.objs[type];
-}
-
-static int __init zs_stat_init(void)
-{
- if (!debugfs_initialized())
- return -ENODEV;
-
- zs_stat_root = debugfs_create_dir("zsmalloc", NULL);
- if (!zs_stat_root)
- return -ENOMEM;
-
- return 0;
-}
-
-static void __exit zs_stat_exit(void)
-{
- debugfs_remove_recursive(zs_stat_root);
-}
-
-static int zs_stats_size_show(struct seq_file *s, void *v)
+static bool zspage_full(struct page *page)
{
- int i;
- struct zs_pool *pool = s->private;
- struct size_class *class;
- int objs_per_zspage;
- unsigned long obj_allocated, obj_used, pages_used;
- unsigned long total_objs = 0, total_used_objs = 0, total_pages = 0;
-
- seq_printf(s, " %5s %5s %13s %10s %10s\n", "class", "size",
- "obj_allocated", "obj_used", "pages_used");
-
- for (i = 0; i < zs_size_classes; i++) {
- class = pool->size_class[i];
-
- if (class->index != i)
- continue;
-
- spin_lock(&class->lock);
- obj_allocated = zs_stat_get(class, OBJ_ALLOCATED);
- obj_used = zs_stat_get(class, OBJ_USED);
- spin_unlock(&class->lock);
-
- objs_per_zspage = get_maxobj_per_zspage(class->size,
- class->pages_per_zspage);
- pages_used = obj_allocated / objs_per_zspage *
- class->pages_per_zspage;
-
- seq_printf(s, " %5u %5u %10lu %10lu %10lu\n", i,
- class->size, obj_allocated, obj_used, pages_used);
-
- total_objs += obj_allocated;
- total_used_objs += obj_used;
- total_pages += pages_used;
- }
-
- seq_puts(s, "\n");
- seq_printf(s, " %5s %5s %10lu %10lu %10lu\n", "Total", "",
- total_objs, total_used_objs, total_pages);
-
- return 0;
-}
-
-static int zs_stats_size_open(struct inode *inode, struct file *file)
-{
- return single_open(file, zs_stats_size_show, inode->i_private);
-}
-
-static const struct file_operations zs_stat_size_ops = {
- .open = zs_stats_size_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
-static int zs_pool_stat_create(char *name, struct zs_pool *pool)
-{
- struct dentry *entry;
-
- if (!zs_stat_root)
- return -ENODEV;
-
- entry = debugfs_create_dir(name, zs_stat_root);
- if (!entry) {
- pr_warn("debugfs dir <%s> creation failed\n", name);
- return -ENOMEM;
- }
- pool->stat_dentry = entry;
-
- entry = debugfs_create_file("obj_in_classes", S_IFREG | S_IRUGO,
- pool->stat_dentry, pool, &zs_stat_size_ops);
- if (!entry) {
- pr_warn("%s: debugfs file entry <%s> creation failed\n",
- name, "obj_in_classes");
- return -ENOMEM;
- }
-
- return 0;
-}
-
-static void zs_pool_stat_destroy(struct zs_pool *pool)
-{
- debugfs_remove_recursive(pool->stat_dentry);
-}
-
-#else /* CONFIG_ZSMALLOC_STAT */
-
-static inline void zs_stat_inc(struct size_class *class,
- enum zs_stat_type type, unsigned long cnt)
-{
-}
-
-static inline void zs_stat_dec(struct size_class *class,
- enum zs_stat_type type, unsigned long cnt)
-{
-}
-
-static inline unsigned long zs_stat_get(struct size_class *class,
- enum zs_stat_type type)
-{
- return 0;
-}
-
-static int __init zs_stat_init(void)
-{
- return 0;
-}
-
-static void __exit zs_stat_exit(void)
-{
-}
-
-static inline int zs_pool_stat_create(char *name, struct zs_pool *pool)
-{
- return 0;
-}
+ BUG_ON(!is_first_page(page));
-static inline void zs_pool_stat_destroy(struct zs_pool *pool)
-{
+ return page->inuse == page->objects;
}
-#endif
-
unsigned long zs_get_total_pages(struct zs_pool *pool)
{
return atomic_long_read(&pool->pages_allocated);
@@ -1153,13 +1256,14 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
enum zs_mapmode mm)
{
struct page *page;
- unsigned long obj_idx, off;
+ unsigned long obj, obj_idx, off;
unsigned int class_idx;
enum fullness_group fg;
struct size_class *class;
struct mapping_area *area;
struct page *pages[2];
+ void *ret;
BUG_ON(!handle);
@@ -1170,7 +1274,11 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
*/
BUG_ON(in_interrupt());
- obj_handle_to_location(handle, &page, &obj_idx);
+ /* From now on, migration cannot move the object */
+ pin_tag(handle);
+
+ obj = handle_to_obj(handle);
+ obj_to_location(obj, &page, &obj_idx);
get_zspage_mapping(get_first_page(page), &class_idx, &fg);
class = pool->size_class[class_idx];
off = obj_idx_to_offset(page, obj_idx, class->size);
@@ -1180,7 +1288,8 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
if (off + class->size <= PAGE_SIZE) {
/* this object is contained entirely within a page */
area->vm_addr = kmap_atomic(page);
- return area->vm_addr + off;
+ ret = area->vm_addr + off;
+ goto out;
}
/* this object spans two pages */
@@ -1188,14 +1297,19 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
pages[1] = get_next_page(page);
BUG_ON(!pages[1]);
- return __zs_map_object(area, pages, off, class->size);
+ ret = __zs_map_object(area, pages, off, class->size);
+out:
+ if (!class->huge)
+ ret += ZS_HANDLE_SIZE;
+
+ return ret;
}
EXPORT_SYMBOL_GPL(zs_map_object);
void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
{
struct page *page;
- unsigned long obj_idx, off;
+ unsigned long obj, obj_idx, off;
unsigned int class_idx;
enum fullness_group fg;
@@ -1204,7 +1318,8 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
BUG_ON(!handle);
- obj_handle_to_location(handle, &page, &obj_idx);
+ obj = handle_to_obj(handle);
+ obj_to_location(obj, &page, &obj_idx);
get_zspage_mapping(get_first_page(page), &class_idx, &fg);
class = pool->size_class[class_idx];
off = obj_idx_to_offset(page, obj_idx, class->size);
@@ -1222,9 +1337,42 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
__zs_unmap_object(area, pages, off, class->size);
}
put_cpu_var(zs_map_area);
+ unpin_tag(handle);
}
EXPORT_SYMBOL_GPL(zs_unmap_object);
+static unsigned long obj_malloc(struct page *first_page,
+ struct size_class *class, unsigned long handle)
+{
+ unsigned long obj;
+ struct link_free *link;
+
+ struct page *m_page;
+ unsigned long m_objidx, m_offset;
+ void *vaddr;
+
+ handle |= OBJ_ALLOCATED_TAG;
+ obj = (unsigned long)first_page->freelist;
+ obj_to_location(obj, &m_page, &m_objidx);
+ m_offset = obj_idx_to_offset(m_page, m_objidx, class->size);
+
+ vaddr = kmap_atomic(m_page);
+ link = (struct link_free *)vaddr + m_offset / sizeof(*link);
+ first_page->freelist = link->next;
+ if (!class->huge)
+ /* record handle in the header of allocated chunk */
+ link->handle = handle;
+ else
+ /* record handle in first_page->private */
+ set_page_private(first_page, handle);
+ kunmap_atomic(vaddr);
+ first_page->inuse++;
+ zs_stat_inc(class, OBJ_USED, 1);
+
+ return obj;
+}
+
+
/**
* zs_malloc - Allocate block of given size from pool.
* @pool: pool to allocate from
@@ -1236,17 +1384,19 @@ EXPORT_SYMBOL_GPL(zs_unmap_object);
*/
unsigned long zs_malloc(struct zs_pool *pool, size_t size)
{
- unsigned long obj;
- struct link_free *link;
+ unsigned long handle, obj;
struct size_class *class;
- void *vaddr;
-
- struct page *first_page, *m_page;
- unsigned long m_objidx, m_offset;
+ struct page *first_page;
if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE))
return 0;
+ handle = alloc_handle(pool);
+ if (!handle)
+ return 0;
+
+ /* extra space in chunk to keep the handle */
+ size += ZS_HANDLE_SIZE;
class = pool->size_class[get_size_class_index(size)];
spin_lock(&class->lock);
@@ -1255,8 +1405,10 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
if (!first_page) {
spin_unlock(&class->lock);
first_page = alloc_zspage(class, pool->flags);
- if (unlikely(!first_page))
+ if (unlikely(!first_page)) {
+ free_handle(pool, handle);
return 0;
+ }
set_zspage_mapping(first_page, class->index, ZS_EMPTY);
atomic_long_add(class->pages_per_zspage,
@@ -1267,73 +1419,360 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
class->size, class->pages_per_zspage));
}
- obj = (unsigned long)first_page->freelist;
- obj_handle_to_location(obj, &m_page, &m_objidx);
- m_offset = obj_idx_to_offset(m_page, m_objidx, class->size);
-
- vaddr = kmap_atomic(m_page);
- link = (struct link_free *)vaddr + m_offset / sizeof(*link);
- first_page->freelist = link->next;
- memset(link, POISON_INUSE, sizeof(*link));
- kunmap_atomic(vaddr);
-
- first_page->inuse++;
- zs_stat_inc(class, OBJ_USED, 1);
+ obj = obj_malloc(first_page, class, handle);
/* Now move the zspage to another fullness group, if required */
- fix_fullness_group(pool, first_page);
+ fix_fullness_group(class, first_page);
+ record_obj(handle, obj);
spin_unlock(&class->lock);
- return obj;
+ return handle;
}
EXPORT_SYMBOL_GPL(zs_malloc);
-void zs_free(struct zs_pool *pool, unsigned long obj)
+static void obj_free(struct zs_pool *pool, struct size_class *class,
+ unsigned long obj)
{
struct link_free *link;
struct page *first_page, *f_page;
unsigned long f_objidx, f_offset;
void *vaddr;
-
int class_idx;
- struct size_class *class;
enum fullness_group fullness;
- if (unlikely(!obj))
- return;
+ BUG_ON(!obj);
- obj_handle_to_location(obj, &f_page, &f_objidx);
+ obj &= ~OBJ_ALLOCATED_TAG;
+ obj_to_location(obj, &f_page, &f_objidx);
first_page = get_first_page(f_page);
get_zspage_mapping(first_page, &class_idx, &fullness);
- class = pool->size_class[class_idx];
f_offset = obj_idx_to_offset(f_page, f_objidx, class->size);
- spin_lock(&class->lock);
+ vaddr = kmap_atomic(f_page);
/* Insert this object in containing zspage's freelist */
- vaddr = kmap_atomic(f_page);
link = (struct link_free *)(vaddr + f_offset);
link->next = first_page->freelist;
+ if (class->huge)
+ set_page_private(first_page, 0);
kunmap_atomic(vaddr);
first_page->freelist = (void *)obj;
-
first_page->inuse--;
- fullness = fix_fullness_group(pool, first_page);
-
zs_stat_dec(class, OBJ_USED, 1);
- if (fullness == ZS_EMPTY)
+}
+
+void zs_free(struct zs_pool *pool, unsigned long handle)
+{
+ struct page *first_page, *f_page;
+ unsigned long obj, f_objidx;
+ int class_idx;
+ struct size_class *class;
+ enum fullness_group fullness;
+
+ if (unlikely(!handle))
+ return;
+
+ pin_tag(handle);
+ obj = handle_to_obj(handle);
+ obj_to_location(obj, &f_page, &f_objidx);
+ first_page = get_first_page(f_page);
+
+ get_zspage_mapping(first_page, &class_idx, &fullness);
+ class = pool->size_class[class_idx];
+
+ spin_lock(&class->lock);
+ obj_free(pool, class, obj);
+ fullness = fix_fullness_group(class, first_page);
+ if (fullness == ZS_EMPTY) {
zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage(
class->size, class->pages_per_zspage));
-
+ atomic_long_sub(class->pages_per_zspage,
+ &pool->pages_allocated);
+ free_zspage(first_page);
+ }
spin_unlock(&class->lock);
+ unpin_tag(handle);
+
+ free_handle(pool, handle);
+}
+EXPORT_SYMBOL_GPL(zs_free);
+
+static void zs_object_copy(unsigned long src, unsigned long dst,
+ struct size_class *class)
+{
+ struct page *s_page, *d_page;
+ unsigned long s_objidx, d_objidx;
+ unsigned long s_off, d_off;
+ void *s_addr, *d_addr;
+ int s_size, d_size, size;
+ int written = 0;
+
+ s_size = d_size = class->size;
+
+ obj_to_location(src, &s_page, &s_objidx);
+ obj_to_location(dst, &d_page, &d_objidx);
+
+ s_off = obj_idx_to_offset(s_page, s_objidx, class->size);
+ d_off = obj_idx_to_offset(d_page, d_objidx, class->size);
+
+ if (s_off + class->size > PAGE_SIZE)
+ s_size = PAGE_SIZE - s_off;
+
+ if (d_off + class->size > PAGE_SIZE)
+ d_size = PAGE_SIZE - d_off;
+
+ s_addr = kmap_atomic(s_page);
+ d_addr = kmap_atomic(d_page);
+
+ while (1) {
+ size = min(s_size, d_size);
+ memcpy(d_addr + d_off, s_addr + s_off, size);
+ written += size;
+
+ if (written == class->size)
+ break;
+
+ s_off += size;
+ s_size -= size;
+ d_off += size;
+ d_size -= size;
+
+ if (s_off >= PAGE_SIZE) {
+ kunmap_atomic(d_addr);
+ kunmap_atomic(s_addr);
+ s_page = get_next_page(s_page);
+ BUG_ON(!s_page);
+ s_addr = kmap_atomic(s_page);
+ d_addr = kmap_atomic(d_page);
+ s_size = class->size - written;
+ s_off = 0;
+ }
+
+ if (d_off >= PAGE_SIZE) {
+ kunmap_atomic(d_addr);
+ d_page = get_next_page(d_page);
+ BUG_ON(!d_page);
+ d_addr = kmap_atomic(d_page);
+ d_size = class->size - written;
+ d_off = 0;
+ }
+ }
+
+ kunmap_atomic(d_addr);
+ kunmap_atomic(s_addr);
+}
+
+/*
+ * Find alloced object in zspage from index object and
+ * return handle.
+ */
+static unsigned long find_alloced_obj(struct page *page, int index,
+ struct size_class *class)
+{
+ unsigned long head;
+ int offset = 0;
+ unsigned long handle = 0;
+ void *addr = kmap_atomic(page);
+
+ if (!is_first_page(page))
+ offset = page->index;
+ offset += class->size * index;
+
+ while (offset < PAGE_SIZE) {
+ head = obj_to_head(class, page, addr + offset);
+ if (head & OBJ_ALLOCATED_TAG) {
+ handle = head & ~OBJ_ALLOCATED_TAG;
+ if (trypin_tag(handle))
+ break;
+ handle = 0;
+ }
+
+ offset += class->size;
+ index++;
+ }
+
+ kunmap_atomic(addr);
+ return handle;
+}
+
+struct zs_compact_control {
+ /* Source page for migration which could be a subpage of zspage. */
+ struct page *s_page;
+ /* Destination page for migration which should be a first page
+ * of zspage. */
+ struct page *d_page;
+ /* Starting object index within @s_page which used for live object
+ * in the subpage. */
+ int index;
+ /* how many of objects are migrated */
+ int nr_migrated;
+};
+
+static int migrate_zspage(struct zs_pool *pool, struct size_class *class,
+ struct zs_compact_control *cc)
+{
+ unsigned long used_obj, free_obj;
+ unsigned long handle;
+ struct page *s_page = cc->s_page;
+ struct page *d_page = cc->d_page;
+ unsigned long index = cc->index;
+ int nr_migrated = 0;
+ int ret = 0;
+
+ while (1) {
+ handle = find_alloced_obj(s_page, index, class);
+ if (!handle) {
+ s_page = get_next_page(s_page);
+ if (!s_page)
+ break;
+ index = 0;
+ continue;
+ }
+
+ /* Stop if there is no more space */
+ if (zspage_full(d_page)) {
+ unpin_tag(handle);
+ ret = -ENOMEM;
+ break;
+ }
+
+ used_obj = handle_to_obj(handle);
+ free_obj = obj_malloc(d_page, class, handle);
+ zs_object_copy(used_obj, free_obj, class);
+ index++;
+ record_obj(handle, free_obj);
+ unpin_tag(handle);
+ obj_free(pool, class, used_obj);
+ nr_migrated++;
+ }
+
+ /* Remember last position in this iteration */
+ cc->s_page = s_page;
+ cc->index = index;
+ cc->nr_migrated = nr_migrated;
+
+ return ret;
+}
+
+static struct page *alloc_target_page(struct size_class *class)
+{
+ int i;
+ struct page *page;
+
+ for (i = 0; i < _ZS_NR_FULLNESS_GROUPS; i++) {
+ page = class->fullness_list[i];
+ if (page) {
+ remove_zspage(page, class, i);
+ break;
+ }
+ }
+
+ return page;
+}
+
+static void putback_zspage(struct zs_pool *pool, struct size_class *class,
+ struct page *first_page)
+{
+ enum fullness_group fullness;
+
+ BUG_ON(!is_first_page(first_page));
+
+ fullness = get_fullness_group(first_page);
+ insert_zspage(first_page, class, fullness);
+ set_zspage_mapping(first_page, class->index, fullness);
if (fullness == ZS_EMPTY) {
+ zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage(
+ class->size, class->pages_per_zspage));
atomic_long_sub(class->pages_per_zspage,
&pool->pages_allocated);
+
free_zspage(first_page);
}
}
-EXPORT_SYMBOL_GPL(zs_free);
+
+static struct page *isolate_source_page(struct size_class *class)
+{
+ struct page *page;
+
+ page = class->fullness_list[ZS_ALMOST_EMPTY];
+ if (page)
+ remove_zspage(page, class, ZS_ALMOST_EMPTY);
+
+ return page;
+}
+
+static unsigned long __zs_compact(struct zs_pool *pool,
+ struct size_class *class)
+{
+ int nr_to_migrate;
+ struct zs_compact_control cc;
+ struct page *src_page;
+ struct page *dst_page = NULL;
+ unsigned long nr_total_migrated = 0;
+
+ spin_lock(&class->lock);
+ while ((src_page = isolate_source_page(class))) {
+
+ BUG_ON(!is_first_page(src_page));
+
+ /* The goal is to migrate all live objects in source page */
+ nr_to_migrate = src_page->inuse;
+ cc.index = 0;
+ cc.s_page = src_page;
+
+ while ((dst_page = alloc_target_page(class))) {
+ cc.d_page = dst_page;
+ /*
+ * If there is no more space in dst_page, try to
+ * allocate another zspage.
+ */
+ if (!migrate_zspage(pool, class, &cc))
+ break;
+
+ putback_zspage(pool, class, dst_page);
+ nr_total_migrated += cc.nr_migrated;
+ nr_to_migrate -= cc.nr_migrated;
+ }
+
+ /* Stop if we couldn't find slot */
+ if (dst_page == NULL)
+ break;
+
+ putback_zspage(pool, class, dst_page);
+ putback_zspage(pool, class, src_page);
+ spin_unlock(&class->lock);
+ nr_total_migrated += cc.nr_migrated;
+ cond_resched();
+ spin_lock(&class->lock);
+ }
+
+ if (src_page)
+ putback_zspage(pool, class, src_page);
+
+ spin_unlock(&class->lock);
+
+ return nr_total_migrated;
+}
+
+unsigned long zs_compact(struct zs_pool *pool)
+{
+ int i;
+ unsigned long nr_migrated = 0;
+ struct size_class *class;
+
+ for (i = zs_size_classes - 1; i >= 0; i--) {
+ class = pool->size_class[i];
+ if (!class)
+ continue;
+ if (class->index != i)
+ continue;
+ nr_migrated += __zs_compact(pool, class);
+ }
+
+ return nr_migrated;
+}
+EXPORT_SYMBOL_GPL(zs_compact);
/**
* zs_create_pool - Creates an allocation pool to work from.
@@ -1355,20 +1794,20 @@ struct zs_pool *zs_create_pool(char *name, gfp_t flags)
if (!pool)
return NULL;
- pool->name = kstrdup(name, GFP_KERNEL);
- if (!pool->name) {
- kfree(pool);
- return NULL;
- }
-
pool->size_class = kcalloc(zs_size_classes, sizeof(struct size_class *),
GFP_KERNEL);
if (!pool->size_class) {
- kfree(pool->name);
kfree(pool);
return NULL;
}
+ pool->name = kstrdup(name, GFP_KERNEL);
+ if (!pool->name)
+ goto err;
+
+ if (create_handle_cache(pool))
+ goto err;
+
/*
* Iterate reversly, because, size of size_class that we want to use
* for merging should be larger or equal to current size.
@@ -1406,6 +1845,9 @@ struct zs_pool *zs_create_pool(char *name, gfp_t flags)
class->size = size;
class->index = i;
class->pages_per_zspage = pages_per_zspage;
+ if (pages_per_zspage == 1 &&
+ get_maxobj_per_zspage(size, pages_per_zspage) == 1)
+ class->huge = true;
spin_lock_init(&class->lock);
pool->size_class[i] = class;
@@ -1450,6 +1892,7 @@ void zs_destroy_pool(struct zs_pool *pool)
kfree(class);
}
+ destroy_handle_cache(pool);
kfree(pool->size_class);
kfree(pool->name);
kfree(pool);