aboutsummaryrefslogtreecommitdiffstats
path: root/mm/mmap.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/mmap.c')
-rw-r--r--mm/mmap.c203
1 files changed, 183 insertions, 20 deletions
diff --git a/mm/mmap.c b/mm/mmap.c
index 0db0de1c2fbe..1ae21d645c68 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -6,6 +6,7 @@
* Address space accounting code <alan@lxorguk.ukuu.org.uk>
*/
+#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/backing-dev.h>
#include <linux/mm.h>
@@ -33,6 +34,8 @@
#include <linux/uprobes.h>
#include <linux/rbtree_augmented.h>
#include <linux/sched/sysctl.h>
+#include <linux/notifier.h>
+#include <linux/memory.h>
#include <asm/uaccess.h>
#include <asm/cacheflush.h>
@@ -84,6 +87,8 @@ EXPORT_SYMBOL(vm_get_page_prot);
int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; /* heuristic overcommit */
int sysctl_overcommit_ratio __read_mostly = 50; /* default is 50% */
int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
+unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
+unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */
/*
* Make sure vm_committed_as in one cacheline and not cacheline shared with
* other variables. It can be updated by several CPUs frequently.
@@ -122,7 +127,7 @@ EXPORT_SYMBOL_GPL(vm_memory_committed);
*/
int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
{
- unsigned long free, allowed;
+ unsigned long free, allowed, reserve;
vm_acct_memory(pages);
@@ -163,10 +168,10 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
free -= totalreserve_pages;
/*
- * Leave the last 3% for root
+ * Reserve some for root
*/
if (!cap_sys_admin)
- free -= free / 32;
+ free -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
if (free > pages)
return 0;
@@ -177,16 +182,19 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
allowed = (totalram_pages - hugetlb_total_pages())
* sysctl_overcommit_ratio / 100;
/*
- * Leave the last 3% for root
+ * Reserve some for root
*/
if (!cap_sys_admin)
- allowed -= allowed / 32;
+ allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
allowed += total_swap_pages;
- /* Don't let a single process grow too big:
- leave 3% of the size of this process for other processes */
- if (mm)
- allowed -= mm->total_vm / 32;
+ /*
+ * Don't let a single process grow so big a user can't recover
+ */
+ if (mm) {
+ reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10);
+ allowed -= min(mm->total_vm / 32, reserve);
+ }
if (percpu_counter_read_positive(&vm_committed_as) < allowed)
return 0;
@@ -543,6 +551,34 @@ static int find_vma_links(struct mm_struct *mm, unsigned long addr,
return 0;
}
+static unsigned long count_vma_pages_range(struct mm_struct *mm,
+ unsigned long addr, unsigned long end)
+{
+ unsigned long nr_pages = 0;
+ struct vm_area_struct *vma;
+
+ /* Find first overlaping mapping */
+ vma = find_vma_intersection(mm, addr, end);
+ if (!vma)
+ return 0;
+
+ nr_pages = (min(end, vma->vm_end) -
+ max(addr, vma->vm_start)) >> PAGE_SHIFT;
+
+ /* Iterate over the rest of the overlaps */
+ for (vma = vma->vm_next; vma; vma = vma->vm_next) {
+ unsigned long overlap_len;
+
+ if (vma->vm_start > end)
+ break;
+
+ overlap_len = min(end, vma->vm_end) - vma->vm_start;
+ nr_pages += overlap_len >> PAGE_SHIFT;
+ }
+
+ return nr_pages;
+}
+
void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
struct rb_node **rb_link, struct rb_node *rb_parent)
{
@@ -829,7 +865,7 @@ again: remove_next = 1 + (end > next->vm_end);
if (next->anon_vma)
anon_vma_merge(vma, next);
mm->map_count--;
- mpol_put(vma_policy(next));
+ vma_set_policy(vma, vma_policy(next));
kmem_cache_free(vm_area_cachep, next);
/*
* In mprotect's case 6 (see comments on vma_merge),
@@ -1327,15 +1363,20 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
file = fget(fd);
if (!file)
goto out;
+ if (is_file_hugepages(file))
+ len = ALIGN(len, huge_page_size(hstate_file(file)));
} else if (flags & MAP_HUGETLB) {
struct user_struct *user = NULL;
+
+ len = ALIGN(len, huge_page_size(hstate_sizelog(
+ (flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK)));
/*
* VM_NORESERVE is used because the reservations will be
* taken when vm_ops->mmap() is called
* A dummy user value is used because we are not locking
* memory so no accounting is necessary
*/
- file = hugetlb_file_setup(HUGETLB_ANON_FILE, addr, len,
+ file = hugetlb_file_setup(HUGETLB_ANON_FILE, len,
VM_NORESERVE,
&user, HUGETLB_ANONHUGE_INODE,
(flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
@@ -1435,6 +1476,23 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
unsigned long charged = 0;
struct inode *inode = file ? file_inode(file) : NULL;
+ /* Check against address space limit. */
+ if (!may_expand_vm(mm, len >> PAGE_SHIFT)) {
+ unsigned long nr_pages;
+
+ /*
+ * MAP_FIXED may remove pages of mappings that intersects with
+ * requested mapping. Account for the pages it would unmap.
+ */
+ if (!(vm_flags & MAP_FIXED))
+ return -ENOMEM;
+
+ nr_pages = count_vma_pages_range(mm, addr, addr + len);
+
+ if (!may_expand_vm(mm, (len >> PAGE_SHIFT) - nr_pages))
+ return -ENOMEM;
+ }
+
/* Clear old maps */
error = -ENOMEM;
munmap_back:
@@ -1444,10 +1502,6 @@ munmap_back:
goto munmap_back;
}
- /* Check against address space limit. */
- if (!may_expand_vm(mm, len >> PAGE_SHIFT))
- return -ENOMEM;
-
/*
* Private writable mapping: check memory availability
*/
@@ -1935,9 +1989,6 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
{
struct vm_area_struct *vma = NULL;
- if (WARN_ON_ONCE(!mm)) /* Remove this in linux-3.6 */
- return NULL;
-
/* Check the cache first. */
/* (Cache hit rate is typically around 35%.) */
vma = ACCESS_ONCE(mm->mmap_cache);
@@ -2305,7 +2356,7 @@ static void unmap_region(struct mm_struct *mm,
update_hiwater_rss(mm);
unmap_vmas(&tlb, vma, start, end);
free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
- next ? next->vm_start : 0);
+ next ? next->vm_start : USER_PGTABLES_CEILING);
tlb_finish_mmu(&tlb, start, end);
}
@@ -2685,7 +2736,7 @@ void exit_mmap(struct mm_struct *mm)
/* Use -1 here to ensure all VMAs in the mm are unmapped */
unmap_vmas(&tlb, vma, 0, -1);
- free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0);
+ free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING);
tlb_finish_mmu(&tlb, 0, -1);
/*
@@ -3097,3 +3148,115 @@ void __init mmap_init(void)
ret = percpu_counter_init(&vm_committed_as, 0);
VM_BUG_ON(ret);
}
+
+/*
+ * Initialise sysctl_user_reserve_kbytes.
+ *
+ * This is intended to prevent a user from starting a single memory hogging
+ * process, such that they cannot recover (kill the hog) in OVERCOMMIT_NEVER
+ * mode.
+ *
+ * The default value is min(3% of free memory, 128MB)
+ * 128MB is enough to recover with sshd/login, bash, and top/kill.
+ */
+static int init_user_reserve(void)
+{
+ unsigned long free_kbytes;
+
+ free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
+
+ sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17);
+ return 0;
+}
+module_init(init_user_reserve)
+
+/*
+ * Initialise sysctl_admin_reserve_kbytes.
+ *
+ * The purpose of sysctl_admin_reserve_kbytes is to allow the sys admin
+ * to log in and kill a memory hogging process.
+ *
+ * Systems with more than 256MB will reserve 8MB, enough to recover
+ * with sshd, bash, and top in OVERCOMMIT_GUESS. Smaller systems will
+ * only reserve 3% of free pages by default.
+ */
+static int init_admin_reserve(void)
+{
+ unsigned long free_kbytes;
+
+ free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
+
+ sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13);
+ return 0;
+}
+module_init(init_admin_reserve)
+
+/*
+ * Reinititalise user and admin reserves if memory is added or removed.
+ *
+ * The default user reserve max is 128MB, and the default max for the
+ * admin reserve is 8MB. These are usually, but not always, enough to
+ * enable recovery from a memory hogging process using login/sshd, a shell,
+ * and tools like top. It may make sense to increase or even disable the
+ * reserve depending on the existence of swap or variations in the recovery
+ * tools. So, the admin may have changed them.
+ *
+ * If memory is added and the reserves have been eliminated or increased above
+ * the default max, then we'll trust the admin.
+ *
+ * If memory is removed and there isn't enough free memory, then we
+ * need to reset the reserves.
+ *
+ * Otherwise keep the reserve set by the admin.
+ */
+static int reserve_mem_notifier(struct notifier_block *nb,
+ unsigned long action, void *data)
+{
+ unsigned long tmp, free_kbytes;
+
+ switch (action) {
+ case MEM_ONLINE:
+ /* Default max is 128MB. Leave alone if modified by operator. */
+ tmp = sysctl_user_reserve_kbytes;
+ if (0 < tmp && tmp < (1UL << 17))
+ init_user_reserve();
+
+ /* Default max is 8MB. Leave alone if modified by operator. */
+ tmp = sysctl_admin_reserve_kbytes;
+ if (0 < tmp && tmp < (1UL << 13))
+ init_admin_reserve();
+
+ break;
+ case MEM_OFFLINE:
+ free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
+
+ if (sysctl_user_reserve_kbytes > free_kbytes) {
+ init_user_reserve();
+ pr_info("vm.user_reserve_kbytes reset to %lu\n",
+ sysctl_user_reserve_kbytes);
+ }
+
+ if (sysctl_admin_reserve_kbytes > free_kbytes) {
+ init_admin_reserve();
+ pr_info("vm.admin_reserve_kbytes reset to %lu\n",
+ sysctl_admin_reserve_kbytes);
+ }
+ break;
+ default:
+ break;
+ }
+ return NOTIFY_OK;
+}
+
+static struct notifier_block reserve_mem_nb = {
+ .notifier_call = reserve_mem_notifier,
+};
+
+static int __meminit init_reserve_notifier(void)
+{
+ if (register_hotmemory_notifier(&reserve_mem_nb))
+ printk("Failed registering memory add/remove notifier for admin reserve");
+
+ return 0;
+}
+module_init(init_reserve_notifier)