diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/debug.c | 4 | ||||
-rw-r--r-- | mm/highmem.c | 2 | ||||
-rw-r--r-- | mm/huge_memory.c | 11 | ||||
-rw-r--r-- | mm/hugetlb_cgroup.c | 198 | ||||
-rw-r--r-- | mm/kasan/report.c | 40 | ||||
-rw-r--r-- | mm/madvise.c | 7 | ||||
-rw-r--r-- | mm/memory.c | 2 | ||||
-rw-r--r-- | mm/mmap.c | 2 | ||||
-rw-r--r-- | mm/mmu_notifier.c | 585 | ||||
-rw-r--r-- | mm/percpu.c | 61 | ||||
-rw-r--r-- | mm/slub.c | 14 | ||||
-rw-r--r-- | mm/vmalloc.c | 8 |
12 files changed, 595 insertions, 339 deletions
diff --git a/mm/debug.c b/mm/debug.c index 0461df1207cb..74ee73cf7079 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -153,7 +153,7 @@ void dump_mm(const struct mm_struct *mm) #endif "exe_file %px\n" #ifdef CONFIG_MMU_NOTIFIER - "mmu_notifier_mm %px\n" + "notifier_subscriptions %px\n" #endif #ifdef CONFIG_NUMA_BALANCING "numa_next_scan %lu numa_scan_offset %lu numa_scan_seq %d\n" @@ -185,7 +185,7 @@ void dump_mm(const struct mm_struct *mm) #endif mm->exe_file, #ifdef CONFIG_MMU_NOTIFIER - mm->mmu_notifier_mm, + mm->notifier_subscriptions, #endif #ifdef CONFIG_NUMA_BALANCING mm->numa_next_scan, mm->numa_scan_offset, mm->numa_scan_seq, diff --git a/mm/highmem.c b/mm/highmem.c index 107b10f9878e..64d8dea47dd1 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -29,7 +29,7 @@ #include <linux/highmem.h> #include <linux/kgdb.h> #include <asm/tlbflush.h> - +#include <linux/vmalloc.h> #if defined(CONFIG_HIGHMEM) || defined(CONFIG_X86_32) DEFINE_PER_CPU(int, __kmap_atomic_idx); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index a88093213674..f39689a29128 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -527,6 +527,17 @@ void prep_transhuge_page(struct page *page) set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR); } +bool is_transparent_hugepage(struct page *page) +{ + if (!PageCompound(page)) + return 0; + + page = compound_head(page); + return is_huge_zero_page(page) || + page[1].compound_dtor == TRANSHUGE_PAGE_DTOR; +} +EXPORT_SYMBOL_GPL(is_transparent_hugepage); + static unsigned long __thp_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, loff_t off, unsigned long flags, unsigned long size) diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index 2ac38bdc18a1..e434b05416c6 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -3,6 +3,10 @@ * Copyright IBM Corporation, 2012 * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> * + * Cgroup v2 + * Copyright (C) 2019 Red Hat, Inc. + * Author: Giuseppe Scrivano <gscrivan@redhat.com> + * * This program is free software; you can redistribute it and/or modify it * under the terms of version 2.1 of the GNU Lesser General Public License * as published by the Free Software Foundation. @@ -19,18 +23,36 @@ #include <linux/hugetlb.h> #include <linux/hugetlb_cgroup.h> +enum hugetlb_memory_event { + HUGETLB_MAX, + HUGETLB_NR_MEMORY_EVENTS, +}; + struct hugetlb_cgroup { struct cgroup_subsys_state css; + /* * the counter to account for hugepages from hugetlb. */ struct page_counter hugepage[HUGE_MAX_HSTATE]; + + atomic_long_t events[HUGE_MAX_HSTATE][HUGETLB_NR_MEMORY_EVENTS]; + atomic_long_t events_local[HUGE_MAX_HSTATE][HUGETLB_NR_MEMORY_EVENTS]; + + /* Handle for "hugetlb.events" */ + struct cgroup_file events_file[HUGE_MAX_HSTATE]; + + /* Handle for "hugetlb.events.local" */ + struct cgroup_file events_local_file[HUGE_MAX_HSTATE]; }; #define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val)) #define MEMFILE_IDX(val) (((val) >> 16) & 0xffff) #define MEMFILE_ATTR(val) ((val) & 0xffff) +#define hugetlb_cgroup_from_counter(counter, idx) \ + container_of(counter, struct hugetlb_cgroup, hugepage[idx]) + static struct hugetlb_cgroup *root_h_cgroup __read_mostly; static inline @@ -178,6 +200,19 @@ static void hugetlb_cgroup_css_offline(struct cgroup_subsys_state *css) } while (hugetlb_cgroup_have_usage(h_cg)); } +static inline void hugetlb_event(struct hugetlb_cgroup *hugetlb, int idx, + enum hugetlb_memory_event event) +{ + atomic_long_inc(&hugetlb->events_local[idx][event]); + cgroup_file_notify(&hugetlb->events_local_file[idx]); + + do { + atomic_long_inc(&hugetlb->events[idx][event]); + cgroup_file_notify(&hugetlb->events_file[idx]); + } while ((hugetlb = parent_hugetlb_cgroup(hugetlb)) && + !hugetlb_cgroup_is_root(hugetlb)); +} + int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, struct hugetlb_cgroup **ptr) { @@ -202,8 +237,12 @@ again: } rcu_read_unlock(); - if (!page_counter_try_charge(&h_cg->hugepage[idx], nr_pages, &counter)) + if (!page_counter_try_charge(&h_cg->hugepage[idx], nr_pages, + &counter)) { ret = -ENOMEM; + hugetlb_event(hugetlb_cgroup_from_counter(counter, idx), idx, + HUGETLB_MAX); + } css_put(&h_cg->css); done: *ptr = h_cg; @@ -283,10 +322,45 @@ static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css, } } +static int hugetlb_cgroup_read_u64_max(struct seq_file *seq, void *v) +{ + int idx; + u64 val; + struct cftype *cft = seq_cft(seq); + unsigned long limit; + struct page_counter *counter; + struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(seq_css(seq)); + + idx = MEMFILE_IDX(cft->private); + counter = &h_cg->hugepage[idx]; + + limit = round_down(PAGE_COUNTER_MAX, + 1 << huge_page_order(&hstates[idx])); + + switch (MEMFILE_ATTR(cft->private)) { + case RES_USAGE: + val = (u64)page_counter_read(counter); + seq_printf(seq, "%llu\n", val * PAGE_SIZE); + break; + case RES_LIMIT: + val = (u64)counter->max; + if (val == limit) + seq_puts(seq, "max\n"); + else + seq_printf(seq, "%llu\n", val * PAGE_SIZE); + break; + default: + BUG(); + } + + return 0; +} + static DEFINE_MUTEX(hugetlb_limit_mutex); static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off) + char *buf, size_t nbytes, loff_t off, + const char *max) { int ret, idx; unsigned long nr_pages; @@ -296,7 +370,7 @@ static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of, return -EINVAL; buf = strstrip(buf); - ret = page_counter_memparse(buf, "-1", &nr_pages); + ret = page_counter_memparse(buf, max, &nr_pages); if (ret) return ret; @@ -316,6 +390,18 @@ static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of, return ret ?: nbytes; } +static ssize_t hugetlb_cgroup_write_legacy(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + return hugetlb_cgroup_write(of, buf, nbytes, off, "-1"); +} + +static ssize_t hugetlb_cgroup_write_dfl(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + return hugetlb_cgroup_write(of, buf, nbytes, off, "max"); +} + static ssize_t hugetlb_cgroup_reset(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { @@ -350,7 +436,36 @@ static char *mem_fmt(char *buf, int size, unsigned long hsize) return buf; } -static void __init __hugetlb_cgroup_file_init(int idx) +static int __hugetlb_events_show(struct seq_file *seq, bool local) +{ + int idx; + long max; + struct cftype *cft = seq_cft(seq); + struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(seq_css(seq)); + + idx = MEMFILE_IDX(cft->private); + + if (local) + max = atomic_long_read(&h_cg->events_local[idx][HUGETLB_MAX]); + else + max = atomic_long_read(&h_cg->events[idx][HUGETLB_MAX]); + + seq_printf(seq, "max %lu\n", max); + + return 0; +} + +static int hugetlb_events_show(struct seq_file *seq, void *v) +{ + return __hugetlb_events_show(seq, false); +} + +static int hugetlb_events_local_show(struct seq_file *seq, void *v) +{ + return __hugetlb_events_show(seq, true); +} + +static void __init __hugetlb_cgroup_file_dfl_init(int idx) { char buf[32]; struct cftype *cft; @@ -360,38 +475,93 @@ static void __init __hugetlb_cgroup_file_init(int idx) mem_fmt(buf, 32, huge_page_size(h)); /* Add the limit file */ - cft = &h->cgroup_files[0]; + cft = &h->cgroup_files_dfl[0]; + snprintf(cft->name, MAX_CFTYPE_NAME, "%s.max", buf); + cft->private = MEMFILE_PRIVATE(idx, RES_LIMIT); + cft->seq_show = hugetlb_cgroup_read_u64_max; + cft->write = hugetlb_cgroup_write_dfl; + cft->flags = CFTYPE_NOT_ON_ROOT; + + /* Add the current usage file */ + cft = &h->cgroup_files_dfl[1]; + snprintf(cft->name, MAX_CFTYPE_NAME, "%s.current", buf); + cft->private = MEMFILE_PRIVATE(idx, RES_USAGE); + cft->seq_show = hugetlb_cgroup_read_u64_max; + cft->flags = CFTYPE_NOT_ON_ROOT; + + /* Add the events file */ + cft = &h->cgroup_files_dfl[2]; + snprintf(cft->name, MAX_CFTYPE_NAME, "%s.events", buf); + cft->private = MEMFILE_PRIVATE(idx, 0); + cft->seq_show = hugetlb_events_show; + cft->file_offset = offsetof(struct hugetlb_cgroup, events_file[idx]), + cft->flags = CFTYPE_NOT_ON_ROOT; + + /* Add the events.local file */ + cft = &h->cgroup_files_dfl[3]; + snprintf(cft->name, MAX_CFTYPE_NAME, "%s.events.local", buf); + cft->private = MEMFILE_PRIVATE(idx, 0); + cft->seq_show = hugetlb_events_local_show; + cft->file_offset = offsetof(struct hugetlb_cgroup, + events_local_file[idx]), + cft->flags = CFTYPE_NOT_ON_ROOT; + + /* NULL terminate the last cft */ + cft = &h->cgroup_files_dfl[4]; + memset(cft, 0, sizeof(*cft)); + + WARN_ON(cgroup_add_dfl_cftypes(&hugetlb_cgrp_subsys, + h->cgroup_files_dfl)); +} + +static void __init __hugetlb_cgroup_file_legacy_init(int idx) +{ + char buf[32]; + struct cftype *cft; + struct hstate *h = &hstates[idx]; + + /* format the size */ + mem_fmt(buf, 32, huge_page_size(h)); + + /* Add the limit file */ + cft = &h->cgroup_files_legacy[0]; snprintf(cft->name, MAX_CFTYPE_NAME, "%s.limit_in_bytes", buf); cft->private = MEMFILE_PRIVATE(idx, RES_LIMIT); cft->read_u64 = hugetlb_cgroup_read_u64; - cft->write = hugetlb_cgroup_write; + cft->write = hugetlb_cgroup_write_legacy; /* Add the usage file */ - cft = &h->cgroup_files[1]; + cft = &h->cgroup_files_legacy[1]; snprintf(cft->name, MAX_CFTYPE_NAME, "%s.usage_in_bytes", buf); cft->private = MEMFILE_PRIVATE(idx, RES_USAGE); cft->read_u64 = hugetlb_cgroup_read_u64; /* Add the MAX usage file */ - cft = &h->cgroup_files[2]; + cft = &h->cgroup_files_legacy[2]; snprintf(cft->name, MAX_CFTYPE_NAME, "%s.max_usage_in_bytes", buf); cft->private = MEMFILE_PRIVATE(idx, RES_MAX_USAGE); cft->write = hugetlb_cgroup_reset; cft->read_u64 = hugetlb_cgroup_read_u64; /* Add the failcntfile */ - cft = &h->cgroup_files[3]; + cft = &h->cgroup_files_legacy[3]; snprintf(cft->name, MAX_CFTYPE_NAME, "%s.failcnt", buf); cft->private = MEMFILE_PRIVATE(idx, RES_FAILCNT); cft->write = hugetlb_cgroup_reset; cft->read_u64 = hugetlb_cgroup_read_u64; /* NULL terminate the last cft */ - cft = &h->cgroup_files[4]; + cft = &h->cgroup_files_legacy[4]; memset(cft, 0, sizeof(*cft)); WARN_ON(cgroup_add_legacy_cftypes(&hugetlb_cgrp_subsys, - h->cgroup_files)); + h->cgroup_files_legacy)); +} + +static void __init __hugetlb_cgroup_file_init(int idx) +{ + __hugetlb_cgroup_file_dfl_init(idx); + __hugetlb_cgroup_file_legacy_init(idx); } void __init hugetlb_cgroup_file_init(void) @@ -433,8 +603,14 @@ void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage) return; } +static struct cftype hugetlb_files[] = { + {} /* terminate */ +}; + struct cgroup_subsys hugetlb_cgrp_subsys = { .css_alloc = hugetlb_cgroup_css_alloc, .css_offline = hugetlb_cgroup_css_offline, .css_free = hugetlb_cgroup_css_free, + .dfl_cftypes = hugetlb_files, + .legacy_cftypes = hugetlb_files, }; diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 621782100eaa..5ef9f24f566b 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -512,3 +512,43 @@ void __kasan_report(unsigned long addr, size_t size, bool is_write, unsigned lon end_report(&flags); } + +#ifdef CONFIG_KASAN_INLINE +/* + * With CONFIG_KASAN_INLINE, accesses to bogus pointers (outside the high + * canonical half of the address space) cause out-of-bounds shadow memory reads + * before the actual access. For addresses in the low canonical half of the + * address space, as well as most non-canonical addresses, that out-of-bounds + * shadow memory access lands in the non-canonical part of the address space. + * Help the user figure out what the original bogus pointer was. + */ +void kasan_non_canonical_hook(unsigned long addr) +{ + unsigned long orig_addr; + const char *bug_type; + + if (addr < KASAN_SHADOW_OFFSET) + return; + + orig_addr = (addr - KASAN_SHADOW_OFFSET) << KASAN_SHADOW_SCALE_SHIFT; + /* + * For faults near the shadow address for NULL, we can be fairly certain + * that this is a KASAN shadow memory access. + * For faults that correspond to shadow for low canonical addresses, we + * can still be pretty sure - that shadow region is a fairly narrow + * chunk of the non-canonical address space. + * But faults that look like shadow for non-canonical addresses are a + * really large chunk of the address space. In that case, we still + * print the decoded address, but make it clear that this is not + * necessarily what's actually going on. + */ + if (orig_addr < PAGE_SIZE) + bug_type = "null-ptr-deref"; + else if (orig_addr < TASK_SIZE) + bug_type = "probably user-memory-access"; + else + bug_type = "maybe wild-memory-access"; + pr_alert("KASAN: %s in range [0x%016lx-0x%016lx]\n", bug_type, + orig_addr, orig_addr + KASAN_SHADOW_MASK); +} +#endif diff --git a/mm/madvise.c b/mm/madvise.c index bcdb6a042787..43b47d3fae02 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -1044,7 +1044,7 @@ madvise_behavior_valid(int behavior) * -EBADF - map exists, but area maps something that isn't a file. * -EAGAIN - a kernel resource was temporarily unavailable. */ -SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) +int do_madvise(unsigned long start, size_t len_in, int behavior) { unsigned long end, tmp; struct vm_area_struct *vma, *prev; @@ -1141,3 +1141,8 @@ out: return error; } + +SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) +{ + return do_madvise(start, len_in, behavior); +} diff --git a/mm/memory.c b/mm/memory.c index 45442d9a4f52..1c4be871a237 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2203,7 +2203,7 @@ static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd, pte_t *page_table, pte_t orig_pte) { int same = 1; -#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT) +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPTION) if (sizeof(pte_t) > sizeof(unsigned long)) { spinlock_t *ptl = pte_lockptr(mm, pmd); spin_lock(ptl); diff --git a/mm/mmap.c b/mm/mmap.c index 71e4ffc83bcd..bc788548c4e5 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -3336,6 +3336,8 @@ static const struct vm_operations_struct special_mapping_vmops = { .fault = special_mapping_fault, .mremap = special_mapping_mremap, .name = special_mapping_name, + /* vDSO code relies that VVAR can't be accessed remotely */ + .access = NULL, }; static const struct vm_operations_struct legacy_special_mapping_vmops = { diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index f76ea05b1cb0..ef3973a5d34a 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -29,12 +29,12 @@ struct lockdep_map __mmu_notifier_invalidate_range_start_map = { #endif /* - * The mmu notifier_mm structure is allocated and installed in - * mm->mmu_notifier_mm inside the mm_take_all_locks() protected + * The mmu_notifier_subscriptions structure is allocated and installed in + * mm->notifier_subscriptions inside the mm_take_all_locks() protected * critical section and it's released only when mm_count reaches zero * in mmdrop(). */ -struct mmu_notifier_mm { +struct mmu_notifier_subscriptions { /* all mmu notifiers registered in this mm are queued in this list */ struct hlist_head list; bool has_itree; @@ -65,80 +65,81 @@ struct mmu_notifier_mm { * * The write side has two states, fully excluded: * - mm->active_invalidate_ranges != 0 - * - mnn->invalidate_seq & 1 == True (odd) + * - subscriptions->invalidate_seq & 1 == True (odd) * - some range on the mm_struct is being invalidated * - the itree is not allowed to change * * And partially excluded: * - mm->active_invalidate_ranges != 0 - * - mnn->invalidate_seq & 1 == False (even) + * - subscriptions->invalidate_seq & 1 == False (even) * - some range on the mm_struct is being invalidated * - the itree is allowed to change * - * Operations on mmu_notifier_mm->invalidate_seq (under spinlock): + * Operations on notifier_subscriptions->invalidate_seq (under spinlock): * seq |= 1 # Begin writing * seq++ # Release the writing state * seq & 1 # True if a writer exists * * The later state avoids some expensive work on inv_end in the common case of - * no mni monitoring the VA. + * no mmu_interval_notifier monitoring the VA. */ -static bool mn_itree_is_invalidating(struct mmu_notifier_mm *mmn_mm) +static bool +mn_itree_is_invalidating(struct mmu_notifier_subscriptions *subscriptions) { - lockdep_assert_held(&mmn_mm->lock); - return mmn_mm->invalidate_seq & 1; + lockdep_assert_held(&subscriptions->lock); + return subscriptions->invalidate_seq & 1; } static struct mmu_interval_notifier * -mn_itree_inv_start_range(struct mmu_notifier_mm *mmn_mm, +mn_itree_inv_start_range(struct mmu_notifier_subscriptions *subscriptions, const struct mmu_notifier_range *range, unsigned long *seq) { struct interval_tree_node *node; struct mmu_interval_notifier *res = NULL; - spin_lock(&mmn_mm->lock); - mmn_mm->active_invalidate_ranges++; - node = interval_tree_iter_first(&mmn_mm->itree, range->start, + spin_lock(&subscriptions->lock); + subscriptions->active_invalidate_ranges++; + node = interval_tree_iter_first(&subscriptions->itree, range->start, range->end - 1); if (node) { - mmn_mm->invalidate_seq |= 1; + subscriptions->invalidate_seq |= 1; res = container_of(node, struct mmu_interval_notifier, interval_tree); } - *seq = mmn_mm->invalidate_seq; - spin_unlock(&mmn_mm->lock); + *seq = subscriptions->invalidate_seq; + spin_unlock(&subscriptions->lock); return res; } static struct mmu_interval_notifier * -mn_itree_inv_next(struct mmu_interval_notifier *mni, +mn_itree_inv_next(struct mmu_interval_notifier *interval_sub, const struct mmu_notifier_range *range) { struct interval_tree_node *node; - node = interval_tree_iter_next(&mni->interval_tree, range->start, - range->end - 1); + node = interval_tree_iter_next(&interval_sub->interval_tree, + range->start, range->end - 1); if (!node) return NULL; return container_of(node, struct mmu_interval_notifier, interval_tree); } -static void mn_itree_inv_end(struct mmu_notifier_mm *mmn_mm) +static void mn_itree_inv_end(struct mmu_notifier_subscriptions *subscriptions) { - struct mmu_interval_notifier *mni; + struct mmu_interval_notifier *interval_sub; struct hlist_node *next; - spin_lock(&mmn_mm->lock); - if (--mmn_mm->active_invalidate_ranges || - !mn_itree_is_invalidating(mmn_mm)) { - spin_unlock(&mmn_mm->lock); + spin_lock(&subscriptions->lock); + if (--subscriptions->active_invalidate_ranges || + !mn_itree_is_invalidating(subscriptions)) { + spin_unlock(&subscriptions->lock); return; } /* Make invalidate_seq even */ - mmn_mm->invalidate_seq++; + subscriptions->invalidate_seq++; /* * The inv_end incorporates a deferred mechanism like rtnl_unlock(). @@ -146,30 +147,31 @@ static void mn_itree_inv_end(struct mmu_notifier_mm *mmn_mm) * they are progressed. This arrangement for tree updates is used to * avoid using a blocking lock during invalidate_range_start. */ - hlist_for_each_entry_safe(mni, next, &mmn_mm->deferred_list, + hlist_for_each_entry_safe(interval_sub, next, + &subscriptions->deferred_list, deferred_item) { - if (RB_EMPTY_NODE(&mni->interval_tree.rb)) - interval_tree_insert(&mni->interval_tree, - &mmn_mm->itree); + if (RB_EMPTY_NODE(&interval_sub->interval_tree.rb)) + interval_tree_insert(&interval_sub->interval_tree, + &subscriptions->itree); else - interval_tree_remove(&mni->interval_tree, - &mmn_mm->itree); - hlist_del(&mni->deferred_item); + interval_tree_remove(&interval_sub->interval_tree, + &subscriptions->itree); + hlist_del(&interval_sub->deferred_item); } - spin_unlock(&mmn_mm->lock); + spin_unlock(&subscriptions->lock); - wake_up_all(&mmn_mm->wq); + wake_up_all(&subscriptions->wq); } /** * mmu_interval_read_begin - Begin a read side critical section against a VA * range - * mni: The range to use + * interval_sub: The interval subscription * * mmu_iterval_read_begin()/mmu_iterval_read_retry() implement a - * collision-retry scheme similar to seqcount for the VA range under mni. If - * the mm invokes invalidation during the critical section then - * mmu_interval_read_retry() will return true. + * collision-retry scheme similar to seqcount for the VA range under + * subscription. If the mm invokes invalidation during the critical section + * then mmu_interval_read_retry() will return true. * * This is useful to obtain shadow PTEs where teardown or setup of the SPTEs * require a blocking context. The critical region formed by this can sleep, @@ -180,68 +182,71 @@ static void mn_itree_inv_end(struct mmu_notifier_mm *mmn_mm) * * The return value should be passed to mmu_interval_read_retry(). */ -unsigned long mmu_interval_read_begin(struct mmu_interval_notifier *mni) +unsigned long +mmu_interval_read_begin(struct mmu_interval_notifier *interval_sub) { - struct mmu_notifier_mm *mmn_mm = mni->mm->mmu_notifier_mm; + struct mmu_notifier_subscriptions *subscriptions = + interval_sub->mm->notifier_subscriptions; unsigned long seq; bool is_invalidating; /* - * If the mni has a different seq value under the user_lock than we - * started with then it has collided. + * If the subscription has a different seq value under the user_lock + * than we started with then it has collided. * - * If the mni currently has the same seq value as the mmn_mm seq, then - * it is currently between invalidate_start/end and is colliding. + * If the subscription currently has the same seq value as the + * subscriptions seq, then it is currently between + * invalidate_start/end and is colliding. * * The locking looks broadly like this: * mn_tree_invalidate_start(): mmu_interval_read_begin(): * spin_lock - * seq = READ_ONCE(mni->invalidate_seq); - * seq == mmn_mm->invalidate_seq + * seq = READ_ONCE(interval_sub->invalidate_seq); + * seq == subs->invalidate_seq * spin_unlock * spin_lock - * seq = ++mmn_mm->invalidate_seq + * seq = ++subscriptions->invalidate_seq * spin_unlock * op->invalidate_range(): * user_lock * mmu_interval_set_seq() - * mni->invalidate_seq = seq + * interval_sub->invalidate_seq = seq * user_unlock * * [Required: mmu_interval_read_retry() == true] * * mn_itree_inv_end(): * spin_lock - * seq = ++mmn_mm->invalidate_seq + * seq = ++subscriptions->invalidate_seq * spin_unlock * * user_lock * mmu_interval_read_retry(): - * mni->invalidate_seq != seq + * interval_sub->invalidate_seq != seq * user_unlock * * Barriers are not needed here as any races here are closed by an * eventual mmu_interval_read_retry(), which provides a barrier via the * user_lock. */ - spin_lock(&mmn_mm->lock); + spin_lock(&subscriptions->lock); /* Pairs with the WRITE_ONCE in mmu_interval_set_seq() */ - seq = READ_ONCE(mni->invalidate_seq); - is_invalidating = seq == mmn_mm->invalidate_seq; - spin_unlock(&mmn_mm->lock); + seq = READ_ONCE(interval_sub->invalidate_seq); + is_invalidating = seq == subscriptions->invalidate_seq; + spin_unlock(&subscriptions->lock); /* - * mni->invalidate_seq must always be set to an odd value via + * interval_sub->invalidate_seq must always be set to an odd value via * mmu_interval_set_seq() using the provided cur_seq from * mn_itree_inv_start_range(). This ensures that if seq does wrap we * will always clear the below sleep in some reasonable time as - * mmn_mm->invalidate_seq is even in the idle state. + * subscriptions->invalidate_seq is even in the idle state. */ lock_map_acquire(&__mmu_notifier_invalidate_range_start_map); lock_map_release(&__mmu_notifier_invalidate_range_start_map); if (is_invalidating) - wait_event(mmn_mm->wq, - READ_ONCE(mmn_mm->invalidate_seq) != seq); + wait_event(subscriptions->wq, + READ_ONCE(subscriptions->invalidate_seq) != seq); /* * Notice that mmu_interval_read_retry() can already be true at this @@ -253,7 +258,7 @@ unsigned long mmu_interval_read_begin(struct mmu_interval_notifier *mni) } EXPORT_SYMBOL_GPL(mmu_interval_read_begin); -static void mn_itree_release(struct mmu_notifier_mm *mmn_mm, +static void mn_itree_release(struct mmu_notifier_subscriptions *subscriptions, struct mm_struct *mm) { struct mmu_notifier_range range = { @@ -263,17 +268,20 @@ static void mn_itree_release(struct mmu_notifier_mm *mmn_mm, .start = 0, .end = ULONG_MAX, }; - struct mmu_interval_notifier *mni; + struct mmu_interval_notifier *interval_sub; unsigned long cur_seq; bool ret; - for (mni = mn_itree_inv_start_range(mmn_mm, &range, &cur_seq); mni; - mni = mn_itree_inv_next(mni, &range)) { - ret = mni->ops->invalidate(mni, &range, cur_seq); + for (interval_sub = + mn_itree_inv_start_range(subscriptions, &range, &cur_seq); + interval_sub; + interval_sub = mn_itree_inv_next(interval_sub, &range)) { + ret = interval_sub->ops->invalidate(interval_sub, &range, + cur_seq); WARN_ON(!ret); } - mn_itree_inv_end(mmn_mm); + mn_itree_inv_end(subscriptions); } /* @@ -283,15 +291,15 @@ static void mn_itree_release(struct mmu_notifier_mm *mmn_mm, * in parallel despite there being no task using this mm any more, * through the vmas outside of the exit_mmap context, such as with * vmtruncate. This serializes against mmu_notifier_unregister with - * the mmu_notifier_mm->lock in addition to SRCU and it serializes - * against the other mmu notifiers with SRCU. struct mmu_notifier_mm + * the notifier_subscriptions->lock in addition to SRCU and it serializes + * against the other mmu notifiers with SRCU. struct mmu_notifier_subscriptions * can't go away from under us as exit_mmap holds an mm_count pin * itself. */ -static void mn_hlist_release(struct mmu_notifier_mm *mmn_mm, +static void mn_hlist_release(struct mmu_notifier_subscriptions *subscriptions, struct mm_struct *mm) { - struct mmu_notifier *mn; + struct mmu_notifier *subscription; int id; /* @@ -299,29 +307,29 @@ static void mn_hlist_release(struct mmu_notifier_mm *mmn_mm, * ->release returns. */ id = srcu_read_lock(&srcu); - hlist_for_each_entry_rcu(mn, &mmn_mm->list, hlist) + hlist_for_each_entry_rcu(subscription, &subscriptions->list, hlist) /* * If ->release runs before mmu_notifier_unregister it must be * handled, as it's the only way for the driver to flush all * existing sptes and stop the driver from establishing any more * sptes before all the pages in the mm are freed. */ - if (mn->ops->release) - mn->ops->release(mn, mm); + if (subscription->ops->release) + subscription->ops->release(subscription, mm); - spin_lock(&mmn_mm->lock); - while (unlikely(!hlist_empty(&mmn_mm->list))) { - mn = hlist_entry(mmn_mm->list.first, struct mmu_notifier, - hlist); + spin_lock(&subscriptions->lock); + while (unlikely(!hlist_empty(&subscriptions->list))) { + subscription = hlist_entry(subscriptions->list.first, + struct mmu_notifier, hlist); /* * We arrived before mmu_notifier_unregister so * mmu_notifier_unregister will do nothing other than to wait * for ->release to finish and for mmu_notifier_unregister to * return. */ - hlist_del_init_rcu(&mn->hlist); + hlist_del_init_rcu(&subscription->hlist); } - spin_unlock(&mmn_mm->lock); + spin_unlock(&subscriptions->lock); srcu_read_unlock(&srcu, id); /* @@ -330,21 +338,22 @@ static void mn_hlist_release(struct mmu_notifier_mm *mmn_mm, * until the ->release method returns, if it was invoked by * mmu_notifier_unregister. * - * The mmu_notifier_mm can't go away from under us because one mm_count - * is held by exit_mmap. + * The notifier_subscriptions can't go away from under us because + * one mm_count is held by exit_mmap. */ synchronize_srcu(&srcu); } void __mmu_notifier_release(struct mm_struct *mm) { - struct mmu_notifier_mm *mmn_mm = mm->mmu_notifier_mm; + struct mmu_notifier_subscriptions *subscriptions = + mm->notifier_subscriptions; - if (mmn_mm->has_itree) - mn_itree_release(mmn_mm, mm); + if (subscriptions->has_itree) + mn_itree_release(subscriptions, mm); - if (!hlist_empty(&mmn_mm->list)) - mn_hlist_release(mmn_mm, mm); + if (!hlist_empty(&subscriptions->list)) + mn_hlist_release(subscriptions, mm); } /* @@ -356,13 +365,15 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm, unsigned long start, unsigned long end) { - struct mmu_notifier *mn; + struct mmu_notifier *subscription; int young = 0, id; id = srcu_read_lock(&srcu); - hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) { - if (mn->ops->clear_flush_young) - young |= mn->ops->clear_flush_young(mn, mm, start, end); + hlist_for_each_entry_rcu(subscription, + &mm->notifier_subscriptions->list, hlist) { + if (subscription->ops->clear_flush_young) + young |= subscription->ops->clear_flush_young( + subscription, mm, start, end); } srcu_read_unlock(&srcu, id); @@ -373,13 +384,15 @@ int __mmu_notifier_clear_young(struct mm_struct *mm, unsigned long start, unsigned long end) { - struct mmu_notifier *mn; + struct mmu_notifier *subscription; int young = 0, id; id = srcu_read_lock(&srcu); - hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) { - if (mn->ops->clear_young) - young |= mn->ops->clear_young(mn, mm, start, end); + hlist_for_each_entry_rcu(subscription, + &mm->notifier_subscriptions->list, hlist) { + if (subscription->ops->clear_young) + young |= subscription->ops->clear_young(subscription, + mm, start, end); } srcu_read_unlock(&srcu, id); @@ -389,13 +402,15 @@ int __mmu_notifier_clear_young(struct mm_struct *mm, int __mmu_notifier_test_young(struct mm_struct *mm, unsigned long address) { - struct mmu_notifier *mn; + struct mmu_notifier *subscription; int young = 0, id; id = srcu_read_lock(&srcu); - hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) { - if (mn->ops->test_young) { - young = mn->ops->test_young(mn, mm, address); + hlist_for_each_entry_rcu(subscription, + &mm->notifier_subscriptions->list, hlist) { + if (subscription->ops->test_young) { + young = subscription->ops->test_young(subscription, mm, + address); if (young) break; } @@ -408,28 +423,33 @@ int __mmu_notifier_test_young(struct mm_struct *mm, void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, pte_t pte) { - struct mmu_notifier *mn; + struct mmu_notifier *subscription; int id; id = srcu_read_lock(&srcu); - hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) { - if (mn->ops->change_pte) - mn->ops->change_pte(mn, mm, address, pte); + hlist_for_each_entry_rcu(subscription, + &mm->notifier_subscriptions->list, hlist) { + if (subscription->ops->change_pte) + subscription->ops->change_pte(subscription, mm, address, + pte); } srcu_read_unlock(&srcu, id); } -static int mn_itree_invalidate(struct mmu_notifier_mm *mmn_mm, +static int mn_itree_invalidate(struct mmu_notifier_subscriptions *subscriptions, const struct mmu_notifier_range *range) { - struct mmu_interval_notifier *mni; + struct mmu_interval_notifier *interval_sub; unsigned long cur_seq; - for (mni = mn_itree_inv_start_range(mmn_mm, range, &cur_seq); mni; - mni = mn_itree_inv_next(mni, range)) { + for (interval_sub = + mn_itree_inv_start_range(subscriptions, range, &cur_seq); + interval_sub; + interval_sub = mn_itree_inv_next(interval_sub, range)) { bool ret; - ret = mni->ops->invalidate(mni, range, cur_seq); + ret = interval_sub->ops->invalidate(interval_sub, range, + cur_seq); if (!ret) { if (WARN_ON(mmu_notifier_range_blockable(range))) continue; @@ -443,31 +463,36 @@ out_would_block: * On -EAGAIN the non-blocking caller is not allowed to call * invalidate_range_end() */ - mn_itree_inv_end(mmn_mm); + mn_itree_inv_end(subscriptions); return -EAGAIN; } -static int mn_hlist_invalidate_range_start(struct mmu_notifier_mm *mmn_mm, - struct mmu_notifier_range *range) +static int mn_hlist_invalidate_range_start( + struct mmu_notifier_subscriptions *subscriptions, + struct mmu_notifier_range *range) { - struct mmu_notifier *mn; + struct mmu_notifier *subscription; int ret = 0; int id; id = srcu_read_lock(&srcu); - hlist_for_each_entry_rcu(mn, &mmn_mm->list, hlist) { - if (mn->ops->invalidate_range_start) { + hlist_for_each_entry_rcu(subscription, &subscriptions->list, hlist) { + const struct mmu_notifier_ops *ops = subscription->ops; + + if (ops->invalidate_range_start) { int _ret; if (!mmu_notifier_range_blockable(range)) non_block_start(); - _ret = mn->ops->invalidate_range_start(mn, range); + _ret = ops->invalidate_range_start(subscription, range); if (!mmu_notifier_range_blockable(range)) non_block_end(); if (_ret) { pr_info("%pS callback failed with %d in %sblockable context.\n", - mn->ops->invalidate_range_start, _ret, - !mmu_notifier_range_blockable(range) ? "non-" : ""); + ops->invalidate_range_start, _ret, + !mmu_notifier_range_blockable(range) ? + "non-" : + ""); WARN_ON(mmu_notifier_range_blockable(range) || _ret != -EAGAIN); ret = _ret; @@ -481,28 +506,29 @@ static int mn_hlist_invalidate_range_start(struct mmu_notifier_mm *mmn_mm, int __mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range) { - struct mmu_notifier_mm *mmn_mm = range->mm->mmu_notifier_mm; + struct mmu_notifier_subscriptions *subscriptions = + range->mm->notifier_subscriptions; int ret; - if (mmn_mm->has_itree) { - ret = mn_itree_invalidate(mmn_mm, range); + if (subscriptions->has_itree) { + ret = mn_itree_invalidate(subscriptions, range); if (ret) return ret; } - if (!hlist_empty(&mmn_mm->list)) - return mn_hlist_invalidate_range_start(mmn_mm, range); + if (!hlist_empty(&subscriptions->list)) + return mn_hlist_invalidate_range_start(subscriptions, range); return 0; } -static void mn_hlist_invalidate_end(struct mmu_notifier_mm *mmn_mm, - struct mmu_notifier_range *range, - bool only_end) +static void +mn_hlist_invalidate_end(struct mmu_notifier_subscriptions *subscriptions, + struct mmu_notifier_range *range, bool only_end) { - struct mmu_notifier *mn; + struct mmu_notifier *subscription; int id; id = srcu_read_lock(&srcu); - hlist_for_each_entry_rcu(mn, &mmn_mm->list, hlist) { + hlist_for_each_entry_rcu(subscription, &subscriptions->list, hlist) { /* * Call invalidate_range here too to avoid the need for the * subsystem of having to register an invalidate_range_end @@ -516,14 +542,16 @@ static void mn_hlist_invalidate_end(struct mmu_notifier_mm *mmn_mm, * is safe to do when we know that a call to invalidate_range() * already happen under page table lock. */ - if (!only_end && mn->ops->invalidate_range) - mn->ops->invalidate_range(mn, range->mm, - range->start, - range->end); - if (mn->ops->invalidate_range_end) { + if (!only_end && subscription->ops->invalidate_range) + subscription->ops->invalidate_range(subscription, + range->mm, + range->start, + range->end); + if (subscription->ops->invalidate_range_end) { if (!mmu_notifier_range_blockable(range)) non_block_start(); - mn->ops->invalidate_range_end(mn, range); + subscription->ops->invalidate_range_end(subscription, + range); if (!mmu_notifier_range_blockable(range)) non_block_end(); } @@ -534,27 +562,30 @@ static void mn_hlist_invalidate_end(struct mmu_notifier_mm *mmn_mm, void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range, bool only_end) { - struct mmu_notifier_mm *mmn_mm = range->mm->mmu_notifier_mm; + struct mmu_notifier_subscriptions *subscriptions = + range->mm->notifier_subscriptions; lock_map_acquire(&__mmu_notifier_invalidate_range_start_map); - if (mmn_mm->has_itree) - mn_itree_inv_end(mmn_mm); + if (subscriptions->has_itree) + mn_itree_inv_end(subscriptions); - if (!hlist_empty(&mmn_mm->list)) - mn_hlist_invalidate_end(mmn_mm, range, only_end); + if (!hlist_empty(&subscriptions->list)) + mn_hlist_invalidate_end(subscriptions, range, only_end); lock_map_release(&__mmu_notifier_invalidate_range_start_map); } void __mmu_notifier_invalidate_range(struct mm_struct *mm, unsigned long start, unsigned long end) { - struct mmu_notifier *mn; + struct mmu_notifier *subscription; int id; id = srcu_read_lock(&srcu); - hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) { - if (mn->ops->invalidate_range) - mn->ops->invalidate_range(mn, mm, start, end); + hlist_for_each_entry_rcu(subscription, + &mm->notifier_subscriptions->list, hlist) { + if (subscription->ops->invalidate_range) + subscription->ops->invalidate_range(subscription, mm, + start, end); } srcu_read_unlock(&srcu, id); } @@ -564,9 +595,10 @@ void __mmu_notifier_invalidate_range(struct mm_struct *mm, * write mode. A NULL mn signals the notifier is being registered for itree * mode. */ -int __mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm) +int __mmu_notifier_register(struct mmu_notifier *subscription, + struct mm_struct *mm) { - struct mmu_notifier_mm *mmu_notifier_mm = NULL; + struct mmu_notifier_subscriptions *subscriptions = NULL; int ret; lockdep_assert_held_write(&mm->mmap_sem); @@ -579,23 +611,23 @@ int __mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm) fs_reclaim_release(GFP_KERNEL); } - if (!mm->mmu_notifier_mm) { + if (!mm->notifier_subscriptions) { /* * kmalloc cannot be called under mm_take_all_locks(), but we - * know that mm->mmu_notifier_mm can't change while we hold - * the write side of the mmap_sem. + * know that mm->notifier_subscriptions can't change while we + * hold the write side of the mmap_sem. */ - mmu_notifier_mm = - kzalloc(sizeof(struct mmu_notifier_mm), GFP_KERNEL); - if (!mmu_notifier_mm) + subscriptions = kzalloc( + sizeof(struct mmu_notifier_subscriptions), GFP_KERNEL); + if (!subscriptions) return -ENOMEM; - INIT_HLIST_HEAD(&mmu_notifier_mm->list); - spin_lock_init(&mmu_notifier_mm->lock); - mmu_notifier_mm->invalidate_seq = 2; - mmu_notifier_mm->itree = RB_ROOT_CACHED; - init_waitqueue_head(&mmu_notifier_mm->wq); - INIT_HLIST_HEAD(&mmu_notifier_mm->deferred_list); + INIT_HLIST_HEAD(&subscriptions->list); + spin_lock_init(&subscriptions->lock); + subscriptions->invalidate_seq = 2; + subscriptions->itree = RB_ROOT_CACHED; + init_waitqueue_head(&subscriptions->wq); + INIT_HLIST_HEAD(&subscriptions->deferred_list); } ret = mm_take_all_locks(mm); @@ -610,34 +642,36 @@ int __mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm) * We can't race against any other mmu notifier method either * thanks to mm_take_all_locks(). * - * release semantics on the initialization of the mmu_notifier_mm's - * contents are provided for unlocked readers. acquire can only be - * used while holding the mmgrab or mmget, and is safe because once - * created the mmu_notififer_mm is not freed until the mm is - * destroyed. As above, users holding the mmap_sem or one of the + * release semantics on the initialization of the + * mmu_notifier_subscriptions's contents are provided for unlocked + * readers. acquire can only be used while holding the mmgrab or + * mmget, and is safe because once created the + * mmu_notifier_subscriptions is not freed until the mm is destroyed. + * As above, users holding the mmap_sem or one of the * mm_take_all_locks() do not need to use acquire semantics. */ - if (mmu_notifier_mm) - smp_store_release(&mm->mmu_notifier_mm, mmu_notifier_mm); + if (subscriptions) + smp_store_release(&mm->notifier_subscriptions, subscriptions); - if (mn) { + if (subscription) { /* Pairs with the mmdrop in mmu_notifier_unregister_* */ mmgrab(mm); - mn->mm = mm; - mn->users = 1; + subscription->mm = mm; + subscription->users = 1; - spin_lock(&mm->mmu_notifier_mm->lock); - hlist_add_head_rcu(&mn->hlist, &mm->mmu_notifier_mm->list); - spin_unlock(&mm->mmu_notifier_mm->lock); + spin_lock(&mm->notifier_subscriptions->lock); + hlist_add_head_rcu(&subscription->hlist, + &mm->notifier_subscriptions->list); + spin_unlock(&mm->notifier_subscriptions->lock); } else - mm->mmu_notifier_mm->has_itree = true; + mm->notifier_subscriptions->has_itree = true; mm_drop_all_locks(mm); BUG_ON(atomic_read(&mm->mm_users) <= 0); return 0; out_clean: - kfree(mmu_notifier_mm); + kfree(subscriptions); return ret; } EXPORT_SYMBOL_GPL(__mmu_notifier_register); @@ -658,15 +692,16 @@ EXPORT_SYMBOL_GPL(__mmu_notifier_register); * mmu_notifier_unregister() or mmu_notifier_put() must be always called to * unregister the notifier. * - * While the caller has a mmu_notifier get the mn->mm pointer will remain + * While the caller has a mmu_notifier get the subscription->mm pointer will remain * valid, and can be converted to an active mm pointer via mmget_not_zero(). */ -int mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm) +int mmu_notifier_register(struct mmu_notifier *subscription, + struct mm_struct *mm) { int ret; down_write(&mm->mmap_sem); - ret = __mmu_notifier_register(mn, mm); + ret = __mmu_notifier_register(subscription, mm); up_write(&mm->mmap_sem); return ret; } @@ -675,21 +710,22 @@ EXPORT_SYMBOL_GPL(mmu_notifier_register); static struct mmu_notifier * find_get_mmu_notifier(struct mm_struct *mm, const struct mmu_notifier_ops *ops) { - struct mmu_notifier *mn; + struct mmu_notifier *subscription; - spin_lock(&mm->mmu_notifier_mm->lock); - hlist_for_each_entry_rcu (mn, &mm->mmu_notifier_mm->list, hlist) { - if (mn->ops != ops) + spin_lock(&mm->notifier_subscriptions->lock); + hlist_for_each_entry_rcu(subscription, + &mm->notifier_subscriptions->list, hlist) { + if (subscription->ops != ops) continue; - if (likely(mn->users != UINT_MAX)) - mn->users++; + if (likely(subscription->users != UINT_MAX)) + subscription->users++; else - mn = ERR_PTR(-EOVERFLOW); - spin_unlock(&mm->mmu_notifier_mm->lock); - return mn; + subscription = ERR_PTR(-EOVERFLOW); + spin_unlock(&mm->notifier_subscriptions->lock); + return subscription; } - spin_unlock(&mm->mmu_notifier_mm->lock); + spin_unlock(&mm->notifier_subscriptions->lock); return NULL; } @@ -713,37 +749,37 @@ find_get_mmu_notifier(struct mm_struct *mm, const struct mmu_notifier_ops *ops) struct mmu_notifier *mmu_notifier_get_locked(const struct mmu_notifier_ops *ops, struct mm_struct *mm) { - struct mmu_notifier *mn; + struct mmu_notifier *subscription; int ret; lockdep_assert_held_write(&mm->mmap_sem); - if (mm->mmu_notifier_mm) { - mn = find_get_mmu_notifier(mm, ops); - if (mn) - return mn; + if (mm->notifier_subscriptions) { + subscription = find_get_mmu_notifier(mm, ops); + if (subscription) + return subscription; } - mn = ops->alloc_notifier(mm); - if (IS_ERR(mn)) - return mn; - mn->ops = ops; - ret = __mmu_notifier_register(mn, mm); + subscription = ops->alloc_notifier(mm); + if (IS_ERR(subscription)) + return subscription; + subscription->ops = ops; + ret = __mmu_notifier_register(subscription, mm); if (ret) goto out_free; - return mn; + return subscription; out_free: - mn->ops->free_notifier(mn); + subscription->ops->free_notifier(subscription); return ERR_PTR(ret); } EXPORT_SYMBOL_GPL(mmu_notifier_get_locked); /* this is called after the last mmu_notifier_unregister() returned */ -void __mmu_notifier_mm_destroy(struct mm_struct *mm) +void __mmu_notifier_subscriptions_destroy(struct mm_struct *mm) { - BUG_ON(!hlist_empty(&mm->mmu_notifier_mm->list)); - kfree(mm->mmu_notifier_mm); - mm->mmu_notifier_mm = LIST_POISON1; /* debug */ + BUG_ON(!hlist_empty(&mm->notifier_subscriptions->list)); + kfree(mm->notifier_subscriptions); + mm->notifier_subscriptions = LIST_POISON1; /* debug */ } /* @@ -756,11 +792,12 @@ void __mmu_notifier_mm_destroy(struct mm_struct *mm) * and only after mmu_notifier_unregister returned we're guaranteed * that ->release or any other method can't run anymore. */ -void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm) +void mmu_notifier_unregister(struct mmu_notifier *subscription, + struct mm_struct *mm) { BUG_ON(atomic_read(&mm->mm_count) <= 0); - if (!hlist_unhashed(&mn->hlist)) { + if (!hlist_unhashed(&subscription->hlist)) { /* * SRCU here will force exit_mmap to wait for ->release to * finish before freeing the pages. @@ -772,17 +809,17 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm) * exit_mmap will block in mmu_notifier_release to guarantee * that ->release is called before freeing the pages. */ - if (mn->ops->release) - mn->ops->release(mn, mm); + if (subscription->ops->release) + subscription->ops->release(subscription, mm); srcu_read_unlock(&srcu, id); - spin_lock(&mm->mmu_notifier_mm->lock); + spin_lock(&mm->notifier_subscriptions->lock); /* * Can not use list_del_rcu() since __mmu_notifier_release * can delete it before we hold the lock. */ - hlist_del_init_rcu(&mn->hlist); - spin_unlock(&mm->mmu_notifier_mm->lock); + hlist_del_init_rcu(&subscription->hlist); + spin_unlock(&mm->notifier_subscriptions->lock); } /* @@ -799,10 +836,11 @@ EXPORT_SYMBOL_GPL(mmu_notifier_unregister); static void mmu_notifier_free_rcu(struct rcu_head *rcu) { - struct mmu_notifier *mn = container_of(rcu, struct mmu_notifier, rcu); - struct mm_struct *mm = mn->mm; + struct mmu_notifier *subscription = + container_of(rcu, struct mmu_notifier, rcu); + struct mm_struct *mm = subscription->mm; - mn->ops->free_notifier(mn); + subscription->ops->free_notifier(subscription); /* Pairs with the get in __mmu_notifier_register() */ mmdrop(mm); } @@ -829,39 +867,40 @@ static void mmu_notifier_free_rcu(struct rcu_head *rcu) * Modules calling this function must call mmu_notifier_synchronize() in * their __exit functions to ensure the async work is completed. */ -void mmu_notifier_put(struct mmu_notifier *mn) +void mmu_notifier_put(struct mmu_notifier *subscription) { - struct mm_struct *mm = mn->mm; + struct mm_struct *mm = subscription->mm; - spin_lock(&mm->mmu_notifier_mm->lock); - if (WARN_ON(!mn->users) || --mn->users) + spin_lock(&mm->notifier_subscriptions->lock); + if (WARN_ON(!subscription->users) || --subscription->users) goto out_unlock; - hlist_del_init_rcu(&mn->hlist); - spin_unlock(&mm->mmu_notifier_mm->lock); + hlist_del_init_rcu(&subscription->hlist); + spin_unlock(&mm->notifier_subscriptions->lock); - call_srcu(&srcu, &mn->rcu, mmu_notifier_free_rcu); + call_srcu(&srcu, &subscription->rcu, mmu_notifier_free_rcu); return; out_unlock: - spin_unlock(&mm->mmu_notifier_mm->lock); + spin_unlock(&mm->notifier_subscriptions->lock); } EXPORT_SYMBOL_GPL(mmu_notifier_put); static int __mmu_interval_notifier_insert( - struct mmu_interval_notifier *mni, struct mm_struct *mm, - struct mmu_notifier_mm *mmn_mm, unsigned long start, + struct mmu_interval_notifier *interval_sub, struct mm_struct *mm, + struct mmu_notifier_subscriptions *subscriptions, unsigned long start, unsigned long length, const struct mmu_interval_notifier_ops *ops) { - mni->mm = mm; - mni->ops = ops; - RB_CLEAR_NODE(&mni->interval_tree.rb); - mni->interval_tree.start = start; + interval_sub->mm = mm; + interval_sub->ops = ops; + RB_CLEAR_NODE(&interval_sub->interval_tree.rb); + interval_sub->interval_tree.start = start; /* * Note that the representation of the intervals in the interval tree * considers the ending point as contained in the interval. */ if (length == 0 || - check_add_overflow(start, length - 1, &mni->interval_tree.last)) + check_add_overflow(start, length - 1, + &interval_sub->interval_tree.last)) return -EOVERFLOW; /* Must call with a mmget() held */ @@ -881,38 +920,40 @@ static int __mmu_interval_notifier_insert( * possibility for live lock, instead defer the add to * mn_itree_inv_end() so this algorithm is deterministic. * - * In all cases the value for the mni->invalidate_seq should be + * In all cases the value for the interval_sub->invalidate_seq should be * odd, see mmu_interval_read_begin() */ - spin_lock(&mmn_mm->lock); - if (mmn_mm->active_invalidate_ranges) { - if (mn_itree_is_invalidating(mmn_mm)) - hlist_add_head(&mni->deferred_item, - &mmn_mm->deferred_list); + spin_lock(&subscriptions->lock); + if (subscriptions->active_invalidate_ranges) { + if (mn_itree_is_invalidating(subscriptions)) + hlist_add_head(&interval_sub->deferred_item, + &subscriptions->deferred_list); else { - mmn_mm->invalidate_seq |= 1; - interval_tree_insert(&mni->interval_tree, - &mmn_mm->itree); + subscriptions->invalidate_seq |= 1; + interval_tree_insert(&interval_sub->interval_tree, + &subscriptions->itree); } - mni->invalidate_seq = mmn_mm->invalidate_seq; + interval_sub->invalidate_seq = subscriptions->invalidate_seq; } else { - WARN_ON(mn_itree_is_invalidating(mmn_mm)); + WARN_ON(mn_itree_is_invalidating(subscriptions)); /* - * The starting seq for a mni not under invalidation should be - * odd, not equal to the current invalidate_seq and + * The starting seq for a subscription not under invalidation + * should be odd, not equal to the current invalidate_seq and * invalidate_seq should not 'wrap' to the new seq any time * soon. */ - mni->invalidate_seq = mmn_mm->invalidate_seq - 1; - interval_tree_insert(&mni->interval_tree, &mmn_mm->itree); + interval_sub->invalidate_seq = + subscriptions->invalidate_seq - 1; + interval_tree_insert(&interval_sub->interval_tree, + &subscriptions->itree); } - spin_unlock(&mmn_mm->lock); + spin_unlock(&subscriptions->lock); return 0; } /** * mmu_interval_notifier_insert - Insert an interval notifier - * @mni: Interval notifier to register + * @interval_sub: Interval subscription to register * @start: Starting virtual address to monitor * @length: Length of the range to monitor * @mm : mm_struct to attach to @@ -925,53 +966,53 @@ static int __mmu_interval_notifier_insert( * The caller must use the normal interval notifier read flow via * mmu_interval_read_begin() to establish SPTEs for this range. */ -int mmu_interval_notifier_insert(struct mmu_interval_notifier *mni, +int mmu_interval_notifier_insert(struct mmu_interval_notifier *interval_sub, struct mm_struct *mm, unsigned long start, unsigned long length, const struct mmu_interval_notifier_ops *ops) { - struct mmu_notifier_mm *mmn_mm; + struct mmu_notifier_subscriptions *subscriptions; int ret; might_lock(&mm->mmap_sem); - mmn_mm = smp_load_acquire(&mm->mmu_notifier_mm); - if (!mmn_mm || !mmn_mm->has_itree) { + subscriptions = smp_load_acquire(&mm->notifier_subscriptions); + if (!subscriptions || !subscriptions->has_itree) { ret = mmu_notifier_register(NULL, mm); if (ret) return ret; - mmn_mm = mm->mmu_notifier_mm; + subscriptions = mm->notifier_subscriptions; } - return __mmu_interval_notifier_insert(mni, mm, mmn_mm, start, length, - ops); + return __mmu_interval_notifier_insert(interval_sub, mm, subscriptions, + start, length, ops); } EXPORT_SYMBOL_GPL(mmu_interval_notifier_insert); int mmu_interval_notifier_insert_locked( - struct mmu_interval_notifier *mni, struct mm_struct *mm, + struct mmu_interval_notifier *interval_sub, struct mm_struct *mm, unsigned long start, unsigned long length, const struct mmu_interval_notifier_ops *ops) { - struct mmu_notifier_mm *mmn_mm; + struct mmu_notifier_subscriptions *subscriptions = + mm->notifier_subscriptions; int ret; lockdep_assert_held_write(&mm->mmap_sem); - mmn_mm = mm->mmu_notifier_mm; - if (!mmn_mm || !mmn_mm->has_itree) { + if (!subscriptions || !subscriptions->has_itree) { ret = __mmu_notifier_register(NULL, mm); if (ret) return ret; - mmn_mm = mm->mmu_notifier_mm; + subscriptions = mm->notifier_subscriptions; } - return __mmu_interval_notifier_insert(mni, mm, mmn_mm, start, length, - ops); + return __mmu_interval_notifier_insert(interval_sub, mm, subscriptions, + start, length, ops); } EXPORT_SYMBOL_GPL(mmu_interval_notifier_insert_locked); /** * mmu_interval_notifier_remove - Remove a interval notifier - * @mni: Interval notifier to unregister + * @interval_sub: Interval subscription to unregister * * This function must be paired with mmu_interval_notifier_insert(). It cannot * be called from any ops callback. @@ -979,32 +1020,34 @@ EXPORT_SYMBOL_GPL(mmu_interval_notifier_insert_locked); * Once this returns ops callbacks are no longer running on other CPUs and * will not be called in future. */ -void mmu_interval_notifier_remove(struct mmu_interval_notifier *mni) +void mmu_interval_notifier_remove(struct mmu_interval_notifier *interval_sub) { - struct mm_struct *mm = mni->mm; - struct mmu_notifier_mm *mmn_mm = mm->mmu_notifier_mm; + struct mm_struct *mm = interval_sub->mm; + struct mmu_notifier_subscriptions *subscriptions = + mm->notifier_subscriptions; unsigned long seq = 0; might_sleep(); - spin_lock(&mmn_mm->lock); - if (mn_itree_is_invalidating(mmn_mm)) { + spin_lock(&subscriptions->lock); + if (mn_itree_is_invalidating(subscriptions)) { /* * remove is being called after insert put this on the * deferred list, but before the deferred list was processed. */ - if (RB_EMPTY_NODE(&mni->interval_tree.rb)) { - hlist_del(&mni->deferred_item); + if (RB_EMPTY_NODE(&interval_sub->interval_tree.rb)) { + hlist_del(&interval_sub->deferred_item); } else { - hlist_add_head(&mni->deferred_item, - &mmn_mm->deferred_list); - seq = mmn_mm->invalidate_seq; + hlist_add_head(&interval_sub->deferred_item, + &subscriptions->deferred_list); + seq = subscriptions->invalidate_seq; } } else { - WARN_ON(RB_EMPTY_NODE(&mni->interval_tree.rb)); - interval_tree_remove(&mni->interval_tree, &mmn_mm->itree); + WARN_ON(RB_EMPTY_NODE(&interval_sub->interval_tree.rb)); + interval_tree_remove(&interval_sub->interval_tree, + &subscriptions->itree); } - spin_unlock(&mmn_mm->lock); + spin_unlock(&subscriptions->lock); /* * The possible sleep on progress in the invalidation requires the @@ -1013,8 +1056,8 @@ void mmu_interval_notifier_remove(struct mmu_interval_notifier *mni) lock_map_acquire(&__mmu_notifier_invalidate_range_start_map); lock_map_release(&__mmu_notifier_invalidate_range_start_map); if (seq) - wait_event(mmn_mm->wq, - READ_ONCE(mmn_mm->invalidate_seq) != seq); + wait_event(subscriptions->wq, + READ_ONCE(subscriptions->invalidate_seq) != seq); /* pairs with mmgrab in mmu_interval_notifier_insert() */ mmdrop(mm); diff --git a/mm/percpu.c b/mm/percpu.c index 7e06a1e58720..e9844086b236 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -270,33 +270,6 @@ static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk, pcpu_unit_page_offset(cpu, page_idx); } -static void pcpu_next_unpop(unsigned long *bitmap, int *rs, int *re, int end) -{ - *rs = find_next_zero_bit(bitmap, end, *rs); - *re = find_next_bit(bitmap, end, *rs + 1); -} - -static void pcpu_next_pop(unsigned long *bitmap, int *rs, int *re, int end) -{ - *rs = find_next_bit(bitmap, end, *rs); - *re = find_next_zero_bit(bitmap, end, *rs + 1); -} - -/* - * Bitmap region iterators. Iterates over the bitmap between - * [@start, @end) in @chunk. @rs and @re should be integer variables - * and will be set to start and end index of the current free region. - */ -#define pcpu_for_each_unpop_region(bitmap, rs, re, start, end) \ - for ((rs) = (start), pcpu_next_unpop((bitmap), &(rs), &(re), (end)); \ - (rs) < (re); \ - (rs) = (re) + 1, pcpu_next_unpop((bitmap), &(rs), &(re), (end))) - -#define pcpu_for_each_pop_region(bitmap, rs, re, start, end) \ - for ((rs) = (start), pcpu_next_pop((bitmap), &(rs), &(re), (end)); \ - (rs) < (re); \ - (rs) = (re) + 1, pcpu_next_pop((bitmap), &(rs), &(re), (end))) - /* * The following are helper functions to help access bitmaps and convert * between bitmap offsets to address offsets. @@ -732,9 +705,8 @@ static void pcpu_chunk_refresh_hint(struct pcpu_chunk *chunk, bool full_scan) } bits = 0; - pcpu_for_each_md_free_region(chunk, bit_off, bits) { + pcpu_for_each_md_free_region(chunk, bit_off, bits) pcpu_block_update(chunk_md, bit_off, bit_off + bits); - } } /** @@ -749,7 +721,7 @@ static void pcpu_block_refresh_hint(struct pcpu_chunk *chunk, int index) { struct pcpu_block_md *block = chunk->md_blocks + index; unsigned long *alloc_map = pcpu_index_alloc_map(chunk, index); - int rs, re, start; /* region start, region end */ + unsigned int rs, re, start; /* region start, region end */ /* promote scan_hint to contig_hint */ if (block->scan_hint) { @@ -765,10 +737,9 @@ static void pcpu_block_refresh_hint(struct pcpu_chunk *chunk, int index) block->right_free = 0; /* iterate over free areas and update the contig hints */ - pcpu_for_each_unpop_region(alloc_map, rs, re, start, - PCPU_BITMAP_BLOCK_BITS) { + bitmap_for_each_clear_region(alloc_map, rs, re, start, + PCPU_BITMAP_BLOCK_BITS) pcpu_block_update(block, rs, re); - } } /** @@ -1041,13 +1012,13 @@ static void pcpu_block_update_hint_free(struct pcpu_chunk *chunk, int bit_off, static bool pcpu_is_populated(struct pcpu_chunk *chunk, int bit_off, int bits, int *next_off) { - int page_start, page_end, rs, re; + unsigned int page_start, page_end, rs, re; page_start = PFN_DOWN(bit_off * PCPU_MIN_ALLOC_SIZE); page_end = PFN_UP((bit_off + bits) * PCPU_MIN_ALLOC_SIZE); rs = page_start; - pcpu_next_unpop(chunk->populated, &rs, &re, page_end); + bitmap_next_clear_region(chunk->populated, &rs, &re, page_end); if (rs >= page_end) return true; @@ -1702,13 +1673,13 @@ area_found: /* populate if not all pages are already there */ if (!is_atomic) { - int page_start, page_end, rs, re; + unsigned int page_start, page_end, rs, re; page_start = PFN_DOWN(off); page_end = PFN_UP(off + size); - pcpu_for_each_unpop_region(chunk->populated, rs, re, - page_start, page_end) { + bitmap_for_each_clear_region(chunk->populated, rs, re, + page_start, page_end) { WARN_ON(chunk->immutable); ret = pcpu_populate_chunk(chunk, rs, re, pcpu_gfp); @@ -1858,10 +1829,10 @@ static void pcpu_balance_workfn(struct work_struct *work) spin_unlock_irq(&pcpu_lock); list_for_each_entry_safe(chunk, next, &to_free, list) { - int rs, re; + unsigned int rs, re; - pcpu_for_each_pop_region(chunk->populated, rs, re, 0, - chunk->nr_pages) { + bitmap_for_each_set_region(chunk->populated, rs, re, 0, + chunk->nr_pages) { pcpu_depopulate_chunk(chunk, rs, re); spin_lock_irq(&pcpu_lock); pcpu_chunk_depopulated(chunk, rs, re); @@ -1893,7 +1864,7 @@ retry_pop: } for (slot = pcpu_size_to_slot(PAGE_SIZE); slot < pcpu_nr_slots; slot++) { - int nr_unpop = 0, rs, re; + unsigned int nr_unpop = 0, rs, re; if (!nr_to_pop) break; @@ -1910,9 +1881,9 @@ retry_pop: continue; /* @chunk can't go away while pcpu_alloc_mutex is held */ - pcpu_for_each_unpop_region(chunk->populated, rs, re, 0, - chunk->nr_pages) { - int nr = min(re - rs, nr_to_pop); + bitmap_for_each_clear_region(chunk->populated, rs, re, 0, + chunk->nr_pages) { + int nr = min_t(int, re - rs, nr_to_pop); ret = pcpu_populate_chunk(chunk, rs, rs + nr, gfp); if (!ret) { diff --git a/mm/slub.c b/mm/slub.c index 8eafccf75940..0ab92ec8c2a6 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1964,7 +1964,7 @@ static void *get_partial(struct kmem_cache *s, gfp_t flags, int node, return get_any_partial(s, flags, c); } -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPTION /* * Calculate the next globally unique transaction for disambiguiation * during cmpxchg. The transactions start with the cpu number and are then @@ -2009,7 +2009,7 @@ static inline void note_cmpxchg_failure(const char *n, pr_info("%s %s: cmpxchg redo ", n, s->name); -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPTION if (tid_to_cpu(tid) != tid_to_cpu(actual_tid)) pr_warn("due to cpu change %d -> %d\n", tid_to_cpu(tid), tid_to_cpu(actual_tid)); @@ -2341,7 +2341,7 @@ static bool has_cpu_slab(int cpu, void *info) static void flush_all(struct kmem_cache *s) { - on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1, GFP_ATOMIC); + on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1); } /* @@ -2637,7 +2637,7 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, unsigned long flags; local_irq_save(flags); -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPTION /* * We may have been preempted and rescheduled on a different * cpu before disabling interrupts. Need to reload cpu area @@ -2691,13 +2691,13 @@ redo: * as we end up on the original cpu again when doing the cmpxchg. * * We should guarantee that tid and kmem_cache are retrieved on - * the same cpu. It could be different if CONFIG_PREEMPT so we need + * the same cpu. It could be different if CONFIG_PREEMPTION so we need * to check if it is matched or not. */ do { tid = this_cpu_read(s->cpu_slab->tid); c = raw_cpu_ptr(s->cpu_slab); - } while (IS_ENABLED(CONFIG_PREEMPT) && + } while (IS_ENABLED(CONFIG_PREEMPTION) && unlikely(tid != READ_ONCE(c->tid))); /* @@ -2971,7 +2971,7 @@ redo: do { tid = this_cpu_read(s->cpu_slab->tid); c = raw_cpu_ptr(s->cpu_slab); - } while (IS_ENABLED(CONFIG_PREEMPT) && + } while (IS_ENABLED(CONFIG_PREEMPTION) && unlikely(tid != READ_ONCE(c->tid))); /* Same with comment on barrier() in slab_alloc_node() */ diff --git a/mm/vmalloc.c b/mm/vmalloc.c index b29ad17edcf5..1f46c3b86f9f 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -41,6 +41,14 @@ #include "internal.h" +bool is_vmalloc_addr(const void *x) +{ + unsigned long addr = (unsigned long)x; + + return addr >= VMALLOC_START && addr < VMALLOC_END; +} +EXPORT_SYMBOL(is_vmalloc_addr); + struct vfree_deferred { struct llist_head list; struct work_struct wq; |