aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/mm/hugetlb.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/hugetlb.c')
-rw-r--r--mm/hugetlb.c792
1 files changed, 499 insertions, 293 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 0bdfc7e1c933..0ad53ad98e74 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -33,6 +33,7 @@
#include <linux/migrate.h>
#include <linux/nospec.h>
#include <linux/delayacct.h>
+#include <linux/memory.h>
#include <asm/page.h>
#include <asm/pgalloc.h>
@@ -90,6 +91,9 @@ struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp;
/* Forward declaration */
static int hugetlb_acct_memory(struct hstate *h, long delta);
+static void hugetlb_vma_lock_free(struct vm_area_struct *vma);
+static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma);
+static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma);
static inline bool subpool_is_free(struct hugepage_subpool *spool)
{
@@ -257,7 +261,7 @@ static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma)
static struct file_region *
get_file_region_entry_from_cache(struct resv_map *resv, long from, long to)
{
- struct file_region *nrg = NULL;
+ struct file_region *nrg;
VM_BUG_ON(resv->region_cache_count <= 0);
@@ -339,7 +343,7 @@ static bool has_same_uncharge_info(struct file_region *rg,
static void coalesce_file_region(struct resv_map *resv, struct file_region *rg)
{
- struct file_region *nrg = NULL, *prg = NULL;
+ struct file_region *nrg, *prg;
prg = list_prev_entry(rg, link);
if (&prg->link != &resv->regions && prg->to == rg->from &&
@@ -456,14 +460,12 @@ static int allocate_file_region_entries(struct resv_map *resv,
int regions_needed)
__must_hold(&resv->lock)
{
- struct list_head allocated_regions;
+ LIST_HEAD(allocated_regions);
int to_allocate = 0, i = 0;
struct file_region *trg = NULL, *rg = NULL;
VM_BUG_ON(regions_needed < 0);
- INIT_LIST_HEAD(&allocated_regions);
-
/*
* Check for sufficient descriptors in the cache to accommodate
* the number of in progress add operations plus regions_needed.
@@ -860,7 +862,7 @@ __weak unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
* faults in a MAP_PRIVATE mapping. Only the process that called mmap()
* is guaranteed to have their future faults succeed.
*
- * With the exception of reset_vma_resv_huge_pages() which is called at fork(),
+ * With the exception of hugetlb_dup_vma_private() which is called at fork(),
* the reserve counters are updated with the hugetlb_lock held. It is safe
* to reset the VMA at fork() time as it is not in use yet and there is no
* chance of the global counters getting corrupted as a result of the values.
@@ -1007,12 +1009,20 @@ static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
return (get_vma_private_data(vma) & flag) != 0;
}
-/* Reset counters to 0 and clear all HPAGE_RESV_* flags */
-void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
+void hugetlb_dup_vma_private(struct vm_area_struct *vma)
{
VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
+ /*
+ * Clear vm_private_data
+ * - For MAP_PRIVATE mappings, this is the reserve map which does
+ * not apply to children. Faults generated by the children are
+ * not guaranteed to succeed, even if read-only.
+ * - For shared mappings this is a per-vma semaphore that may be
+ * allocated in a subsequent call to hugetlb_vm_op_open.
+ */
+ vma->vm_private_data = (void *)0;
if (!(vma->vm_flags & VM_MAYSHARE))
- vma->vm_private_data = (void *)0;
+ return;
}
/*
@@ -1043,7 +1053,7 @@ void clear_vma_resv_huge_pages(struct vm_area_struct *vma)
kref_put(&reservations->refs, resv_map_release);
}
- reset_vma_resv_huge_pages(vma);
+ hugetlb_dup_vma_private(vma);
}
/* Returns true if the VMA has associated reserve pages */
@@ -1182,6 +1192,11 @@ retry_cpuset:
return NULL;
}
+static unsigned long available_huge_pages(struct hstate *h)
+{
+ return h->free_huge_pages - h->resv_huge_pages;
+}
+
static struct page *dequeue_huge_page_vma(struct hstate *h,
struct vm_area_struct *vma,
unsigned long address, int avoid_reserve,
@@ -1198,12 +1213,11 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
* have no page reserves. This check ensures that reservations are
* not "stolen". The child may still get SIGKILLed
*/
- if (!vma_has_reserves(vma, chg) &&
- h->free_huge_pages - h->resv_huge_pages == 0)
+ if (!vma_has_reserves(vma, chg) && !available_huge_pages(h))
goto err;
/* If reserves cannot be used, ensure enough pages are in the pool */
- if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0)
+ if (avoid_reserve && !available_huge_pages(h))
goto err;
gfp_mask = htlb_alloc_mask(h);
@@ -1308,12 +1322,13 @@ static void __destroy_compound_gigantic_page(struct page *page,
{
int i;
int nr_pages = 1 << order;
- struct page *p = page + 1;
+ struct page *p;
atomic_set(compound_mapcount_ptr(page), 0);
atomic_set(compound_pincount_ptr(page), 0);
- for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
+ for (i = 1; i < nr_pages; i++) {
+ p = nth_page(page, i);
p->mapping = NULL;
clear_compound_head(p);
if (!demote)
@@ -1506,6 +1521,10 @@ static void add_hugetlb_page(struct hstate *h, struct page *page,
set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
set_page_private(page, 0);
+ /*
+ * We have to set HPageVmemmapOptimized again as above
+ * set_page_private(page, 0) cleared it.
+ */
SetHPageVmemmapOptimized(page);
/*
@@ -1530,7 +1549,7 @@ static void add_hugetlb_page(struct hstate *h, struct page *page,
static void __update_and_free_page(struct hstate *h, struct page *page)
{
int i;
- struct page *subpage = page;
+ struct page *subpage;
if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
return;
@@ -1561,8 +1580,8 @@ static void __update_and_free_page(struct hstate *h, struct page *page)
if (unlikely(PageHWPoison(page)))
hugetlb_clear_page_hwpoison(page);
- for (i = 0; i < pages_per_huge_page(h);
- i++, subpage = mem_map_next(subpage, page, i)) {
+ for (i = 0; i < pages_per_huge_page(h); i++) {
+ subpage = nth_page(page, i);
subpage->flags &= ~(1 << PG_locked | 1 << PG_error |
1 << PG_referenced | 1 << PG_dirty |
1 << PG_active | 1 << PG_private |
@@ -1769,13 +1788,14 @@ static bool __prep_compound_gigantic_page(struct page *page, unsigned int order,
{
int i, j;
int nr_pages = 1 << order;
- struct page *p = page + 1;
+ struct page *p;
/* we rely on prep_new_huge_page to set the destructor */
set_compound_order(page, order);
- __ClearPageReserved(page);
__SetPageHead(page);
- for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
+ for (i = 0; i < nr_pages; i++) {
+ p = nth_page(page, i);
+
/*
* For gigantic hugepages allocated through bootmem at
* boot, it's safer to be consistent with the not-gigantic
@@ -1814,22 +1834,26 @@ static bool __prep_compound_gigantic_page(struct page *page, unsigned int order,
} else {
VM_BUG_ON_PAGE(page_count(p), p);
}
- set_compound_head(p, page);
+ if (i != 0)
+ set_compound_head(p, page);
}
atomic_set(compound_mapcount_ptr(page), -1);
atomic_set(compound_pincount_ptr(page), 0);
return true;
out_error:
- /* undo tail page modifications made above */
- p = page + 1;
- for (j = 1; j < i; j++, p = mem_map_next(p, page, j)) {
- clear_compound_head(p);
+ /* undo page modifications made above */
+ for (j = 0; j < i; j++) {
+ p = nth_page(page, j);
+ if (j != 0)
+ clear_compound_head(p);
set_page_refcounted(p);
}
/* need to clear PG_reserved on remaining tail pages */
- for (; j < nr_pages; j++, p = mem_map_next(p, page, j))
+ for (; j < nr_pages; j++) {
+ p = nth_page(page, j);
__ClearPageReserved(p);
+ }
set_compound_order(page, 0);
#ifdef CONFIG_64BIT
page[1].compound_nr = 0;
@@ -1918,6 +1942,7 @@ static struct page *alloc_buddy_huge_page(struct hstate *h,
int order = huge_page_order(h);
struct page *page;
bool alloc_try_hard = true;
+ bool retry = true;
/*
* By default we always try hard to allocate the page with
@@ -1933,7 +1958,21 @@ static struct page *alloc_buddy_huge_page(struct hstate *h,
gfp_mask |= __GFP_RETRY_MAYFAIL;
if (nid == NUMA_NO_NODE)
nid = numa_mem_id();
+retry:
page = __alloc_pages(gfp_mask, order, nid, nmask);
+
+ /* Freeze head page */
+ if (page && !page_ref_freeze(page, 1)) {
+ __free_pages(page, order);
+ if (retry) { /* retry once */
+ retry = false;
+ goto retry;
+ }
+ /* WOW! twice in a row. */
+ pr_warn("HugeTLB head page unexpected inflated ref count\n");
+ page = NULL;
+ }
+
if (page)
__count_vm_event(HTLB_BUDDY_PGALLOC);
else
@@ -1961,6 +2000,9 @@ static struct page *alloc_buddy_huge_page(struct hstate *h,
/*
* Common helper to allocate a fresh hugetlb page. All specific allocators
* should use this function to get new hugetlb pages
+ *
+ * Note that returned page is 'frozen': ref count of head page and all tail
+ * pages is zero.
*/
static struct page *alloc_fresh_huge_page(struct hstate *h,
gfp_t gfp_mask, int nid, nodemask_t *nmask,
@@ -2018,7 +2060,7 @@ static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
if (!page)
return 0;
- put_page(page); /* free it into the hugepage allocator */
+ free_huge_page(page); /* free it into the hugepage allocator */
return 1;
}
@@ -2087,7 +2129,7 @@ retry:
if (!page_count(page)) {
struct page *head = compound_head(page);
struct hstate *h = page_hstate(head);
- if (h->free_huge_pages - h->resv_huge_pages == 0)
+ if (!available_huge_pages(h))
goto out;
/*
@@ -2175,10 +2217,9 @@ int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
* Allocates a fresh surplus page from the page allocator.
*/
static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
- int nid, nodemask_t *nmask, bool zero_ref)
+ int nid, nodemask_t *nmask)
{
struct page *page = NULL;
- bool retry = false;
if (hstate_is_gigantic(h))
return NULL;
@@ -2188,7 +2229,6 @@ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
goto out_unlock;
spin_unlock_irq(&hugetlb_lock);
-retry:
page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL);
if (!page)
return NULL;
@@ -2204,34 +2244,10 @@ retry:
if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) {
SetHPageTemporary(page);
spin_unlock_irq(&hugetlb_lock);
- put_page(page);
+ free_huge_page(page);
return NULL;
}
- if (zero_ref) {
- /*
- * Caller requires a page with zero ref count.
- * We will drop ref count here. If someone else is holding
- * a ref, the page will be freed when they drop it. Abuse
- * temporary page flag to accomplish this.
- */
- SetHPageTemporary(page);
- if (!put_page_testzero(page)) {
- /*
- * Unexpected inflated ref count on freshly allocated
- * huge. Retry once.
- */
- pr_info("HugeTLB unexpected inflated ref count on freshly allocated page\n");
- spin_unlock_irq(&hugetlb_lock);
- if (retry)
- return NULL;
-
- retry = true;
- goto retry;
- }
- ClearHPageTemporary(page);
- }
-
h->surplus_huge_pages++;
h->surplus_huge_pages_node[page_to_nid(page)]++;
@@ -2253,6 +2269,9 @@ static struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask,
if (!page)
return NULL;
+ /* fresh huge pages are frozen */
+ set_page_refcounted(page);
+
/*
* We do not account these pages as surplus because they are only
* temporary and will be released properly on the last reference
@@ -2280,14 +2299,14 @@ struct page *alloc_buddy_huge_page_with_mpol(struct hstate *h,
gfp_t gfp = gfp_mask | __GFP_NOWARN;
gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
- page = alloc_surplus_huge_page(h, gfp, nid, nodemask, false);
+ page = alloc_surplus_huge_page(h, gfp, nid, nodemask);
/* Fallback to all nodes if page==NULL */
nodemask = NULL;
}
if (!page)
- page = alloc_surplus_huge_page(h, gfp_mask, nid, nodemask, false);
+ page = alloc_surplus_huge_page(h, gfp_mask, nid, nodemask);
mpol_cond_put(mpol);
return page;
}
@@ -2297,7 +2316,7 @@ struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid,
nodemask_t *nmask, gfp_t gfp_mask)
{
spin_lock_irq(&hugetlb_lock);
- if (h->free_huge_pages - h->resv_huge_pages > 0) {
+ if (available_huge_pages(h)) {
struct page *page;
page = dequeue_huge_page_nodemask(h, gfp_mask, preferred_nid, nmask);
@@ -2336,7 +2355,7 @@ struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma,
static int gather_surplus_pages(struct hstate *h, long delta)
__must_hold(&hugetlb_lock)
{
- struct list_head surplus_list;
+ LIST_HEAD(surplus_list);
struct page *page, *tmp;
int ret;
long i;
@@ -2351,14 +2370,13 @@ static int gather_surplus_pages(struct hstate *h, long delta)
}
allocated = 0;
- INIT_LIST_HEAD(&surplus_list);
ret = -ENOMEM;
retry:
spin_unlock_irq(&hugetlb_lock);
for (i = 0; i < needed; i++) {
page = alloc_surplus_huge_page(h, htlb_alloc_mask(h),
- NUMA_NO_NODE, NULL, true);
+ NUMA_NO_NODE, NULL);
if (!page) {
alloc_ok = false;
break;
@@ -2720,7 +2738,6 @@ static int alloc_and_dissolve_huge_page(struct hstate *h, struct page *old_page,
{
gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
int nid = page_to_nid(old_page);
- bool alloc_retry = false;
struct page *new_page;
int ret = 0;
@@ -2731,30 +2748,9 @@ static int alloc_and_dissolve_huge_page(struct hstate *h, struct page *old_page,
* the pool. This simplifies and let us do most of the processing
* under the lock.
*/
-alloc_retry:
new_page = alloc_buddy_huge_page(h, gfp_mask, nid, NULL, NULL);
if (!new_page)
return -ENOMEM;
- /*
- * If all goes well, this page will be directly added to the free
- * list in the pool. For this the ref count needs to be zero.
- * Attempt to drop now, and retry once if needed. It is VERY
- * unlikely there is another ref on the page.
- *
- * If someone else has a reference to the page, it will be freed
- * when they drop their ref. Abuse temporary page flag to accomplish
- * this. Retry once if there is an inflated ref count.
- */
- SetHPageTemporary(new_page);
- if (!put_page_testzero(new_page)) {
- if (alloc_retry)
- return -EBUSY;
-
- alloc_retry = true;
- goto alloc_retry;
- }
- ClearHPageTemporary(new_page);
-
__prep_new_huge_page(h, new_page);
retry:
@@ -2934,6 +2930,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
}
spin_lock_irq(&hugetlb_lock);
list_add(&page->lru, &h->hugepage_activelist);
+ set_page_refcounted(page);
/* Fall through */
}
hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page);
@@ -3038,7 +3035,7 @@ static void __init gather_bootmem_prealloc(void)
if (prep_compound_gigantic_page(page, huge_page_order(h))) {
WARN_ON(PageReserved(page));
prep_new_huge_page(h, page, page_to_nid(page));
- put_page(page); /* add to the hugepage allocator */
+ free_huge_page(page); /* add to the hugepage allocator */
} else {
/* VERY unlikely inflated ref count on a tail page */
free_gigantic_page(page, huge_page_order(h));
@@ -3070,7 +3067,7 @@ static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid)
&node_states[N_MEMORY], NULL);
if (!page)
break;
- put_page(page); /* free it into the hugepage allocator */
+ free_huge_page(page); /* free it into the hugepage allocator */
}
cond_resched();
}
@@ -3461,9 +3458,8 @@ static int demote_free_huge_page(struct hstate *h, struct page *page)
else
prep_compound_page(subpage, target_hstate->order);
set_page_private(subpage, 0);
- set_page_refcounted(subpage);
prep_new_huge_page(target_hstate, subpage, nid);
- put_page(subpage);
+ free_huge_page(subpage);
}
mutex_unlock(&target_hstate->resize_lock);
@@ -3474,7 +3470,8 @@ static int demote_free_huge_page(struct hstate *h, struct page *page)
* based on pool changes for the demoted page.
*/
h->max_huge_pages--;
- target_hstate->max_huge_pages += pages_per_huge_page(h);
+ target_hstate->max_huge_pages +=
+ pages_per_huge_page(h) / pages_per_huge_page(target_hstate);
return rc;
}
@@ -3716,7 +3713,7 @@ static ssize_t demote_store(struct kobject *kobj,
unsigned long nr_available;
nodemask_t nodes_allowed, *n_mask;
struct hstate *h;
- int err = 0;
+ int err;
int nid;
err = kstrtoul(buf, 10, &nr_demote);
@@ -3767,8 +3764,7 @@ HSTATE_ATTR_WO(demote);
static ssize_t demote_size_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- int nid;
- struct hstate *h = kobj_to_hstate(kobj, &nid);
+ struct hstate *h = kobj_to_hstate(kobj, NULL);
unsigned long demote_size = (PAGE_SIZE << h->demote_order) / SZ_1K;
return sysfs_emit(buf, "%lukB\n", demote_size);
@@ -3781,7 +3777,6 @@ static ssize_t demote_size_store(struct kobject *kobj,
struct hstate *h, *demote_hstate;
unsigned long demote_size;
unsigned int demote_order;
- int nid;
demote_size = (unsigned long)memparse(buf, NULL);
@@ -3793,7 +3788,7 @@ static ssize_t demote_size_store(struct kobject *kobj,
return -EINVAL;
/* demote order must be smaller than hstate order */
- h = kobj_to_hstate(kobj, &nid);
+ h = kobj_to_hstate(kobj, NULL);
if (demote_order >= h->order)
return -EINVAL;
@@ -3847,35 +3842,26 @@ static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent,
if (retval) {
kobject_put(hstate_kobjs[hi]);
hstate_kobjs[hi] = NULL;
+ return retval;
}
if (h->demote_order) {
- if (sysfs_create_group(hstate_kobjs[hi],
- &hstate_demote_attr_group))
+ retval = sysfs_create_group(hstate_kobjs[hi],
+ &hstate_demote_attr_group);
+ if (retval) {
pr_warn("HugeTLB unable to create demote interfaces for %s\n", h->name);
+ sysfs_remove_group(hstate_kobjs[hi], hstate_attr_group);
+ kobject_put(hstate_kobjs[hi]);
+ hstate_kobjs[hi] = NULL;
+ return retval;
+ }
}
- return retval;
-}
-
-static void __init hugetlb_sysfs_init(void)
-{
- struct hstate *h;
- int err;
-
- hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj);
- if (!hugepages_kobj)
- return;
-
- for_each_hstate(h) {
- err = hugetlb_sysfs_add_hstate(h, hugepages_kobj,
- hstate_kobjs, &hstate_attr_group);
- if (err)
- pr_err("HugeTLB: Unable to add hstate %s", h->name);
- }
+ return 0;
}
#ifdef CONFIG_NUMA
+static bool hugetlb_sysfs_initialized __ro_after_init;
/*
* node_hstate/s - associate per node hstate attributes, via their kobjects,
@@ -3931,7 +3917,7 @@ static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp)
* Unregister hstate attributes from a single node device.
* No-op if no hstate attributes attached.
*/
-static void hugetlb_unregister_node(struct node *node)
+void hugetlb_unregister_node(struct node *node)
{
struct hstate *h;
struct node_hstate *nhs = &node_hstates[node->dev.id];
@@ -3941,10 +3927,15 @@ static void hugetlb_unregister_node(struct node *node)
for_each_hstate(h) {
int idx = hstate_index(h);
- if (nhs->hstate_kobjs[idx]) {
- kobject_put(nhs->hstate_kobjs[idx]);
- nhs->hstate_kobjs[idx] = NULL;
- }
+ struct kobject *hstate_kobj = nhs->hstate_kobjs[idx];
+
+ if (!hstate_kobj)
+ continue;
+ if (h->demote_order)
+ sysfs_remove_group(hstate_kobj, &hstate_demote_attr_group);
+ sysfs_remove_group(hstate_kobj, &per_node_hstate_attr_group);
+ kobject_put(hstate_kobj);
+ nhs->hstate_kobjs[idx] = NULL;
}
kobject_put(nhs->hugepages_kobj);
@@ -3956,12 +3947,15 @@ static void hugetlb_unregister_node(struct node *node)
* Register hstate attributes for a single node device.
* No-op if attributes already registered.
*/
-static void hugetlb_register_node(struct node *node)
+void hugetlb_register_node(struct node *node)
{
struct hstate *h;
struct node_hstate *nhs = &node_hstates[node->dev.id];
int err;
+ if (!hugetlb_sysfs_initialized)
+ return;
+
if (nhs->hugepages_kobj)
return; /* already allocated */
@@ -3992,18 +3986,8 @@ static void __init hugetlb_register_all_nodes(void)
{
int nid;
- for_each_node_state(nid, N_MEMORY) {
- struct node *node = node_devices[nid];
- if (node->dev.id == nid)
- hugetlb_register_node(node);
- }
-
- /*
- * Let the node device driver know we're here so it can
- * [un]register hstate attributes on node hotplug.
- */
- register_hugetlbfs_with_node(hugetlb_register_node,
- hugetlb_unregister_node);
+ for_each_online_node(nid)
+ hugetlb_register_node(node_devices[nid]);
}
#else /* !CONFIG_NUMA */
@@ -4019,6 +4003,36 @@ static void hugetlb_register_all_nodes(void) { }
#endif
+#ifdef CONFIG_CMA
+static void __init hugetlb_cma_check(void);
+#else
+static inline __init void hugetlb_cma_check(void)
+{
+}
+#endif
+
+static void __init hugetlb_sysfs_init(void)
+{
+ struct hstate *h;
+ int err;
+
+ hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj);
+ if (!hugepages_kobj)
+ return;
+
+ for_each_hstate(h) {
+ err = hugetlb_sysfs_add_hstate(h, hugepages_kobj,
+ hstate_kobjs, &hstate_attr_group);
+ if (err)
+ pr_err("HugeTLB: Unable to add hstate %s", h->name);
+ }
+
+#ifdef CONFIG_NUMA
+ hugetlb_sysfs_initialized = true;
+#endif
+ hugetlb_register_all_nodes();
+}
+
static int __init hugetlb_init(void)
{
int i;
@@ -4073,7 +4087,6 @@ static int __init hugetlb_init(void)
report_hugepages();
hugetlb_sysfs_init();
- hugetlb_register_all_nodes();
hugetlb_cgroup_file_init();
#ifdef CONFIG_SMP
@@ -4118,7 +4131,7 @@ void __init hugetlb_add_hstate(unsigned int order)
h->next_nid_to_alloc = first_memory_node;
h->next_nid_to_free = first_memory_node;
snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
- huge_page_size(h)/1024);
+ huge_page_size(h)/SZ_1K);
parsed_hstate = h;
}
@@ -4133,11 +4146,11 @@ static void __init hugepages_clear_pages_in_node(void)
if (!hugetlb_max_hstate) {
default_hstate_max_huge_pages = 0;
memset(default_hugepages_in_node, 0,
- MAX_NUMNODES * sizeof(unsigned int));
+ sizeof(default_hugepages_in_node));
} else {
parsed_hstate->max_huge_pages = 0;
memset(parsed_hstate->max_huge_pages_node, 0,
- MAX_NUMNODES * sizeof(unsigned int));
+ sizeof(parsed_hstate->max_huge_pages_node));
}
}
@@ -4332,18 +4345,34 @@ static int __init default_hugepagesz_setup(char *s)
}
__setup("default_hugepagesz=", default_hugepagesz_setup);
+static nodemask_t *policy_mbind_nodemask(gfp_t gfp)
+{
+#ifdef CONFIG_NUMA
+ struct mempolicy *mpol = get_task_policy(current);
+
+ /*
+ * Only enforce MPOL_BIND policy which overlaps with cpuset policy
+ * (from policy_nodemask) specifically for hugetlb case
+ */
+ if (mpol->mode == MPOL_BIND &&
+ (apply_policy_zone(mpol, gfp_zone(gfp)) &&
+ cpuset_nodemask_valid_mems_allowed(&mpol->nodes)))
+ return &mpol->nodes;
+#endif
+ return NULL;
+}
+
static unsigned int allowed_mems_nr(struct hstate *h)
{
int node;
unsigned int nr = 0;
- nodemask_t *mpol_allowed;
+ nodemask_t *mbind_nodemask;
unsigned int *array = h->free_huge_pages_node;
gfp_t gfp_mask = htlb_alloc_mask(h);
- mpol_allowed = policy_nodemask_current(gfp_mask);
-
+ mbind_nodemask = policy_mbind_nodemask(gfp_mask);
for_each_node_mask(node, cpuset_current_mems_allowed) {
- if (!mpol_allowed || node_isset(node, *mpol_allowed))
+ if (!mbind_nodemask || node_isset(node, *mbind_nodemask))
nr += array[node];
}
@@ -4583,16 +4612,28 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma)
resv_map_dup_hugetlb_cgroup_uncharge_info(resv);
kref_get(&resv->refs);
}
+
+ /*
+ * vma_lock structure for sharable mappings is vma specific.
+ * Clear old pointer (if copied via vm_area_dup) and create new.
+ */
+ if (vma->vm_flags & VM_MAYSHARE) {
+ vma->vm_private_data = NULL;
+ hugetlb_vma_lock_alloc(vma);
+ }
}
static void hugetlb_vm_op_close(struct vm_area_struct *vma)
{
struct hstate *h = hstate_vma(vma);
- struct resv_map *resv = vma_resv_map(vma);
+ struct resv_map *resv;
struct hugepage_subpool *spool = subpool_vma(vma);
unsigned long reserve, start, end;
long gbl_reserve;
+ hugetlb_vma_lock_free(vma);
+
+ resv = vma_resv_map(vma);
if (!resv || !is_vma_resv_set(vma, HPAGE_RESV_OWNER))
return;
@@ -4723,14 +4764,13 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
struct vm_area_struct *dst_vma,
struct vm_area_struct *src_vma)
{
- pte_t *src_pte, *dst_pte, entry, dst_entry;
+ pte_t *src_pte, *dst_pte, entry;
struct page *ptepage;
unsigned long addr;
bool cow = is_cow_mapping(src_vma->vm_flags);
struct hstate *h = hstate_vma(src_vma);
unsigned long sz = huge_page_size(h);
unsigned long npages = pages_per_huge_page(h);
- struct address_space *mapping = src_vma->vm_file->f_mapping;
struct mmu_notifier_range range;
unsigned long last_addr_mask;
int ret = 0;
@@ -4744,12 +4784,12 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
raw_write_seqcount_begin(&src->write_protect_seq);
} else {
/*
- * For shared mappings i_mmap_rwsem must be held to call
- * huge_pte_alloc, otherwise the returned ptep could go
- * away if part of a shared pmd and another thread calls
- * huge_pmd_unshare.
+ * For shared mappings the vma lock must be held before
+ * calling huge_pte_offset in the src vma. Otherwise, the
+ * returned ptep could go away if part of a shared pmd and
+ * another thread calls huge_pmd_unshare.
*/
- i_mmap_lock_read(mapping);
+ hugetlb_vma_lock_read(src_vma);
}
last_addr_mask = hugetlb_mask_last_page(h);
@@ -4768,15 +4808,13 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
/*
* If the pagetables are shared don't copy or take references.
- * dst_pte == src_pte is the common case of src/dest sharing.
*
+ * dst_pte == src_pte is the common case of src/dest sharing.
* However, src could have 'unshared' and dst shares with
- * another vma. If dst_pte !none, this implies sharing.
- * Check here before taking page table lock, and once again
- * after taking the lock below.
+ * another vma. So page_count of ptep page is checked instead
+ * to reliably determine whether pte is shared.
*/
- dst_entry = huge_ptep_get(dst_pte);
- if ((dst_pte == src_pte) || !huge_pte_none(dst_entry)) {
+ if (page_count(virt_to_page(dst_pte)) > 1) {
addr |= last_addr_mask;
continue;
}
@@ -4785,13 +4823,10 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
src_ptl = huge_pte_lockptr(h, src, src_pte);
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
entry = huge_ptep_get(src_pte);
- dst_entry = huge_ptep_get(dst_pte);
again:
- if (huge_pte_none(entry) || !huge_pte_none(dst_entry)) {
+ if (huge_pte_none(entry)) {
/*
- * Skip if src entry none. Also, skip in the
- * unlikely case dst entry !none as this implies
- * sharing with another vma.
+ * Skip if src entry none.
*/
;
} else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) {
@@ -4870,7 +4905,7 @@ again:
restore_reserve_on_error(h, dst_vma, addr,
new);
put_page(new);
- /* dst_entry won't change as in child */
+ /* huge_ptep of dst_pte won't change as in child */
goto again;
}
hugetlb_install_page(dst_vma, dst_pte, addr, new);
@@ -4902,7 +4937,7 @@ again:
raw_write_seqcount_end(&src->write_protect_seq);
mmu_notifier_invalidate_range_end(&range);
} else {
- i_mmap_unlock_read(mapping);
+ hugetlb_vma_unlock_read(src_vma);
}
return ret;
@@ -4961,6 +4996,7 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
mmu_notifier_invalidate_range_start(&range);
last_addr_mask = hugetlb_mask_last_page(h);
/* Prevent race with file truncation */
+ hugetlb_vma_lock_write(vma);
i_mmap_lock_write(mapping);
for (; old_addr < old_end; old_addr += sz, new_addr += sz) {
src_pte = huge_pte_offset(mm, old_addr, sz);
@@ -4992,6 +5028,7 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
flush_tlb_range(vma, old_end - len, old_end);
mmu_notifier_invalidate_range_end(&range);
i_mmap_unlock_write(mapping);
+ hugetlb_vma_unlock_write(vma);
return len + old_addr - old_end;
}
@@ -5139,19 +5176,22 @@ void __unmap_hugepage_range_final(struct mmu_gather *tlb,
unsigned long end, struct page *ref_page,
zap_flags_t zap_flags)
{
+ hugetlb_vma_lock_write(vma);
+ i_mmap_lock_write(vma->vm_file->f_mapping);
+
__unmap_hugepage_range(tlb, vma, start, end, ref_page, zap_flags);
/*
- * Clear this flag so that x86's huge_pmd_share page_table_shareable
- * test will fail on a vma being torn down, and not grab a page table
- * on its way out. We're lucky that the flag has such an appropriate
- * name, and can in fact be safely cleared here. We could clear it
- * before the __unmap_hugepage_range above, but all that's necessary
- * is to clear it before releasing the i_mmap_rwsem. This works
- * because in the context this is called, the VMA is about to be
- * destroyed and the i_mmap_rwsem is held.
+ * Unlock and free the vma lock before releasing i_mmap_rwsem. When
+ * the vma_lock is freed, this makes the vma ineligible for pmd
+ * sharing. And, i_mmap_rwsem is required to set up pmd sharing.
+ * This is important as page tables for this unmapped range will
+ * be asynchrously deleted. If the page tables are shared, there
+ * will be issues when accessed by someone else.
*/
- vma->vm_flags &= ~VM_MAYSHARE;
+ __hugetlb_vma_unlock_write_free(vma);
+
+ i_mmap_unlock_write(vma->vm_file->f_mapping);
}
void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
@@ -5316,11 +5356,10 @@ retry_avoidcopy:
u32 hash;
put_page(old_page);
- BUG_ON(huge_pte_none(pte));
/*
- * Drop hugetlb_fault_mutex and i_mmap_rwsem before
- * unmapping. unmapping needs to hold i_mmap_rwsem
- * in write mode. Dropping i_mmap_rwsem in read mode
+ * Drop hugetlb_fault_mutex and vma_lock before
+ * unmapping. unmapping needs to hold vma_lock
+ * in write mode. Dropping vma_lock in read mode
* here is OK as COW mappings do not interact with
* PMD sharing.
*
@@ -5328,13 +5367,13 @@ retry_avoidcopy:
*/
idx = vma_hugecache_offset(h, vma, haddr);
hash = hugetlb_fault_mutex_hash(mapping, idx);
+ hugetlb_vma_unlock_read(vma);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
- i_mmap_unlock_read(mapping);
unmap_ref_private(mm, vma, old_page, haddr);
- i_mmap_lock_read(mapping);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
+ hugetlb_vma_lock_read(vma);
spin_lock(ptl);
ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
if (likely(ptep &&
@@ -5408,19 +5447,6 @@ out_release_old:
return ret;
}
-/* Return the pagecache page at a given address within a VMA */
-static struct page *hugetlbfs_pagecache_page(struct hstate *h,
- struct vm_area_struct *vma, unsigned long address)
-{
- struct address_space *mapping;
- pgoff_t idx;
-
- mapping = vma->vm_file->f_mapping;
- idx = vma_hugecache_offset(h, vma, address);
-
- return find_lock_page(mapping, idx);
-}
-
/*
* Return whether there is a pagecache page to back given address within VMA.
* Caller follow_hugetlb_page() holds page_table_lock so we cannot lock_page.
@@ -5441,7 +5467,7 @@ static bool hugetlbfs_pagecache_present(struct hstate *h,
return page != NULL;
}
-int huge_add_to_page_cache(struct page *page, struct address_space *mapping,
+int hugetlb_add_to_page_cache(struct page *page, struct address_space *mapping,
pgoff_t idx)
{
struct folio *folio = page_folio(page);
@@ -5478,7 +5504,6 @@ static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma,
unsigned long addr,
unsigned long reason)
{
- vm_fault_t ret;
u32 hash;
struct vm_fault vmf = {
.vma = vma,
@@ -5496,18 +5521,14 @@ static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma,
};
/*
- * hugetlb_fault_mutex and i_mmap_rwsem must be
- * dropped before handling userfault. Reacquire
- * after handling fault to make calling code simpler.
+ * vma_lock and hugetlb_fault_mutex must be dropped before handling
+ * userfault. Also mmap_lock could be dropped due to handling
+ * userfault, any vma operation should be careful from here.
*/
+ hugetlb_vma_unlock_read(vma);
hash = hugetlb_fault_mutex_hash(mapping, idx);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
- i_mmap_unlock_read(mapping);
- ret = handle_userfault(&vmf, reason);
- i_mmap_lock_read(mapping);
- mutex_lock(&hugetlb_fault_mutex_table[hash]);
-
- return ret;
+ return handle_userfault(&vmf, reason);
}
static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
@@ -5525,6 +5546,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
spinlock_t *ptl;
unsigned long haddr = address & huge_page_mask(h);
bool new_page, new_pagecache_page = false;
+ u32 hash = hugetlb_fault_mutex_hash(mapping, idx);
/*
* Currently, we are forced to kill the process in the event the
@@ -5535,29 +5557,24 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) {
pr_warn_ratelimited("PID %d killed due to inadequate hugepage pool\n",
current->pid);
- return ret;
+ goto out;
}
/*
- * We can not race with truncation due to holding i_mmap_rwsem.
- * i_size is modified when holding i_mmap_rwsem, so check here
- * once for faults beyond end of file.
+ * Use page lock to guard against racing truncation
+ * before we get page_table_lock.
*/
- size = i_size_read(mapping->host) >> huge_page_shift(h);
- if (idx >= size)
- goto out;
-
-retry:
new_page = false;
page = find_lock_page(mapping, idx);
if (!page) {
+ size = i_size_read(mapping->host) >> huge_page_shift(h);
+ if (idx >= size)
+ goto out;
/* Check for page in userfault range */
- if (userfaultfd_missing(vma)) {
- ret = hugetlb_handle_userfault(vma, mapping, idx,
+ if (userfaultfd_missing(vma))
+ return hugetlb_handle_userfault(vma, mapping, idx,
flags, haddr, address,
VM_UFFD_MISSING);
- goto out;
- }
page = alloc_huge_page(vma, haddr, 0);
if (IS_ERR(page)) {
@@ -5585,11 +5602,17 @@ retry:
new_page = true;
if (vma->vm_flags & VM_MAYSHARE) {
- int err = huge_add_to_page_cache(page, mapping, idx);
+ int err = hugetlb_add_to_page_cache(page, mapping, idx);
if (err) {
+ /*
+ * err can't be -EEXIST which implies someone
+ * else consumed the reservation since hugetlb
+ * fault mutex is held when add a hugetlb page
+ * to the page cache. So it's safe to call
+ * restore_reserve_on_error() here.
+ */
+ restore_reserve_on_error(h, vma, haddr, page);
put_page(page);
- if (err == -EEXIST)
- goto retry;
goto out;
}
new_pagecache_page = true;
@@ -5617,10 +5640,9 @@ retry:
if (userfaultfd_minor(vma)) {
unlock_page(page);
put_page(page);
- ret = hugetlb_handle_userfault(vma, mapping, idx,
+ return hugetlb_handle_userfault(vma, mapping, idx,
flags, haddr, address,
VM_UFFD_MINOR);
- goto out;
}
}
@@ -5678,15 +5700,17 @@ retry:
unlock_page(page);
out:
+ hugetlb_vma_unlock_read(vma);
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
return ret;
backout:
spin_unlock(ptl);
backout_unlocked:
- unlock_page(page);
- /* restore reserve for newly allocated pages not in page cache */
if (new_page && !new_pagecache_page)
restore_reserve_on_error(h, vma, haddr, page);
+
+ unlock_page(page);
put_page(page);
goto out;
}
@@ -5747,40 +5771,41 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
}
/*
- * Acquire i_mmap_rwsem before calling huge_pte_alloc and hold
- * until finished with ptep. This serves two purposes:
- * 1) It prevents huge_pmd_unshare from being called elsewhere
- * and making the ptep no longer valid.
- * 2) It synchronizes us with i_size modifications during truncation.
+ * Serialize hugepage allocation and instantiation, so that we don't
+ * get spurious allocation failures if two CPUs race to instantiate
+ * the same page in the page cache.
+ */
+ mapping = vma->vm_file->f_mapping;
+ idx = vma_hugecache_offset(h, vma, haddr);
+ hash = hugetlb_fault_mutex_hash(mapping, idx);
+ mutex_lock(&hugetlb_fault_mutex_table[hash]);
+
+ /*
+ * Acquire vma lock before calling huge_pte_alloc and hold
+ * until finished with ptep. This prevents huge_pmd_unshare from
+ * being called elsewhere and making the ptep no longer valid.
*
* ptep could have already be assigned via huge_pte_offset. That
* is OK, as huge_pte_alloc will return the same value unless
* something has changed.
*/
- mapping = vma->vm_file->f_mapping;
- i_mmap_lock_read(mapping);
+ hugetlb_vma_lock_read(vma);
ptep = huge_pte_alloc(mm, vma, haddr, huge_page_size(h));
if (!ptep) {
- i_mmap_unlock_read(mapping);
+ hugetlb_vma_unlock_read(vma);
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
return VM_FAULT_OOM;
}
- /*
- * Serialize hugepage allocation and instantiation, so that we don't
- * get spurious allocation failures if two CPUs race to instantiate
- * the same page in the page cache.
- */
- idx = vma_hugecache_offset(h, vma, haddr);
- hash = hugetlb_fault_mutex_hash(mapping, idx);
- mutex_lock(&hugetlb_fault_mutex_table[hash]);
-
entry = huge_ptep_get(ptep);
/* PTE markers should be handled the same way as none pte */
- if (huge_pte_none_mostly(entry)) {
- ret = hugetlb_no_page(mm, vma, mapping, idx, address, ptep,
+ if (huge_pte_none_mostly(entry))
+ /*
+ * hugetlb_no_page will drop vma lock and hugetlb fault
+ * mutex internally, which make us return immediately.
+ */
+ return hugetlb_no_page(mm, vma, mapping, idx, address, ptep,
entry, flags);
- goto out_mutex;
- }
ret = 0;
@@ -5810,7 +5835,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
/* Just decrements count, does not deallocate */
vma_end_reservation(h, vma, haddr);
- pagecache_page = hugetlbfs_pagecache_page(h, vma, haddr);
+ pagecache_page = find_lock_page(mapping, idx);
}
ptl = huge_pte_lock(h, mm, ptep);
@@ -5834,8 +5859,8 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unlock_page(pagecache_page);
put_page(pagecache_page);
}
+ hugetlb_vma_unlock_read(vma);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
- i_mmap_unlock_read(mapping);
return handle_userfault(&vmf, VM_UFFD_WP);
}
@@ -5878,8 +5903,8 @@ out_ptl:
put_page(pagecache_page);
}
out_mutex:
+ hugetlb_vma_unlock_read(vma);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
- i_mmap_unlock_read(mapping);
/*
* Generally it's safe to hold refcount during waiting page lock. But
* here we just wait to defer the next page fault to avoid busy loop and
@@ -6007,39 +6032,24 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
/*
* Serialization between remove_inode_hugepages() and
- * huge_add_to_page_cache() below happens through the
+ * hugetlb_add_to_page_cache() below happens through the
* hugetlb_fault_mutex_table that here must be hold by
* the caller.
*/
- ret = huge_add_to_page_cache(page, mapping, idx);
+ ret = hugetlb_add_to_page_cache(page, mapping, idx);
if (ret)
goto out_release_nounlock;
page_in_pagecache = true;
}
- ptl = huge_pte_lockptr(h, dst_mm, dst_pte);
- spin_lock(ptl);
+ ptl = huge_pte_lock(h, dst_mm, dst_pte);
/*
- * Recheck the i_size after holding PT lock to make sure not
- * to leave any page mapped (as page_mapped()) beyond the end
- * of the i_size (remove_inode_hugepages() is strict about
- * enforcing that). If we bail out here, we'll also leave a
- * page in the radix tree in the vm_shared case beyond the end
- * of the i_size, but remove_inode_hugepages() will take care
- * of it as soon as we drop the hugetlb_fault_mutex_table.
- */
- size = i_size_read(mapping->host) >> huge_page_shift(h);
- ret = -EFAULT;
- if (idx >= size)
- goto out_release_unlock;
-
- ret = -EEXIST;
- /*
* We allow to overwrite a pte marker: consider when both MISSING|WP
* registered, we firstly wr-protect a none pte which has no page cache
* page backing it, then access the page.
*/
+ ret = -EEXIST;
if (!huge_pte_none_mostly(huge_ptep_get(dst_pte)))
goto out_release_unlock;
@@ -6107,7 +6117,7 @@ static void record_subpages_vmas(struct page *page, struct vm_area_struct *vma,
for (nr = 0; nr < refs; nr++) {
if (likely(pages))
- pages[nr] = mem_map_offset(page, nr);
+ pages[nr] = nth_page(page, nr);
if (vmas)
vmas[nr] = vma;
}
@@ -6271,7 +6281,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
(vma->vm_end - ALIGN_DOWN(vaddr, PAGE_SIZE)) >> PAGE_SHIFT);
if (pages || vmas)
- record_subpages_vmas(mem_map_offset(page, pfn_offset),
+ record_subpages_vmas(nth_page(page, pfn_offset),
vma, refs,
likely(pages) ? pages + i : NULL,
vmas ? vmas + i : NULL);
@@ -6342,8 +6352,9 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
flush_cache_range(vma, range.start, range.end);
mmu_notifier_invalidate_range_start(&range);
- last_addr_mask = hugetlb_mask_last_page(h);
+ hugetlb_vma_lock_write(vma);
i_mmap_lock_write(vma->vm_file->f_mapping);
+ last_addr_mask = hugetlb_mask_last_page(h);
for (; address < end; address += psize) {
spinlock_t *ptl;
ptep = huge_pte_offset(mm, address, psize);
@@ -6442,6 +6453,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
* See Documentation/mm/mmu_notifier.rst
*/
i_mmap_unlock_write(vma->vm_file->f_mapping);
+ hugetlb_vma_unlock_write(vma);
mmu_notifier_invalidate_range_end(&range);
return pages << h->order;
@@ -6467,6 +6479,11 @@ bool hugetlb_reserve_pages(struct inode *inode,
}
/*
+ * vma specific semaphore used for pmd sharing synchronization
+ */
+ hugetlb_vma_lock_alloc(vma);
+
+ /*
* Only apply hugepage reservation if asked. At fault time, an
* attempt will be made for VM_NORESERVE to allocate a page
* without using reserves
@@ -6489,12 +6506,11 @@ bool hugetlb_reserve_pages(struct inode *inode,
resv_map = inode_resv_map(inode);
chg = region_chg(resv_map, from, to, &regions_needed);
-
} else {
/* Private mapping. */
resv_map = resv_map_alloc();
if (!resv_map)
- return false;
+ goto out_err;
chg = to - from;
@@ -6589,6 +6605,7 @@ out_uncharge_cgroup:
hugetlb_cgroup_uncharge_cgroup_rsvd(hstate_index(h),
chg * pages_per_huge_page(h), h_cg);
out_err:
+ hugetlb_vma_lock_free(vma);
if (!vma || vma->vm_flags & VM_MAYSHARE)
/* Only call region_abort if the region_chg succeeded but the
* region_add failed or didn't run.
@@ -6658,35 +6675,37 @@ static unsigned long page_table_shareable(struct vm_area_struct *svma,
/*
* match the virtual addresses, permission and the alignment of the
* page table page.
+ *
+ * Also, vma_lock (vm_private_data) is required for sharing.
*/
if (pmd_index(addr) != pmd_index(saddr) ||
vm_flags != svm_flags ||
- !range_in_vma(svma, sbase, s_end))
+ !range_in_vma(svma, sbase, s_end) ||
+ !svma->vm_private_data)
return 0;
return saddr;
}
-static bool vma_shareable(struct vm_area_struct *vma, unsigned long addr)
-{
- unsigned long base = addr & PUD_MASK;
- unsigned long end = base + PUD_SIZE;
-
- /*
- * check on proper vm_flags and page table alignment
- */
- if (vma->vm_flags & VM_MAYSHARE && range_in_vma(vma, base, end))
- return true;
- return false;
-}
-
bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr)
{
+ unsigned long start = addr & PUD_MASK;
+ unsigned long end = start + PUD_SIZE;
+
#ifdef CONFIG_USERFAULTFD
if (uffd_disable_huge_pmd_share(vma))
return false;
#endif
- return vma_shareable(vma, addr);
+ /*
+ * check on proper vm_flags and page table alignment
+ */
+ if (!(vma->vm_flags & VM_MAYSHARE))
+ return false;
+ if (!vma->vm_private_data) /* vma lock required for sharing */
+ return false;
+ if (!range_in_vma(vma, start, end))
+ return false;
+ return true;
}
/*
@@ -6716,16 +6735,157 @@ void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
*end = ALIGN(*end, PUD_SIZE);
}
+static bool __vma_shareable_flags_pmd(struct vm_area_struct *vma)
+{
+ return vma->vm_flags & (VM_MAYSHARE | VM_SHARED) &&
+ vma->vm_private_data;
+}
+
+void hugetlb_vma_lock_read(struct vm_area_struct *vma)
+{
+ if (__vma_shareable_flags_pmd(vma)) {
+ struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
+
+ down_read(&vma_lock->rw_sema);
+ }
+}
+
+void hugetlb_vma_unlock_read(struct vm_area_struct *vma)
+{
+ if (__vma_shareable_flags_pmd(vma)) {
+ struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
+
+ up_read(&vma_lock->rw_sema);
+ }
+}
+
+void hugetlb_vma_lock_write(struct vm_area_struct *vma)
+{
+ if (__vma_shareable_flags_pmd(vma)) {
+ struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
+
+ down_write(&vma_lock->rw_sema);
+ }
+}
+
+void hugetlb_vma_unlock_write(struct vm_area_struct *vma)
+{
+ if (__vma_shareable_flags_pmd(vma)) {
+ struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
+
+ up_write(&vma_lock->rw_sema);
+ }
+}
+
+int hugetlb_vma_trylock_write(struct vm_area_struct *vma)
+{
+ struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
+
+ if (!__vma_shareable_flags_pmd(vma))
+ return 1;
+
+ return down_write_trylock(&vma_lock->rw_sema);
+}
+
+void hugetlb_vma_assert_locked(struct vm_area_struct *vma)
+{
+ if (__vma_shareable_flags_pmd(vma)) {
+ struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
+
+ lockdep_assert_held(&vma_lock->rw_sema);
+ }
+}
+
+void hugetlb_vma_lock_release(struct kref *kref)
+{
+ struct hugetlb_vma_lock *vma_lock = container_of(kref,
+ struct hugetlb_vma_lock, refs);
+
+ kfree(vma_lock);
+}
+
+void __hugetlb_vma_unlock_write_put(struct hugetlb_vma_lock *vma_lock)
+{
+ struct vm_area_struct *vma = vma_lock->vma;
+
+ /*
+ * vma_lock structure may or not be released as a result of put,
+ * it certainly will no longer be attached to vma so clear pointer.
+ * Semaphore synchronizes access to vma_lock->vma field.
+ */
+ vma_lock->vma = NULL;
+ vma->vm_private_data = NULL;
+ up_write(&vma_lock->rw_sema);
+ kref_put(&vma_lock->refs, hugetlb_vma_lock_release);
+}
+
+static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma)
+{
+ if (__vma_shareable_flags_pmd(vma)) {
+ struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
+
+ __hugetlb_vma_unlock_write_put(vma_lock);
+ }
+}
+
+static void hugetlb_vma_lock_free(struct vm_area_struct *vma)
+{
+ /*
+ * Only present in sharable vmas.
+ */
+ if (!vma || !__vma_shareable_flags_pmd(vma))
+ return;
+
+ if (vma->vm_private_data) {
+ struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
+
+ down_write(&vma_lock->rw_sema);
+ __hugetlb_vma_unlock_write_put(vma_lock);
+ }
+}
+
+static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma)
+{
+ struct hugetlb_vma_lock *vma_lock;
+
+ /* Only establish in (flags) sharable vmas */
+ if (!vma || !(vma->vm_flags & VM_MAYSHARE))
+ return;
+
+ /* Should never get here with non-NULL vm_private_data */
+ if (vma->vm_private_data)
+ return;
+
+ vma_lock = kmalloc(sizeof(*vma_lock), GFP_KERNEL);
+ if (!vma_lock) {
+ /*
+ * If we can not allocate structure, then vma can not
+ * participate in pmd sharing. This is only a possible
+ * performance enhancement and memory saving issue.
+ * However, the lock is also used to synchronize page
+ * faults with truncation. If the lock is not present,
+ * unlikely races could leave pages in a file past i_size
+ * until the file is removed. Warn in the unlikely case of
+ * allocation failure.
+ */
+ pr_warn_once("HugeTLB: unable to allocate vma specific lock\n");
+ return;
+ }
+
+ kref_init(&vma_lock->refs);
+ init_rwsem(&vma_lock->rw_sema);
+ vma_lock->vma = vma;
+ vma->vm_private_data = vma_lock;
+}
+
/*
* Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc()
* and returns the corresponding pte. While this is not necessary for the
* !shared pmd case because we can allocate the pmd later as well, it makes the
- * code much cleaner.
- *
- * This routine must be called with i_mmap_rwsem held in at least read mode if
- * sharing is possible. For hugetlbfs, this prevents removal of any page
- * table entries associated with the address space. This is important as we
- * are setting up sharing based on existing page table entries (mappings).
+ * code much cleaner. pmd allocation is essential for the shared case because
+ * pud has to be populated inside the same i_mmap_rwsem section - otherwise
+ * racing tasks could either miss the sharing (see huge_pte_offset) or select a
+ * bad pmd for sharing.
*/
pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, pud_t *pud)
@@ -6739,7 +6899,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
pte_t *pte;
spinlock_t *ptl;
- i_mmap_assert_locked(mapping);
+ i_mmap_lock_read(mapping);
vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) {
if (svma == vma)
continue;
@@ -6769,6 +6929,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
spin_unlock(ptl);
out:
pte = (pte_t *)pmd_alloc(mm, pud, addr);
+ i_mmap_unlock_read(mapping);
return pte;
}
@@ -6779,7 +6940,7 @@ out:
* indicated by page_count > 1, unmap is achieved by clearing pud and
* decrementing the ref count. If count == 1, the pte page is not shared.
*
- * Called with page table lock held and i_mmap_rwsem held in write mode.
+ * Called with page table lock held.
*
* returns: 1 successfully unmapped a shared pte page
* 0 the underlying pte page is not shared, or it is the last user
@@ -6792,6 +6953,7 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
pud_t *pud = pud_offset(p4d, addr);
i_mmap_assert_write_locked(vma->vm_file->f_mapping);
+ hugetlb_vma_assert_locked(vma);
BUG_ON(page_count(virt_to_page(ptep)) == 0);
if (page_count(virt_to_page(ptep)) == 1)
return 0;
@@ -6803,6 +6965,48 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
}
#else /* !CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
+
+void hugetlb_vma_lock_read(struct vm_area_struct *vma)
+{
+}
+
+void hugetlb_vma_unlock_read(struct vm_area_struct *vma)
+{
+}
+
+void hugetlb_vma_lock_write(struct vm_area_struct *vma)
+{
+}
+
+void hugetlb_vma_unlock_write(struct vm_area_struct *vma)
+{
+}
+
+int hugetlb_vma_trylock_write(struct vm_area_struct *vma)
+{
+ return 1;
+}
+
+void hugetlb_vma_assert_locked(struct vm_area_struct *vma)
+{
+}
+
+void hugetlb_vma_lock_release(struct kref *kref)
+{
+}
+
+static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma)
+{
+}
+
+static void hugetlb_vma_lock_free(struct vm_area_struct *vma)
+{
+}
+
+static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma)
+{
+}
+
pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, pud_t *pud)
{
@@ -7173,6 +7377,7 @@ void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
start, end);
mmu_notifier_invalidate_range_start(&range);
+ hugetlb_vma_lock_write(vma);
i_mmap_lock_write(vma->vm_file->f_mapping);
for (address = start; address < end; address += PUD_SIZE) {
ptep = huge_pte_offset(mm, address, sz);
@@ -7184,6 +7389,7 @@ void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
}
flush_hugetlb_tlb_range(vma, start, end);
i_mmap_unlock_write(vma->vm_file->f_mapping);
+ hugetlb_vma_unlock_write(vma);
/*
* No need to call mmu_notifier_invalidate_range(), see
* Documentation/mm/mmu_notifier.rst.
@@ -7334,7 +7540,7 @@ void __init hugetlb_cma_reserve(int order)
hugetlb_cma_size = 0;
}
-void __init hugetlb_cma_check(void)
+static void __init hugetlb_cma_check(void)
{
if (!hugetlb_cma_size || cma_reserve_called)
return;