aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/include/linux/mm.h
diff options
context:
space:
mode:
Diffstat (limited to 'include/linux/mm.h')
-rw-r--r--include/linux/mm.h313
1 files changed, 266 insertions, 47 deletions
diff --git a/include/linux/mm.h b/include/linux/mm.h
index c54fb96cb1e6..5a323422d783 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -27,6 +27,7 @@
#include <linux/memremap.h>
#include <linux/overflow.h>
#include <linux/sizes.h>
+#include <linux/sched.h>
struct mempolicy;
struct anon_vma;
@@ -342,6 +343,20 @@ extern unsigned int kobjsize(const void *objp);
/* Bits set in the VMA until the stack is in its final location */
#define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ)
+#define TASK_EXEC ((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0)
+
+/* Common data flag combinations */
+#define VM_DATA_FLAGS_TSK_EXEC (VM_READ | VM_WRITE | TASK_EXEC | \
+ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
+#define VM_DATA_FLAGS_NON_EXEC (VM_READ | VM_WRITE | VM_MAYREAD | \
+ VM_MAYWRITE | VM_MAYEXEC)
+#define VM_DATA_FLAGS_EXEC (VM_READ | VM_WRITE | VM_EXEC | \
+ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
+
+#ifndef VM_DATA_DEFAULT_FLAGS /* arch can override this */
+#define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_EXEC
+#endif
+
#ifndef VM_STACK_DEFAULT_FLAGS /* arch can override this */
#define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS
#endif
@@ -354,12 +369,18 @@ extern unsigned int kobjsize(const void *objp);
#define VM_STACK_FLAGS (VM_STACK | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT)
+/* VMA basic access permission flags */
+#define VM_ACCESS_FLAGS (VM_READ | VM_WRITE | VM_EXEC)
+
+
/*
* Special vmas that are non-mergable, non-mlock()able.
- * Note: mm/huge_memory.c VM_NO_THP depends on this definition.
*/
#define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_PFNMAP | VM_MIXEDMAP)
+/* This mask prevents VMA from being scanned with khugepaged */
+#define VM_NO_KHUGEPAGED (VM_SPECIAL | VM_HUGETLB)
+
/* This mask defines which mm->def_flags a process can inherit its parent */
#define VM_INIT_DEF_MASK VM_NOHUGEPAGE
@@ -378,15 +399,75 @@ extern unsigned int kobjsize(const void *objp);
*/
extern pgprot_t protection_map[16];
-#define FAULT_FLAG_WRITE 0x01 /* Fault was a write access */
-#define FAULT_FLAG_MKWRITE 0x02 /* Fault was mkwrite of existing pte */
-#define FAULT_FLAG_ALLOW_RETRY 0x04 /* Retry fault if blocking */
-#define FAULT_FLAG_RETRY_NOWAIT 0x08 /* Don't drop mmap_sem and wait when retrying */
-#define FAULT_FLAG_KILLABLE 0x10 /* The fault task is in SIGKILL killable region */
-#define FAULT_FLAG_TRIED 0x20 /* Second try */
-#define FAULT_FLAG_USER 0x40 /* The fault originated in userspace */
-#define FAULT_FLAG_REMOTE 0x80 /* faulting for non current tsk/mm */
-#define FAULT_FLAG_INSTRUCTION 0x100 /* The fault was during an instruction fetch */
+/**
+ * Fault flag definitions.
+ *
+ * @FAULT_FLAG_WRITE: Fault was a write fault.
+ * @FAULT_FLAG_MKWRITE: Fault was mkwrite of existing PTE.
+ * @FAULT_FLAG_ALLOW_RETRY: Allow to retry the fault if blocked.
+ * @FAULT_FLAG_RETRY_NOWAIT: Don't drop mmap_sem and wait when retrying.
+ * @FAULT_FLAG_KILLABLE: The fault task is in SIGKILL killable region.
+ * @FAULT_FLAG_TRIED: The fault has been tried once.
+ * @FAULT_FLAG_USER: The fault originated in userspace.
+ * @FAULT_FLAG_REMOTE: The fault is not for current task/mm.
+ * @FAULT_FLAG_INSTRUCTION: The fault was during an instruction fetch.
+ * @FAULT_FLAG_INTERRUPTIBLE: The fault can be interrupted by non-fatal signals.
+ *
+ * About @FAULT_FLAG_ALLOW_RETRY and @FAULT_FLAG_TRIED: we can specify
+ * whether we would allow page faults to retry by specifying these two
+ * fault flags correctly. Currently there can be three legal combinations:
+ *
+ * (a) ALLOW_RETRY and !TRIED: this means the page fault allows retry, and
+ * this is the first try
+ *
+ * (b) ALLOW_RETRY and TRIED: this means the page fault allows retry, and
+ * we've already tried at least once
+ *
+ * (c) !ALLOW_RETRY and !TRIED: this means the page fault does not allow retry
+ *
+ * The unlisted combination (!ALLOW_RETRY && TRIED) is illegal and should never
+ * be used. Note that page faults can be allowed to retry for multiple times,
+ * in which case we'll have an initial fault with flags (a) then later on
+ * continuous faults with flags (b). We should always try to detect pending
+ * signals before a retry to make sure the continuous page faults can still be
+ * interrupted if necessary.
+ */
+#define FAULT_FLAG_WRITE 0x01
+#define FAULT_FLAG_MKWRITE 0x02
+#define FAULT_FLAG_ALLOW_RETRY 0x04
+#define FAULT_FLAG_RETRY_NOWAIT 0x08
+#define FAULT_FLAG_KILLABLE 0x10
+#define FAULT_FLAG_TRIED 0x20
+#define FAULT_FLAG_USER 0x40
+#define FAULT_FLAG_REMOTE 0x80
+#define FAULT_FLAG_INSTRUCTION 0x100
+#define FAULT_FLAG_INTERRUPTIBLE 0x200
+
+/*
+ * The default fault flags that should be used by most of the
+ * arch-specific page fault handlers.
+ */
+#define FAULT_FLAG_DEFAULT (FAULT_FLAG_ALLOW_RETRY | \
+ FAULT_FLAG_KILLABLE | \
+ FAULT_FLAG_INTERRUPTIBLE)
+
+/**
+ * fault_flag_allow_retry_first - check ALLOW_RETRY the first time
+ *
+ * This is mostly used for places where we want to try to avoid taking
+ * the mmap_sem for too long a time when waiting for another condition
+ * to change, in which case we can try to be polite to release the
+ * mmap_sem in the first round to avoid potential starvation of other
+ * processes that would also want the mmap_sem.
+ *
+ * Return: true if the page fault allows retry and this is the first
+ * attempt of the fault handling; false otherwise.
+ */
+static inline bool fault_flag_allow_retry_first(unsigned int flags)
+{
+ return (flags & FAULT_FLAG_ALLOW_RETRY) &&
+ (!(flags & FAULT_FLAG_TRIED));
+}
#define FAULT_FLAG_TRACE \
{ FAULT_FLAG_WRITE, "WRITE" }, \
@@ -397,7 +478,8 @@ extern pgprot_t protection_map[16];
{ FAULT_FLAG_TRIED, "TRIED" }, \
{ FAULT_FLAG_USER, "USER" }, \
{ FAULT_FLAG_REMOTE, "REMOTE" }, \
- { FAULT_FLAG_INSTRUCTION, "INSTRUCTION" }
+ { FAULT_FLAG_INSTRUCTION, "INSTRUCTION" }, \
+ { FAULT_FLAG_INTERRUPTIBLE, "INTERRUPTIBLE" }
/*
* vm_fault is filled by the the pagefault handler and passed to the vma's
@@ -541,6 +623,36 @@ static inline bool vma_is_anonymous(struct vm_area_struct *vma)
return !vma->vm_ops;
}
+static inline bool vma_is_temporary_stack(struct vm_area_struct *vma)
+{
+ int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP);
+
+ if (!maybe_stack)
+ return false;
+
+ if ((vma->vm_flags & VM_STACK_INCOMPLETE_SETUP) ==
+ VM_STACK_INCOMPLETE_SETUP)
+ return true;
+
+ return false;
+}
+
+static inline bool vma_is_foreign(struct vm_area_struct *vma)
+{
+ if (!current->mm)
+ return true;
+
+ if (current->mm != vma->vm_mm)
+ return true;
+
+ return false;
+}
+
+static inline bool vma_is_accessible(struct vm_area_struct *vma)
+{
+ return vma->vm_flags & VM_ACCESS_FLAGS;
+}
+
#ifdef CONFIG_SHMEM
/*
* The vma_is_shmem is not inline because it is used only by slow
@@ -770,6 +882,24 @@ static inline unsigned int compound_order(struct page *page)
return page[1].compound_order;
}
+static inline bool hpage_pincount_available(struct page *page)
+{
+ /*
+ * Can the page->hpage_pinned_refcount field be used? That field is in
+ * the 3rd page of the compound page, so the smallest (2-page) compound
+ * pages cannot support it.
+ */
+ page = compound_head(page);
+ return PageCompound(page) && compound_order(page) > 1;
+}
+
+static inline int compound_pincount(struct page *page)
+{
+ VM_BUG_ON_PAGE(!hpage_pincount_available(page), page);
+ page = compound_head(page);
+ return atomic_read(compound_pincount_ptr(page));
+}
+
static inline void set_compound_order(struct page *page, unsigned int order)
{
page[1].compound_order = order;
@@ -1001,6 +1131,8 @@ static inline void get_page(struct page *page)
page_ref_inc(page);
}
+bool __must_check try_grab_page(struct page *page, unsigned int flags);
+
static inline __must_check bool try_get_page(struct page *page)
{
page = compound_head(page);
@@ -1029,29 +1161,87 @@ static inline void put_page(struct page *page)
__put_page(page);
}
-/**
- * unpin_user_page() - release a gup-pinned page
- * @page: pointer to page to be released
+/*
+ * GUP_PIN_COUNTING_BIAS, and the associated functions that use it, overload
+ * the page's refcount so that two separate items are tracked: the original page
+ * reference count, and also a new count of how many pin_user_pages() calls were
+ * made against the page. ("gup-pinned" is another term for the latter).
+ *
+ * With this scheme, pin_user_pages() becomes special: such pages are marked as
+ * distinct from normal pages. As such, the unpin_user_page() call (and its
+ * variants) must be used in order to release gup-pinned pages.
+ *
+ * Choice of value:
+ *
+ * By making GUP_PIN_COUNTING_BIAS a power of two, debugging of page reference
+ * counts with respect to pin_user_pages() and unpin_user_page() becomes
+ * simpler, due to the fact that adding an even power of two to the page
+ * refcount has the effect of using only the upper N bits, for the code that
+ * counts up using the bias value. This means that the lower bits are left for
+ * the exclusive use of the original code that increments and decrements by one
+ * (or at least, by much smaller values than the bias value).
*
- * Pages that were pinned via pin_user_pages*() must be released via either
- * unpin_user_page(), or one of the unpin_user_pages*() routines. This is so
- * that eventually such pages can be separately tracked and uniquely handled. In
- * particular, interactions with RDMA and filesystems need special handling.
+ * Of course, once the lower bits overflow into the upper bits (and this is
+ * OK, because subtraction recovers the original values), then visual inspection
+ * no longer suffices to directly view the separate counts. However, for normal
+ * applications that don't have huge page reference counts, this won't be an
+ * issue.
*
- * unpin_user_page() and put_page() are not interchangeable, despite this early
- * implementation that makes them look the same. unpin_user_page() calls must
- * be perfectly matched up with pin*() calls.
+ * Locking: the lockless algorithm described in page_cache_get_speculative()
+ * and page_cache_gup_pin_speculative() provides safe operation for
+ * get_user_pages and page_mkclean and other calls that race to set up page
+ * table entries.
*/
-static inline void unpin_user_page(struct page *page)
-{
- put_page(page);
-}
+#define GUP_PIN_COUNTING_BIAS (1U << 10)
+void unpin_user_page(struct page *page);
void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages,
bool make_dirty);
-
void unpin_user_pages(struct page **pages, unsigned long npages);
+/**
+ * page_maybe_dma_pinned() - report if a page is pinned for DMA.
+ *
+ * This function checks if a page has been pinned via a call to
+ * pin_user_pages*().
+ *
+ * For non-huge pages, the return value is partially fuzzy: false is not fuzzy,
+ * because it means "definitely not pinned for DMA", but true means "probably
+ * pinned for DMA, but possibly a false positive due to having at least
+ * GUP_PIN_COUNTING_BIAS worth of normal page references".
+ *
+ * False positives are OK, because: a) it's unlikely for a page to get that many
+ * refcounts, and b) all the callers of this routine are expected to be able to
+ * deal gracefully with a false positive.
+ *
+ * For huge pages, the result will be exactly correct. That's because we have
+ * more tracking data available: the 3rd struct page in the compound page is
+ * used to track the pincount (instead using of the GUP_PIN_COUNTING_BIAS
+ * scheme).
+ *
+ * For more information, please see Documentation/vm/pin_user_pages.rst.
+ *
+ * @page: pointer to page to be queried.
+ * @Return: True, if it is likely that the page has been "dma-pinned".
+ * False, if the page is definitely not dma-pinned.
+ */
+static inline bool page_maybe_dma_pinned(struct page *page)
+{
+ if (hpage_pincount_available(page))
+ return compound_pincount(page) > 0;
+
+ /*
+ * page_ref_count() is signed. If that refcount overflows, then
+ * page_ref_count() returns a negative value, and callers will avoid
+ * further incrementing the refcount.
+ *
+ * Here, for that overflow case, use the signed bit to count a little
+ * bit higher via unsigned math, and thus still get an accurate result.
+ */
+ return ((unsigned int)page_ref_count(compound_head(page))) >=
+ GUP_PIN_COUNTING_BIAS;
+}
+
#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
#define SECTION_IN_PAGE_FLAGS
#endif
@@ -1599,9 +1789,26 @@ extern unsigned long move_page_tables(struct vm_area_struct *vma,
unsigned long old_addr, struct vm_area_struct *new_vma,
unsigned long new_addr, unsigned long len,
bool need_rmap_locks);
+
+/*
+ * Flags used by change_protection(). For now we make it a bitmap so
+ * that we can pass in multiple flags just like parameters. However
+ * for now all the callers are only use one of the flags at the same
+ * time.
+ */
+/* Whether we should allow dirty bit accounting */
+#define MM_CP_DIRTY_ACCT (1UL << 0)
+/* Whether this protection change is for NUMA hints */
+#define MM_CP_PROT_NUMA (1UL << 1)
+/* Whether this change is for write protecting */
+#define MM_CP_UFFD_WP (1UL << 2) /* do wp */
+#define MM_CP_UFFD_WP_RESOLVE (1UL << 3) /* Resolve wp */
+#define MM_CP_UFFD_WP_ALL (MM_CP_UFFD_WP | \
+ MM_CP_UFFD_WP_RESOLVE)
+
extern unsigned long change_protection(struct vm_area_struct *vma, unsigned long start,
unsigned long end, pgprot_t newprot,
- int dirty_accountable, int prot_numa);
+ unsigned long cp_flags);
extern int mprotect_fixup(struct vm_area_struct *vma,
struct vm_area_struct **pprev, unsigned long start,
unsigned long end, unsigned long newflags);
@@ -1720,6 +1927,18 @@ static inline void sync_mm_rss(struct mm_struct *mm)
}
#endif
+#ifndef CONFIG_ARCH_HAS_PTE_SPECIAL
+static inline int pte_special(pte_t pte)
+{
+ return 0;
+}
+
+static inline pte_t pte_mkspecial(pte_t pte)
+{
+ return pte;
+}
+#endif
+
#ifndef CONFIG_ARCH_HAS_PTE_DEVMAP
static inline int pte_devmap(pte_t pte)
{
@@ -2364,26 +2583,7 @@ struct vm_unmapped_area_info {
unsigned long align_offset;
};
-extern unsigned long unmapped_area(struct vm_unmapped_area_info *info);
-extern unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info);
-
-/*
- * Search for an unmapped address range.
- *
- * We are looking for a range that:
- * - does not intersect with any VMA;
- * - is contained within the [low_limit, high_limit) interval;
- * - is at least the desired size.
- * - satisfies (begin_addr & align_mask) == (align_offset & align_mask)
- */
-static inline unsigned long
-vm_unmapped_area(struct vm_unmapped_area_info *info)
-{
- if (info->flags & VM_UNMAPPED_AREA_TOPDOWN)
- return unmapped_area_topdown(info);
- else
- return unmapped_area(info);
-}
+extern unsigned long vm_unmapped_area(struct vm_unmapped_area_info *info);
/* truncate.c */
extern void truncate_inode_pages(struct address_space *, loff_t);
@@ -2519,6 +2719,8 @@ struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr);
int remap_pfn_range(struct vm_area_struct *, unsigned long addr,
unsigned long pfn, unsigned long size, pgprot_t);
int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *);
+int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr,
+ struct page **pages, unsigned long *num);
int vm_map_pages(struct vm_area_struct *vma, struct page **pages,
unsigned long num);
int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages,
@@ -2867,6 +3069,23 @@ extern long copy_huge_page_from_user(struct page *dst_page,
const void __user *usr_src,
unsigned int pages_per_huge_page,
bool allow_pagefault);
+
+/**
+ * vma_is_special_huge - Are transhuge page-table entries considered special?
+ * @vma: Pointer to the struct vm_area_struct to consider
+ *
+ * Whether transhuge page-table entries are considered "special" following
+ * the definition in vm_normal_page().
+ *
+ * Return: true if transhuge page-table entries should be considered special,
+ * false otherwise.
+ */
+static inline bool vma_is_special_huge(const struct vm_area_struct *vma)
+{
+ return vma_is_dax(vma) || (vma->vm_file &&
+ (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)));
+}
+
#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */
#ifdef CONFIG_DEBUG_PAGEALLOC