aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/mm/gup.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/gup.c')
-rw-r--r--mm/gup.c316
1 files changed, 217 insertions, 99 deletions
diff --git a/mm/gup.c b/mm/gup.c
index 6076df8e04a4..e19ff770eb4c 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -382,13 +382,22 @@ static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address,
}
/*
- * FOLL_FORCE can write to even unwritable pte's, but only
- * after we've gone through a COW cycle and they are dirty.
+ * FOLL_FORCE or a forced COW break can write even to unwritable pte's,
+ * but only after we've gone through a COW cycle and they are dirty.
*/
static inline bool can_follow_write_pte(pte_t pte, unsigned int flags)
{
- return pte_write(pte) ||
- ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pte_dirty(pte));
+ return pte_write(pte) || ((flags & FOLL_COW) && pte_dirty(pte));
+}
+
+/*
+ * A (separate) COW fault might break the page the other way and
+ * get_user_pages() would return the page from what is now the wrong
+ * VM. So we need to force a COW break at GUP time even for reads.
+ */
+static inline bool should_force_cow_break(struct vm_area_struct *vma, unsigned int flags)
+{
+ return is_cow_mapping(vma->vm_flags) && (flags & (FOLL_GET | FOLL_PIN));
}
static struct page *follow_page_pte(struct vm_area_struct *vma,
@@ -980,6 +989,7 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
* -- If nr_pages is >0, but no pages were pinned, returns -errno.
* -- If nr_pages is >0, and some pages were pinned, returns the number of
* pages pinned. Again, this may be less than nr_pages.
+ * -- 0 return value is possible when the fault would need to be retried.
*
* The caller is responsible for releasing returned @pages, via put_page().
*
@@ -1066,9 +1076,11 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
goto out;
}
if (is_vm_hugetlb_page(vma)) {
+ if (should_force_cow_break(vma, foll_flags))
+ foll_flags |= FOLL_WRITE;
i = follow_hugetlb_page(mm, vma, pages, vmas,
&start, &nr_pages, i,
- gup_flags, locked);
+ foll_flags, locked);
if (locked && *locked == 0) {
/*
* We've got a VM_FAULT_RETRY
@@ -1082,13 +1094,17 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
continue;
}
}
+
+ if (should_force_cow_break(vma, foll_flags))
+ foll_flags |= FOLL_WRITE;
+
retry:
/*
* If we have a pending SIGKILL, don't keep faulting pages and
* potentially allocating memory.
*/
if (fatal_signal_pending(current)) {
- ret = -ERESTARTSYS;
+ ret = -EINTR;
goto out;
}
cond_resched();
@@ -1168,7 +1184,7 @@ static bool vma_permits_fault(struct vm_area_struct *vma,
return true;
}
-/*
+/**
* fixup_user_fault() - manually resolve a user page fault
* @tsk: the task_struct to use for page fault accounting, or
* NULL if faults are not to be recorded.
@@ -1176,7 +1192,8 @@ static bool vma_permits_fault(struct vm_area_struct *vma,
* @address: user address
* @fault_flags:flags to pass down to handle_mm_fault()
* @unlocked: did we unlock the mmap_sem while retrying, maybe NULL if caller
- * does not allow retry
+ * does not allow retry. If NULL, the caller must guarantee
+ * that fault_flags does not contain FAULT_FLAG_ALLOW_RETRY.
*
* This is meant to be called in the specific scenario where for locking reasons
* we try to access user memory in atomic context (within a pagefault_disable()
@@ -1218,6 +1235,10 @@ retry:
if (!vma_permits_fault(vma, fault_flags))
return -EFAULT;
+ if ((fault_flags & FAULT_FLAG_KILLABLE) &&
+ fatal_signal_pending(current))
+ return -EINTR;
+
ret = handle_mm_fault(vma, address, fault_flags);
major |= ret & VM_FAULT_MAJOR;
if (ret & VM_FAULT_ERROR) {
@@ -1230,11 +1251,9 @@ retry:
if (ret & VM_FAULT_RETRY) {
down_read(&mm->mmap_sem);
- if (!(fault_flags & FAULT_FLAG_TRIED)) {
- *unlocked = true;
- fault_flags |= FAULT_FLAG_TRIED;
- goto retry;
- }
+ *unlocked = true;
+ fault_flags |= FAULT_FLAG_TRIED;
+ goto retry;
}
if (tsk) {
@@ -1247,6 +1266,10 @@ retry:
}
EXPORT_SYMBOL_GPL(fixup_user_fault);
+/*
+ * Please note that this function, unlike __get_user_pages will not
+ * return 0 for nr_pages > 0 without FOLL_NOWAIT
+ */
static __always_inline long __get_user_pages_locked(struct task_struct *tsk,
struct mm_struct *mm,
unsigned long start,
@@ -1837,7 +1860,7 @@ static long __get_user_pages_remote(struct task_struct *tsk,
gup_flags | FOLL_TOUCH | FOLL_REMOTE);
}
-/*
+/**
* get_user_pages_remote() - pin user pages in memory
* @tsk: the task_struct to use for page fault accounting, or
* NULL if faults are not to be recorded.
@@ -1868,13 +1891,13 @@ static long __get_user_pages_remote(struct task_struct *tsk,
*
* Must be called with mmap_sem held for read or write.
*
- * get_user_pages walks a process's page tables and takes a reference to
- * each struct page that each user address corresponds to at a given
+ * get_user_pages_remote walks a process's page tables and takes a reference
+ * to each struct page that each user address corresponds to at a given
* instant. That is, it takes the page that would be accessed if a user
* thread accesses the given user virtual address at that instant.
*
* This does not guarantee that the page exists in the user mappings when
- * get_user_pages returns, and there may even be a completely different
+ * get_user_pages_remote returns, and there may even be a completely different
* page there in some cases (eg. if mmapped pagecache has been invalidated
* and subsequently re faulted). However it does guarantee that the page
* won't be freed completely. And mostly callers simply care that the page
@@ -1886,17 +1909,17 @@ static long __get_user_pages_remote(struct task_struct *tsk,
* is written to, set_page_dirty (or set_page_dirty_lock, as appropriate) must
* be called after the page is finished with, and before put_page is called.
*
- * get_user_pages is typically used for fewer-copy IO operations, to get a
- * handle on the memory by some means other than accesses via the user virtual
- * addresses. The pages may be submitted for DMA to devices or accessed via
- * their kernel linear mapping (via the kmap APIs). Care should be taken to
- * use the correct cache flushing APIs.
+ * get_user_pages_remote is typically used for fewer-copy IO operations,
+ * to get a handle on the memory by some means other than accesses
+ * via the user virtual addresses. The pages may be submitted for
+ * DMA to devices or accessed via their kernel linear mapping (via the
+ * kmap APIs). Care should be taken to use the correct cache flushing APIs.
*
* See also get_user_pages_fast, for performance critical applications.
*
- * get_user_pages should be phased out in favor of
+ * get_user_pages_remote should be phased out in favor of
* get_user_pages_locked|unlocked or get_user_pages_fast. Nothing
- * should use get_user_pages because it cannot pass
+ * should use get_user_pages_remote because it cannot pass
* FAULT_FLAG_ALLOW_RETRY to handle_mm_fault.
*/
long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
@@ -1935,7 +1958,17 @@ static long __get_user_pages_remote(struct task_struct *tsk,
}
#endif /* !CONFIG_MMU */
-/*
+/**
+ * get_user_pages() - pin user pages in memory
+ * @start: starting user address
+ * @nr_pages: number of pages from start to pin
+ * @gup_flags: flags modifying lookup behaviour
+ * @pages: array that receives pointers to the pages pinned.
+ * Should be at least nr_pages long. Or NULL, if caller
+ * only intends to ensure the pages are faulted in.
+ * @vmas: array of pointers to vmas corresponding to each page.
+ * Or NULL if the caller does not require them.
+ *
* This is the same as get_user_pages_remote(), just with a
* less-flexible calling convention where we assume that the task
* and mm being operated on are the current task's and don't allow
@@ -1958,11 +1991,7 @@ long get_user_pages(unsigned long start, unsigned long nr_pages,
}
EXPORT_SYMBOL(get_user_pages);
-/*
- * We can leverage the VM_FAULT_RETRY functionality in the page fault
- * paths better by using either get_user_pages_locked() or
- * get_user_pages_unlocked().
- *
+/**
* get_user_pages_locked() is suitable to replace the form:
*
* down_read(&mm->mmap_sem);
@@ -1978,6 +2007,21 @@ EXPORT_SYMBOL(get_user_pages);
* get_user_pages_locked(tsk, mm, ..., pages, &locked);
* if (locked)
* up_read(&mm->mmap_sem);
+ *
+ * @start: starting user address
+ * @nr_pages: number of pages from start to pin
+ * @gup_flags: flags modifying lookup behaviour
+ * @pages: array that receives pointers to the pages pinned.
+ * Should be at least nr_pages long. Or NULL, if caller
+ * only intends to ensure the pages are faulted in.
+ * @locked: pointer to lock flag indicating whether lock is held and
+ * subsequently whether VM_FAULT_RETRY functionality can be
+ * utilised. Lock must initially be held.
+ *
+ * We can leverage the VM_FAULT_RETRY functionality in the page fault
+ * paths better by using either get_user_pages_locked() or
+ * get_user_pages_unlocked().
+ *
*/
long get_user_pages_locked(unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
@@ -2664,62 +2708,6 @@ static bool gup_fast_permitted(unsigned long start, unsigned long end)
}
#endif
-/*
- * Like get_user_pages_fast() except it's IRQ-safe in that it won't fall back to
- * the regular GUP.
- * Note a difference with get_user_pages_fast: this always returns the
- * number of pages pinned, 0 if no pages were pinned.
- *
- * If the architecture does not support this function, simply return with no
- * pages pinned.
- */
-int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
- struct page **pages)
-{
- unsigned long len, end;
- unsigned long flags;
- int nr_pinned = 0;
- /*
- * Internally (within mm/gup.c), gup fast variants must set FOLL_GET,
- * because gup fast is always a "pin with a +1 page refcount" request.
- */
- unsigned int gup_flags = FOLL_GET;
-
- if (write)
- gup_flags |= FOLL_WRITE;
-
- start = untagged_addr(start) & PAGE_MASK;
- len = (unsigned long) nr_pages << PAGE_SHIFT;
- end = start + len;
-
- if (end <= start)
- return 0;
- if (unlikely(!access_ok((void __user *)start, len)))
- return 0;
-
- /*
- * Disable interrupts. We use the nested form as we can already have
- * interrupts disabled by get_futex_key.
- *
- * With interrupts disabled, we block page table pages from being
- * freed from under us. See struct mmu_table_batch comments in
- * include/asm-generic/tlb.h for more details.
- *
- * We do not adopt an rcu_read_lock(.) here as we also want to
- * block IPIs that come from THPs splitting.
- */
-
- if (IS_ENABLED(CONFIG_HAVE_FAST_GUP) &&
- gup_fast_permitted(start, end)) {
- local_irq_save(flags);
- gup_pgd_range(start, end, gup_flags, pages, &nr_pinned);
- local_irq_restore(flags);
- }
-
- return nr_pinned;
-}
-EXPORT_SYMBOL_GPL(__get_user_pages_fast);
-
static int __gup_longterm_unlocked(unsigned long start, int nr_pages,
unsigned int gup_flags, struct page **pages)
{
@@ -2748,12 +2736,17 @@ static int internal_get_user_pages_fast(unsigned long start, int nr_pages,
struct page **pages)
{
unsigned long addr, len, end;
+ unsigned long flags;
int nr_pinned = 0, ret = 0;
if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM |
- FOLL_FORCE | FOLL_PIN | FOLL_GET)))
+ FOLL_FORCE | FOLL_PIN | FOLL_GET |
+ FOLL_FAST_ONLY)))
return -EINVAL;
+ if (!(gup_flags & FOLL_FAST_ONLY))
+ might_lock_read(&current->mm->mmap_sem);
+
start = untagged_addr(start) & PAGE_MASK;
addr = start;
len = (unsigned long) nr_pages << PAGE_SHIFT;
@@ -2764,15 +2757,42 @@ static int internal_get_user_pages_fast(unsigned long start, int nr_pages,
if (unlikely(!access_ok((void __user *)start, len)))
return -EFAULT;
- if (IS_ENABLED(CONFIG_HAVE_FAST_GUP) &&
- gup_fast_permitted(start, end)) {
- local_irq_disable();
- gup_pgd_range(addr, end, gup_flags, pages, &nr_pinned);
- local_irq_enable();
+ /*
+ * The FAST_GUP case requires FOLL_WRITE even for pure reads,
+ * because get_user_pages() may need to cause an early COW in
+ * order to avoid confusing the normal COW routines. So only
+ * targets that are already writable are safe to do by just
+ * looking at the page tables.
+ *
+ * NOTE! With FOLL_FAST_ONLY we allow read-only gup_fast() here,
+ * because there is no slow path to fall back on. But you'd
+ * better be careful about possible COW pages - you'll get _a_
+ * COW page, but not necessarily the one you intended to get
+ * depending on what COW event happens after this. COW may break
+ * the page copy in a random direction.
+ *
+ * Disable interrupts. The nested form is used, in order to allow
+ * full, general purpose use of this routine.
+ *
+ * With interrupts disabled, we block page table pages from being
+ * freed from under us. See struct mmu_table_batch comments in
+ * include/asm-generic/tlb.h for more details.
+ *
+ * We do not adopt an rcu_read_lock(.) here as we also want to
+ * block IPIs that come from THPs splitting.
+ */
+ if (IS_ENABLED(CONFIG_HAVE_FAST_GUP) && gup_fast_permitted(start, end)) {
+ unsigned long fast_flags = gup_flags;
+ if (!(gup_flags & FOLL_FAST_ONLY))
+ fast_flags |= FOLL_WRITE;
+
+ local_irq_save(flags);
+ gup_pgd_range(addr, end, fast_flags, pages, &nr_pinned);
+ local_irq_restore(flags);
ret = nr_pinned;
}
- if (nr_pinned < nr_pages) {
+ if (nr_pinned < nr_pages && !(gup_flags & FOLL_FAST_ONLY)) {
/* Try to get the remaining pages with get_user_pages */
start += nr_pinned << PAGE_SHIFT;
pages += nr_pinned;
@@ -2792,6 +2812,51 @@ static int internal_get_user_pages_fast(unsigned long start, int nr_pages,
return ret;
}
+/*
+ * Like get_user_pages_fast() except it's IRQ-safe in that it won't fall back to
+ * the regular GUP.
+ * Note a difference with get_user_pages_fast: this always returns the
+ * number of pages pinned, 0 if no pages were pinned.
+ *
+ * If the architecture does not support this function, simply return with no
+ * pages pinned.
+ *
+ * Careful, careful! COW breaking can go either way, so a non-write
+ * access can get ambiguous page results. If you call this function without
+ * 'write' set, you'd better be sure that you're ok with that ambiguity.
+ */
+int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
+ struct page **pages)
+{
+ int nr_pinned;
+ /*
+ * Internally (within mm/gup.c), gup fast variants must set FOLL_GET,
+ * because gup fast is always a "pin with a +1 page refcount" request.
+ *
+ * FOLL_FAST_ONLY is required in order to match the API description of
+ * this routine: no fall back to regular ("slow") GUP.
+ */
+ unsigned int gup_flags = FOLL_GET | FOLL_FAST_ONLY;
+
+ if (write)
+ gup_flags |= FOLL_WRITE;
+
+ nr_pinned = internal_get_user_pages_fast(start, nr_pages, gup_flags,
+ pages);
+
+ /*
+ * As specified in the API description above, this routine is not
+ * allowed to return negative values. However, the common core
+ * routine internal_get_user_pages_fast() *can* return -errno.
+ * Therefore, correct for that here:
+ */
+ if (nr_pinned < 0)
+ nr_pinned = 0;
+
+ return nr_pinned;
+}
+EXPORT_SYMBOL_GPL(__get_user_pages_fast);
+
/**
* get_user_pages_fast() - pin user pages in memory
* @start: starting user address
@@ -2843,9 +2908,9 @@ EXPORT_SYMBOL_GPL(get_user_pages_fast);
* the arguments here are identical.
*
* FOLL_PIN means that the pages must be released via unpin_user_page(). Please
- * see Documentation/vm/pin_user_pages.rst for further details.
+ * see Documentation/core-api/pin_user_pages.rst for further details.
*
- * This is intended for Case 1 (DIO) in Documentation/vm/pin_user_pages.rst. It
+ * This is intended for Case 1 (DIO) in Documentation/core-api/pin_user_pages.rst. It
* is NOT intended for Case 2 (RDMA: long-term pins).
*/
int pin_user_pages_fast(unsigned long start, int nr_pages,
@@ -2860,6 +2925,42 @@ int pin_user_pages_fast(unsigned long start, int nr_pages,
}
EXPORT_SYMBOL_GPL(pin_user_pages_fast);
+/*
+ * This is the FOLL_PIN equivalent of __get_user_pages_fast(). Behavior is the
+ * same, except that this one sets FOLL_PIN instead of FOLL_GET.
+ *
+ * The API rules are the same, too: no negative values may be returned.
+ */
+int pin_user_pages_fast_only(unsigned long start, int nr_pages,
+ unsigned int gup_flags, struct page **pages)
+{
+ int nr_pinned;
+
+ /*
+ * FOLL_GET and FOLL_PIN are mutually exclusive. Note that the API
+ * rules require returning 0, rather than -errno:
+ */
+ if (WARN_ON_ONCE(gup_flags & FOLL_GET))
+ return 0;
+ /*
+ * FOLL_FAST_ONLY is required in order to match the API description of
+ * this routine: no fall back to regular ("slow") GUP.
+ */
+ gup_flags |= (FOLL_PIN | FOLL_FAST_ONLY);
+ nr_pinned = internal_get_user_pages_fast(start, nr_pages, gup_flags,
+ pages);
+ /*
+ * This routine is not allowed to return negative values. However,
+ * internal_get_user_pages_fast() *can* return -errno. Therefore,
+ * correct for that here:
+ */
+ if (nr_pinned < 0)
+ nr_pinned = 0;
+
+ return nr_pinned;
+}
+EXPORT_SYMBOL_GPL(pin_user_pages_fast_only);
+
/**
* pin_user_pages_remote() - pin pages of a remote process (task != current)
*
@@ -2883,9 +2984,9 @@ EXPORT_SYMBOL_GPL(pin_user_pages_fast);
* the arguments here are identical.
*
* FOLL_PIN means that the pages must be released via unpin_user_page(). Please
- * see Documentation/vm/pin_user_pages.rst for details.
+ * see Documentation/core-api/pin_user_pages.rst for details.
*
- * This is intended for Case 1 (DIO) in Documentation/vm/pin_user_pages.rst. It
+ * This is intended for Case 1 (DIO) in Documentation/core-api/pin_user_pages.rst. It
* is NOT intended for Case 2 (RDMA: long-term pins).
*/
long pin_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
@@ -2919,9 +3020,9 @@ EXPORT_SYMBOL(pin_user_pages_remote);
* FOLL_PIN is set.
*
* FOLL_PIN means that the pages must be released via unpin_user_page(). Please
- * see Documentation/vm/pin_user_pages.rst for details.
+ * see Documentation/core-api/pin_user_pages.rst for details.
*
- * This is intended for Case 1 (DIO) in Documentation/vm/pin_user_pages.rst. It
+ * This is intended for Case 1 (DIO) in Documentation/core-api/pin_user_pages.rst. It
* is NOT intended for Case 2 (RDMA: long-term pins).
*/
long pin_user_pages(unsigned long start, unsigned long nr_pages,
@@ -2937,3 +3038,20 @@ long pin_user_pages(unsigned long start, unsigned long nr_pages,
pages, vmas, gup_flags);
}
EXPORT_SYMBOL(pin_user_pages);
+
+/*
+ * pin_user_pages_unlocked() is the FOLL_PIN variant of
+ * get_user_pages_unlocked(). Behavior is the same, except that this one sets
+ * FOLL_PIN and rejects FOLL_GET.
+ */
+long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
+ struct page **pages, unsigned int gup_flags)
+{
+ /* FOLL_GET and FOLL_PIN are mutually exclusive. */
+ if (WARN_ON_ONCE(gup_flags & FOLL_GET))
+ return -EINVAL;
+
+ gup_flags |= FOLL_PIN;
+ return get_user_pages_unlocked(start, nr_pages, pages, gup_flags);
+}
+EXPORT_SYMBOL(pin_user_pages_unlocked);