aboutsummaryrefslogtreecommitdiffstats
path: root/Documentation
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2019-05-14 10:10:55 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2019-05-14 10:10:55 -0700
commit318222a35bfb0ae9b5ff3e359a583463e6cfcd94 (patch)
tree6a8d921f7ac9915f3f12dd3fcd7efaaf6f16bb09 /Documentation
parentMerge tag 'ovl-update-5.2' of git://git.kernel.org/pub/scm/linux/kernel/git/mszeredi/vfs (diff)
parentkernel/memremap.c: remove the unused device_private_entry_fault() export (diff)
downloadlinux-dev-318222a35bfb0ae9b5ff3e359a583463e6cfcd94.tar.xz
linux-dev-318222a35bfb0ae9b5ff3e359a583463e6cfcd94.zip
Merge branch 'akpm' (patches from Andrew)
Merge misc updates from Andrew Morton: - a few misc things and hotfixes - ocfs2 - almost all of MM * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (139 commits) kernel/memremap.c: remove the unused device_private_entry_fault() export mm: delete find_get_entries_tag mm/huge_memory.c: make __thp_get_unmapped_area static mm/mprotect.c: fix compilation warning because of unused 'mm' variable mm/page-writeback: introduce tracepoint for wait_on_page_writeback() mm/vmscan: simplify trace_reclaim_flags and trace_shrink_flags mm/Kconfig: update "Memory Model" help text mm/vmscan.c: don't disable irq again when count pgrefill for memcg mm: memblock: make keeping memblock memory opt-in rather than opt-out hugetlbfs: always use address space in inode for resv_map pointer mm/z3fold.c: support page migration mm/z3fold.c: add structure for buddy handles mm/z3fold.c: improve compression by extending search mm/z3fold.c: introduce helper functions mm/page_alloc.c: remove unnecessary parameter in rmqueue_pcplist mm/hmm: add ARCH_HAS_HMM_MIRROR ARCH_HAS_HMM_DEVICE Kconfig mm/vmscan.c: simplify shrink_inactive_list() fs/sync.c: sync_file_range(2) may use WB_SYNC_ALL writeback xen/privcmd-buf.c: convert to use vm_map_pages_zero() xen/gntdev.c: convert to use vm_map_pages() ...
Diffstat (limited to 'Documentation')
-rw-r--r--Documentation/sysctl/vm.txt12
-rw-r--r--Documentation/trace/postprocess/trace-vmscan-postprocess.pl7
-rw-r--r--Documentation/vm/hmm.rst94
3 files changed, 91 insertions, 22 deletions
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 3f13d8599337..749322060f10 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -61,6 +61,7 @@ Currently, these files are in /proc/sys/vm:
- stat_refresh
- numa_stat
- swappiness
+- unprivileged_userfaultfd
- user_reserve_kbytes
- vfs_cache_pressure
- watermark_boost_factor
@@ -818,6 +819,17 @@ The default value is 60.
==============================================================
+unprivileged_userfaultfd
+
+This flag controls whether unprivileged users can use the userfaultfd
+system calls. Set this to 1 to allow unprivileged users to use the
+userfaultfd system calls, or set this to 0 to restrict userfaultfd to only
+privileged users (with SYS_CAP_PTRACE capability).
+
+The default value is 1.
+
+==============================================================
+
- user_reserve_kbytes
When overcommit_memory is set to 2, "never overcommit" mode, reserve
diff --git a/Documentation/trace/postprocess/trace-vmscan-postprocess.pl b/Documentation/trace/postprocess/trace-vmscan-postprocess.pl
index 66bfd8396877..995da15b16ca 100644
--- a/Documentation/trace/postprocess/trace-vmscan-postprocess.pl
+++ b/Documentation/trace/postprocess/trace-vmscan-postprocess.pl
@@ -113,7 +113,7 @@ my $regex_kswapd_wake_default = 'nid=([0-9]*) order=([0-9]*)';
my $regex_kswapd_sleep_default = 'nid=([0-9]*)';
my $regex_wakeup_kswapd_default = 'nid=([0-9]*) zid=([0-9]*) order=([0-9]*) gfp_flags=([A-Z_|]*)';
my $regex_lru_isolate_default = 'isolate_mode=([0-9]*) classzone_idx=([0-9]*) order=([0-9]*) nr_requested=([0-9]*) nr_scanned=([0-9]*) nr_skipped=([0-9]*) nr_taken=([0-9]*) lru=([a-z_]*)';
-my $regex_lru_shrink_inactive_default = 'nid=([0-9]*) nr_scanned=([0-9]*) nr_reclaimed=([0-9]*) nr_dirty=([0-9]*) nr_writeback=([0-9]*) nr_congested=([0-9]*) nr_immediate=([0-9]*) nr_activate=([0-9]*) nr_ref_keep=([0-9]*) nr_unmap_fail=([0-9]*) priority=([0-9]*) flags=([A-Z_|]*)';
+my $regex_lru_shrink_inactive_default = 'nid=([0-9]*) nr_scanned=([0-9]*) nr_reclaimed=([0-9]*) nr_dirty=([0-9]*) nr_writeback=([0-9]*) nr_congested=([0-9]*) nr_immediate=([0-9]*) nr_activate_anon=([0-9]*) nr_activate_file=([0-9]*) nr_ref_keep=([0-9]*) nr_unmap_fail=([0-9]*) priority=([0-9]*) flags=([A-Z_|]*)';
my $regex_lru_shrink_active_default = 'lru=([A-Z_]*) nr_scanned=([0-9]*) nr_rotated=([0-9]*) priority=([0-9]*)';
my $regex_writepage_default = 'page=([0-9a-f]*) pfn=([0-9]*) flags=([A-Z_|]*)';
@@ -212,7 +212,8 @@ $regex_lru_shrink_inactive = generate_traceevent_regex(
"vmscan/mm_vmscan_lru_shrink_inactive",
$regex_lru_shrink_inactive_default,
"nid", "nr_scanned", "nr_reclaimed", "nr_dirty", "nr_writeback",
- "nr_congested", "nr_immediate", "nr_activate", "nr_ref_keep",
+ "nr_congested", "nr_immediate", "nr_activate_anon",
+ "nr_activate_file", "nr_ref_keep",
"nr_unmap_fail", "priority", "flags");
$regex_lru_shrink_active = generate_traceevent_regex(
"vmscan/mm_vmscan_lru_shrink_active",
@@ -407,7 +408,7 @@ EVENT_PROCESS:
}
my $nr_reclaimed = $3;
- my $flags = $12;
+ my $flags = $13;
my $file = 0;
if ($flags =~ /RECLAIM_WB_FILE/) {
$file = 1;
diff --git a/Documentation/vm/hmm.rst b/Documentation/vm/hmm.rst
index 44205f0b671f..ec1efa32af3c 100644
--- a/Documentation/vm/hmm.rst
+++ b/Documentation/vm/hmm.rst
@@ -189,20 +189,10 @@ the driver callback returns.
When the device driver wants to populate a range of virtual addresses, it can
use either::
- int hmm_vma_get_pfns(struct vm_area_struct *vma,
- struct hmm_range *range,
- unsigned long start,
- unsigned long end,
- hmm_pfn_t *pfns);
- int hmm_vma_fault(struct vm_area_struct *vma,
- struct hmm_range *range,
- unsigned long start,
- unsigned long end,
- hmm_pfn_t *pfns,
- bool write,
- bool block);
-
-The first one (hmm_vma_get_pfns()) will only fetch present CPU page table
+ long hmm_range_snapshot(struct hmm_range *range);
+ long hmm_range_fault(struct hmm_range *range, bool block);
+
+The first one (hmm_range_snapshot()) will only fetch present CPU page table
entries and will not trigger a page fault on missing or non-present entries.
The second one does trigger a page fault on missing or read-only entry if the
write parameter is true. Page faults use the generic mm page fault code path
@@ -220,25 +210,56 @@ respect in order to keep things properly synchronized. The usage pattern is::
{
struct hmm_range range;
...
+
+ range.start = ...;
+ range.end = ...;
+ range.pfns = ...;
+ range.flags = ...;
+ range.values = ...;
+ range.pfn_shift = ...;
+ hmm_range_register(&range);
+
+ /*
+ * Just wait for range to be valid, safe to ignore return value as we
+ * will use the return value of hmm_range_snapshot() below under the
+ * mmap_sem to ascertain the validity of the range.
+ */
+ hmm_range_wait_until_valid(&range, TIMEOUT_IN_MSEC);
+
again:
- ret = hmm_vma_get_pfns(vma, &range, start, end, pfns);
- if (ret)
+ down_read(&mm->mmap_sem);
+ ret = hmm_range_snapshot(&range);
+ if (ret) {
+ up_read(&mm->mmap_sem);
+ if (ret == -EAGAIN) {
+ /*
+ * No need to check hmm_range_wait_until_valid() return value
+ * on retry we will get proper error with hmm_range_snapshot()
+ */
+ hmm_range_wait_until_valid(&range, TIMEOUT_IN_MSEC);
+ goto again;
+ }
+ hmm_mirror_unregister(&range);
return ret;
+ }
take_lock(driver->update);
- if (!hmm_vma_range_done(vma, &range)) {
+ if (!range.valid) {
release_lock(driver->update);
+ up_read(&mm->mmap_sem);
goto again;
}
// Use pfns array content to update device page table
+ hmm_mirror_unregister(&range);
release_lock(driver->update);
+ up_read(&mm->mmap_sem);
return 0;
}
The driver->update lock is the same lock that the driver takes inside its
-update() callback. That lock must be held before hmm_vma_range_done() to avoid
-any race with a concurrent CPU page table update.
+update() callback. That lock must be held before checking the range.valid
+field to avoid any race with a concurrent CPU page table update.
HMM implements all this on top of the mmu_notifier API because we wanted a
simpler API and also to be able to perform optimizations latter on like doing
@@ -255,6 +276,41 @@ report commands as executed is serialized (there is no point in doing this
concurrently).
+Leverage default_flags and pfn_flags_mask
+=========================================
+
+The hmm_range struct has 2 fields default_flags and pfn_flags_mask that allows
+to set fault or snapshot policy for a whole range instead of having to set them
+for each entries in the range.
+
+For instance if the device flags for device entries are:
+ VALID (1 << 63)
+ WRITE (1 << 62)
+
+Now let say that device driver wants to fault with at least read a range then
+it does set:
+ range->default_flags = (1 << 63)
+ range->pfn_flags_mask = 0;
+
+and calls hmm_range_fault() as described above. This will fill fault all page
+in the range with at least read permission.
+
+Now let say driver wants to do the same except for one page in the range for
+which its want to have write. Now driver set:
+ range->default_flags = (1 << 63);
+ range->pfn_flags_mask = (1 << 62);
+ range->pfns[index_of_write] = (1 << 62);
+
+With this HMM will fault in all page with at least read (ie valid) and for the
+address == range->start + (index_of_write << PAGE_SHIFT) it will fault with
+write permission ie if the CPU pte does not have write permission set then HMM
+will call handle_mm_fault().
+
+Note that HMM will populate the pfns array with write permission for any entry
+that have write permission within the CPU pte no matter what are the values set
+in default_flags or pfn_flags_mask.
+
+
Represent and manage device memory from core kernel point of view
=================================================================