aboutsummaryrefslogtreecommitdiffstats
path: root/arch/s390/mm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/s390/mm')
-rw-r--r--arch/s390/mm/Makefile4
-rw-r--r--arch/s390/mm/cmm.c74
-rw-r--r--arch/s390/mm/dump_pagetables.c405
-rw-r--r--arch/s390/mm/extable.c81
-rw-r--r--arch/s390/mm/extmem.c34
-rw-r--r--arch/s390/mm/fault.c329
-rw-r--r--arch/s390/mm/gmap.c424
-rw-r--r--arch/s390/mm/hugetlbpage.c93
-rw-r--r--arch/s390/mm/init.c53
-rw-r--r--arch/s390/mm/kasan_init.c157
-rw-r--r--arch/s390/mm/maccess.c234
-rw-r--r--arch/s390/mm/mmap.c67
-rw-r--r--arch/s390/mm/page-states.c70
-rw-r--r--arch/s390/mm/pageattr.c85
-rw-r--r--arch/s390/mm/pgalloc.c383
-rw-r--r--arch/s390/mm/pgtable.c183
-rw-r--r--arch/s390/mm/vmem.c835
17 files changed, 2071 insertions, 1440 deletions
diff --git a/arch/s390/mm/Makefile b/arch/s390/mm/Makefile
index 3175413186b9..57e4f3a24829 100644
--- a/arch/s390/mm/Makefile
+++ b/arch/s390/mm/Makefile
@@ -4,11 +4,11 @@
#
obj-y := init.o fault.o extmem.o mmap.o vmem.o maccess.o
-obj-y += page-states.o pageattr.o pgtable.o pgalloc.o
+obj-y += page-states.o pageattr.o pgtable.o pgalloc.o extable.o
obj-$(CONFIG_CMM) += cmm.o
obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
-obj-$(CONFIG_S390_PTDUMP) += dump_pagetables.o
+obj-$(CONFIG_PTDUMP_CORE) += dump_pagetables.o
obj-$(CONFIG_PGSTE) += gmap.o
KASAN_SANITIZE_kasan_init.o := n
diff --git a/arch/s390/mm/cmm.c b/arch/s390/mm/cmm.c
index a51c892f14f3..9141ed4c52e9 100644
--- a/arch/s390/mm/cmm.c
+++ b/arch/s390/mm/cmm.c
@@ -14,15 +14,13 @@
#include <linux/moduleparam.h>
#include <linux/gfp.h>
#include <linux/sched.h>
+#include <linux/string_helpers.h>
#include <linux/sysctl.h>
-#include <linux/ctype.h>
#include <linux/swap.h>
#include <linux/kthread.h>
#include <linux/oom.h>
-#include <linux/suspend.h>
#include <linux/uaccess.h>
-#include <asm/pgalloc.h>
#include <asm/diag.h>
#ifdef CONFIG_CMM_IUCV
@@ -49,7 +47,6 @@ static volatile long cmm_pages_target;
static volatile long cmm_timed_pages_target;
static long cmm_timeout_pages;
static long cmm_timeout_seconds;
-static int cmm_suspended;
static struct cmm_page_array *cmm_page_list;
static struct cmm_page_array *cmm_timed_page_list;
@@ -93,7 +90,7 @@ static long cmm_alloc_pages(long nr, long *counter,
} else
free_page((unsigned long) npa);
}
- diag10_range(addr >> PAGE_SHIFT, 1);
+ diag10_range(virt_to_pfn(addr), 1);
pa->pages[pa->index++] = addr;
(*counter)++;
spin_unlock(&cmm_lock);
@@ -151,9 +148,9 @@ static int cmm_thread(void *dummy)
while (1) {
rc = wait_event_interruptible(cmm_thread_wait,
- (!cmm_suspended && (cmm_pages != cmm_pages_target ||
- cmm_timed_pages != cmm_timed_pages_target)) ||
- kthread_should_stop());
+ cmm_pages != cmm_pages_target ||
+ cmm_timed_pages != cmm_timed_pages_target ||
+ kthread_should_stop());
if (kthread_should_stop() || rc == -ERESTARTSYS) {
cmm_pages_target = cmm_pages;
cmm_timed_pages_target = cmm_timed_pages;
@@ -191,7 +188,7 @@ static void cmm_set_timer(void)
del_timer(&cmm_timer);
return;
}
- mod_timer(&cmm_timer, jiffies + cmm_timeout_seconds * HZ);
+ mod_timer(&cmm_timer, jiffies + msecs_to_jiffies(cmm_timeout_seconds * MSEC_PER_SEC));
}
static void cmm_timer_fn(struct timer_list *unused)
@@ -247,7 +244,7 @@ static int cmm_skip_blanks(char *cp, char **endp)
}
static int cmm_pages_handler(struct ctl_table *ctl, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
long nr = cmm_get_pages();
struct ctl_table ctl_entry = {
@@ -266,7 +263,7 @@ static int cmm_pages_handler(struct ctl_table *ctl, int write,
}
static int cmm_timed_pages_handler(struct ctl_table *ctl, int write,
- void __user *buffer, size_t *lenp,
+ void *buffer, size_t *lenp,
loff_t *ppos)
{
long nr = cmm_get_timed_pages();
@@ -286,7 +283,7 @@ static int cmm_timed_pages_handler(struct ctl_table *ctl, int write,
}
static int cmm_timeout_handler(struct ctl_table *ctl, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
char buf[64], *p;
long nr, seconds;
@@ -299,8 +296,7 @@ static int cmm_timeout_handler(struct ctl_table *ctl, int write,
if (write) {
len = min(*lenp, sizeof(buf));
- if (copy_from_user(buf, buffer, len))
- return -EFAULT;
+ memcpy(buf, buffer, len);
buf[len - 1] = '\0';
cmm_skip_blanks(buf, &p);
nr = simple_strtoul(p, &p, 0);
@@ -313,8 +309,7 @@ static int cmm_timeout_handler(struct ctl_table *ctl, int write,
cmm_timeout_pages, cmm_timeout_seconds);
if (len > *lenp)
len = *lenp;
- if (copy_to_user(buffer, buf, len))
- return -EFAULT;
+ memcpy(buffer, buf, len);
*lenp = len;
*ppos += len;
}
@@ -390,38 +385,6 @@ static void cmm_smsg_target(const char *from, char *msg)
static struct ctl_table_header *cmm_sysctl_header;
-static int cmm_suspend(void)
-{
- cmm_suspended = 1;
- cmm_free_pages(cmm_pages, &cmm_pages, &cmm_page_list);
- cmm_free_pages(cmm_timed_pages, &cmm_timed_pages, &cmm_timed_page_list);
- return 0;
-}
-
-static int cmm_resume(void)
-{
- cmm_suspended = 0;
- cmm_kick_thread();
- return 0;
-}
-
-static int cmm_power_event(struct notifier_block *this,
- unsigned long event, void *ptr)
-{
- switch (event) {
- case PM_POST_HIBERNATION:
- return cmm_resume();
- case PM_HIBERNATION_PREPARE:
- return cmm_suspend();
- default:
- return NOTIFY_DONE;
- }
-}
-
-static struct notifier_block cmm_power_notifier = {
- .notifier_call = cmm_power_event,
-};
-
static int __init cmm_init(void)
{
int rc = -ENOMEM;
@@ -431,13 +394,10 @@ static int __init cmm_init(void)
goto out_sysctl;
#ifdef CONFIG_CMM_IUCV
/* convert sender to uppercase characters */
- if (sender) {
- int len = strlen(sender);
- while (len--)
- sender[len] = toupper(sender[len]);
- } else {
+ if (sender)
+ string_upper(sender, sender);
+ else
sender = cmm_default_sender;
- }
rc = smsg_register_callback(SMSG_PREFIX, cmm_smsg_target);
if (rc < 0)
@@ -446,16 +406,11 @@ static int __init cmm_init(void)
rc = register_oom_notifier(&cmm_oom_nb);
if (rc < 0)
goto out_oom_notify;
- rc = register_pm_notifier(&cmm_power_notifier);
- if (rc)
- goto out_pm;
cmm_thread_ptr = kthread_run(cmm_thread, NULL, "cmmthread");
if (!IS_ERR(cmm_thread_ptr))
return 0;
rc = PTR_ERR(cmm_thread_ptr);
- unregister_pm_notifier(&cmm_power_notifier);
-out_pm:
unregister_oom_notifier(&cmm_oom_nb);
out_oom_notify:
#ifdef CONFIG_CMM_IUCV
@@ -475,7 +430,6 @@ static void __exit cmm_exit(void)
#ifdef CONFIG_CMM_IUCV
smsg_unregister_callback(SMSG_PREFIX, cmm_smsg_target);
#endif
- unregister_pm_notifier(&cmm_power_notifier);
unregister_oom_notifier(&cmm_oom_nb);
kthread_stop(cmm_thread_ptr);
del_timer_sync(&cmm_timer);
diff --git a/arch/s390/mm/dump_pagetables.c b/arch/s390/mm/dump_pagetables.c
index 5d67b81c704a..9953819d7959 100644
--- a/arch/s390/mm/dump_pagetables.c
+++ b/arch/s390/mm/dump_pagetables.c
@@ -1,12 +1,17 @@
// SPDX-License-Identifier: GPL-2.0
+#include <linux/set_memory.h>
+#include <linux/ptdump.h>
#include <linux/seq_file.h>
#include <linux/debugfs.h>
-#include <linux/sched.h>
#include <linux/mm.h>
+#include <linux/kfence.h>
#include <linux/kasan.h>
+#include <asm/ptdump.h>
#include <asm/kasan.h>
+#include <asm/abs_lowcore.h>
+#include <asm/nospec-branch.h>
#include <asm/sections.h>
-#include <asm/pgtable.h>
+#include <asm/maccess.h>
static unsigned long max_addr;
@@ -16,266 +21,267 @@ struct addr_marker {
};
enum address_markers_idx {
- IDENTITY_NR = 0,
+ IDENTITY_BEFORE_NR = 0,
+ IDENTITY_BEFORE_END_NR,
+ AMODE31_START_NR,
+ AMODE31_END_NR,
KERNEL_START_NR,
KERNEL_END_NR,
+#ifdef CONFIG_KFENCE
+ KFENCE_START_NR,
+ KFENCE_END_NR,
+#endif
+ IDENTITY_AFTER_NR,
+ IDENTITY_AFTER_END_NR,
#ifdef CONFIG_KASAN
KASAN_SHADOW_START_NR,
KASAN_SHADOW_END_NR,
#endif
VMEMMAP_NR,
+ VMEMMAP_END_NR,
VMALLOC_NR,
+ VMALLOC_END_NR,
MODULES_NR,
+ MODULES_END_NR,
+ ABS_LOWCORE_NR,
+ ABS_LOWCORE_END_NR,
+ MEMCPY_REAL_NR,
+ MEMCPY_REAL_END_NR,
};
static struct addr_marker address_markers[] = {
- [IDENTITY_NR] = {0, "Identity Mapping"},
+ [IDENTITY_BEFORE_NR] = {0, "Identity Mapping Start"},
+ [IDENTITY_BEFORE_END_NR] = {(unsigned long)_stext, "Identity Mapping End"},
+ [AMODE31_START_NR] = {0, "Amode31 Area Start"},
+ [AMODE31_END_NR] = {0, "Amode31 Area End"},
[KERNEL_START_NR] = {(unsigned long)_stext, "Kernel Image Start"},
[KERNEL_END_NR] = {(unsigned long)_end, "Kernel Image End"},
+#ifdef CONFIG_KFENCE
+ [KFENCE_START_NR] = {0, "KFence Pool Start"},
+ [KFENCE_END_NR] = {0, "KFence Pool End"},
+#endif
+ [IDENTITY_AFTER_NR] = {(unsigned long)_end, "Identity Mapping Start"},
+ [IDENTITY_AFTER_END_NR] = {0, "Identity Mapping End"},
#ifdef CONFIG_KASAN
[KASAN_SHADOW_START_NR] = {KASAN_SHADOW_START, "Kasan Shadow Start"},
[KASAN_SHADOW_END_NR] = {KASAN_SHADOW_END, "Kasan Shadow End"},
#endif
- [VMEMMAP_NR] = {0, "vmemmap Area"},
- [VMALLOC_NR] = {0, "vmalloc Area"},
- [MODULES_NR] = {0, "Modules Area"},
+ [VMEMMAP_NR] = {0, "vmemmap Area Start"},
+ [VMEMMAP_END_NR] = {0, "vmemmap Area End"},
+ [VMALLOC_NR] = {0, "vmalloc Area Start"},
+ [VMALLOC_END_NR] = {0, "vmalloc Area End"},
+ [MODULES_NR] = {0, "Modules Area Start"},
+ [MODULES_END_NR] = {0, "Modules Area End"},
+ [ABS_LOWCORE_NR] = {0, "Lowcore Area Start"},
+ [ABS_LOWCORE_END_NR] = {0, "Lowcore Area End"},
+ [MEMCPY_REAL_NR] = {0, "Real Memory Copy Area Start"},
+ [MEMCPY_REAL_END_NR] = {0, "Real Memory Copy Area End"},
{ -1, NULL }
};
struct pg_state {
+ struct ptdump_state ptdump;
+ struct seq_file *seq;
int level;
unsigned int current_prot;
+ bool check_wx;
+ unsigned long wx_pages;
unsigned long start_address;
- unsigned long current_address;
const struct addr_marker *marker;
};
+#define pt_dump_seq_printf(m, fmt, args...) \
+({ \
+ struct seq_file *__m = (m); \
+ \
+ if (__m) \
+ seq_printf(__m, fmt, ##args); \
+})
+
+#define pt_dump_seq_puts(m, fmt) \
+({ \
+ struct seq_file *__m = (m); \
+ \
+ if (__m) \
+ seq_printf(__m, fmt); \
+})
+
static void print_prot(struct seq_file *m, unsigned int pr, int level)
{
static const char * const level_name[] =
{ "ASCE", "PGD", "PUD", "PMD", "PTE" };
- seq_printf(m, "%s ", level_name[level]);
+ pt_dump_seq_printf(m, "%s ", level_name[level]);
if (pr & _PAGE_INVALID) {
- seq_printf(m, "I\n");
+ pt_dump_seq_printf(m, "I\n");
return;
}
- seq_puts(m, (pr & _PAGE_PROTECT) ? "RO " : "RW ");
- seq_puts(m, (pr & _PAGE_NOEXEC) ? "NX\n" : "X\n");
+ pt_dump_seq_puts(m, (pr & _PAGE_PROTECT) ? "RO " : "RW ");
+ pt_dump_seq_puts(m, (pr & _PAGE_NOEXEC) ? "NX\n" : "X\n");
}
-static void note_page(struct seq_file *m, struct pg_state *st,
- unsigned int new_prot, int level)
+static void note_prot_wx(struct pg_state *st, unsigned long addr)
+{
+#ifdef CONFIG_DEBUG_WX
+ if (!st->check_wx)
+ return;
+ if (st->current_prot & _PAGE_INVALID)
+ return;
+ if (st->current_prot & _PAGE_PROTECT)
+ return;
+ if (st->current_prot & _PAGE_NOEXEC)
+ return;
+ /*
+ * The first lowcore page is W+X if spectre mitigations are using
+ * trampolines or the BEAR enhancements facility is not installed,
+ * in which case we have two lpswe instructions in lowcore that need
+ * to be executable.
+ */
+ if (addr == PAGE_SIZE && (nospec_uses_trampoline() || !static_key_enabled(&cpu_has_bear)))
+ return;
+ WARN_ONCE(1, "s390/mm: Found insecure W+X mapping at address %pS\n",
+ (void *)st->start_address);
+ st->wx_pages += (addr - st->start_address) / PAGE_SIZE;
+#endif /* CONFIG_DEBUG_WX */
+}
+
+static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level, u64 val)
{
- static const char units[] = "KMGTPE";
int width = sizeof(unsigned long) * 2;
+ static const char units[] = "KMGTPE";
const char *unit = units;
- unsigned int prot, cur;
unsigned long delta;
+ struct pg_state *st;
+ struct seq_file *m;
+ unsigned int prot;
- /*
- * If we have a "break" in the series, we need to flush the state
- * that we have now. "break" is either changing perms, levels or
- * address space marker.
- */
- prot = new_prot;
- cur = st->current_prot;
-
- if (!st->level) {
- /* First entry */
- st->current_prot = new_prot;
+ st = container_of(pt_st, struct pg_state, ptdump);
+ m = st->seq;
+ prot = val & (_PAGE_PROTECT | _PAGE_NOEXEC);
+ if (level == 4 && (val & _PAGE_INVALID))
+ prot = _PAGE_INVALID;
+ /* For pmd_none() & friends val gets passed as zero. */
+ if (level != 4 && !val)
+ prot = _PAGE_INVALID;
+ /* Final flush from generic code. */
+ if (level == -1)
+ addr = max_addr;
+ if (st->level == -1) {
+ pt_dump_seq_printf(m, "---[ %s ]---\n", st->marker->name);
+ st->start_address = addr;
+ st->current_prot = prot;
st->level = level;
- st->marker = address_markers;
- seq_printf(m, "---[ %s ]---\n", st->marker->name);
- } else if (prot != cur || level != st->level ||
- st->current_address >= st->marker[1].start_address) {
- /* Print the actual finished series */
- seq_printf(m, "0x%0*lx-0x%0*lx ",
- width, st->start_address,
- width, st->current_address);
- delta = (st->current_address - st->start_address) >> 10;
+ } else if (prot != st->current_prot || level != st->level ||
+ addr >= st->marker[1].start_address) {
+ note_prot_wx(st, addr);
+ pt_dump_seq_printf(m, "0x%0*lx-0x%0*lx ",
+ width, st->start_address,
+ width, addr);
+ delta = (addr - st->start_address) >> 10;
while (!(delta & 0x3ff) && unit[1]) {
delta >>= 10;
unit++;
}
- seq_printf(m, "%9lu%c ", delta, *unit);
+ pt_dump_seq_printf(m, "%9lu%c ", delta, *unit);
print_prot(m, st->current_prot, st->level);
- while (st->current_address >= st->marker[1].start_address) {
+ while (addr >= st->marker[1].start_address) {
st->marker++;
- seq_printf(m, "---[ %s ]---\n", st->marker->name);
+ pt_dump_seq_printf(m, "---[ %s ]---\n", st->marker->name);
}
- st->start_address = st->current_address;
- st->current_prot = new_prot;
+ st->start_address = addr;
+ st->current_prot = prot;
st->level = level;
}
}
-#ifdef CONFIG_KASAN
-static void note_kasan_early_shadow_page(struct seq_file *m,
- struct pg_state *st)
-{
- unsigned int prot;
-
- prot = pte_val(*kasan_early_shadow_pte) &
- (_PAGE_PROTECT | _PAGE_INVALID | _PAGE_NOEXEC);
- note_page(m, st, prot, 4);
-}
-#endif
-
-/*
- * The actual page table walker functions. In order to keep the
- * implementation of print_prot() short, we only check and pass
- * _PAGE_INVALID and _PAGE_PROTECT flags to note_page() if a region,
- * segment or page table entry is invalid or read-only.
- * After all it's just a hint that the current level being walked
- * contains an invalid or read-only entry.
- */
-static void walk_pte_level(struct seq_file *m, struct pg_state *st,
- pmd_t *pmd, unsigned long addr)
-{
- unsigned int prot;
- pte_t *pte;
- int i;
-
- for (i = 0; i < PTRS_PER_PTE && addr < max_addr; i++) {
- st->current_address = addr;
- pte = pte_offset_kernel(pmd, addr);
- prot = pte_val(*pte) &
- (_PAGE_PROTECT | _PAGE_INVALID | _PAGE_NOEXEC);
- note_page(m, st, prot, 4);
- addr += PAGE_SIZE;
- }
-}
-
-static void walk_pmd_level(struct seq_file *m, struct pg_state *st,
- pud_t *pud, unsigned long addr)
-{
- unsigned int prot;
- pmd_t *pmd;
- int i;
-
-#ifdef CONFIG_KASAN
- if ((pud_val(*pud) & PAGE_MASK) == __pa(kasan_early_shadow_pmd)) {
- note_kasan_early_shadow_page(m, st);
- return;
- }
-#endif
-
- pmd = pmd_offset(pud, addr);
- for (i = 0; i < PTRS_PER_PMD && addr < max_addr; i++, pmd++) {
- st->current_address = addr;
- if (!pmd_none(*pmd)) {
- if (pmd_large(*pmd)) {
- prot = pmd_val(*pmd) &
- (_SEGMENT_ENTRY_PROTECT |
- _SEGMENT_ENTRY_NOEXEC);
- note_page(m, st, prot, 3);
- } else
- walk_pte_level(m, st, pmd, addr);
- } else
- note_page(m, st, _PAGE_INVALID, 3);
- addr += PMD_SIZE;
- }
-}
-
-static void walk_pud_level(struct seq_file *m, struct pg_state *st,
- p4d_t *p4d, unsigned long addr)
+#ifdef CONFIG_DEBUG_WX
+void ptdump_check_wx(void)
{
- unsigned int prot;
- pud_t *pud;
- int i;
+ struct pg_state st = {
+ .ptdump = {
+ .note_page = note_page,
+ .range = (struct ptdump_range[]) {
+ {.start = 0, .end = max_addr},
+ {.start = 0, .end = 0},
+ }
+ },
+ .seq = NULL,
+ .level = -1,
+ .current_prot = 0,
+ .check_wx = true,
+ .wx_pages = 0,
+ .start_address = 0,
+ .marker = (struct addr_marker[]) {
+ { .start_address = 0, .name = NULL},
+ { .start_address = -1, .name = NULL},
+ },
+ };
-#ifdef CONFIG_KASAN
- if ((p4d_val(*p4d) & PAGE_MASK) == __pa(kasan_early_shadow_pud)) {
- note_kasan_early_shadow_page(m, st);
+ if (!MACHINE_HAS_NX)
return;
- }
-#endif
-
- pud = pud_offset(p4d, addr);
- for (i = 0; i < PTRS_PER_PUD && addr < max_addr; i++, pud++) {
- st->current_address = addr;
- if (!pud_none(*pud))
- if (pud_large(*pud)) {
- prot = pud_val(*pud) &
- (_REGION_ENTRY_PROTECT |
- _REGION_ENTRY_NOEXEC);
- note_page(m, st, prot, 2);
- } else
- walk_pmd_level(m, st, pud, addr);
- else
- note_page(m, st, _PAGE_INVALID, 2);
- addr += PUD_SIZE;
- }
+ ptdump_walk_pgd(&st.ptdump, &init_mm, NULL);
+ if (st.wx_pages)
+ pr_warn("Checked W+X mappings: FAILED, %lu W+X pages found\n", st.wx_pages);
+ else
+ pr_info("Checked W+X mappings: passed, no %sW+X pages found\n",
+ (nospec_uses_trampoline() || !static_key_enabled(&cpu_has_bear)) ?
+ "unexpected " : "");
}
+#endif /* CONFIG_DEBUG_WX */
-static void walk_p4d_level(struct seq_file *m, struct pg_state *st,
- pgd_t *pgd, unsigned long addr)
+#ifdef CONFIG_PTDUMP_DEBUGFS
+static int ptdump_show(struct seq_file *m, void *v)
{
- p4d_t *p4d;
- int i;
+ struct pg_state st = {
+ .ptdump = {
+ .note_page = note_page,
+ .range = (struct ptdump_range[]) {
+ {.start = 0, .end = max_addr},
+ {.start = 0, .end = 0},
+ }
+ },
+ .seq = m,
+ .level = -1,
+ .current_prot = 0,
+ .check_wx = false,
+ .wx_pages = 0,
+ .start_address = 0,
+ .marker = address_markers,
+ };
-#ifdef CONFIG_KASAN
- if ((pgd_val(*pgd) & PAGE_MASK) == __pa(kasan_early_shadow_p4d)) {
- note_kasan_early_shadow_page(m, st);
- return;
- }
-#endif
-
- p4d = p4d_offset(pgd, addr);
- for (i = 0; i < PTRS_PER_P4D && addr < max_addr; i++, p4d++) {
- st->current_address = addr;
- if (!p4d_none(*p4d))
- walk_pud_level(m, st, p4d, addr);
- else
- note_page(m, st, _PAGE_INVALID, 2);
- addr += P4D_SIZE;
- }
+ get_online_mems();
+ mutex_lock(&cpa_mutex);
+ ptdump_walk_pgd(&st.ptdump, &init_mm, NULL);
+ mutex_unlock(&cpa_mutex);
+ put_online_mems();
+ return 0;
}
+DEFINE_SHOW_ATTRIBUTE(ptdump);
+#endif /* CONFIG_PTDUMP_DEBUGFS */
-static void walk_pgd_level(struct seq_file *m)
+/*
+ * Heapsort from lib/sort.c is not a stable sorting algorithm, do a simple
+ * insertion sort to preserve the original order of markers with the same
+ * start address.
+ */
+static void sort_address_markers(void)
{
- unsigned long addr = 0;
- struct pg_state st;
- pgd_t *pgd;
- int i;
+ struct addr_marker tmp;
+ int i, j;
- memset(&st, 0, sizeof(st));
- for (i = 0; i < PTRS_PER_PGD && addr < max_addr; i++) {
- st.current_address = addr;
- pgd = pgd_offset_k(addr);
- if (!pgd_none(*pgd))
- walk_p4d_level(m, &st, pgd, addr);
- else
- note_page(m, &st, _PAGE_INVALID, 1);
- addr += PGDIR_SIZE;
- cond_resched();
+ for (i = 1; i < ARRAY_SIZE(address_markers) - 1; i++) {
+ tmp = address_markers[i];
+ for (j = i - 1; j >= 0 && address_markers[j].start_address > tmp.start_address; j--)
+ address_markers[j + 1] = address_markers[j];
+ address_markers[j + 1] = tmp;
}
- /* Flush out the last page */
- st.current_address = max_addr;
- note_page(m, &st, 0, 0);
-}
-
-static int ptdump_show(struct seq_file *m, void *v)
-{
- walk_pgd_level(m);
- return 0;
-}
-
-static int ptdump_open(struct inode *inode, struct file *filp)
-{
- return single_open(filp, ptdump_show, NULL);
}
-static const struct file_operations ptdump_fops = {
- .open = ptdump_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
static int pt_dump_init(void)
{
+#ifdef CONFIG_KFENCE
+ unsigned long kfence_start = (unsigned long)__kfence_pool;
+#endif
/*
* Figure out the maximum virtual address being accessible with the
* kernel ASCE. We need this to keep the page table walker functions
@@ -283,10 +289,27 @@ static int pt_dump_init(void)
*/
max_addr = (S390_lowcore.kernel_asce & _REGION_ENTRY_TYPE_MASK) >> 2;
max_addr = 1UL << (max_addr * 11 + 31);
+ address_markers[IDENTITY_AFTER_END_NR].start_address = ident_map_size;
+ address_markers[AMODE31_START_NR].start_address = __samode31;
+ address_markers[AMODE31_END_NR].start_address = __eamode31;
address_markers[MODULES_NR].start_address = MODULES_VADDR;
+ address_markers[MODULES_END_NR].start_address = MODULES_END;
+ address_markers[ABS_LOWCORE_NR].start_address = __abs_lowcore;
+ address_markers[ABS_LOWCORE_END_NR].start_address = __abs_lowcore + ABS_LOWCORE_MAP_SIZE;
+ address_markers[MEMCPY_REAL_NR].start_address = __memcpy_real_area;
+ address_markers[MEMCPY_REAL_END_NR].start_address = __memcpy_real_area + PAGE_SIZE;
address_markers[VMEMMAP_NR].start_address = (unsigned long) vmemmap;
+ address_markers[VMEMMAP_END_NR].start_address = (unsigned long)vmemmap + vmemmap_size;
address_markers[VMALLOC_NR].start_address = VMALLOC_START;
+ address_markers[VMALLOC_END_NR].start_address = VMALLOC_END;
+#ifdef CONFIG_KFENCE
+ address_markers[KFENCE_START_NR].start_address = kfence_start;
+ address_markers[KFENCE_END_NR].start_address = kfence_start + KFENCE_POOL_SIZE;
+#endif
+ sort_address_markers();
+#ifdef CONFIG_PTDUMP_DEBUGFS
debugfs_create_file("kernel_page_tables", 0400, NULL, NULL, &ptdump_fops);
+#endif /* CONFIG_PTDUMP_DEBUGFS */
return 0;
}
device_initcall(pt_dump_init);
diff --git a/arch/s390/mm/extable.c b/arch/s390/mm/extable.c
new file mode 100644
index 000000000000..1e4d2187541a
--- /dev/null
+++ b/arch/s390/mm/extable.c
@@ -0,0 +1,81 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/bitfield.h>
+#include <linux/extable.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/panic.h>
+#include <asm/asm-extable.h>
+#include <asm/extable.h>
+
+const struct exception_table_entry *s390_search_extables(unsigned long addr)
+{
+ const struct exception_table_entry *fixup;
+ size_t num;
+
+ fixup = search_exception_tables(addr);
+ if (fixup)
+ return fixup;
+ num = __stop_amode31_ex_table - __start_amode31_ex_table;
+ return search_extable(__start_amode31_ex_table, num, addr);
+}
+
+static bool ex_handler_fixup(const struct exception_table_entry *ex, struct pt_regs *regs)
+{
+ regs->psw.addr = extable_fixup(ex);
+ return true;
+}
+
+static bool ex_handler_ua_store(const struct exception_table_entry *ex, struct pt_regs *regs)
+{
+ unsigned int reg_err = FIELD_GET(EX_DATA_REG_ERR, ex->data);
+
+ regs->gprs[reg_err] = -EFAULT;
+ regs->psw.addr = extable_fixup(ex);
+ return true;
+}
+
+static bool ex_handler_ua_load_mem(const struct exception_table_entry *ex, struct pt_regs *regs)
+{
+ unsigned int reg_addr = FIELD_GET(EX_DATA_REG_ADDR, ex->data);
+ unsigned int reg_err = FIELD_GET(EX_DATA_REG_ERR, ex->data);
+ size_t len = FIELD_GET(EX_DATA_LEN, ex->data);
+
+ regs->gprs[reg_err] = -EFAULT;
+ memset((void *)regs->gprs[reg_addr], 0, len);
+ regs->psw.addr = extable_fixup(ex);
+ return true;
+}
+
+static bool ex_handler_ua_load_reg(const struct exception_table_entry *ex, struct pt_regs *regs)
+{
+ unsigned int reg_zero = FIELD_GET(EX_DATA_REG_ADDR, ex->data);
+ unsigned int reg_err = FIELD_GET(EX_DATA_REG_ERR, ex->data);
+
+ regs->gprs[reg_err] = -EFAULT;
+ regs->gprs[reg_zero] = 0;
+ regs->psw.addr = extable_fixup(ex);
+ return true;
+}
+
+bool fixup_exception(struct pt_regs *regs)
+{
+ const struct exception_table_entry *ex;
+
+ ex = s390_search_extables(instruction_pointer(regs));
+ if (!ex)
+ return false;
+ switch (ex->type) {
+ case EX_TYPE_FIXUP:
+ return ex_handler_fixup(ex, regs);
+ case EX_TYPE_BPF:
+ return ex_handler_bpf(ex, regs);
+ case EX_TYPE_UA_STORE:
+ return ex_handler_ua_store(ex, regs);
+ case EX_TYPE_UA_LOAD_MEM:
+ return ex_handler_ua_load_mem(ex, regs);
+ case EX_TYPE_UA_LOAD_REG:
+ return ex_handler_ua_load_reg(ex, regs);
+ }
+ panic("invalid exception table entry");
+}
diff --git a/arch/s390/mm/extmem.c b/arch/s390/mm/extmem.c
index fd0dae9d10f4..5060956b8e7d 100644
--- a/arch/s390/mm/extmem.c
+++ b/arch/s390/mm/extmem.c
@@ -20,9 +20,9 @@
#include <linux/ctype.h>
#include <linux/ioport.h>
#include <linux/refcount.h>
+#include <linux/pgtable.h>
#include <asm/diag.h>
#include <asm/page.h>
-#include <asm/pgtable.h>
#include <asm/ebcdic.h>
#include <asm/errno.h>
#include <asm/extmem.h>
@@ -313,15 +313,10 @@ __segment_load (char *name, int do_nonshared, unsigned long *addr, unsigned long
goto out_free;
}
- rc = vmem_add_mapping(seg->start_addr, seg->end - seg->start_addr + 1);
-
- if (rc)
- goto out_free;
-
seg->res = kzalloc(sizeof(struct resource), GFP_KERNEL);
if (seg->res == NULL) {
rc = -ENOMEM;
- goto out_shared;
+ goto out_free;
}
seg->res->flags = IORESOURCE_BUSY | IORESOURCE_MEM;
seg->res->start = seg->start_addr;
@@ -335,12 +330,17 @@ __segment_load (char *name, int do_nonshared, unsigned long *addr, unsigned long
if (rc == SEG_TYPE_SC ||
((rc == SEG_TYPE_SR || rc == SEG_TYPE_ER) && !do_nonshared))
seg->res->flags |= IORESOURCE_READONLY;
+
+ /* Check for overlapping resources before adding the mapping. */
if (request_resource(&iomem_resource, seg->res)) {
rc = -EBUSY;
- kfree(seg->res);
- goto out_shared;
+ goto out_free_resource;
}
+ rc = vmem_add_mapping(seg->start_addr, seg->end - seg->start_addr + 1);
+ if (rc)
+ goto out_resource;
+
if (do_nonshared)
diag_cc = dcss_diag(&loadnsr_scode, seg->dcss_name,
&start_addr, &end_addr);
@@ -351,14 +351,14 @@ __segment_load (char *name, int do_nonshared, unsigned long *addr, unsigned long
dcss_diag(&purgeseg_scode, seg->dcss_name,
&dummy, &dummy);
rc = diag_cc;
- goto out_resource;
+ goto out_mapping;
}
if (diag_cc > 1) {
pr_warn("Loading DCSS %s failed with rc=%ld\n", name, end_addr);
rc = dcss_diag_translate_rc(end_addr);
dcss_diag(&purgeseg_scode, seg->dcss_name,
&dummy, &dummy);
- goto out_resource;
+ goto out_mapping;
}
seg->start_addr = start_addr;
seg->end = end_addr;
@@ -377,11 +377,12 @@ __segment_load (char *name, int do_nonshared, unsigned long *addr, unsigned long
(void*) seg->end, segtype_string[seg->vm_segtype]);
}
goto out;
+ out_mapping:
+ vmem_remove_mapping(seg->start_addr, seg->end - seg->start_addr + 1);
out_resource:
release_resource(seg->res);
+ out_free_resource:
kfree(seg->res);
- out_shared:
- vmem_remove_mapping(seg->start_addr, seg->end - seg->start_addr + 1);
out_free:
kfree(seg);
out:
@@ -400,8 +401,7 @@ __segment_load (char *name, int do_nonshared, unsigned long *addr, unsigned long
* -EIO : could not perform query or load diagnose
* -ENOENT : no such segment
* -EOPNOTSUPP: multi-part segment cannot be used with linux
- * -ENOSPC : segment cannot be used (overlaps with storage)
- * -EBUSY : segment can temporarily not be used (overlaps with dcss)
+ * -EBUSY : segment cannot be used (overlaps with dcss or storage)
* -ERANGE : segment cannot be used (exceeds kernel mapping range)
* -EPERM : segment is currently loaded with incompatible permissions
* -ENOMEM : out of memory
@@ -626,10 +626,6 @@ void segment_warning(int rc, char *seg_name)
pr_err("DCSS %s has multiple page ranges and cannot be "
"loaded or queried\n", seg_name);
break;
- case -ENOSPC:
- pr_err("DCSS %s overlaps with used storage and cannot "
- "be loaded\n", seg_name);
- break;
case -EBUSY:
pr_err("%s needs used memory resources and cannot be "
"loaded or queried\n", seg_name);
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index 7b0bb475c166..9649d9382e0a 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -31,29 +31,30 @@
#include <linux/kprobes.h>
#include <linux/uaccess.h>
#include <linux/hugetlb.h>
+#include <linux/kfence.h>
+#include <asm/asm-extable.h>
#include <asm/asm-offsets.h>
#include <asm/diag.h>
-#include <asm/pgtable.h>
#include <asm/gmap.h>
#include <asm/irq.h>
#include <asm/mmu_context.h>
#include <asm/facility.h>
+#include <asm/uv.h>
#include "../kernel/entry.h"
#define __FAIL_ADDR_MASK -4096L
#define __SUBCODE_MASK 0x0600
#define __PF_RES_FIELD 0x8000000000000000ULL
-#define VM_FAULT_BADCONTEXT 0x010000
-#define VM_FAULT_BADMAP 0x020000
-#define VM_FAULT_BADACCESS 0x040000
-#define VM_FAULT_SIGNAL 0x080000
-#define VM_FAULT_PFAULT 0x100000
+#define VM_FAULT_BADCONTEXT ((__force vm_fault_t) 0x010000)
+#define VM_FAULT_BADMAP ((__force vm_fault_t) 0x020000)
+#define VM_FAULT_BADACCESS ((__force vm_fault_t) 0x040000)
+#define VM_FAULT_SIGNAL ((__force vm_fault_t) 0x080000)
+#define VM_FAULT_PFAULT ((__force vm_fault_t) 0x100000)
enum fault_type {
KERNEL_FAULT,
USER_FAULT,
- VDSO_FAULT,
GMAP_FAULT,
};
@@ -77,22 +78,16 @@ static enum fault_type get_fault_type(struct pt_regs *regs)
trans_exc_code = regs->int_parm_long & 3;
if (likely(trans_exc_code == 0)) {
/* primary space exception */
- if (IS_ENABLED(CONFIG_PGSTE) &&
- test_pt_regs_flag(regs, PIF_GUEST_FAULT))
- return GMAP_FAULT;
- if (current->thread.mm_segment == USER_DS)
+ if (user_mode(regs))
return USER_FAULT;
- return KERNEL_FAULT;
- }
- if (trans_exc_code == 2) {
- /* secondary space exception */
- if (current->thread.mm_segment & 1) {
- if (current->thread.mm_segment == USER_DS_SACF)
- return USER_FAULT;
+ if (!IS_ENABLED(CONFIG_PGSTE))
return KERNEL_FAULT;
- }
- return VDSO_FAULT;
+ if (test_pt_regs_flag(regs, PIF_GUEST_FAULT))
+ return GMAP_FAULT;
+ return KERNEL_FAULT;
}
+ if (trans_exc_code == 2)
+ return USER_FAULT;
if (trans_exc_code == 1) {
/* access register mode, not used in the kernel */
return USER_FAULT;
@@ -105,7 +100,7 @@ static int bad_address(void *p)
{
unsigned long dummy;
- return probe_kernel_address((unsigned long *)p, dummy);
+ return get_kernel_nofault(dummy, (unsigned long *)p);
}
static void dump_pagetable(unsigned long asce, unsigned long address)
@@ -121,8 +116,8 @@ static void dump_pagetable(unsigned long asce, unsigned long address)
pr_cont("R1:%016lx ", *table);
if (*table & _REGION_ENTRY_INVALID)
goto out;
- table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
- /* fallthrough */
+ table = __va(*table & _REGION_ENTRY_ORIGIN);
+ fallthrough;
case _ASCE_TYPE_REGION2:
table += (address & _REGION2_INDEX) >> _REGION2_SHIFT;
if (bad_address(table))
@@ -130,8 +125,8 @@ static void dump_pagetable(unsigned long asce, unsigned long address)
pr_cont("R2:%016lx ", *table);
if (*table & _REGION_ENTRY_INVALID)
goto out;
- table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
- /* fallthrough */
+ table = __va(*table & _REGION_ENTRY_ORIGIN);
+ fallthrough;
case _ASCE_TYPE_REGION3:
table += (address & _REGION3_INDEX) >> _REGION3_SHIFT;
if (bad_address(table))
@@ -139,8 +134,8 @@ static void dump_pagetable(unsigned long asce, unsigned long address)
pr_cont("R3:%016lx ", *table);
if (*table & (_REGION_ENTRY_INVALID | _REGION3_ENTRY_LARGE))
goto out;
- table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
- /* fallthrough */
+ table = __va(*table & _REGION_ENTRY_ORIGIN);
+ fallthrough;
case _ASCE_TYPE_SEGMENT:
table += (address & _SEGMENT_INDEX) >> _SEGMENT_SHIFT;
if (bad_address(table))
@@ -148,7 +143,7 @@ static void dump_pagetable(unsigned long asce, unsigned long address)
pr_cont("S:%016lx ", *table);
if (*table & (_SEGMENT_ENTRY_INVALID | _SEGMENT_ENTRY_LARGE))
goto out;
- table = (unsigned long *)(*table & _SEGMENT_ENTRY_ORIGIN);
+ table = __va(*table & _SEGMENT_ENTRY_ORIGIN);
}
table += (address & _PAGE_INDEX) >> _PAGE_SHIFT;
if (bad_address(table))
@@ -188,10 +183,6 @@ static void dump_fault_info(struct pt_regs *regs)
asce = S390_lowcore.user_asce;
pr_cont("user ");
break;
- case VDSO_FAULT:
- asce = S390_lowcore.vdso_asce;
- pr_cont("vdso ");
- break;
case GMAP_FAULT:
asce = ((struct gmap *) S390_lowcore.gmap)->asce;
pr_cont("gmap ");
@@ -237,29 +228,10 @@ static noinline void do_sigsegv(struct pt_regs *regs, int si_code)
(void __user *)(regs->int_parm_long & __FAIL_ADDR_MASK));
}
-const struct exception_table_entry *s390_search_extables(unsigned long addr)
-{
- const struct exception_table_entry *fixup;
-
- fixup = search_extable(__start_dma_ex_table,
- __stop_dma_ex_table - __start_dma_ex_table,
- addr);
- if (!fixup)
- fixup = search_exception_tables(addr);
- return fixup;
-}
-
static noinline void do_no_context(struct pt_regs *regs)
{
- const struct exception_table_entry *fixup;
-
- /* Are we prepared to handle this kernel fault? */
- fixup = s390_search_extables(regs->psw.addr);
- if (fixup) {
- regs->psw.addr = extable_fixup(fixup);
+ if (fixup_exception(regs))
return;
- }
-
/*
* Oops. The kernel tried to access some bad page. We'll have to
* terminate things with extreme prejudice.
@@ -272,7 +244,6 @@ static noinline void do_no_context(struct pt_regs *regs)
" in virtual user address space\n");
dump_fault_info(regs);
die(regs, "Oops");
- do_exit(SIGKILL);
}
static noinline void do_low_address(struct pt_regs *regs)
@@ -282,7 +253,6 @@ static noinline void do_low_address(struct pt_regs *regs)
if (regs->psw.mask & PSW_MASK_PSTATE) {
/* Low-address protection hit in user mode 'cannot happen'. */
die (regs, "Low-address protection");
- do_exit(SIGKILL);
}
do_no_context(regs);
@@ -298,36 +268,12 @@ static noinline void do_sigbus(struct pt_regs *regs)
(void __user *)(regs->int_parm_long & __FAIL_ADDR_MASK));
}
-static noinline int signal_return(struct pt_regs *regs)
-{
- u16 instruction;
- int rc;
-
- rc = __get_user(instruction, (u16 __user *) regs->psw.addr);
- if (rc)
- return rc;
- if (instruction == 0x0a77) {
- set_pt_regs_flag(regs, PIF_SYSCALL);
- regs->int_code = 0x00040077;
- return 0;
- } else if (instruction == 0x0aad) {
- set_pt_regs_flag(regs, PIF_SYSCALL);
- regs->int_code = 0x000400ad;
- return 0;
- }
- return -EACCES;
-}
-
-static noinline void do_fault_error(struct pt_regs *regs, int access,
- vm_fault_t fault)
+static noinline void do_fault_error(struct pt_regs *regs, vm_fault_t fault)
{
int si_code;
switch (fault) {
case VM_FAULT_BADACCESS:
- if (access == VM_EXEC && signal_return(regs) == 0)
- break;
- /* fallthrough */
case VM_FAULT_BADMAP:
/* Bad memory access. Check if it is kernel or user space. */
if (user_mode(regs)) {
@@ -337,9 +283,8 @@ static noinline void do_fault_error(struct pt_regs *regs, int access,
do_sigsegv(regs, si_code);
break;
}
- /* fallthrough */
+ fallthrough;
case VM_FAULT_BADCONTEXT:
- /* fallthrough */
case VM_FAULT_PFAULT:
do_no_context(regs);
break;
@@ -377,7 +322,7 @@ static noinline void do_fault_error(struct pt_regs *regs, int access,
* routines.
*
* interruption code (int_code):
- * 04 Protection -> Write-Protection (suprression)
+ * 04 Protection -> Write-Protection (suppression)
* 10 Segment translation -> Not present (nullification)
* 11 Page translation -> Not present (nullification)
* 3b Region third trans. -> Not present (nullification)
@@ -393,19 +338,22 @@ static inline vm_fault_t do_exception(struct pt_regs *regs, int access)
unsigned long address;
unsigned int flags;
vm_fault_t fault;
+ bool is_write;
tsk = current;
/*
* The instruction that caused the program check has
* been nullified. Don't signal single step via SIGTRAP.
*/
- clear_pt_regs_flag(regs, PIF_PER_TRAP);
+ clear_thread_flag(TIF_PER_TRAP);
if (kprobe_page_fault(regs, 14))
return 0;
mm = tsk->mm;
trans_exc_code = regs->int_parm_long;
+ address = trans_exc_code & __FAIL_ADDR_MASK;
+ is_write = (trans_exc_code & store_indication) == 0x400;
/*
* Verify that the fault happened in user space, that
@@ -416,9 +364,8 @@ static inline vm_fault_t do_exception(struct pt_regs *regs, int access)
type = get_fault_type(regs);
switch (type) {
case KERNEL_FAULT:
- goto out;
- case VDSO_FAULT:
- fault = VM_FAULT_BADMAP;
+ if (kfence_handle_page_fault(address, is_write, regs))
+ return 0;
goto out;
case USER_FAULT:
case GMAP_FAULT:
@@ -427,14 +374,15 @@ static inline vm_fault_t do_exception(struct pt_regs *regs, int access)
break;
}
- address = trans_exc_code & __FAIL_ADDR_MASK;
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
- flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
+ flags = FAULT_FLAG_DEFAULT;
if (user_mode(regs))
flags |= FAULT_FLAG_USER;
- if (access == VM_WRITE || (trans_exc_code & store_indication) == 0x400)
+ if (is_write)
+ access = VM_WRITE;
+ if (access == VM_WRITE)
flags |= FAULT_FLAG_WRITE;
- down_read(&mm->mmap_sem);
+ mmap_read_lock(mm);
gmap = NULL;
if (IS_ENABLED(CONFIG_PGSTE) && type == GMAP_FAULT) {
@@ -472,57 +420,49 @@ retry:
if (unlikely(!(vma->vm_flags & access)))
goto out_up;
- if (is_vm_hugetlb_page(vma))
- address &= HPAGE_MASK;
/*
* If for any reason at all we couldn't handle the fault,
* make sure we exit gracefully rather than endlessly redo
* the fault.
*/
- fault = handle_mm_fault(vma, address, flags);
- /* No reason to continue if interrupted by SIGKILL. */
- if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) {
+ fault = handle_mm_fault(vma, address, flags, regs);
+ if (fault_signal_pending(fault, regs)) {
fault = VM_FAULT_SIGNAL;
if (flags & FAULT_FLAG_RETRY_NOWAIT)
goto out_up;
goto out;
}
+
+ /* The fault is fully completed (including releasing mmap lock) */
+ if (fault & VM_FAULT_COMPLETED) {
+ if (gmap) {
+ mmap_read_lock(mm);
+ goto out_gmap;
+ }
+ fault = 0;
+ goto out;
+ }
+
if (unlikely(fault & VM_FAULT_ERROR))
goto out_up;
- /*
- * Major/minor page fault accounting is only done on the
- * initial attempt. If we go through a retry, it is extremely
- * likely that the page will be found in page cache at that point.
- */
- if (flags & FAULT_FLAG_ALLOW_RETRY) {
- if (fault & VM_FAULT_MAJOR) {
- tsk->maj_flt++;
- perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1,
- regs, address);
- } else {
- tsk->min_flt++;
- perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1,
- regs, address);
- }
- if (fault & VM_FAULT_RETRY) {
- if (IS_ENABLED(CONFIG_PGSTE) && gmap &&
- (flags & FAULT_FLAG_RETRY_NOWAIT)) {
- /* FAULT_FLAG_RETRY_NOWAIT has been set,
- * mmap_sem has not been released */
- current->thread.gmap_pfault = 1;
- fault = VM_FAULT_PFAULT;
- goto out_up;
- }
- /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk
- * of starvation. */
- flags &= ~(FAULT_FLAG_ALLOW_RETRY |
- FAULT_FLAG_RETRY_NOWAIT);
- flags |= FAULT_FLAG_TRIED;
- down_read(&mm->mmap_sem);
- goto retry;
+ if (fault & VM_FAULT_RETRY) {
+ if (IS_ENABLED(CONFIG_PGSTE) && gmap &&
+ (flags & FAULT_FLAG_RETRY_NOWAIT)) {
+ /*
+ * FAULT_FLAG_RETRY_NOWAIT has been set, mmap_lock has
+ * not been released
+ */
+ current->thread.gmap_pfault = 1;
+ fault = VM_FAULT_PFAULT;
+ goto out_up;
}
+ flags &= ~FAULT_FLAG_RETRY_NOWAIT;
+ flags |= FAULT_FLAG_TRIED;
+ mmap_read_lock(mm);
+ goto retry;
}
+out_gmap:
if (IS_ENABLED(CONFIG_PGSTE) && gmap) {
address = __gmap_link(gmap, current->thread.gmap_addr,
address);
@@ -537,7 +477,7 @@ retry:
}
fault = 0;
out_up:
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
out:
return fault;
}
@@ -575,7 +515,7 @@ void do_protection_exception(struct pt_regs *regs)
fault = do_exception(regs, access);
}
if (unlikely(fault))
- do_fault_error(regs, access, fault);
+ do_fault_error(regs, fault);
}
NOKPROBE_SYMBOL(do_protection_exception);
@@ -584,10 +524,10 @@ void do_dat_exception(struct pt_regs *regs)
int access;
vm_fault_t fault;
- access = VM_READ | VM_EXEC | VM_WRITE;
+ access = VM_ACCESS_FLAGS;
fault = do_exception(regs, access);
if (unlikely(fault))
- do_fault_error(regs, access, fault);
+ do_fault_error(regs, fault);
}
NOKPROBE_SYMBOL(do_dat_exception);
@@ -737,7 +677,7 @@ static void pfault_interrupt(struct ext_code ext_code,
* interrupt since it must be a leftover of a PFAULT
* CANCEL operation which didn't remove all pending
* completion interrupts. */
- if (tsk->state == TASK_RUNNING)
+ if (task_is_running(tsk))
tsk->thread.pfault_wait = -1;
}
} else {
@@ -816,3 +756,130 @@ out_extint:
early_initcall(pfault_irq_init);
#endif /* CONFIG_PFAULT */
+
+#if IS_ENABLED(CONFIG_PGSTE)
+
+void do_secure_storage_access(struct pt_regs *regs)
+{
+ unsigned long addr = regs->int_parm_long & __FAIL_ADDR_MASK;
+ struct vm_area_struct *vma;
+ struct mm_struct *mm;
+ struct page *page;
+ struct gmap *gmap;
+ int rc;
+
+ /*
+ * bit 61 tells us if the address is valid, if it's not we
+ * have a major problem and should stop the kernel or send a
+ * SIGSEGV to the process. Unfortunately bit 61 is not
+ * reliable without the misc UV feature so we need to check
+ * for that as well.
+ */
+ if (test_bit_inv(BIT_UV_FEAT_MISC, &uv_info.uv_feature_indications) &&
+ !test_bit_inv(61, &regs->int_parm_long)) {
+ /*
+ * When this happens, userspace did something that it
+ * was not supposed to do, e.g. branching into secure
+ * memory. Trigger a segmentation fault.
+ */
+ if (user_mode(regs)) {
+ send_sig(SIGSEGV, current, 0);
+ return;
+ }
+
+ /*
+ * The kernel should never run into this case and we
+ * have no way out of this situation.
+ */
+ panic("Unexpected PGM 0x3d with TEID bit 61=0");
+ }
+
+ switch (get_fault_type(regs)) {
+ case GMAP_FAULT:
+ mm = current->mm;
+ gmap = (struct gmap *)S390_lowcore.gmap;
+ mmap_read_lock(mm);
+ addr = __gmap_translate(gmap, addr);
+ mmap_read_unlock(mm);
+ if (IS_ERR_VALUE(addr)) {
+ do_fault_error(regs, VM_FAULT_BADMAP);
+ break;
+ }
+ fallthrough;
+ case USER_FAULT:
+ mm = current->mm;
+ mmap_read_lock(mm);
+ vma = find_vma(mm, addr);
+ if (!vma) {
+ mmap_read_unlock(mm);
+ do_fault_error(regs, VM_FAULT_BADMAP);
+ break;
+ }
+ page = follow_page(vma, addr, FOLL_WRITE | FOLL_GET);
+ if (IS_ERR_OR_NULL(page)) {
+ mmap_read_unlock(mm);
+ break;
+ }
+ if (arch_make_page_accessible(page))
+ send_sig(SIGSEGV, current, 0);
+ put_page(page);
+ mmap_read_unlock(mm);
+ break;
+ case KERNEL_FAULT:
+ page = phys_to_page(addr);
+ if (unlikely(!try_get_page(page)))
+ break;
+ rc = arch_make_page_accessible(page);
+ put_page(page);
+ if (rc)
+ BUG();
+ break;
+ default:
+ do_fault_error(regs, VM_FAULT_BADMAP);
+ WARN_ON_ONCE(1);
+ }
+}
+NOKPROBE_SYMBOL(do_secure_storage_access);
+
+void do_non_secure_storage_access(struct pt_regs *regs)
+{
+ unsigned long gaddr = regs->int_parm_long & __FAIL_ADDR_MASK;
+ struct gmap *gmap = (struct gmap *)S390_lowcore.gmap;
+
+ if (get_fault_type(regs) != GMAP_FAULT) {
+ do_fault_error(regs, VM_FAULT_BADMAP);
+ WARN_ON_ONCE(1);
+ return;
+ }
+
+ if (gmap_convert_to_secure(gmap, gaddr) == -EINVAL)
+ send_sig(SIGSEGV, current, 0);
+}
+NOKPROBE_SYMBOL(do_non_secure_storage_access);
+
+void do_secure_storage_violation(struct pt_regs *regs)
+{
+ unsigned long gaddr = regs->int_parm_long & __FAIL_ADDR_MASK;
+ struct gmap *gmap = (struct gmap *)S390_lowcore.gmap;
+
+ /*
+ * If the VM has been rebooted, its address space might still contain
+ * secure pages from the previous boot.
+ * Clear the page so it can be reused.
+ */
+ if (!gmap_destroy_page(gmap, gaddr))
+ return;
+ /*
+ * Either KVM messed up the secure guest mapping or the same
+ * page is mapped into multiple secure guests.
+ *
+ * This exception is only triggered when a guest 2 is running
+ * and can therefore never occur in kernel context.
+ */
+ printk_ratelimited(KERN_WARNING
+ "Secure storage violation in task: %s, pid %d\n",
+ current->comm, current->pid);
+ send_sig(SIGSEGV, current, 0);
+}
+
+#endif /* CONFIG_PGSTE */
diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
index edcdca97e85e..02d15c8dc92e 100644
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -2,7 +2,7 @@
/*
* KVM guest address space mapping code
*
- * Copyright IBM Corp. 2007, 2016, 2018
+ * Copyright IBM Corp. 2007, 2020
* Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
* David Hildenbrand <david@redhat.com>
* Janosch Frank <frankja@linux.vnet.ibm.com>
@@ -17,8 +17,8 @@
#include <linux/swapops.h>
#include <linux/ksm.h>
#include <linux/mman.h>
+#include <linux/pgtable.h>
-#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/gmap.h>
#include <asm/tlb.h>
@@ -27,7 +27,6 @@
/**
* gmap_alloc - allocate and initialize a guest address space
- * @mm: pointer to the parent mm_struct
* @limit: maximum address of the gmap address space
*
* Returns a guest address space structure.
@@ -56,19 +55,19 @@ static struct gmap *gmap_alloc(unsigned long limit)
atype = _ASCE_TYPE_REGION1;
etype = _REGION1_ENTRY_EMPTY;
}
- gmap = kzalloc(sizeof(struct gmap), GFP_KERNEL);
+ gmap = kzalloc(sizeof(struct gmap), GFP_KERNEL_ACCOUNT);
if (!gmap)
goto out;
INIT_LIST_HEAD(&gmap->crst_list);
INIT_LIST_HEAD(&gmap->children);
INIT_LIST_HEAD(&gmap->pt_list);
- INIT_RADIX_TREE(&gmap->guest_to_host, GFP_KERNEL);
- INIT_RADIX_TREE(&gmap->host_to_guest, GFP_ATOMIC);
- INIT_RADIX_TREE(&gmap->host_to_rmap, GFP_ATOMIC);
+ INIT_RADIX_TREE(&gmap->guest_to_host, GFP_KERNEL_ACCOUNT);
+ INIT_RADIX_TREE(&gmap->host_to_guest, GFP_ATOMIC | __GFP_ACCOUNT);
+ INIT_RADIX_TREE(&gmap->host_to_rmap, GFP_ATOMIC | __GFP_ACCOUNT);
spin_lock_init(&gmap->guest_table_lock);
spin_lock_init(&gmap->shadow_lock);
refcount_set(&gmap->ref_count, 1);
- page = alloc_pages(GFP_KERNEL, CRST_ALLOC_ORDER);
+ page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER);
if (!page)
goto out_free;
page->index = 0;
@@ -300,7 +299,7 @@ struct gmap *gmap_get_enabled(void)
EXPORT_SYMBOL_GPL(gmap_get_enabled);
/*
- * gmap_alloc_table is assumed to be called with mmap_sem held
+ * gmap_alloc_table is assumed to be called with mmap_lock held
*/
static int gmap_alloc_table(struct gmap *gmap, unsigned long *table,
unsigned long init, unsigned long gaddr)
@@ -309,7 +308,7 @@ static int gmap_alloc_table(struct gmap *gmap, unsigned long *table,
unsigned long *new;
/* since we dont free the gmap table until gmap_free we can unlock */
- page = alloc_pages(GFP_KERNEL, CRST_ALLOC_ORDER);
+ page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER);
if (!page)
return -ENOMEM;
new = (unsigned long *) page_to_phys(page);
@@ -405,10 +404,10 @@ int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len)
return -EINVAL;
flush = 0;
- down_write(&gmap->mm->mmap_sem);
+ mmap_write_lock(gmap->mm);
for (off = 0; off < len; off += PMD_SIZE)
flush |= __gmap_unmap_by_gaddr(gmap, to + off);
- up_write(&gmap->mm->mmap_sem);
+ mmap_write_unlock(gmap->mm);
if (flush)
gmap_flush_tlb(gmap);
return 0;
@@ -438,7 +437,7 @@ int gmap_map_segment(struct gmap *gmap, unsigned long from,
return -EINVAL;
flush = 0;
- down_write(&gmap->mm->mmap_sem);
+ mmap_write_lock(gmap->mm);
for (off = 0; off < len; off += PMD_SIZE) {
/* Remove old translation */
flush |= __gmap_unmap_by_gaddr(gmap, to + off);
@@ -448,7 +447,7 @@ int gmap_map_segment(struct gmap *gmap, unsigned long from,
(void *) from + off))
break;
}
- up_write(&gmap->mm->mmap_sem);
+ mmap_write_unlock(gmap->mm);
if (flush)
gmap_flush_tlb(gmap);
if (off >= len)
@@ -466,7 +465,7 @@ EXPORT_SYMBOL_GPL(gmap_map_segment);
* Returns user space address which corresponds to the guest address or
* -EFAULT if no such mapping exists.
* This function does not establish potentially missing page table entries.
- * The mmap_sem of the mm that belongs to the address space must be held
+ * The mmap_lock of the mm that belongs to the address space must be held
* when this function gets called.
*
* Note: Can also be called for shadow gmaps.
@@ -495,16 +494,16 @@ unsigned long gmap_translate(struct gmap *gmap, unsigned long gaddr)
{
unsigned long rc;
- down_read(&gmap->mm->mmap_sem);
+ mmap_read_lock(gmap->mm);
rc = __gmap_translate(gmap, gaddr);
- up_read(&gmap->mm->mmap_sem);
+ mmap_read_unlock(gmap->mm);
return rc;
}
EXPORT_SYMBOL_GPL(gmap_translate);
/**
* gmap_unlink - disconnect a page table from the gmap shadow tables
- * @gmap: pointer to guest mapping meta data structure
+ * @mm: pointer to the parent mm_struct
* @table: pointer to the host page table
* @vmaddr: vm address associated with the host page table
*/
@@ -527,14 +526,14 @@ static void gmap_pmdp_xchg(struct gmap *gmap, pmd_t *old, pmd_t new,
unsigned long gaddr);
/**
- * gmap_link - set up shadow page tables to connect a host to a guest address
+ * __gmap_link - set up shadow page tables to connect a host to a guest address
* @gmap: pointer to guest mapping meta data structure
* @gaddr: guest address
* @vmaddr: vm address
*
* Returns 0 on success, -ENOMEM for out of memory conditions, and -EFAULT
* if the vm address is already mapped to a different guest segment.
- * The mmap_sem of the mm that belongs to the address space must be held
+ * The mmap_lock of the mm that belongs to the address space must be held
* when this function gets called.
*/
int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr)
@@ -594,7 +593,7 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr)
if (pmd_large(*pmd) && !gmap->mm->context.allow_gmap_hpage_1m)
return -EFAULT;
/* Link gmap segment table entry location to page table. */
- rc = radix_tree_preload(GFP_KERNEL);
+ rc = radix_tree_preload(GFP_KERNEL_ACCOUNT);
if (rc)
return rc;
ptl = pmd_lock(mm, pmd);
@@ -640,7 +639,7 @@ int gmap_fault(struct gmap *gmap, unsigned long gaddr,
int rc;
bool unlocked;
- down_read(&gmap->mm->mmap_sem);
+ mmap_read_lock(gmap->mm);
retry:
unlocked = false;
@@ -649,13 +648,13 @@ retry:
rc = vmaddr;
goto out_up;
}
- if (fixup_user_fault(current, gmap->mm, vmaddr, fault_flags,
+ if (fixup_user_fault(gmap->mm, vmaddr, fault_flags,
&unlocked)) {
rc = -EFAULT;
goto out_up;
}
/*
- * In the case that fixup_user_fault unlocked the mmap_sem during
+ * In the case that fixup_user_fault unlocked the mmap_lock during
* faultin redo __gmap_translate to not race with a map/unmap_segment.
*/
if (unlocked)
@@ -663,16 +662,17 @@ retry:
rc = __gmap_link(gmap, gaddr, vmaddr);
out_up:
- up_read(&gmap->mm->mmap_sem);
+ mmap_read_unlock(gmap->mm);
return rc;
}
EXPORT_SYMBOL_GPL(gmap_fault);
/*
- * this function is assumed to be called with mmap_sem held
+ * this function is assumed to be called with mmap_lock held
*/
void __gmap_zap(struct gmap *gmap, unsigned long gaddr)
{
+ struct vm_area_struct *vma;
unsigned long vmaddr;
spinlock_t *ptl;
pte_t *ptep;
@@ -682,11 +682,17 @@ void __gmap_zap(struct gmap *gmap, unsigned long gaddr)
gaddr >> PMD_SHIFT);
if (vmaddr) {
vmaddr |= gaddr & ~PMD_MASK;
+
+ vma = vma_lookup(gmap->mm, vmaddr);
+ if (!vma || is_vm_hugetlb_page(vma))
+ return;
+
/* Get pointer to the page table entry */
ptep = get_locked_pte(gmap->mm, vmaddr, &ptl);
- if (likely(ptep))
+ if (likely(ptep)) {
ptep_zap_unused(gmap->mm, vmaddr, ptep, 0);
- pte_unmap_unlock(ptep, ptl);
+ pte_unmap_unlock(ptep, ptl);
+ }
}
}
EXPORT_SYMBOL_GPL(__gmap_zap);
@@ -696,7 +702,7 @@ void gmap_discard(struct gmap *gmap, unsigned long from, unsigned long to)
unsigned long gaddr, vmaddr, size;
struct vm_area_struct *vma;
- down_read(&gmap->mm->mmap_sem);
+ mmap_read_lock(gmap->mm);
for (gaddr = from; gaddr < to;
gaddr = (gaddr + PMD_SIZE) & PMD_MASK) {
/* Find the vm address for the guest address */
@@ -719,7 +725,7 @@ void gmap_discard(struct gmap *gmap, unsigned long from, unsigned long to)
size = min(to - gaddr, PMD_SIZE - (gaddr & ~PMD_MASK));
zap_page_range(vma, vmaddr, size);
}
- up_read(&gmap->mm->mmap_sem);
+ mmap_read_unlock(gmap->mm);
}
EXPORT_SYMBOL_GPL(gmap_discard);
@@ -787,16 +793,20 @@ static void gmap_call_notifier(struct gmap *gmap, unsigned long start,
static inline unsigned long *gmap_table_walk(struct gmap *gmap,
unsigned long gaddr, int level)
{
- unsigned long *table;
+ const int asce_type = gmap->asce & _ASCE_TYPE_MASK;
+ unsigned long *table = gmap->table;
- if ((gmap->asce & _ASCE_TYPE_MASK) + 4 < (level * 4))
- return NULL;
if (gmap_is_shadow(gmap) && gmap->removed)
return NULL;
- if (gaddr & (-1UL << (31 + ((gmap->asce & _ASCE_TYPE_MASK) >> 2)*11)))
+
+ if (WARN_ON_ONCE(level > (asce_type >> 2) + 1))
return NULL;
- table = gmap->table;
- switch (gmap->asce & _ASCE_TYPE_MASK) {
+
+ if (asce_type != _ASCE_TYPE_REGION1 &&
+ gaddr & (-1UL << (31 + (asce_type >> 2) * 11)))
+ return NULL;
+
+ switch (asce_type) {
case _ASCE_TYPE_REGION1:
table += (gaddr & _REGION1_INDEX) >> _REGION1_SHIFT;
if (level == 4)
@@ -804,7 +814,7 @@ static inline unsigned long *gmap_table_walk(struct gmap *gmap,
if (*table & _REGION_ENTRY_INVALID)
return NULL;
table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
- /* Fallthrough */
+ fallthrough;
case _ASCE_TYPE_REGION2:
table += (gaddr & _REGION2_INDEX) >> _REGION2_SHIFT;
if (level == 3)
@@ -812,7 +822,7 @@ static inline unsigned long *gmap_table_walk(struct gmap *gmap,
if (*table & _REGION_ENTRY_INVALID)
return NULL;
table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
- /* Fallthrough */
+ fallthrough;
case _ASCE_TYPE_REGION3:
table += (gaddr & _REGION3_INDEX) >> _REGION3_SHIFT;
if (level == 2)
@@ -820,7 +830,7 @@ static inline unsigned long *gmap_table_walk(struct gmap *gmap,
if (*table & _REGION_ENTRY_INVALID)
return NULL;
table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
- /* Fallthrough */
+ fallthrough;
case _ASCE_TYPE_SEGMENT:
table += (gaddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT;
if (level == 1)
@@ -875,10 +885,10 @@ static int gmap_pte_op_fixup(struct gmap *gmap, unsigned long gaddr,
BUG_ON(gmap_is_shadow(gmap));
fault_flags = (prot == PROT_WRITE) ? FAULT_FLAG_WRITE : 0;
- if (fixup_user_fault(current, mm, vmaddr, fault_flags, &unlocked))
+ if (fixup_user_fault(mm, vmaddr, fault_flags, &unlocked))
return -EFAULT;
if (unlocked)
- /* lost mmap_sem, caller has to retry __gmap_translate */
+ /* lost mmap_lock, caller has to retry __gmap_translate */
return 0;
/* Connect the page tables */
return __gmap_link(gmap, gaddr, vmaddr);
@@ -949,7 +959,7 @@ static inline void gmap_pmd_op_end(struct gmap *gmap, pmd_t *pmdp)
* -EAGAIN if a fixup is needed
* -EINVAL if unsupported notifier bits have been specified
*
- * Expected to be called with sg->mm->mmap_sem in read and
+ * Expected to be called with sg->mm->mmap_lock in read and
* guest_table_lock held.
*/
static int gmap_protect_pmd(struct gmap *gmap, unsigned long gaddr,
@@ -964,18 +974,18 @@ static int gmap_protect_pmd(struct gmap *gmap, unsigned long gaddr,
return -EAGAIN;
if (prot == PROT_NONE && !pmd_i) {
- pmd_val(new) |= _SEGMENT_ENTRY_INVALID;
+ new = set_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_INVALID));
gmap_pmdp_xchg(gmap, pmdp, new, gaddr);
}
if (prot == PROT_READ && !pmd_p) {
- pmd_val(new) &= ~_SEGMENT_ENTRY_INVALID;
- pmd_val(new) |= _SEGMENT_ENTRY_PROTECT;
+ new = clear_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_INVALID));
+ new = set_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_PROTECT));
gmap_pmdp_xchg(gmap, pmdp, new, gaddr);
}
if (bits & GMAP_NOTIFY_MPROT)
- pmd_val(*pmdp) |= _SEGMENT_ENTRY_GMAP_IN;
+ set_pmd(pmdp, set_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_GMAP_IN)));
/* Shadow GMAP protection needs split PMDs */
if (bits & GMAP_NOTIFY_SHADOW)
@@ -995,7 +1005,7 @@ static int gmap_protect_pmd(struct gmap *gmap, unsigned long gaddr,
* Returns 0 if successfully protected, -ENOMEM if out of memory and
* -EAGAIN if a fixup is needed.
*
- * Expected to be called with sg->mm->mmap_sem in read
+ * Expected to be called with sg->mm->mmap_lock in read
*/
static int gmap_protect_pte(struct gmap *gmap, unsigned long gaddr,
pmd_t *pmdp, int prot, unsigned long bits)
@@ -1031,7 +1041,7 @@ static int gmap_protect_pte(struct gmap *gmap, unsigned long gaddr,
* Returns 0 if successfully protected, -ENOMEM if out of memory and
* -EFAULT if gaddr is invalid (or mapping for shadows is missing).
*
- * Called with sg->mm->mmap_sem in read.
+ * Called with sg->mm->mmap_lock in read.
*/
static int gmap_protect_range(struct gmap *gmap, unsigned long gaddr,
unsigned long len, int prot, unsigned long bits)
@@ -1102,9 +1112,9 @@ int gmap_mprotect_notify(struct gmap *gmap, unsigned long gaddr,
return -EINVAL;
if (!MACHINE_HAS_ESOP && prot == PROT_READ)
return -EINVAL;
- down_read(&gmap->mm->mmap_sem);
+ mmap_read_lock(gmap->mm);
rc = gmap_protect_range(gmap, gaddr, len, prot, GMAP_NOTIFY_MPROT);
- up_read(&gmap->mm->mmap_sem);
+ mmap_read_unlock(gmap->mm);
return rc;
}
EXPORT_SYMBOL_GPL(gmap_mprotect_notify);
@@ -1120,7 +1130,7 @@ EXPORT_SYMBOL_GPL(gmap_mprotect_notify);
* if reading using the virtual address failed. -EINVAL if called on a gmap
* shadow.
*
- * Called with gmap->mm->mmap_sem in read.
+ * Called with gmap->mm->mmap_lock in read.
*/
int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val)
{
@@ -1141,7 +1151,7 @@ int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val)
address = pte_val(pte) & PAGE_MASK;
address += gaddr & ~PAGE_MASK;
*val = *(unsigned long *) address;
- pte_val(*ptep) |= _PAGE_YOUNG;
+ set_pte(ptep, set_pte_bit(*ptep, __pgprot(_PAGE_YOUNG)));
/* Do *NOT* clear the _PAGE_INVALID bit! */
rc = 0;
}
@@ -1173,6 +1183,7 @@ EXPORT_SYMBOL_GPL(gmap_read_table);
static inline void gmap_insert_rmap(struct gmap *sg, unsigned long vmaddr,
struct gmap_rmap *rmap)
{
+ struct gmap_rmap *temp;
void __rcu **slot;
BUG_ON(!gmap_is_shadow(sg));
@@ -1180,6 +1191,12 @@ static inline void gmap_insert_rmap(struct gmap *sg, unsigned long vmaddr,
if (slot) {
rmap->next = radix_tree_deref_slot_protected(slot,
&sg->guest_table_lock);
+ for (temp = rmap->next; temp; temp = temp->next) {
+ if (temp->raddr == rmap->raddr) {
+ kfree(rmap);
+ return;
+ }
+ }
radix_tree_replace_slot(&sg->host_to_rmap, slot, rmap);
} else {
rmap->next = NULL;
@@ -1214,11 +1231,11 @@ static int gmap_protect_rmap(struct gmap *sg, unsigned long raddr,
vmaddr = __gmap_translate(parent, paddr);
if (IS_ERR_VALUE(vmaddr))
return vmaddr;
- rmap = kzalloc(sizeof(*rmap), GFP_KERNEL);
+ rmap = kzalloc(sizeof(*rmap), GFP_KERNEL_ACCOUNT);
if (!rmap)
return -ENOMEM;
rmap->raddr = raddr;
- rc = radix_tree_preload(GFP_KERNEL);
+ rc = radix_tree_preload(GFP_KERNEL_ACCOUNT);
if (rc) {
kfree(rmap);
return rc;
@@ -1268,7 +1285,7 @@ static int gmap_protect_rmap(struct gmap *sg, unsigned long raddr,
static inline void gmap_idte_one(unsigned long asce, unsigned long vaddr)
{
asm volatile(
- " .insn rrf,0xb98e0000,%0,%1,0,0"
+ " idte %0,0,%1"
: : "a" (asce), "a" (vaddr) : "cc", "memory");
}
@@ -1692,11 +1709,11 @@ struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce,
}
spin_unlock(&parent->shadow_lock);
/* protect after insertion, so it will get properly invalidated */
- down_read(&parent->mm->mmap_sem);
+ mmap_read_lock(parent->mm);
rc = gmap_protect_range(parent, asce & _ASCE_ORIGIN,
((asce & _ASCE_TABLE_LENGTH) + 1) * PAGE_SIZE,
PROT_READ, GMAP_NOTIFY_SHADOW);
- up_read(&parent->mm->mmap_sem);
+ mmap_read_unlock(parent->mm);
spin_lock(&parent->shadow_lock);
new->initialized = true;
if (rc) {
@@ -1725,7 +1742,7 @@ EXPORT_SYMBOL_GPL(gmap_shadow);
* shadow table structure is incomplete, -ENOMEM if out of memory and
* -EFAULT if an address in the parent gmap could not be resolved.
*
- * Called with sg->mm->mmap_sem in read.
+ * Called with sg->mm->mmap_lock in read.
*/
int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t,
int fake)
@@ -1737,7 +1754,7 @@ int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t,
BUG_ON(!gmap_is_shadow(sg));
/* Allocate a shadow region second table */
- page = alloc_pages(GFP_KERNEL, CRST_ALLOC_ORDER);
+ page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER);
if (!page)
return -ENOMEM;
page->index = r2t & _REGION_ENTRY_ORIGIN;
@@ -1809,7 +1826,7 @@ EXPORT_SYMBOL_GPL(gmap_shadow_r2t);
* shadow table structure is incomplete, -ENOMEM if out of memory and
* -EFAULT if an address in the parent gmap could not be resolved.
*
- * Called with sg->mm->mmap_sem in read.
+ * Called with sg->mm->mmap_lock in read.
*/
int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t,
int fake)
@@ -1821,7 +1838,7 @@ int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t,
BUG_ON(!gmap_is_shadow(sg));
/* Allocate a shadow region second table */
- page = alloc_pages(GFP_KERNEL, CRST_ALLOC_ORDER);
+ page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER);
if (!page)
return -ENOMEM;
page->index = r3t & _REGION_ENTRY_ORIGIN;
@@ -1840,6 +1857,7 @@ int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t,
goto out_free;
} else if (*table & _REGION_ENTRY_ORIGIN) {
rc = -EAGAIN; /* Race with shadow */
+ goto out_free;
}
crst_table_init(s_r3t, _REGION3_ENTRY_EMPTY);
/* mark as invalid as long as the parent table is not protected */
@@ -1892,7 +1910,7 @@ EXPORT_SYMBOL_GPL(gmap_shadow_r3t);
* shadow table structure is incomplete, -ENOMEM if out of memory and
* -EFAULT if an address in the parent gmap could not be resolved.
*
- * Called with sg->mm->mmap_sem in read.
+ * Called with sg->mm->mmap_lock in read.
*/
int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt,
int fake)
@@ -1904,7 +1922,7 @@ int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt,
BUG_ON(!gmap_is_shadow(sg) || (sgt & _REGION3_ENTRY_LARGE));
/* Allocate a shadow segment table */
- page = alloc_pages(GFP_KERNEL, CRST_ALLOC_ORDER);
+ page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER);
if (!page)
return -ENOMEM;
page->index = sgt & _REGION_ENTRY_ORIGIN;
@@ -1966,7 +1984,7 @@ out_free:
EXPORT_SYMBOL_GPL(gmap_shadow_sgt);
/**
- * gmap_shadow_lookup_pgtable - find a shadow page table
+ * gmap_shadow_pgt_lookup - find a shadow page table
* @sg: pointer to the shadow guest address space structure
* @saddr: the address in the shadow aguest address space
* @pgt: parent gmap address of the page table to get shadowed
@@ -1976,7 +1994,7 @@ EXPORT_SYMBOL_GPL(gmap_shadow_sgt);
* Returns 0 if the shadow page table was found and -EAGAIN if the page
* table was not found.
*
- * Called with sg->mm->mmap_sem in read.
+ * Called with sg->mm->mmap_lock in read.
*/
int gmap_shadow_pgt_lookup(struct gmap *sg, unsigned long saddr,
unsigned long *pgt, int *dat_protection,
@@ -2016,7 +2034,7 @@ EXPORT_SYMBOL_GPL(gmap_shadow_pgt_lookup);
* shadow table structure is incomplete, -ENOMEM if out of memory,
* -EFAULT if an address in the parent gmap could not be resolved and
*
- * Called with gmap->mm->mmap_sem in read
+ * Called with gmap->mm->mmap_lock in read
*/
int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt,
int fake)
@@ -2095,7 +2113,7 @@ EXPORT_SYMBOL_GPL(gmap_shadow_pgt);
* shadow table structure is incomplete, -ENOMEM if out of memory and
* -EFAULT if an address in the parent gmap could not be resolved.
*
- * Called with sg->mm->mmap_sem in read.
+ * Called with sg->mm->mmap_lock in read.
*/
int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte)
{
@@ -2111,7 +2129,7 @@ int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte)
parent = sg->parent;
prot = (pte_val(pte) & _PAGE_PROTECT) ? PROT_READ : PROT_WRITE;
- rmap = kzalloc(sizeof(*rmap), GFP_KERNEL);
+ rmap = kzalloc(sizeof(*rmap), GFP_KERNEL_ACCOUNT);
if (!rmap)
return -ENOMEM;
rmap->raddr = (saddr & PAGE_MASK) | _SHADOW_RMAP_PGTABLE;
@@ -2123,7 +2141,7 @@ int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte)
rc = vmaddr;
break;
}
- rc = radix_tree_preload(GFP_KERNEL);
+ rc = radix_tree_preload(GFP_KERNEL_ACCOUNT);
if (rc)
break;
rc = -EAGAIN;
@@ -2160,7 +2178,7 @@ int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte)
}
EXPORT_SYMBOL_GPL(gmap_shadow_page);
-/**
+/*
* gmap_shadow_notify - handle notifications for shadow gmap
*
* Called with sg->parent->shadow_lock.
@@ -2220,7 +2238,7 @@ static void gmap_shadow_notify(struct gmap *sg, unsigned long vmaddr,
/**
* ptep_notify - call all invalidation callbacks for a specific pte.
* @mm: pointer to the process mm_struct
- * @addr: virtual address in the process address space
+ * @vmaddr: virtual address in the process address space
* @pte: pointer to the page table entry
* @bits: bits from the pgste that caused the notify call
*
@@ -2264,7 +2282,7 @@ EXPORT_SYMBOL_GPL(ptep_notify);
static void pmdp_notify_gmap(struct gmap *gmap, pmd_t *pmdp,
unsigned long gaddr)
{
- pmd_val(*pmdp) &= ~_SEGMENT_ENTRY_GMAP_IN;
+ set_pmd(pmdp, clear_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_GMAP_IN)));
gmap_call_notifier(gmap, gaddr, gaddr + HPAGE_SIZE - 1);
}
@@ -2283,7 +2301,7 @@ static void gmap_pmdp_xchg(struct gmap *gmap, pmd_t *pmdp, pmd_t new,
{
gaddr &= HPAGE_MASK;
pmdp_notify_gmap(gmap, pmdp, gaddr);
- pmd_val(new) &= ~_SEGMENT_ENTRY_GMAP_IN;
+ new = clear_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_GMAP_IN));
if (MACHINE_HAS_TLB_GUEST)
__pmdp_idte(gaddr, (pmd_t *)pmdp, IDTE_GUEST_ASCE, gmap->asce,
IDTE_GLOBAL);
@@ -2291,7 +2309,7 @@ static void gmap_pmdp_xchg(struct gmap *gmap, pmd_t *pmdp, pmd_t new,
__pmdp_idte(gaddr, (pmd_t *)pmdp, 0, 0, IDTE_GLOBAL);
else
__pmdp_csp(pmdp);
- *pmdp = new;
+ set_pmd(pmdp, new);
}
static void gmap_pmdp_clear(struct mm_struct *mm, unsigned long vmaddr,
@@ -2313,7 +2331,7 @@ static void gmap_pmdp_clear(struct mm_struct *mm, unsigned long vmaddr,
_SEGMENT_ENTRY_GMAP_UC));
if (purge)
__pmdp_csp(pmdp);
- pmd_val(*pmdp) = _SEGMENT_ENTRY_EMPTY;
+ set_pmd(pmdp, __pmd(_SEGMENT_ENTRY_EMPTY));
}
spin_unlock(&gmap->guest_table_lock);
}
@@ -2436,7 +2454,7 @@ static bool gmap_test_and_clear_dirty_pmd(struct gmap *gmap, pmd_t *pmdp,
return false;
/* Clear UC indication and reset protection */
- pmd_val(*pmdp) &= ~_SEGMENT_ENTRY_GMAP_UC;
+ set_pmd(pmdp, clear_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_GMAP_UC)));
gmap_protect_pmd(gmap, gaddr, pmdp, PROT_READ, 0);
return true;
}
@@ -2480,23 +2498,37 @@ void gmap_sync_dirty_log_pmd(struct gmap *gmap, unsigned long bitmap[4],
}
EXPORT_SYMBOL_GPL(gmap_sync_dirty_log_pmd);
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static int thp_split_walk_pmd_entry(pmd_t *pmd, unsigned long addr,
+ unsigned long end, struct mm_walk *walk)
+{
+ struct vm_area_struct *vma = walk->vma;
+
+ split_huge_pmd(vma, pmd, addr);
+ return 0;
+}
+
+static const struct mm_walk_ops thp_split_walk_ops = {
+ .pmd_entry = thp_split_walk_pmd_entry,
+};
+
static inline void thp_split_mm(struct mm_struct *mm)
{
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
struct vm_area_struct *vma;
- unsigned long addr;
+ VMA_ITERATOR(vmi, mm, 0);
- for (vma = mm->mmap; vma != NULL; vma = vma->vm_next) {
- for (addr = vma->vm_start;
- addr < vma->vm_end;
- addr += PAGE_SIZE)
- follow_page(vma, addr, FOLL_SPLIT);
+ for_each_vma(vmi, vma) {
vma->vm_flags &= ~VM_HUGEPAGE;
vma->vm_flags |= VM_NOHUGEPAGE;
+ walk_page_vma(vma, &thp_split_walk_ops, NULL);
}
mm->def_flags |= VM_NOHUGEPAGE;
-#endif
}
+#else
+static inline void thp_split_mm(struct mm_struct *mm)
+{
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
/*
* Remove all empty zero pages from the mapping for lazy refaulting
@@ -2538,16 +2570,34 @@ int s390_enable_sie(void)
/* Fail if the page tables are 2K */
if (!mm_alloc_pgste(mm))
return -EINVAL;
- down_write(&mm->mmap_sem);
+ mmap_write_lock(mm);
mm->context.has_pgste = 1;
/* split thp mappings and disable thp for future mappings */
thp_split_mm(mm);
walk_page_range(mm, 0, TASK_SIZE, &zap_zero_walk_ops, NULL);
- up_write(&mm->mmap_sem);
+ mmap_write_unlock(mm);
return 0;
}
EXPORT_SYMBOL_GPL(s390_enable_sie);
+int gmap_mark_unmergeable(void)
+{
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma;
+ int ret;
+ VMA_ITERATOR(vmi, mm, 0);
+
+ for_each_vma(vmi, vma) {
+ ret = ksm_madvise(vma, vma->vm_start, vma->vm_end,
+ MADV_UNMERGEABLE, &vma->vm_flags);
+ if (ret)
+ return ret;
+ }
+ mm->def_flags &= ~VM_MERGEABLE;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(gmap_mark_unmergeable);
+
/*
* Enable storage key handling from now on and initialize the storage
* keys with the default key.
@@ -2560,6 +2610,18 @@ static int __s390_enable_skey_pte(pte_t *pte, unsigned long addr,
return 0;
}
+/*
+ * Give a chance to schedule after setting a key to 256 pages.
+ * We only hold the mm lock, which is a rwsem and the kvm srcu.
+ * Both can sleep.
+ */
+static int __s390_enable_skey_pmd(pmd_t *pmd, unsigned long addr,
+ unsigned long next, struct mm_walk *walk)
+{
+ cond_resched();
+ return 0;
+}
+
static int __s390_enable_skey_hugetlb(pte_t *pte, unsigned long addr,
unsigned long hmask, unsigned long next,
struct mm_walk *walk)
@@ -2582,39 +2644,35 @@ static int __s390_enable_skey_hugetlb(pte_t *pte, unsigned long addr,
end = start + HPAGE_SIZE - 1;
__storage_key_init_range(start, end);
set_bit(PG_arch_1, &page->flags);
+ cond_resched();
return 0;
}
static const struct mm_walk_ops enable_skey_walk_ops = {
.hugetlb_entry = __s390_enable_skey_hugetlb,
.pte_entry = __s390_enable_skey_pte,
+ .pmd_entry = __s390_enable_skey_pmd,
};
int s390_enable_skey(void)
{
struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma;
int rc = 0;
- down_write(&mm->mmap_sem);
+ mmap_write_lock(mm);
if (mm_uses_skeys(mm))
goto out_up;
mm->context.uses_skeys = 1;
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
- if (ksm_madvise(vma, vma->vm_start, vma->vm_end,
- MADV_UNMERGEABLE, &vma->vm_flags)) {
- mm->context.uses_skeys = 0;
- rc = -ENOMEM;
- goto out_up;
- }
+ rc = gmap_mark_unmergeable();
+ if (rc) {
+ mm->context.uses_skeys = 0;
+ goto out_up;
}
- mm->def_flags &= ~VM_MERGEABLE;
-
walk_page_range(mm, 0, TASK_SIZE, &enable_skey_walk_ops, NULL);
out_up:
- up_write(&mm->mmap_sem);
+ mmap_write_unlock(mm);
return rc;
}
EXPORT_SYMBOL_GPL(s390_enable_skey);
@@ -2635,8 +2693,174 @@ static const struct mm_walk_ops reset_cmma_walk_ops = {
void s390_reset_cmma(struct mm_struct *mm)
{
- down_write(&mm->mmap_sem);
+ mmap_write_lock(mm);
walk_page_range(mm, 0, TASK_SIZE, &reset_cmma_walk_ops, NULL);
- up_write(&mm->mmap_sem);
+ mmap_write_unlock(mm);
}
EXPORT_SYMBOL_GPL(s390_reset_cmma);
+
+#define GATHER_GET_PAGES 32
+
+struct reset_walk_state {
+ unsigned long next;
+ unsigned long count;
+ unsigned long pfns[GATHER_GET_PAGES];
+};
+
+static int s390_gather_pages(pte_t *ptep, unsigned long addr,
+ unsigned long next, struct mm_walk *walk)
+{
+ struct reset_walk_state *p = walk->private;
+ pte_t pte = READ_ONCE(*ptep);
+
+ if (pte_present(pte)) {
+ /* we have a reference from the mapping, take an extra one */
+ get_page(phys_to_page(pte_val(pte)));
+ p->pfns[p->count] = phys_to_pfn(pte_val(pte));
+ p->next = next;
+ p->count++;
+ }
+ return p->count >= GATHER_GET_PAGES;
+}
+
+static const struct mm_walk_ops gather_pages_ops = {
+ .pte_entry = s390_gather_pages,
+};
+
+/*
+ * Call the Destroy secure page UVC on each page in the given array of PFNs.
+ * Each page needs to have an extra reference, which will be released here.
+ */
+void s390_uv_destroy_pfns(unsigned long count, unsigned long *pfns)
+{
+ unsigned long i;
+
+ for (i = 0; i < count; i++) {
+ /* we always have an extra reference */
+ uv_destroy_owned_page(pfn_to_phys(pfns[i]));
+ /* get rid of the extra reference */
+ put_page(pfn_to_page(pfns[i]));
+ cond_resched();
+ }
+}
+EXPORT_SYMBOL_GPL(s390_uv_destroy_pfns);
+
+/**
+ * __s390_uv_destroy_range - Call the destroy secure page UVC on each page
+ * in the given range of the given address space.
+ * @mm: the mm to operate on
+ * @start: the start of the range
+ * @end: the end of the range
+ * @interruptible: if not 0, stop when a fatal signal is received
+ *
+ * Walk the given range of the given address space and call the destroy
+ * secure page UVC on each page. Optionally exit early if a fatal signal is
+ * pending.
+ *
+ * Return: 0 on success, -EINTR if the function stopped before completing
+ */
+int __s390_uv_destroy_range(struct mm_struct *mm, unsigned long start,
+ unsigned long end, bool interruptible)
+{
+ struct reset_walk_state state = { .next = start };
+ int r = 1;
+
+ while (r > 0) {
+ state.count = 0;
+ mmap_read_lock(mm);
+ r = walk_page_range(mm, state.next, end, &gather_pages_ops, &state);
+ mmap_read_unlock(mm);
+ cond_resched();
+ s390_uv_destroy_pfns(state.count, state.pfns);
+ if (interruptible && fatal_signal_pending(current))
+ return -EINTR;
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(__s390_uv_destroy_range);
+
+/**
+ * s390_unlist_old_asce - Remove the topmost level of page tables from the
+ * list of page tables of the gmap.
+ * @gmap: the gmap whose table is to be removed
+ *
+ * On s390x, KVM keeps a list of all pages containing the page tables of the
+ * gmap (the CRST list). This list is used at tear down time to free all
+ * pages that are now not needed anymore.
+ *
+ * This function removes the topmost page of the tree (the one pointed to by
+ * the ASCE) from the CRST list.
+ *
+ * This means that it will not be freed when the VM is torn down, and needs
+ * to be handled separately by the caller, unless a leak is actually
+ * intended. Notice that this function will only remove the page from the
+ * list, the page will still be used as a top level page table (and ASCE).
+ */
+void s390_unlist_old_asce(struct gmap *gmap)
+{
+ struct page *old;
+
+ old = virt_to_page(gmap->table);
+ spin_lock(&gmap->guest_table_lock);
+ list_del(&old->lru);
+ /*
+ * Sometimes the topmost page might need to be "removed" multiple
+ * times, for example if the VM is rebooted into secure mode several
+ * times concurrently, or if s390_replace_asce fails after calling
+ * s390_remove_old_asce and is attempted again later. In that case
+ * the old asce has been removed from the list, and therefore it
+ * will not be freed when the VM terminates, but the ASCE is still
+ * in use and still pointed to.
+ * A subsequent call to replace_asce will follow the pointer and try
+ * to remove the same page from the list again.
+ * Therefore it's necessary that the page of the ASCE has valid
+ * pointers, so list_del can work (and do nothing) without
+ * dereferencing stale or invalid pointers.
+ */
+ INIT_LIST_HEAD(&old->lru);
+ spin_unlock(&gmap->guest_table_lock);
+}
+EXPORT_SYMBOL_GPL(s390_unlist_old_asce);
+
+/**
+ * s390_replace_asce - Try to replace the current ASCE of a gmap with a copy
+ * @gmap: the gmap whose ASCE needs to be replaced
+ *
+ * If the allocation of the new top level page table fails, the ASCE is not
+ * replaced.
+ * In any case, the old ASCE is always removed from the gmap CRST list.
+ * Therefore the caller has to make sure to save a pointer to it
+ * beforehand, unless a leak is actually intended.
+ */
+int s390_replace_asce(struct gmap *gmap)
+{
+ unsigned long asce;
+ struct page *page;
+ void *table;
+
+ s390_unlist_old_asce(gmap);
+
+ page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER);
+ if (!page)
+ return -ENOMEM;
+ table = page_to_virt(page);
+ memcpy(table, gmap->table, 1UL << (CRST_ALLOC_ORDER + PAGE_SHIFT));
+
+ /*
+ * The caller has to deal with the old ASCE, but here we make sure
+ * the new one is properly added to the CRST list, so that
+ * it will be freed when the VM is torn down.
+ */
+ spin_lock(&gmap->guest_table_lock);
+ list_add(&page->lru, &gmap->crst_list);
+ spin_unlock(&gmap->guest_table_lock);
+
+ /* Set new table origin while preserving existing ASCE control bits */
+ asce = (gmap->asce & ~_ASCE_ORIGIN) | __pa(table);
+ WRITE_ONCE(gmap->asce, asce);
+ WRITE_ONCE(gmap->mm->context.gmap_asce, asce);
+ WRITE_ONCE(gmap->table, table);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(s390_replace_asce);
diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c
index 5674710a4841..c299a18273ff 100644
--- a/arch/s390/mm/hugetlbpage.c
+++ b/arch/s390/mm/hugetlbpage.c
@@ -9,6 +9,7 @@
#define KMSG_COMPONENT "hugetlb"
#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#include <asm/pgalloc.h>
#include <linux/mm.h>
#include <linux/hugetlb.h>
#include <linux/mman.h>
@@ -72,8 +73,8 @@ static inline unsigned long __pte_to_rste(pte_t pte)
static inline pte_t __rste_to_pte(unsigned long rste)
{
+ unsigned long pteval;
int present;
- pte_t pte;
if ((rste & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3)
present = pud_present(__pud(rste));
@@ -101,29 +102,21 @@ static inline pte_t __rste_to_pte(unsigned long rste)
* u unused, l large
*/
if (present) {
- pte_val(pte) = rste & _SEGMENT_ENTRY_ORIGIN_LARGE;
- pte_val(pte) |= _PAGE_LARGE | _PAGE_PRESENT;
- pte_val(pte) |= move_set_bit(rste, _SEGMENT_ENTRY_READ,
- _PAGE_READ);
- pte_val(pte) |= move_set_bit(rste, _SEGMENT_ENTRY_WRITE,
- _PAGE_WRITE);
- pte_val(pte) |= move_set_bit(rste, _SEGMENT_ENTRY_INVALID,
- _PAGE_INVALID);
- pte_val(pte) |= move_set_bit(rste, _SEGMENT_ENTRY_PROTECT,
- _PAGE_PROTECT);
- pte_val(pte) |= move_set_bit(rste, _SEGMENT_ENTRY_DIRTY,
- _PAGE_DIRTY);
- pte_val(pte) |= move_set_bit(rste, _SEGMENT_ENTRY_YOUNG,
- _PAGE_YOUNG);
+ pteval = rste & _SEGMENT_ENTRY_ORIGIN_LARGE;
+ pteval |= _PAGE_LARGE | _PAGE_PRESENT;
+ pteval |= move_set_bit(rste, _SEGMENT_ENTRY_READ, _PAGE_READ);
+ pteval |= move_set_bit(rste, _SEGMENT_ENTRY_WRITE, _PAGE_WRITE);
+ pteval |= move_set_bit(rste, _SEGMENT_ENTRY_INVALID, _PAGE_INVALID);
+ pteval |= move_set_bit(rste, _SEGMENT_ENTRY_PROTECT, _PAGE_PROTECT);
+ pteval |= move_set_bit(rste, _SEGMENT_ENTRY_DIRTY, _PAGE_DIRTY);
+ pteval |= move_set_bit(rste, _SEGMENT_ENTRY_YOUNG, _PAGE_YOUNG);
#ifdef CONFIG_MEM_SOFT_DIRTY
- pte_val(pte) |= move_set_bit(rste, _SEGMENT_ENTRY_SOFT_DIRTY,
- _PAGE_DIRTY);
+ pteval |= move_set_bit(rste, _SEGMENT_ENTRY_SOFT_DIRTY, _PAGE_SOFT_DIRTY);
#endif
- pte_val(pte) |= move_set_bit(rste, _SEGMENT_ENTRY_NOEXEC,
- _PAGE_NOEXEC);
+ pteval |= move_set_bit(rste, _SEGMENT_ENTRY_NOEXEC, _PAGE_NOEXEC);
} else
- pte_val(pte) = _PAGE_INVALID;
- return pte;
+ pteval = _PAGE_INVALID;
+ return __pte(pteval);
}
static void clear_huge_pte_skeys(struct mm_struct *mm, unsigned long rste)
@@ -159,12 +152,15 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
rste &= ~_SEGMENT_ENTRY_NOEXEC;
/* Set correct table type for 2G hugepages */
- if ((pte_val(*ptep) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3)
- rste |= _REGION_ENTRY_TYPE_R3 | _REGION3_ENTRY_LARGE;
- else
+ if ((pte_val(*ptep) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3) {
+ if (likely(pte_present(pte)))
+ rste |= _REGION3_ENTRY_LARGE;
+ rste |= _REGION_ENTRY_TYPE_R3;
+ } else if (likely(pte_present(pte)))
rste |= _SEGMENT_ENTRY_LARGE;
+
clear_huge_pte_skeys(mm, rste);
- pte_val(*ptep) = rste;
+ set_pte(ptep, __pte(rste));
}
pte_t huge_ptep_get(pte_t *ptep)
@@ -186,7 +182,7 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
return pte;
}
-pte_t *huge_pte_alloc(struct mm_struct *mm,
+pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, unsigned long sz)
{
pgd_t *pgdp;
@@ -241,35 +237,15 @@ int pud_huge(pud_t pud)
return pud_large(pud);
}
-struct page *
-follow_huge_pud(struct mm_struct *mm, unsigned long address,
- pud_t *pud, int flags)
+bool __init arch_hugetlb_valid_size(unsigned long size)
{
- if (flags & FOLL_GET)
- return NULL;
-
- return pud_page(*pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT);
-}
-
-static __init int setup_hugepagesz(char *opt)
-{
- unsigned long size;
- char *string = opt;
-
- size = memparse(opt, &opt);
- if (MACHINE_HAS_EDAT1 && size == PMD_SIZE) {
- hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT);
- } else if (MACHINE_HAS_EDAT2 && size == PUD_SIZE) {
- hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT);
- } else {
- hugetlb_bad_size();
- pr_err("hugepagesz= specifies an unsupported page size %s\n",
- string);
- return 0;
- }
- return 1;
+ if (MACHINE_HAS_EDAT1 && size == PMD_SIZE)
+ return true;
+ else if (MACHINE_HAS_EDAT2 && size == PUD_SIZE)
+ return true;
+ else
+ return false;
}
-__setup("hugepagesz=", setup_hugepagesz);
static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
unsigned long addr, unsigned long len,
@@ -326,7 +302,6 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
struct hstate *h = hstate_file(file);
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
- int rc;
if (len & ~huge_page_mask(h))
return -EINVAL;
@@ -353,15 +328,9 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
else
addr = hugetlb_get_unmapped_area_topdown(file, addr, len,
pgoff, flags);
- if (addr & ~PAGE_MASK)
+ if (offset_in_page(addr))
return addr;
check_asce_limit:
- if (addr + len > current->mm->context.asce_limit &&
- addr + len <= TASK_SIZE) {
- rc = crst_table_upgrade(mm, addr + len);
- if (rc)
- return (unsigned long) rc;
- }
- return addr;
+ return check_asce_limit(mm, addr, len);
}
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
index ac44bd76db4b..97d66a3e60fb 100644
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -33,10 +33,11 @@
#include <linux/dma-direct.h>
#include <asm/processor.h>
#include <linux/uaccess.h>
-#include <asm/pgtable.h>
#include <asm/pgalloc.h>
+#include <asm/kfence.h>
+#include <asm/ptdump.h>
#include <asm/dma.h>
-#include <asm/lowcore.h>
+#include <asm/abs_lowcore.h>
#include <asm/tlb.h>
#include <asm/tlbflush.h>
#include <asm/sections.h>
@@ -46,15 +47,18 @@
#include <asm/kasan.h>
#include <asm/dma-mapping.h>
#include <asm/uv.h>
+#include <linux/virtio_anchor.h>
+#include <linux/virtio_config.h>
-pgd_t swapper_pg_dir[PTRS_PER_PGD] __section(.bss..swapper_pg_dir);
+pgd_t swapper_pg_dir[PTRS_PER_PGD] __section(".bss..swapper_pg_dir");
+static pgd_t invalid_pg_dir[PTRS_PER_PGD] __section(".bss..invalid_pg_dir");
+
+unsigned long s390_invalid_asce;
unsigned long empty_zero_page, zero_page_mask;
EXPORT_SYMBOL(empty_zero_page);
EXPORT_SYMBOL(zero_page_mask);
-bool initmem_freed;
-
static void __init setup_zero_pages(void)
{
unsigned int order;
@@ -91,6 +95,9 @@ void __init paging_init(void)
unsigned long pgd_type, asce_bits;
psw_t psw;
+ s390_invalid_asce = (unsigned long)invalid_pg_dir;
+ s390_invalid_asce |= _ASCE_TYPE_REGION3 | _ASCE_TABLE_LENGTH;
+ crst_table_init((unsigned long *)invalid_pg_dir, _REGION3_ENTRY_EMPTY);
init_mm.pgd = swapper_pg_dir;
if (VMALLOC_END > _REGION2_SIZE) {
asce_bits = _ASCE_TYPE_REGION2 | _ASCE_TABLE_LENGTH;
@@ -101,14 +108,14 @@ void __init paging_init(void)
}
init_mm.context.asce = (__pa(init_mm.pgd) & PAGE_MASK) | asce_bits;
S390_lowcore.kernel_asce = init_mm.context.asce;
- S390_lowcore.user_asce = S390_lowcore.kernel_asce;
+ S390_lowcore.user_asce = s390_invalid_asce;
crst_table_init((unsigned long *) init_mm.pgd, pgd_type);
vmem_map_init();
- kasan_copy_shadow(init_mm.pgd);
+ kasan_copy_shadow_mapping();
/* enable virtual mapping in kernel mode */
__ctl_load(S390_lowcore.kernel_asce, 1, 1);
- __ctl_load(S390_lowcore.kernel_asce, 7, 7);
+ __ctl_load(S390_lowcore.user_asce, 7, 7);
__ctl_load(S390_lowcore.kernel_asce, 13, 13);
psw.mask = __extract_psw();
psw_bits(psw).dat = 1;
@@ -116,13 +123,12 @@ void __init paging_init(void)
__load_psw_mask(psw.mask);
kasan_free_early_identity();
- sparse_memory_present_with_active_regions(MAX_NUMNODES);
sparse_init();
zone_dma_bits = 31;
memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
max_zone_pfns[ZONE_DMA] = PFN_DOWN(MAX_DMA_ADDRESS);
max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
- free_area_init_nodes(max_zone_pfns);
+ free_area_init(max_zone_pfns);
}
void mark_rodata_ro(void)
@@ -131,6 +137,7 @@ void mark_rodata_ro(void)
set_memory_ro((unsigned long)__start_ro_after_init, size >> PAGE_SHIFT);
pr_info("Write protected read-only-after-init data: %luk\n", size >> 10);
+ debug_checkwx();
}
int set_memory_encrypted(unsigned long addr, int numpages)
@@ -168,10 +175,11 @@ static void pv_init(void)
if (!is_prot_virt_guest())
return;
+ virtio_set_mem_acc_cb(virtio_require_restricted_mem_acc);
+
/* make sure bounce buffers are shared */
- swiotlb_init(1);
+ swiotlb_init(true, SWIOTLB_FORCE | SWIOTLB_VERBOSE);
swiotlb_update_mem_attributes();
- swiotlb_force = SWIOTLB_FORCE;
}
void __init mem_init(void)
@@ -183,7 +191,7 @@ void __init mem_init(void)
high_memory = (void *) __va(max_low_pfn * PAGE_SIZE);
pv_init();
-
+ kfence_split_mapping();
/* Setup guest page hinting */
cmma_init();
@@ -192,16 +200,16 @@ void __init mem_init(void)
setup_zero_pages(); /* Setup zeroed pages. */
cmma_init_nodat();
-
- mem_init_print_info(NULL);
}
void free_initmem(void)
{
- initmem_freed = true;
__set_memory((unsigned long)_sinittext,
(unsigned long)(_einittext - _sinittext) >> PAGE_SHIFT,
SET_MEMORY_RW | SET_MEMORY_NX);
+ free_reserved_area(sclp_early_sccb,
+ sclp_early_sccb + EXT_SCCB_READ_SCP,
+ POISON_FREE_INITMEM, "unused early sccb");
free_initmem_default(POISON_FREE_INITMEM);
}
@@ -268,27 +276,30 @@ device_initcall(s390_cma_mem_init);
#endif /* CONFIG_CMA */
int arch_add_memory(int nid, u64 start, u64 size,
- struct mhp_restrictions *restrictions)
+ struct mhp_params *params)
{
unsigned long start_pfn = PFN_DOWN(start);
unsigned long size_pages = PFN_DOWN(size);
int rc;
- if (WARN_ON_ONCE(restrictions->altmap))
+ if (WARN_ON_ONCE(params->altmap))
+ return -EINVAL;
+
+ if (WARN_ON_ONCE(params->pgprot.pgprot != PAGE_KERNEL.pgprot))
return -EINVAL;
+ VM_BUG_ON(!mhp_range_allowed(start, size, true));
rc = vmem_add_mapping(start, size);
if (rc)
return rc;
- rc = __add_pages(nid, start_pfn, size_pages, restrictions);
+ rc = __add_pages(nid, start_pfn, size_pages, params);
if (rc)
vmem_remove_mapping(start, size);
return rc;
}
-void arch_remove_memory(int nid, u64 start, u64 size,
- struct vmem_altmap *altmap)
+void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
{
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
diff --git a/arch/s390/mm/kasan_init.c b/arch/s390/mm/kasan_init.c
index 06345616a646..9f988d4582ed 100644
--- a/arch/s390/mm/kasan_init.c
+++ b/arch/s390/mm/kasan_init.c
@@ -2,8 +2,8 @@
#include <linux/kasan.h>
#include <linux/sched/task.h>
#include <linux/memblock.h>
+#include <linux/pgtable.h>
#include <asm/pgalloc.h>
-#include <asm/pgtable.h>
#include <asm/kasan.h>
#include <asm/mem_detect.h>
#include <asm/processor.h>
@@ -11,6 +11,7 @@
#include <asm/facility.h>
#include <asm/sections.h>
#include <asm/setup.h>
+#include <asm/uv.h>
static unsigned long segment_pos __initdata;
static unsigned long segment_low __initdata;
@@ -85,7 +86,7 @@ enum populate_mode {
POPULATE_ZERO_SHADOW,
POPULATE_SHALLOW
};
-static void __init kasan_early_vmemmap_populate(unsigned long address,
+static void __init kasan_early_pgtable_populate(unsigned long address,
unsigned long end,
enum populate_mode mode)
{
@@ -99,9 +100,16 @@ static void __init kasan_early_vmemmap_populate(unsigned long address,
pgt_prot_zero = pgprot_val(PAGE_KERNEL_RO);
if (!has_nx)
pgt_prot_zero &= ~_PAGE_NOEXEC;
- pgt_prot = pgprot_val(PAGE_KERNEL_EXEC);
- sgt_prot = pgprot_val(SEGMENT_KERNEL_EXEC);
+ pgt_prot = pgprot_val(PAGE_KERNEL);
+ sgt_prot = pgprot_val(SEGMENT_KERNEL);
+ if (!has_nx || mode == POPULATE_ONE2ONE) {
+ pgt_prot &= ~_PAGE_NOEXEC;
+ sgt_prot &= ~_SEGMENT_ENTRY_NOEXEC;
+ }
+ /*
+ * The first 1MB of 1:1 mapping is mapped with 4KB pages
+ */
while (address < end) {
pg_dir = pgd_offset_k(address);
if (pgd_none(*pg_dir)) {
@@ -117,8 +125,7 @@ static void __init kasan_early_vmemmap_populate(unsigned long address,
pgd_populate(&init_mm, pg_dir, p4_dir);
}
- if (IS_ENABLED(CONFIG_KASAN_S390_4_LEVEL_PAGING) &&
- mode == POPULATE_SHALLOW) {
+ if (mode == POPULATE_SHALLOW) {
address = (address + P4D_SIZE) & P4D_MASK;
continue;
}
@@ -137,12 +144,6 @@ static void __init kasan_early_vmemmap_populate(unsigned long address,
p4d_populate(&init_mm, p4_dir, pu_dir);
}
- if (!IS_ENABLED(CONFIG_KASAN_S390_4_LEVEL_PAGING) &&
- mode == POPULATE_SHALLOW) {
- address = (address + PUD_SIZE) & PUD_MASK;
- continue;
- }
-
pu_dir = pud_offset(p4_dir, address);
if (pud_none(*pu_dir)) {
if (mode == POPULATE_ZERO_SHADOW &&
@@ -159,30 +160,26 @@ static void __init kasan_early_vmemmap_populate(unsigned long address,
pm_dir = pmd_offset(pu_dir, address);
if (pmd_none(*pm_dir)) {
- if (mode == POPULATE_ZERO_SHADOW &&
- IS_ALIGNED(address, PMD_SIZE) &&
+ if (IS_ALIGNED(address, PMD_SIZE) &&
end - address >= PMD_SIZE) {
- pmd_populate(&init_mm, pm_dir,
- kasan_early_shadow_pte);
- address = (address + PMD_SIZE) & PMD_MASK;
- continue;
- }
- /* the first megabyte of 1:1 is mapped with 4k pages */
- if (has_edat && address && end - address >= PMD_SIZE &&
- mode != POPULATE_ZERO_SHADOW) {
- void *page;
-
- if (mode == POPULATE_ONE2ONE) {
- page = (void *)address;
- } else {
- page = kasan_early_alloc_segment();
- memset(page, 0, _SEGMENT_SIZE);
+ if (mode == POPULATE_ZERO_SHADOW) {
+ pmd_populate(&init_mm, pm_dir, kasan_early_shadow_pte);
+ address = (address + PMD_SIZE) & PMD_MASK;
+ continue;
+ } else if (has_edat && address) {
+ void *page;
+
+ if (mode == POPULATE_ONE2ONE) {
+ page = (void *)address;
+ } else {
+ page = kasan_early_alloc_segment();
+ memset(page, 0, _SEGMENT_SIZE);
+ }
+ set_pmd(pm_dir, __pmd(__pa(page) | sgt_prot));
+ address = (address + PMD_SIZE) & PMD_MASK;
+ continue;
}
- pmd_val(*pm_dir) = __pa(page) | sgt_prot;
- address = (address + PMD_SIZE) & PMD_MASK;
- continue;
}
-
pt_dir = kasan_early_pte_alloc();
pmd_populate(&init_mm, pm_dir, pt_dir);
} else if (pmd_large(*pm_dir)) {
@@ -197,16 +194,16 @@ static void __init kasan_early_vmemmap_populate(unsigned long address,
switch (mode) {
case POPULATE_ONE2ONE:
page = (void *)address;
- pte_val(*pt_dir) = __pa(page) | pgt_prot;
+ set_pte(pt_dir, __pte(__pa(page) | pgt_prot));
break;
case POPULATE_MAP:
page = kasan_early_alloc_pages(0);
memset(page, 0, PAGE_SIZE);
- pte_val(*pt_dir) = __pa(page) | pgt_prot;
+ set_pte(pt_dir, __pte(__pa(page) | pgt_prot));
break;
case POPULATE_ZERO_SHADOW:
page = kasan_early_shadow_page;
- pte_val(*pt_dir) = __pa(page) | pgt_prot_zero;
+ set_pte(pt_dir, __pte(__pa(page) | pgt_prot_zero));
break;
case POPULATE_SHALLOW:
/* should never happen */
@@ -254,12 +251,9 @@ static void __init kasan_early_detect_facilities(void)
void __init kasan_early_init(void)
{
- unsigned long untracked_mem_end;
unsigned long shadow_alloc_size;
unsigned long initrd_end;
- unsigned long asce_type;
unsigned long memsize;
- unsigned long vmax;
unsigned long pgt_prot = pgprot_val(PAGE_KERNEL_RO);
pte_t pte_z;
pmd_t pmd_z = __pmd(__pa(kasan_early_shadow_pte) | _SEGMENT_ENTRY);
@@ -274,30 +268,23 @@ void __init kasan_early_init(void)
memsize = get_mem_detect_end();
if (!memsize)
kasan_early_panic("cannot detect physical memory size\n");
- /* respect mem= cmdline parameter */
- if (memory_end_set && memsize > memory_end)
- memsize = memory_end;
- if (IS_ENABLED(CONFIG_CRASH_DUMP) && OLDMEM_BASE)
- memsize = min(memsize, OLDMEM_SIZE);
- memsize = min(memsize, KASAN_SHADOW_START);
-
- if (IS_ENABLED(CONFIG_KASAN_S390_4_LEVEL_PAGING)) {
- /* 4 level paging */
- BUILD_BUG_ON(!IS_ALIGNED(KASAN_SHADOW_START, P4D_SIZE));
- BUILD_BUG_ON(!IS_ALIGNED(KASAN_SHADOW_END, P4D_SIZE));
- crst_table_init((unsigned long *)early_pg_dir,
- _REGION2_ENTRY_EMPTY);
- untracked_mem_end = vmax = _REGION1_SIZE;
- asce_type = _ASCE_TYPE_REGION2;
- } else {
- /* 3 level paging */
- BUILD_BUG_ON(!IS_ALIGNED(KASAN_SHADOW_START, PUD_SIZE));
- BUILD_BUG_ON(!IS_ALIGNED(KASAN_SHADOW_END, PUD_SIZE));
- crst_table_init((unsigned long *)early_pg_dir,
- _REGION3_ENTRY_EMPTY);
- untracked_mem_end = vmax = _REGION2_SIZE;
- asce_type = _ASCE_TYPE_REGION3;
- }
+ /*
+ * Kasan currently supports standby memory but only if it follows
+ * online memory (default allocation), i.e. no memory holes.
+ * - memsize represents end of online memory
+ * - ident_map_size represents online + standby and memory limits
+ * accounted.
+ * Kasan maps "memsize" right away.
+ * [0, memsize] - as identity mapping
+ * [__sha(0), __sha(memsize)] - shadow memory for identity mapping
+ * The rest [memsize, ident_map_size] if memsize < ident_map_size
+ * could be mapped/unmapped dynamically later during memory hotplug.
+ */
+ memsize = min(memsize, ident_map_size);
+
+ BUILD_BUG_ON(!IS_ALIGNED(KASAN_SHADOW_START, P4D_SIZE));
+ BUILD_BUG_ON(!IS_ALIGNED(KASAN_SHADOW_END, P4D_SIZE));
+ crst_table_init((unsigned long *)early_pg_dir, _REGION2_ENTRY_EMPTY);
/* init kasan zero shadow */
crst_table_init((unsigned long *)kasan_early_shadow_p4d,
@@ -312,7 +299,7 @@ void __init kasan_early_init(void)
pgalloc_low = round_up((unsigned long)_end, _SEGMENT_SIZE);
if (IS_ENABLED(CONFIG_BLK_DEV_INITRD)) {
initrd_end =
- round_up(INITRD_START + INITRD_SIZE, _SEGMENT_SIZE);
+ round_up(initrd_data.start + initrd_data.size, _SEGMENT_SIZE);
pgalloc_low = max(pgalloc_low, initrd_end);
}
@@ -363,24 +350,25 @@ void __init kasan_early_init(void)
* +-----------------+ +- shadow end ---+
*/
/* populate kasan shadow (for identity mapping and zero page mapping) */
- kasan_early_vmemmap_populate(__sha(0), __sha(memsize), POPULATE_MAP);
- if (IS_ENABLED(CONFIG_MODULES))
- untracked_mem_end = vmax - MODULES_LEN;
+ kasan_early_pgtable_populate(__sha(0), __sha(memsize), POPULATE_MAP);
if (IS_ENABLED(CONFIG_KASAN_VMALLOC)) {
- untracked_mem_end = vmax - vmalloc_size - MODULES_LEN;
/* shallowly populate kasan shadow for vmalloc and modules */
- kasan_early_vmemmap_populate(__sha(untracked_mem_end),
- __sha(vmax), POPULATE_SHALLOW);
+ kasan_early_pgtable_populate(__sha(VMALLOC_START), __sha(MODULES_END),
+ POPULATE_SHALLOW);
}
/* populate kasan shadow for untracked memory */
- kasan_early_vmemmap_populate(__sha(max_physmem_end),
- __sha(untracked_mem_end),
+ kasan_early_pgtable_populate(__sha(ident_map_size),
+ IS_ENABLED(CONFIG_KASAN_VMALLOC) ?
+ __sha(VMALLOC_START) :
+ __sha(MODULES_VADDR),
+ POPULATE_ZERO_SHADOW);
+ kasan_early_pgtable_populate(__sha(MODULES_END), __sha(_REGION1_SIZE),
POPULATE_ZERO_SHADOW);
/* memory allocated for identity mapping structs will be freed later */
pgalloc_freeable = pgalloc_pos;
/* populate identity mapping */
- kasan_early_vmemmap_populate(0, memsize, POPULATE_ONE2ONE);
- kasan_set_pgd(early_pg_dir, asce_type);
+ kasan_early_pgtable_populate(0, memsize, POPULATE_ONE2ONE);
+ kasan_set_pgd(early_pg_dir, _ASCE_TYPE_REGION2);
kasan_enable_dat();
/* enable kasan */
init_task.kasan_depth = 0;
@@ -388,7 +376,7 @@ void __init kasan_early_init(void)
sclp_early_printk("KernelAddressSanitizer initialized\n");
}
-void __init kasan_copy_shadow(pgd_t *pg_dir)
+void __init kasan_copy_shadow_mapping(void)
{
/*
* At this point we are still running on early pages setup early_pg_dir,
@@ -400,27 +388,16 @@ void __init kasan_copy_shadow(pgd_t *pg_dir)
pgd_t *pg_dir_dst;
p4d_t *p4_dir_src;
p4d_t *p4_dir_dst;
- pud_t *pu_dir_src;
- pud_t *pu_dir_dst;
pg_dir_src = pgd_offset_raw(early_pg_dir, KASAN_SHADOW_START);
- pg_dir_dst = pgd_offset_raw(pg_dir, KASAN_SHADOW_START);
+ pg_dir_dst = pgd_offset_raw(init_mm.pgd, KASAN_SHADOW_START);
p4_dir_src = p4d_offset(pg_dir_src, KASAN_SHADOW_START);
p4_dir_dst = p4d_offset(pg_dir_dst, KASAN_SHADOW_START);
- if (!p4d_folded(*p4_dir_src)) {
- /* 4 level paging */
- memcpy(p4_dir_dst, p4_dir_src,
- (KASAN_SHADOW_SIZE >> P4D_SHIFT) * sizeof(p4d_t));
- return;
- }
- /* 3 level paging */
- pu_dir_src = pud_offset(p4_dir_src, KASAN_SHADOW_START);
- pu_dir_dst = pud_offset(p4_dir_dst, KASAN_SHADOW_START);
- memcpy(pu_dir_dst, pu_dir_src,
- (KASAN_SHADOW_SIZE >> PUD_SHIFT) * sizeof(pud_t));
+ memcpy(p4_dir_dst, p4_dir_src,
+ (KASAN_SHADOW_SIZE >> P4D_SHIFT) * sizeof(p4d_t));
}
void __init kasan_free_early_identity(void)
{
- memblock_free(pgalloc_pos, pgalloc_freeable - pgalloc_pos);
+ memblock_phys_free(pgalloc_pos, pgalloc_freeable - pgalloc_pos);
}
diff --git a/arch/s390/mm/maccess.c b/arch/s390/mm/maccess.c
index de7ca4b6718f..1571cdcb0c50 100644
--- a/arch/s390/mm/maccess.c
+++ b/arch/s390/mm/maccess.c
@@ -4,8 +4,6 @@
*
* Copyright IBM Corp. 2009, 2015
*
- * Author(s): Heiko Carstens <heiko.carstens@de.ibm.com>,
- *
*/
#include <linux/uaccess.h>
@@ -14,9 +12,17 @@
#include <linux/errno.h>
#include <linux/gfp.h>
#include <linux/cpu.h>
+#include <linux/uio.h>
+#include <asm/asm-extable.h>
#include <asm/ctl_reg.h>
#include <asm/io.h>
+#include <asm/abs_lowcore.h>
#include <asm/stacktrace.h>
+#include <asm/maccess.h>
+
+unsigned long __bootdata_preserved(__memcpy_real_area);
+static __ro_after_init pte_t *memcpy_real_ptep;
+static DEFINE_MUTEX(memcpy_real_mutex);
static notrace long s390_kernel_write_odd(void *dst, const void *src, size_t size)
{
@@ -55,155 +61,94 @@ static notrace long s390_kernel_write_odd(void *dst, const void *src, size_t siz
*/
static DEFINE_SPINLOCK(s390_kernel_write_lock);
-void notrace s390_kernel_write(void *dst, const void *src, size_t size)
+notrace void *s390_kernel_write(void *dst, const void *src, size_t size)
{
+ void *tmp = dst;
unsigned long flags;
long copied;
spin_lock_irqsave(&s390_kernel_write_lock, flags);
- while (size) {
- copied = s390_kernel_write_odd(dst, src, size);
- dst += copied;
- src += copied;
- size -= copied;
+ if (!(flags & PSW_MASK_DAT)) {
+ memcpy(dst, src, size);
+ } else {
+ while (size) {
+ copied = s390_kernel_write_odd(tmp, src, size);
+ tmp += copied;
+ src += copied;
+ size -= copied;
+ }
}
spin_unlock_irqrestore(&s390_kernel_write_lock, flags);
-}
-
-static int __no_sanitize_address __memcpy_real(void *dest, void *src, size_t count)
-{
- register unsigned long _dest asm("2") = (unsigned long) dest;
- register unsigned long _len1 asm("3") = (unsigned long) count;
- register unsigned long _src asm("4") = (unsigned long) src;
- register unsigned long _len2 asm("5") = (unsigned long) count;
- int rc = -EFAULT;
-
- asm volatile (
- "0: mvcle %1,%2,0x0\n"
- "1: jo 0b\n"
- " lhi %0,0x0\n"
- "2:\n"
- EX_TABLE(1b,2b)
- : "+d" (rc), "+d" (_dest), "+d" (_src), "+d" (_len1),
- "+d" (_len2), "=m" (*((long *) dest))
- : "m" (*((long *) src))
- : "cc", "memory");
- return rc;
-}
-static unsigned long __no_sanitize_address _memcpy_real(unsigned long dest,
- unsigned long src,
- unsigned long count)
-{
- int irqs_disabled, rc;
- unsigned long flags;
-
- if (!count)
- return 0;
- flags = arch_local_irq_save();
- irqs_disabled = arch_irqs_disabled_flags(flags);
- if (!irqs_disabled)
- trace_hardirqs_off();
- __arch_local_irq_stnsm(0xf8); // disable DAT
- rc = __memcpy_real((void *) dest, (void *) src, (size_t) count);
- if (flags & PSW_MASK_DAT)
- __arch_local_irq_stosm(0x04); // enable DAT
- if (!irqs_disabled)
- trace_hardirqs_on();
- __arch_local_irq_ssm(flags);
- return rc;
+ return dst;
}
-/*
- * Copy memory in real mode (kernel to kernel)
- */
-int memcpy_real(void *dest, void *src, size_t count)
+void __init memcpy_real_init(void)
{
- int rc;
-
- if (S390_lowcore.nodat_stack != 0) {
- preempt_disable();
- rc = CALL_ON_STACK(_memcpy_real, S390_lowcore.nodat_stack, 3,
- dest, src, count);
- preempt_enable();
- return rc;
- }
- /*
- * This is a really early memcpy_real call, the stacks are
- * not set up yet. Just call _memcpy_real on the early boot
- * stack
- */
- return _memcpy_real((unsigned long) dest,(unsigned long) src,
- (unsigned long) count);
+ memcpy_real_ptep = vmem_get_alloc_pte(__memcpy_real_area, true);
+ if (!memcpy_real_ptep)
+ panic("Couldn't setup memcpy real area");
}
-/*
- * Copy memory in absolute mode (kernel to kernel)
- */
-void memcpy_absolute(void *dest, void *src, size_t count)
+size_t memcpy_real_iter(struct iov_iter *iter, unsigned long src, size_t count)
{
- unsigned long cr0, flags, prefix;
-
- flags = arch_local_irq_save();
- __ctl_store(cr0, 0, 0);
- __ctl_clear_bit(0, 28); /* disable lowcore protection */
- prefix = store_prefix();
- if (prefix) {
- local_mcck_disable();
- set_prefix(0);
- memcpy(dest, src, count);
- set_prefix(prefix);
- local_mcck_enable();
- } else {
- memcpy(dest, src, count);
+ size_t len, copied, res = 0;
+ unsigned long phys, offset;
+ void *chunk;
+ pte_t pte;
+
+ while (count) {
+ phys = src & PAGE_MASK;
+ offset = src & ~PAGE_MASK;
+ chunk = (void *)(__memcpy_real_area + offset);
+ len = min(count, PAGE_SIZE - offset);
+ pte = mk_pte_phys(phys, PAGE_KERNEL_RO);
+
+ mutex_lock(&memcpy_real_mutex);
+ if (pte_val(pte) != pte_val(*memcpy_real_ptep)) {
+ __ptep_ipte(__memcpy_real_area, memcpy_real_ptep, 0, 0, IPTE_GLOBAL);
+ set_pte(memcpy_real_ptep, pte);
+ }
+ copied = copy_to_iter(chunk, len, iter);
+ mutex_unlock(&memcpy_real_mutex);
+
+ count -= copied;
+ src += copied;
+ res += copied;
+ if (copied < len)
+ break;
}
- __ctl_load(cr0, 0, 0);
- arch_local_irq_restore(flags);
+ return res;
}
-/*
- * Copy memory from kernel (real) to user (virtual)
- */
-int copy_to_user_real(void __user *dest, void *src, unsigned long count)
+int memcpy_real(void *dest, unsigned long src, size_t count)
{
- int offs = 0, size, rc;
- char *buf;
-
- buf = (char *) __get_free_page(GFP_KERNEL);
- if (!buf)
- return -ENOMEM;
- rc = -EFAULT;
- while (offs < count) {
- size = min(PAGE_SIZE, count - offs);
- if (memcpy_real(buf, src + offs, size))
- goto out;
- if (copy_to_user(dest + offs, buf, size))
- goto out;
- offs += size;
- }
- rc = 0;
-out:
- free_page((unsigned long) buf);
- return rc;
+ struct iov_iter iter;
+ struct kvec kvec;
+
+ kvec.iov_base = dest;
+ kvec.iov_len = count;
+ iov_iter_kvec(&iter, WRITE, &kvec, 1, count);
+ if (memcpy_real_iter(&iter, src, count) < count)
+ return -EFAULT;
+ return 0;
}
/*
- * Check if physical address is within prefix or zero page
+ * Find CPU that owns swapped prefix page
*/
-static int is_swapped(unsigned long addr)
+static int get_swapped_owner(phys_addr_t addr)
{
- unsigned long lc;
+ phys_addr_t lc;
int cpu;
- if (addr < sizeof(struct lowcore))
- return 1;
for_each_online_cpu(cpu) {
- lc = (unsigned long) lowcore_ptr[cpu];
+ lc = virt_to_phys(lowcore_ptr[cpu]);
if (addr > lc + sizeof(struct lowcore) - 1 || addr < lc)
continue;
- return 1;
+ return cpu;
}
- return 0;
+ return -1;
}
/*
@@ -214,27 +159,46 @@ static int is_swapped(unsigned long addr)
*/
void *xlate_dev_mem_ptr(phys_addr_t addr)
{
- void *bounce = (void *) addr;
+ void *ptr = phys_to_virt(addr);
+ void *bounce = ptr;
+ struct lowcore *abs_lc;
+ unsigned long flags;
unsigned long size;
+ int this_cpu, cpu;
- get_online_cpus();
- preempt_disable();
- if (is_swapped(addr)) {
- size = PAGE_SIZE - (addr & ~PAGE_MASK);
- bounce = (void *) __get_free_page(GFP_ATOMIC);
- if (bounce)
- memcpy_absolute(bounce, (void *) addr, size);
+ cpus_read_lock();
+ this_cpu = get_cpu();
+ if (addr >= sizeof(struct lowcore)) {
+ cpu = get_swapped_owner(addr);
+ if (cpu < 0)
+ goto out;
}
- preempt_enable();
- put_online_cpus();
+ bounce = (void *)__get_free_page(GFP_ATOMIC);
+ if (!bounce)
+ goto out;
+ size = PAGE_SIZE - (addr & ~PAGE_MASK);
+ if (addr < sizeof(struct lowcore)) {
+ abs_lc = get_abs_lowcore(&flags);
+ ptr = (void *)abs_lc + addr;
+ memcpy(bounce, ptr, size);
+ put_abs_lowcore(abs_lc, flags);
+ } else if (cpu == this_cpu) {
+ ptr = (void *)(addr - virt_to_phys(lowcore_ptr[cpu]));
+ memcpy(bounce, ptr, size);
+ } else {
+ memcpy(bounce, ptr, size);
+ }
+out:
+ put_cpu();
+ cpus_read_unlock();
return bounce;
}
/*
* Free converted buffer for /dev/mem access (if necessary)
*/
-void unxlate_dev_mem_ptr(phys_addr_t addr, void *buf)
+void unxlate_dev_mem_ptr(phys_addr_t addr, void *ptr)
{
- if ((void *) addr != buf)
- free_page((unsigned long) buf);
+ if (addr != virt_to_phys(ptr))
+ free_page((unsigned long)ptr);
}
diff --git a/arch/s390/mm/mmap.c b/arch/s390/mm/mmap.c
index cbc718ba6d78..3327c47bc181 100644
--- a/arch/s390/mm/mmap.c
+++ b/arch/s390/mm/mmap.c
@@ -17,7 +17,6 @@
#include <linux/random.h>
#include <linux/compat.h>
#include <linux/security.h>
-#include <asm/pgalloc.h>
#include <asm/elf.h>
static unsigned long stack_maxrandom_size(void)
@@ -38,7 +37,7 @@ static inline int mmap_is_legacy(struct rlimit *rlim_stack)
unsigned long arch_mmap_rnd(void)
{
- return (get_random_int() & MMAP_RND_MASK) << PAGE_SHIFT;
+ return (get_random_u32() & MMAP_RND_MASK) << PAGE_SHIFT;
}
static unsigned long mmap_base_legacy(unsigned long rnd)
@@ -59,9 +58,9 @@ static inline unsigned long mmap_base(unsigned long rnd,
/*
* Top of mmap area (just below the process stack).
- * Leave at least a ~32 MB hole.
+ * Leave at least a ~128 MB hole.
*/
- gap_min = 32 * 1024 * 1024UL;
+ gap_min = SZ_128M;
gap_max = (STACK_TOP / 6) * 5;
if (gap < gap_min)
@@ -72,14 +71,13 @@ static inline unsigned long mmap_base(unsigned long rnd,
return PAGE_ALIGN(STACK_TOP - gap - rnd);
}
-unsigned long
-arch_get_unmapped_area(struct file *filp, unsigned long addr,
- unsigned long len, unsigned long pgoff, unsigned long flags)
+unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr,
+ unsigned long len, unsigned long pgoff,
+ unsigned long flags)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
struct vm_unmapped_area_info info;
- int rc;
if (len > TASK_SIZE - mmap_min_addr)
return -ENOMEM;
@@ -105,30 +103,20 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
info.align_mask = 0;
info.align_offset = pgoff << PAGE_SHIFT;
addr = vm_unmapped_area(&info);
- if (addr & ~PAGE_MASK)
+ if (offset_in_page(addr))
return addr;
check_asce_limit:
- if (addr + len > current->mm->context.asce_limit &&
- addr + len <= TASK_SIZE) {
- rc = crst_table_upgrade(mm, addr + len);
- if (rc)
- return (unsigned long) rc;
- }
-
- return addr;
+ return check_asce_limit(mm, addr, len);
}
-unsigned long
-arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
- const unsigned long len, const unsigned long pgoff,
- const unsigned long flags)
+unsigned long arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
+ unsigned long len, unsigned long pgoff,
+ unsigned long flags)
{
struct vm_area_struct *vma;
struct mm_struct *mm = current->mm;
- unsigned long addr = addr0;
struct vm_unmapped_area_info info;
- int rc;
/* requested length too big for entire address space */
if (len > TASK_SIZE - mmap_min_addr)
@@ -163,25 +151,18 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
* can happen with large stack limits and large mmap()
* allocations.
*/
- if (addr & ~PAGE_MASK) {
+ if (offset_in_page(addr)) {
VM_BUG_ON(addr != -ENOMEM);
info.flags = 0;
info.low_limit = TASK_UNMAPPED_BASE;
info.high_limit = TASK_SIZE;
addr = vm_unmapped_area(&info);
- if (addr & ~PAGE_MASK)
+ if (offset_in_page(addr))
return addr;
}
check_asce_limit:
- if (addr + len > current->mm->context.asce_limit &&
- addr + len <= TASK_SIZE) {
- rc = crst_table_upgrade(mm, addr + len);
- if (rc)
- return (unsigned long) rc;
- }
-
- return addr;
+ return check_asce_limit(mm, addr, len);
}
/*
@@ -207,3 +188,23 @@ void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
mm->get_unmapped_area = arch_get_unmapped_area_topdown;
}
}
+
+static const pgprot_t protection_map[16] = {
+ [VM_NONE] = PAGE_NONE,
+ [VM_READ] = PAGE_RO,
+ [VM_WRITE] = PAGE_RO,
+ [VM_WRITE | VM_READ] = PAGE_RO,
+ [VM_EXEC] = PAGE_RX,
+ [VM_EXEC | VM_READ] = PAGE_RX,
+ [VM_EXEC | VM_WRITE] = PAGE_RX,
+ [VM_EXEC | VM_WRITE | VM_READ] = PAGE_RX,
+ [VM_SHARED] = PAGE_NONE,
+ [VM_SHARED | VM_READ] = PAGE_RO,
+ [VM_SHARED | VM_WRITE] = PAGE_RW,
+ [VM_SHARED | VM_WRITE | VM_READ] = PAGE_RW,
+ [VM_SHARED | VM_EXEC] = PAGE_RX,
+ [VM_SHARED | VM_EXEC | VM_READ] = PAGE_RX,
+ [VM_SHARED | VM_EXEC | VM_WRITE] = PAGE_RWX,
+ [VM_SHARED | VM_EXEC | VM_WRITE | VM_READ] = PAGE_RWX
+};
+DECLARE_VM_GET_PAGE_PROT
diff --git a/arch/s390/mm/page-states.c b/arch/s390/mm/page-states.c
index fc141893d028..d5ea09d78938 100644
--- a/arch/s390/mm/page-states.c
+++ b/arch/s390/mm/page-states.c
@@ -14,6 +14,7 @@
#include <linux/memblock.h>
#include <linux/gfp.h>
#include <linux/init.h>
+#include <asm/asm-extable.h>
#include <asm/facility.h>
#include <asm/page-states.h>
@@ -31,17 +32,17 @@ __setup("cmma=", cmma);
static inline int cmma_test_essa(void)
{
- register unsigned long tmp asm("0") = 0;
- register int rc asm("1");
+ unsigned long tmp = 0;
+ int rc = -EOPNOTSUPP;
/* test ESSA_GET_STATE */
asm volatile(
- " .insn rrf,0xb9ab0000,%1,%1,%2,0\n"
- "0: la %0,0\n"
+ " .insn rrf,0xb9ab0000,%[tmp],%[tmp],%[cmd],0\n"
+ "0: la %[rc],0\n"
"1:\n"
EX_TABLE(0b,1b)
- : "=&d" (rc), "+&d" (tmp)
- : "i" (ESSA_GET_STATE), "0" (-EOPNOTSUPP));
+ : [rc] "+&d" (rc), [tmp] "+&d" (tmp)
+ : [cmd] "i" (ESSA_GET_STATE));
return rc;
}
@@ -112,7 +113,7 @@ static void mark_kernel_pmd(pud_t *pud, unsigned long addr, unsigned long end)
next = pmd_addr_end(addr, end);
if (pmd_none(*pmd) || pmd_large(*pmd))
continue;
- page = virt_to_page(pmd_val(*pmd));
+ page = phys_to_page(pmd_val(*pmd));
set_bit(PG_arch_1, &page->flags);
} while (pmd++, addr = next, addr != end);
}
@@ -130,7 +131,7 @@ static void mark_kernel_pud(p4d_t *p4d, unsigned long addr, unsigned long end)
if (pud_none(*pud) || pud_large(*pud))
continue;
if (!pud_folded(*pud)) {
- page = virt_to_page(pud_val(*pud));
+ page = phys_to_page(pud_val(*pud));
for (i = 0; i < 3; i++)
set_bit(PG_arch_1, &page[i].flags);
}
@@ -151,7 +152,7 @@ static void mark_kernel_p4d(pgd_t *pgd, unsigned long addr, unsigned long end)
if (p4d_none(*p4d))
continue;
if (!p4d_folded(*p4d)) {
- page = virt_to_page(p4d_val(*p4d));
+ page = phys_to_page(p4d_val(*p4d));
for (i = 0; i < 3; i++)
set_bit(PG_arch_1, &page[i].flags);
}
@@ -173,7 +174,7 @@ static void mark_kernel_pgd(void)
if (pgd_none(*pgd))
continue;
if (!pgd_folded(*pgd)) {
- page = virt_to_page(pgd_val(*pgd));
+ page = phys_to_page(pgd_val(*pgd));
for (i = 0; i < 3; i++)
set_bit(PG_arch_1, &page[i].flags);
}
@@ -183,9 +184,9 @@ static void mark_kernel_pgd(void)
void __init cmma_init_nodat(void)
{
- struct memblock_region *reg;
struct page *page;
unsigned long start, end, ix;
+ int i;
if (cmma_flag < 2)
return;
@@ -193,9 +194,7 @@ void __init cmma_init_nodat(void)
mark_kernel_pgd();
/* Set all kernel pages not used for page tables to stable/no-dat */
- for_each_memblock(memory, reg) {
- start = memblock_region_memory_base_pfn(reg);
- end = memblock_region_memory_end_pfn(reg);
+ for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, NULL) {
page = pfn_to_page(start);
for (ix = start; ix < end; ix++, page++) {
if (__test_and_clear_bit(PG_arch_1, &page->flags))
@@ -230,46 +229,3 @@ void arch_set_page_dat(struct page *page, int order)
return;
set_page_stable_dat(page, order);
}
-
-void arch_set_page_nodat(struct page *page, int order)
-{
- if (cmma_flag < 2)
- return;
- set_page_stable_nodat(page, order);
-}
-
-int arch_test_page_nodat(struct page *page)
-{
- unsigned char state;
-
- if (cmma_flag < 2)
- return 0;
- state = get_page_state(page);
- return !!(state & 0x20);
-}
-
-void arch_set_page_states(int make_stable)
-{
- unsigned long flags, order, t;
- struct list_head *l;
- struct page *page;
- struct zone *zone;
-
- if (!cmma_flag)
- return;
- if (make_stable)
- drain_local_pages(NULL);
- for_each_populated_zone(zone) {
- spin_lock_irqsave(&zone->lock, flags);
- for_each_migratetype_order(order, t) {
- list_for_each(l, &zone->free_area[order].free_list[t]) {
- page = list_entry(l, struct page, lru);
- if (make_stable)
- set_page_stable_dat(page, order);
- else
- set_page_unused(page, order);
- }
- }
- spin_unlock_irqrestore(&zone->lock, flags);
- }
-}
diff --git a/arch/s390/mm/pageattr.c b/arch/s390/mm/pageattr.c
index f8c6faab41f4..85195c18b2e8 100644
--- a/arch/s390/mm/pageattr.c
+++ b/arch/s390/mm/pageattr.c
@@ -7,8 +7,8 @@
#include <linux/mm.h>
#include <asm/cacheflush.h>
#include <asm/facility.h>
-#include <asm/pgtable.h>
#include <asm/pgalloc.h>
+#include <asm/kfence.h>
#include <asm/page.h>
#include <asm/set_memory.h>
@@ -57,7 +57,7 @@ void arch_report_meminfo(struct seq_file *m)
static void pgt_set(unsigned long *old, unsigned long new, unsigned long addr,
unsigned long dtt)
{
- unsigned long table, mask;
+ unsigned long *table, mask;
mask = 0;
if (MACHINE_HAS_EDAT2) {
@@ -72,7 +72,7 @@ static void pgt_set(unsigned long *old, unsigned long new, unsigned long addr,
mask = ~(PTRS_PER_PTE * sizeof(pte_t) - 1);
break;
}
- table = (unsigned long)old & mask;
+ table = (unsigned long *)((unsigned long)old & mask);
crdte(*old, new, table, dtt, addr, S390_lowcore.kernel_asce);
} else if (MACHINE_HAS_IDTE) {
cspg(old, *old, new);
@@ -86,7 +86,9 @@ static int walk_pte_level(pmd_t *pmdp, unsigned long addr, unsigned long end,
{
pte_t *ptep, new;
- ptep = pte_offset(pmdp, addr);
+ if (flags == SET_MEMORY_4K)
+ return 0;
+ ptep = pte_offset_kernel(pmdp, addr);
do {
new = *ptep;
if (pte_none(new))
@@ -96,9 +98,9 @@ static int walk_pte_level(pmd_t *pmdp, unsigned long addr, unsigned long end,
else if (flags & SET_MEMORY_RW)
new = pte_mkwrite(pte_mkdirty(new));
if (flags & SET_MEMORY_NX)
- pte_val(new) |= _PAGE_NOEXEC;
+ new = set_pte_bit(new, __pgprot(_PAGE_NOEXEC));
else if (flags & SET_MEMORY_X)
- pte_val(new) &= ~_PAGE_NOEXEC;
+ new = clear_pte_bit(new, __pgprot(_PAGE_NOEXEC));
pgt_set((unsigned long *)ptep, pte_val(new), addr, CRDTE_DTT_PAGE);
ptep++;
addr += PAGE_SIZE;
@@ -125,11 +127,11 @@ static int split_pmd_page(pmd_t *pmdp, unsigned long addr)
prot &= ~_PAGE_NOEXEC;
ptep = pt_dir;
for (i = 0; i < PTRS_PER_PTE; i++) {
- pte_val(*ptep) = pte_addr | prot;
+ set_pte(ptep, __pte(pte_addr | prot));
pte_addr += PAGE_SIZE;
ptep++;
}
- pmd_val(new) = __pa(pt_dir) | _SEGMENT_ENTRY;
+ new = __pmd(__pa(pt_dir) | _SEGMENT_ENTRY);
pgt_set((unsigned long *)pmdp, pmd_val(new), addr, CRDTE_DTT_SEGMENT);
update_page_count(PG_DIRECT_MAP_4K, PTRS_PER_PTE);
update_page_count(PG_DIRECT_MAP_1M, -1);
@@ -146,9 +148,9 @@ static void modify_pmd_page(pmd_t *pmdp, unsigned long addr,
else if (flags & SET_MEMORY_RW)
new = pmd_mkwrite(pmd_mkdirty(new));
if (flags & SET_MEMORY_NX)
- pmd_val(new) |= _SEGMENT_ENTRY_NOEXEC;
+ new = set_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_NOEXEC));
else if (flags & SET_MEMORY_X)
- pmd_val(new) &= ~_SEGMENT_ENTRY_NOEXEC;
+ new = clear_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_NOEXEC));
pgt_set((unsigned long *)pmdp, pmd_val(new), addr, CRDTE_DTT_SEGMENT);
}
@@ -156,6 +158,7 @@ static int walk_pmd_level(pud_t *pudp, unsigned long addr, unsigned long end,
unsigned long flags)
{
unsigned long next;
+ int need_split;
pmd_t *pmdp;
int rc = 0;
@@ -165,7 +168,10 @@ static int walk_pmd_level(pud_t *pudp, unsigned long addr, unsigned long end,
return -EINVAL;
next = pmd_addr_end(addr, end);
if (pmd_large(*pmdp)) {
- if (addr & ~PMD_MASK || addr + PMD_SIZE > next) {
+ need_split = !!(flags & SET_MEMORY_4K);
+ need_split |= !!(addr & ~PMD_MASK);
+ need_split |= !!(addr + PMD_SIZE > next);
+ if (need_split) {
rc = split_pmd_page(pmdp, addr);
if (rc)
return rc;
@@ -202,11 +208,11 @@ static int split_pud_page(pud_t *pudp, unsigned long addr)
prot &= ~_SEGMENT_ENTRY_NOEXEC;
pmdp = pm_dir;
for (i = 0; i < PTRS_PER_PMD; i++) {
- pmd_val(*pmdp) = pmd_addr | prot;
+ set_pmd(pmdp, __pmd(pmd_addr | prot));
pmd_addr += PMD_SIZE;
pmdp++;
}
- pud_val(new) = __pa(pm_dir) | _REGION3_ENTRY;
+ new = __pud(__pa(pm_dir) | _REGION3_ENTRY);
pgt_set((unsigned long *)pudp, pud_val(new), addr, CRDTE_DTT_REGION3);
update_page_count(PG_DIRECT_MAP_1M, PTRS_PER_PMD);
update_page_count(PG_DIRECT_MAP_2G, -1);
@@ -223,9 +229,9 @@ static void modify_pud_page(pud_t *pudp, unsigned long addr,
else if (flags & SET_MEMORY_RW)
new = pud_mkwrite(pud_mkdirty(new));
if (flags & SET_MEMORY_NX)
- pud_val(new) |= _REGION_ENTRY_NOEXEC;
+ new = set_pud_bit(new, __pgprot(_REGION_ENTRY_NOEXEC));
else if (flags & SET_MEMORY_X)
- pud_val(new) &= ~_REGION_ENTRY_NOEXEC;
+ new = clear_pud_bit(new, __pgprot(_REGION_ENTRY_NOEXEC));
pgt_set((unsigned long *)pudp, pud_val(new), addr, CRDTE_DTT_REGION3);
}
@@ -233,6 +239,7 @@ static int walk_pud_level(p4d_t *p4d, unsigned long addr, unsigned long end,
unsigned long flags)
{
unsigned long next;
+ int need_split;
pud_t *pudp;
int rc = 0;
@@ -242,7 +249,10 @@ static int walk_pud_level(p4d_t *p4d, unsigned long addr, unsigned long end,
return -EINVAL;
next = pud_addr_end(addr, end);
if (pud_large(*pudp)) {
- if (addr & ~PUD_MASK || addr + PUD_SIZE > next) {
+ need_split = !!(flags & SET_MEMORY_4K);
+ need_split |= !!(addr & ~PUD_MASK);
+ need_split |= !!(addr + PUD_SIZE > next);
+ if (need_split) {
rc = split_pud_page(pudp, addr);
if (rc)
break;
@@ -279,7 +289,7 @@ static int walk_p4d_level(pgd_t *pgd, unsigned long addr, unsigned long end,
return rc;
}
-static DEFINE_MUTEX(cpa_mutex);
+DEFINE_MUTEX(cpa_mutex);
static int change_page_attr(unsigned long addr, unsigned long end,
unsigned long flags)
@@ -317,7 +327,7 @@ int __set_memory(unsigned long addr, int numpages, unsigned long flags)
return change_page_attr(addr, addr + numpages * PAGE_SIZE, flags);
}
-#ifdef CONFIG_DEBUG_PAGEALLOC
+#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KFENCE)
static void ipte_range(pte_t *pte, unsigned long address, int nr)
{
@@ -337,50 +347,27 @@ static void ipte_range(pte_t *pte, unsigned long address, int nr)
void __kernel_map_pages(struct page *page, int numpages, int enable)
{
unsigned long address;
+ pte_t *ptep, pte;
int nr, i, j;
- pgd_t *pgd;
- p4d_t *p4d;
- pud_t *pud;
- pmd_t *pmd;
- pte_t *pte;
for (i = 0; i < numpages;) {
- address = page_to_phys(page + i);
- pgd = pgd_offset_k(address);
- p4d = p4d_offset(pgd, address);
- pud = pud_offset(p4d, address);
- pmd = pmd_offset(pud, address);
- pte = pte_offset_kernel(pmd, address);
- nr = (unsigned long)pte >> ilog2(sizeof(long));
+ address = (unsigned long)page_to_virt(page + i);
+ ptep = virt_to_kpte(address);
+ nr = (unsigned long)ptep >> ilog2(sizeof(long));
nr = PTRS_PER_PTE - (nr & (PTRS_PER_PTE - 1));
nr = min(numpages - i, nr);
if (enable) {
for (j = 0; j < nr; j++) {
- pte_val(*pte) &= ~_PAGE_INVALID;
+ pte = clear_pte_bit(*ptep, __pgprot(_PAGE_INVALID));
+ set_pte(ptep, pte);
address += PAGE_SIZE;
- pte++;
+ ptep++;
}
} else {
- ipte_range(pte, address, nr);
+ ipte_range(ptep, address, nr);
}
i += nr;
}
}
-#ifdef CONFIG_HIBERNATION
-bool kernel_page_present(struct page *page)
-{
- unsigned long addr;
- int cc;
-
- addr = page_to_phys(page);
- asm volatile(
- " lra %1,0(%1)\n"
- " ipm %0\n"
- " srl %0,28"
- : "=d" (cc), "+a" (addr) : : "cc");
- return cc == 0;
-}
-#endif /* CONFIG_HIBERNATION */
-
#endif /* CONFIG_DEBUG_PAGEALLOC */
diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c
index 3dd253f81a77..2de48b2c1b04 100644
--- a/arch/s390/mm/pgalloc.c
+++ b/arch/s390/mm/pgalloc.c
@@ -53,91 +53,92 @@ __initcall(page_table_register_sysctl);
unsigned long *crst_table_alloc(struct mm_struct *mm)
{
- struct page *page = alloc_pages(GFP_KERNEL, 2);
+ struct page *page = alloc_pages(GFP_KERNEL, CRST_ALLOC_ORDER);
if (!page)
return NULL;
- arch_set_page_dat(page, 2);
- return (unsigned long *) page_to_phys(page);
+ arch_set_page_dat(page, CRST_ALLOC_ORDER);
+ return (unsigned long *) page_to_virt(page);
}
void crst_table_free(struct mm_struct *mm, unsigned long *table)
{
- free_pages((unsigned long) table, 2);
+ free_pages((unsigned long)table, CRST_ALLOC_ORDER);
}
static void __crst_table_upgrade(void *arg)
{
struct mm_struct *mm = arg;
- if (current->active_mm == mm)
- set_user_asce(mm);
+ /* change all active ASCEs to avoid the creation of new TLBs */
+ if (current->active_mm == mm) {
+ S390_lowcore.user_asce = mm->context.asce;
+ __ctl_load(S390_lowcore.user_asce, 7, 7);
+ }
__tlb_flush_local();
}
int crst_table_upgrade(struct mm_struct *mm, unsigned long end)
{
- unsigned long *table, *pgd;
- int rc, notify;
+ unsigned long *pgd = NULL, *p4d = NULL, *__pgd;
+ unsigned long asce_limit = mm->context.asce_limit;
/* upgrade should only happen from 3 to 4, 3 to 5, or 4 to 5 levels */
- VM_BUG_ON(mm->context.asce_limit < _REGION2_SIZE);
- rc = 0;
- notify = 0;
- while (mm->context.asce_limit < end) {
- table = crst_table_alloc(mm);
- if (!table) {
- rc = -ENOMEM;
- break;
- }
- spin_lock_bh(&mm->page_table_lock);
- pgd = (unsigned long *) mm->pgd;
- if (mm->context.asce_limit == _REGION2_SIZE) {
- crst_table_init(table, _REGION2_ENTRY_EMPTY);
- p4d_populate(mm, (p4d_t *) table, (pud_t *) pgd);
- mm->pgd = (pgd_t *) table;
- mm->context.asce_limit = _REGION1_SIZE;
- mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
- _ASCE_USER_BITS | _ASCE_TYPE_REGION2;
- mm_inc_nr_puds(mm);
- } else {
- crst_table_init(table, _REGION1_ENTRY_EMPTY);
- pgd_populate(mm, (pgd_t *) table, (p4d_t *) pgd);
- mm->pgd = (pgd_t *) table;
- mm->context.asce_limit = -PAGE_SIZE;
- mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
- _ASCE_USER_BITS | _ASCE_TYPE_REGION1;
- }
- notify = 1;
- spin_unlock_bh(&mm->page_table_lock);
- }
- if (notify)
- on_each_cpu(__crst_table_upgrade, mm, 0);
- return rc;
-}
+ VM_BUG_ON(asce_limit < _REGION2_SIZE);
-void crst_table_downgrade(struct mm_struct *mm)
-{
- pgd_t *pgd;
+ if (end <= asce_limit)
+ return 0;
- /* downgrade should only happen from 3 to 2 levels (compat only) */
- VM_BUG_ON(mm->context.asce_limit != _REGION2_SIZE);
+ if (asce_limit == _REGION2_SIZE) {
+ p4d = crst_table_alloc(mm);
+ if (unlikely(!p4d))
+ goto err_p4d;
+ crst_table_init(p4d, _REGION2_ENTRY_EMPTY);
+ }
+ if (end > _REGION1_SIZE) {
+ pgd = crst_table_alloc(mm);
+ if (unlikely(!pgd))
+ goto err_pgd;
+ crst_table_init(pgd, _REGION1_ENTRY_EMPTY);
+ }
- if (current->active_mm == mm) {
- clear_user_asce();
- __tlb_flush_mm(mm);
+ spin_lock_bh(&mm->page_table_lock);
+
+ /*
+ * This routine gets called with mmap_lock lock held and there is
+ * no reason to optimize for the case of otherwise. However, if
+ * that would ever change, the below check will let us know.
+ */
+ VM_BUG_ON(asce_limit != mm->context.asce_limit);
+
+ if (p4d) {
+ __pgd = (unsigned long *) mm->pgd;
+ p4d_populate(mm, (p4d_t *) p4d, (pud_t *) __pgd);
+ mm->pgd = (pgd_t *) p4d;
+ mm->context.asce_limit = _REGION1_SIZE;
+ mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
+ _ASCE_USER_BITS | _ASCE_TYPE_REGION2;
+ mm_inc_nr_puds(mm);
+ }
+ if (pgd) {
+ __pgd = (unsigned long *) mm->pgd;
+ pgd_populate(mm, (pgd_t *) pgd, (p4d_t *) __pgd);
+ mm->pgd = (pgd_t *) pgd;
+ mm->context.asce_limit = TASK_SIZE_MAX;
+ mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
+ _ASCE_USER_BITS | _ASCE_TYPE_REGION1;
}
- pgd = mm->pgd;
- mm_dec_nr_pmds(mm);
- mm->pgd = (pgd_t *) (pgd_val(*pgd) & _REGION_ENTRY_ORIGIN);
- mm->context.asce_limit = _REGION3_SIZE;
- mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
- _ASCE_USER_BITS | _ASCE_TYPE_SEGMENT;
- crst_table_free(mm, (unsigned long *) pgd);
+ spin_unlock_bh(&mm->page_table_lock);
+
+ on_each_cpu(__crst_table_upgrade, mm, 0);
- if (current->active_mm == mm)
- set_user_asce(mm);
+ return 0;
+
+err_pgd:
+ crst_table_free(mm, p4d);
+err_p4d:
+ return -ENOMEM;
}
static inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits)
@@ -160,7 +161,7 @@ struct page *page_table_alloc_pgste(struct mm_struct *mm)
page = alloc_page(GFP_KERNEL);
if (page) {
- table = (u64 *)page_to_phys(page);
+ table = (u64 *)page_to_virt(page);
memset64(table, _PAGE_INVALID, PTRS_PER_PTE);
memset64(table + PTRS_PER_PTE, 0, PTRS_PER_PTE);
}
@@ -175,7 +176,75 @@ void page_table_free_pgste(struct page *page)
#endif /* CONFIG_PGSTE */
/*
- * page table entry allocation/free routines.
+ * A 2KB-pgtable is either upper or lower half of a normal page.
+ * The second half of the page may be unused or used as another
+ * 2KB-pgtable.
+ *
+ * Whenever possible the parent page for a new 2KB-pgtable is picked
+ * from the list of partially allocated pages mm_context_t::pgtable_list.
+ * In case the list is empty a new parent page is allocated and added to
+ * the list.
+ *
+ * When a parent page gets fully allocated it contains 2KB-pgtables in both
+ * upper and lower halves and is removed from mm_context_t::pgtable_list.
+ *
+ * When 2KB-pgtable is freed from to fully allocated parent page that
+ * page turns partially allocated and added to mm_context_t::pgtable_list.
+ *
+ * If 2KB-pgtable is freed from the partially allocated parent page that
+ * page turns unused and gets removed from mm_context_t::pgtable_list.
+ * Furthermore, the unused parent page is released.
+ *
+ * As follows from the above, no unallocated or fully allocated parent
+ * pages are contained in mm_context_t::pgtable_list.
+ *
+ * The upper byte (bits 24-31) of the parent page _refcount is used
+ * for tracking contained 2KB-pgtables and has the following format:
+ *
+ * PP AA
+ * 01234567 upper byte (bits 24-31) of struct page::_refcount
+ * || ||
+ * || |+--- upper 2KB-pgtable is allocated
+ * || +---- lower 2KB-pgtable is allocated
+ * |+------- upper 2KB-pgtable is pending for removal
+ * +-------- lower 2KB-pgtable is pending for removal
+ *
+ * (See commit 620b4e903179 ("s390: use _refcount for pgtables") on why
+ * using _refcount is possible).
+ *
+ * When 2KB-pgtable is allocated the corresponding AA bit is set to 1.
+ * The parent page is either:
+ * - added to mm_context_t::pgtable_list in case the second half of the
+ * parent page is still unallocated;
+ * - removed from mm_context_t::pgtable_list in case both hales of the
+ * parent page are allocated;
+ * These operations are protected with mm_context_t::lock.
+ *
+ * When 2KB-pgtable is deallocated the corresponding AA bit is set to 0
+ * and the corresponding PP bit is set to 1 in a single atomic operation.
+ * Thus, PP and AA bits corresponding to the same 2KB-pgtable are mutually
+ * exclusive and may never be both set to 1!
+ * The parent page is either:
+ * - added to mm_context_t::pgtable_list in case the second half of the
+ * parent page is still allocated;
+ * - removed from mm_context_t::pgtable_list in case the second half of
+ * the parent page is unallocated;
+ * These operations are protected with mm_context_t::lock.
+ *
+ * It is important to understand that mm_context_t::lock only protects
+ * mm_context_t::pgtable_list and AA bits, but not the parent page itself
+ * and PP bits.
+ *
+ * Releasing the parent page happens whenever the PP bit turns from 1 to 0,
+ * while both AA bits and the second PP bit are already unset. Then the
+ * parent page does not contain any 2KB-pgtable fragment anymore, and it has
+ * also been removed from mm_context_t::pgtable_list. It is safe to release
+ * the page therefore.
+ *
+ * PGSTE memory spaces use full 4KB-pgtables and do not need most of the
+ * logic described above. Both AA bits are set to 1 to denote a 4KB-pgtable
+ * while the PP bits are never used, nor such a page is added to or removed
+ * from mm_context_t::pgtable_list.
*/
unsigned long *page_table_alloc(struct mm_struct *mm)
{
@@ -191,14 +260,23 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
page = list_first_entry(&mm->context.pgtable_list,
struct page, lru);
mask = atomic_read(&page->_refcount) >> 24;
- mask = (mask | (mask >> 4)) & 3;
- if (mask != 3) {
- table = (unsigned long *) page_to_phys(page);
+ /*
+ * The pending removal bits must also be checked.
+ * Failure to do so might lead to an impossible
+ * value of (i.e 0x13 or 0x23) written to _refcount.
+ * Such values violate the assumption that pending and
+ * allocation bits are mutually exclusive, and the rest
+ * of the code unrails as result. That could lead to
+ * a whole bunch of races and corruptions.
+ */
+ mask = (mask | (mask >> 4)) & 0x03U;
+ if (mask != 0x03U) {
+ table = (unsigned long *) page_to_virt(page);
bit = mask & 1; /* =1 -> second 2K */
if (bit)
table += PTRS_PER_PTE;
atomic_xor_bits(&page->_refcount,
- 1U << (bit + 24));
+ 0x01U << (bit + 24));
list_del(&page->lru);
}
}
@@ -216,15 +294,15 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
}
arch_set_page_dat(page, 0);
/* Initialize page table */
- table = (unsigned long *) page_to_phys(page);
+ table = (unsigned long *) page_to_virt(page);
if (mm_alloc_pgste(mm)) {
/* Return 4K page table with PGSTEs */
- atomic_xor_bits(&page->_refcount, 3 << 24);
+ atomic_xor_bits(&page->_refcount, 0x03U << 24);
memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE);
memset64((u64 *)table + PTRS_PER_PTE, 0, PTRS_PER_PTE);
} else {
/* Return the first 2K fragment of the page */
- atomic_xor_bits(&page->_refcount, 1 << 24);
+ atomic_xor_bits(&page->_refcount, 0x01U << 24);
memset64((u64 *)table, _PAGE_INVALID, 2 * PTRS_PER_PTE);
spin_lock_bh(&mm->context.lock);
list_add(&page->lru, &mm->context.pgtable_list);
@@ -233,29 +311,53 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
return table;
}
+static void page_table_release_check(struct page *page, void *table,
+ unsigned int half, unsigned int mask)
+{
+ char msg[128];
+
+ if (!IS_ENABLED(CONFIG_DEBUG_VM) || !mask)
+ return;
+ snprintf(msg, sizeof(msg),
+ "Invalid pgtable %p release half 0x%02x mask 0x%02x",
+ table, half, mask);
+ dump_page(page, msg);
+}
+
void page_table_free(struct mm_struct *mm, unsigned long *table)
{
+ unsigned int mask, bit, half;
struct page *page;
- unsigned int bit, mask;
- page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
+ page = virt_to_page(table);
if (!mm_alloc_pgste(mm)) {
/* Free 2K page table fragment of a 4K page */
- bit = (__pa(table) & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t));
+ bit = ((unsigned long) table & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t));
spin_lock_bh(&mm->context.lock);
- mask = atomic_xor_bits(&page->_refcount, 1U << (bit + 24));
+ /*
+ * Mark the page for delayed release. The actual release
+ * will happen outside of the critical section from this
+ * function or from __tlb_remove_table()
+ */
+ mask = atomic_xor_bits(&page->_refcount, 0x11U << (bit + 24));
mask >>= 24;
- if (mask & 3)
+ if (mask & 0x03U)
list_add(&page->lru, &mm->context.pgtable_list);
else
list_del(&page->lru);
spin_unlock_bh(&mm->context.lock);
- if (mask != 0)
+ mask = atomic_xor_bits(&page->_refcount, 0x10U << (bit + 24));
+ mask >>= 24;
+ if (mask != 0x00U)
return;
+ half = 0x01U << bit;
} else {
- atomic_xor_bits(&page->_refcount, 3U << 24);
+ half = 0x03U;
+ mask = atomic_xor_bits(&page->_refcount, 0x03U << 24);
+ mask >>= 24;
}
+ page_table_release_check(page, table, half, mask);
pgtable_pte_page_dtor(page);
__free_page(page);
}
@@ -268,50 +370,57 @@ void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table,
unsigned int bit, mask;
mm = tlb->mm;
- page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
+ page = virt_to_page(table);
if (mm_alloc_pgste(mm)) {
gmap_unlink(mm, table, vmaddr);
- table = (unsigned long *) (__pa(table) | 3);
+ table = (unsigned long *) ((unsigned long)table | 0x03U);
tlb_remove_table(tlb, table);
return;
}
- bit = (__pa(table) & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t));
+ bit = ((unsigned long) table & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t));
spin_lock_bh(&mm->context.lock);
+ /*
+ * Mark the page for delayed release. The actual release will happen
+ * outside of the critical section from __tlb_remove_table() or from
+ * page_table_free()
+ */
mask = atomic_xor_bits(&page->_refcount, 0x11U << (bit + 24));
mask >>= 24;
- if (mask & 3)
+ if (mask & 0x03U)
list_add_tail(&page->lru, &mm->context.pgtable_list);
else
list_del(&page->lru);
spin_unlock_bh(&mm->context.lock);
- table = (unsigned long *) (__pa(table) | (1U << bit));
+ table = (unsigned long *) ((unsigned long) table | (0x01U << bit));
tlb_remove_table(tlb, table);
}
void __tlb_remove_table(void *_table)
{
- unsigned int mask = (unsigned long) _table & 3;
+ unsigned int mask = (unsigned long) _table & 0x03U, half = mask;
void *table = (void *)((unsigned long) _table ^ mask);
- struct page *page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
+ struct page *page = virt_to_page(table);
- switch (mask) {
- case 0: /* pmd, pud, or p4d */
- free_pages((unsigned long) table, 2);
- break;
- case 1: /* lower 2K of a 4K page table */
- case 2: /* higher 2K of a 4K page table */
+ switch (half) {
+ case 0x00U: /* pmd, pud, or p4d */
+ free_pages((unsigned long)table, CRST_ALLOC_ORDER);
+ return;
+ case 0x01U: /* lower 2K of a 4K page table */
+ case 0x02U: /* higher 2K of a 4K page table */
mask = atomic_xor_bits(&page->_refcount, mask << (4 + 24));
mask >>= 24;
- if (mask != 0)
- break;
- /* fallthrough */
- case 3: /* 4K page table with pgstes */
- if (mask & 3)
- atomic_xor_bits(&page->_refcount, 3 << 24);
- pgtable_pte_page_dtor(page);
- __free_page(page);
+ if (mask != 0x00U)
+ return;
+ break;
+ case 0x03U: /* 4K page table with pgstes */
+ mask = atomic_xor_bits(&page->_refcount, 0x03U << 24);
+ mask >>= 24;
break;
}
+
+ page_table_release_check(page, table, half, mask);
+ pgtable_pte_page_dtor(page);
+ __free_page(page);
}
/*
@@ -321,34 +430,34 @@ void __tlb_remove_table(void *_table)
static struct kmem_cache *base_pgt_cache;
-static unsigned long base_pgt_alloc(void)
+static unsigned long *base_pgt_alloc(void)
{
- u64 *table;
+ unsigned long *table;
table = kmem_cache_alloc(base_pgt_cache, GFP_KERNEL);
if (table)
- memset64(table, _PAGE_INVALID, PTRS_PER_PTE);
- return (unsigned long) table;
+ memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE);
+ return table;
}
-static void base_pgt_free(unsigned long table)
+static void base_pgt_free(unsigned long *table)
{
- kmem_cache_free(base_pgt_cache, (void *) table);
+ kmem_cache_free(base_pgt_cache, table);
}
-static unsigned long base_crst_alloc(unsigned long val)
+static unsigned long *base_crst_alloc(unsigned long val)
{
- unsigned long table;
+ unsigned long *table;
- table = __get_free_pages(GFP_KERNEL, CRST_ALLOC_ORDER);
+ table = (unsigned long *)__get_free_pages(GFP_KERNEL, CRST_ALLOC_ORDER);
if (table)
- crst_table_init((unsigned long *)table, val);
+ crst_table_init(table, val);
return table;
}
-static void base_crst_free(unsigned long table)
+static void base_crst_free(unsigned long *table)
{
- free_pages(table, CRST_ALLOC_ORDER);
+ free_pages((unsigned long)table, CRST_ALLOC_ORDER);
}
#define BASE_ADDR_END_FUNC(NAME, SIZE) \
@@ -376,14 +485,14 @@ static inline unsigned long base_lra(unsigned long address)
return real;
}
-static int base_page_walk(unsigned long origin, unsigned long addr,
+static int base_page_walk(unsigned long *origin, unsigned long addr,
unsigned long end, int alloc)
{
unsigned long *pte, next;
if (!alloc)
return 0;
- pte = (unsigned long *) origin;
+ pte = origin;
pte += (addr & _PAGE_INDEX) >> _PAGE_SHIFT;
do {
next = base_page_addr_end(addr, end);
@@ -392,13 +501,13 @@ static int base_page_walk(unsigned long origin, unsigned long addr,
return 0;
}
-static int base_segment_walk(unsigned long origin, unsigned long addr,
+static int base_segment_walk(unsigned long *origin, unsigned long addr,
unsigned long end, int alloc)
{
- unsigned long *ste, next, table;
+ unsigned long *ste, next, *table;
int rc;
- ste = (unsigned long *) origin;
+ ste = origin;
ste += (addr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT;
do {
next = base_segment_addr_end(addr, end);
@@ -408,9 +517,9 @@ static int base_segment_walk(unsigned long origin, unsigned long addr,
table = base_pgt_alloc();
if (!table)
return -ENOMEM;
- *ste = table | _SEGMENT_ENTRY;
+ *ste = __pa(table) | _SEGMENT_ENTRY;
}
- table = *ste & _SEGMENT_ENTRY_ORIGIN;
+ table = __va(*ste & _SEGMENT_ENTRY_ORIGIN);
rc = base_page_walk(table, addr, next, alloc);
if (rc)
return rc;
@@ -421,13 +530,13 @@ static int base_segment_walk(unsigned long origin, unsigned long addr,
return 0;
}
-static int base_region3_walk(unsigned long origin, unsigned long addr,
+static int base_region3_walk(unsigned long *origin, unsigned long addr,
unsigned long end, int alloc)
{
- unsigned long *rtte, next, table;
+ unsigned long *rtte, next, *table;
int rc;
- rtte = (unsigned long *) origin;
+ rtte = origin;
rtte += (addr & _REGION3_INDEX) >> _REGION3_SHIFT;
do {
next = base_region3_addr_end(addr, end);
@@ -437,9 +546,9 @@ static int base_region3_walk(unsigned long origin, unsigned long addr,
table = base_crst_alloc(_SEGMENT_ENTRY_EMPTY);
if (!table)
return -ENOMEM;
- *rtte = table | _REGION3_ENTRY;
+ *rtte = __pa(table) | _REGION3_ENTRY;
}
- table = *rtte & _REGION_ENTRY_ORIGIN;
+ table = __va(*rtte & _REGION_ENTRY_ORIGIN);
rc = base_segment_walk(table, addr, next, alloc);
if (rc)
return rc;
@@ -449,13 +558,13 @@ static int base_region3_walk(unsigned long origin, unsigned long addr,
return 0;
}
-static int base_region2_walk(unsigned long origin, unsigned long addr,
+static int base_region2_walk(unsigned long *origin, unsigned long addr,
unsigned long end, int alloc)
{
- unsigned long *rste, next, table;
+ unsigned long *rste, next, *table;
int rc;
- rste = (unsigned long *) origin;
+ rste = origin;
rste += (addr & _REGION2_INDEX) >> _REGION2_SHIFT;
do {
next = base_region2_addr_end(addr, end);
@@ -465,9 +574,9 @@ static int base_region2_walk(unsigned long origin, unsigned long addr,
table = base_crst_alloc(_REGION3_ENTRY_EMPTY);
if (!table)
return -ENOMEM;
- *rste = table | _REGION2_ENTRY;
+ *rste = __pa(table) | _REGION2_ENTRY;
}
- table = *rste & _REGION_ENTRY_ORIGIN;
+ table = __va(*rste & _REGION_ENTRY_ORIGIN);
rc = base_region3_walk(table, addr, next, alloc);
if (rc)
return rc;
@@ -477,13 +586,13 @@ static int base_region2_walk(unsigned long origin, unsigned long addr,
return 0;
}
-static int base_region1_walk(unsigned long origin, unsigned long addr,
+static int base_region1_walk(unsigned long *origin, unsigned long addr,
unsigned long end, int alloc)
{
- unsigned long *rfte, next, table;
+ unsigned long *rfte, next, *table;
int rc;
- rfte = (unsigned long *) origin;
+ rfte = origin;
rfte += (addr & _REGION1_INDEX) >> _REGION1_SHIFT;
do {
next = base_region1_addr_end(addr, end);
@@ -493,9 +602,9 @@ static int base_region1_walk(unsigned long origin, unsigned long addr,
table = base_crst_alloc(_REGION2_ENTRY_EMPTY);
if (!table)
return -ENOMEM;
- *rfte = table | _REGION1_ENTRY;
+ *rfte = __pa(table) | _REGION1_ENTRY;
}
- table = *rfte & _REGION_ENTRY_ORIGIN;
+ table = __va(*rfte & _REGION_ENTRY_ORIGIN);
rc = base_region2_walk(table, addr, next, alloc);
if (rc)
return rc;
@@ -514,7 +623,7 @@ static int base_region1_walk(unsigned long origin, unsigned long addr,
*/
void base_asce_free(unsigned long asce)
{
- unsigned long table = asce & _ASCE_ORIGIN;
+ unsigned long *table = __va(asce & _ASCE_ORIGIN);
if (!asce)
return;
@@ -529,7 +638,7 @@ void base_asce_free(unsigned long asce)
base_region2_walk(table, 0, _REGION1_SIZE, 0);
break;
case _ASCE_TYPE_REGION1:
- base_region1_walk(table, 0, -_PAGE_SIZE, 0);
+ base_region1_walk(table, 0, TASK_SIZE_MAX, 0);
break;
}
base_crst_free(table);
@@ -566,7 +675,7 @@ static int base_pgt_cache_init(void)
*/
unsigned long base_asce_alloc(unsigned long addr, unsigned long num_pages)
{
- unsigned long asce, table, end;
+ unsigned long asce, *table, end;
int rc;
if (base_pgt_cache_init())
@@ -577,25 +686,25 @@ unsigned long base_asce_alloc(unsigned long addr, unsigned long num_pages)
if (!table)
return 0;
rc = base_segment_walk(table, addr, end, 1);
- asce = table | _ASCE_TYPE_SEGMENT | _ASCE_TABLE_LENGTH;
+ asce = __pa(table) | _ASCE_TYPE_SEGMENT | _ASCE_TABLE_LENGTH;
} else if (end <= _REGION2_SIZE) {
table = base_crst_alloc(_REGION3_ENTRY_EMPTY);
if (!table)
return 0;
rc = base_region3_walk(table, addr, end, 1);
- asce = table | _ASCE_TYPE_REGION3 | _ASCE_TABLE_LENGTH;
+ asce = __pa(table) | _ASCE_TYPE_REGION3 | _ASCE_TABLE_LENGTH;
} else if (end <= _REGION1_SIZE) {
table = base_crst_alloc(_REGION2_ENTRY_EMPTY);
if (!table)
return 0;
rc = base_region2_walk(table, addr, end, 1);
- asce = table | _ASCE_TYPE_REGION2 | _ASCE_TABLE_LENGTH;
+ asce = __pa(table) | _ASCE_TYPE_REGION2 | _ASCE_TABLE_LENGTH;
} else {
table = base_crst_alloc(_REGION1_ENTRY_EMPTY);
if (!table)
return 0;
rc = base_region1_walk(table, addr, end, 1);
- asce = table | _ASCE_TYPE_REGION1 | _ASCE_TABLE_LENGTH;
+ asce = __pa(table) | _ASCE_TYPE_REGION1 | _ASCE_TABLE_LENGTH;
}
if (rc) {
base_asce_free(asce);
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index 9ebd01219812..4909dcd762e8 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -19,13 +19,31 @@
#include <linux/ksm.h>
#include <linux/mman.h>
-#include <asm/pgtable.h>
-#include <asm/pgalloc.h>
#include <asm/tlb.h>
#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
#include <asm/page-states.h>
+pgprot_t pgprot_writecombine(pgprot_t prot)
+{
+ /*
+ * mio_wb_bit_mask may be set on a different CPU, but it is only set
+ * once at init and only read afterwards.
+ */
+ return __pgprot(pgprot_val(prot) | mio_wb_bit_mask);
+}
+EXPORT_SYMBOL_GPL(pgprot_writecombine);
+
+pgprot_t pgprot_writethrough(pgprot_t prot)
+{
+ /*
+ * mio_wb_bit_mask may be set on a different CPU, but it is only set
+ * once at init and only read afterwards.
+ */
+ return __pgprot(pgprot_val(prot) & ~mio_wb_bit_mask);
+}
+EXPORT_SYMBOL_GPL(pgprot_writethrough);
+
static inline void ptep_ipte_local(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, int nodat)
{
@@ -97,7 +115,7 @@ static inline pte_t ptep_flush_lazy(struct mm_struct *mm,
atomic_inc(&mm->context.flush_count);
if (cpumask_equal(&mm->context.cpu_attach_mask,
cpumask_of(smp_processor_id()))) {
- pte_val(*ptep) |= _PAGE_INVALID;
+ set_pte(ptep, set_pte_bit(*ptep, __pgprot(_PAGE_INVALID)));
mm->context.flush_mm = 1;
} else
ptep_ipte_global(mm, addr, ptep, nodat);
@@ -206,15 +224,15 @@ static inline pgste_t pgste_set_pte(pte_t *ptep, pgste_t pgste, pte_t entry)
* Without enhanced suppression-on-protection force
* the dirty bit on for all writable ptes.
*/
- pte_val(entry) |= _PAGE_DIRTY;
- pte_val(entry) &= ~_PAGE_PROTECT;
+ entry = set_pte_bit(entry, __pgprot(_PAGE_DIRTY));
+ entry = clear_pte_bit(entry, __pgprot(_PAGE_PROTECT));
}
if (!(pte_val(entry) & _PAGE_PROTECT))
/* This pte allows write access, set user-dirty */
pgste_val(pgste) |= PGSTE_UC_BIT;
}
#endif
- *ptep = entry;
+ set_pte(ptep, entry);
return pgste;
}
@@ -257,12 +275,12 @@ static inline pte_t ptep_xchg_commit(struct mm_struct *mm,
pgste = pgste_update_all(old, pgste, mm);
if ((pgste_val(pgste) & _PGSTE_GPS_USAGE_MASK) ==
_PGSTE_GPS_USAGE_UNUSED)
- pte_val(old) |= _PAGE_UNUSED;
+ old = set_pte_bit(old, __pgprot(_PAGE_UNUSED));
}
pgste = pgste_set_pte(ptep, pgste, new);
pgste_set_unlock(ptep, pgste);
} else {
- *ptep = new;
+ set_pte(ptep, new);
}
return old;
}
@@ -327,14 +345,14 @@ void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr,
struct mm_struct *mm = vma->vm_mm;
if (!MACHINE_HAS_NX)
- pte_val(pte) &= ~_PAGE_NOEXEC;
+ pte = clear_pte_bit(pte, __pgprot(_PAGE_NOEXEC));
if (mm_has_pgste(mm)) {
pgste = pgste_get(ptep);
pgste_set_key(ptep, pgste, pte, mm);
pgste = pgste_set_pte(ptep, pgste, pte);
pgste_set_unlock(ptep, pgste);
} else {
- *ptep = pte;
+ set_pte(ptep, pte);
}
preempt_enable();
}
@@ -399,7 +417,7 @@ static inline pmd_t pmdp_flush_lazy(struct mm_struct *mm,
atomic_inc(&mm->context.flush_count);
if (cpumask_equal(&mm->context.cpu_attach_mask,
cpumask_of(smp_processor_id()))) {
- pmd_val(*pmdp) |= _SEGMENT_ENTRY_INVALID;
+ set_pmd(pmdp, set_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_INVALID)));
mm->context.flush_mm = 1;
if (mm_has_pgste(mm))
gmap_pmdp_invalidate(mm, addr);
@@ -411,22 +429,36 @@ static inline pmd_t pmdp_flush_lazy(struct mm_struct *mm,
}
#ifdef CONFIG_PGSTE
-static pmd_t *pmd_alloc_map(struct mm_struct *mm, unsigned long addr)
+static int pmd_lookup(struct mm_struct *mm, unsigned long addr, pmd_t **pmdp)
{
+ struct vm_area_struct *vma;
pgd_t *pgd;
p4d_t *p4d;
pud_t *pud;
- pmd_t *pmd;
+
+ /* We need a valid VMA, otherwise this is clearly a fault. */
+ vma = vma_lookup(mm, addr);
+ if (!vma)
+ return -EFAULT;
pgd = pgd_offset(mm, addr);
- p4d = p4d_alloc(mm, pgd, addr);
- if (!p4d)
- return NULL;
- pud = pud_alloc(mm, p4d, addr);
- if (!pud)
- return NULL;
- pmd = pmd_alloc(mm, pud, addr);
- return pmd;
+ if (!pgd_present(*pgd))
+ return -ENOENT;
+
+ p4d = p4d_offset(pgd, addr);
+ if (!p4d_present(*p4d))
+ return -ENOENT;
+
+ pud = pud_offset(p4d, addr);
+ if (!pud_present(*pud))
+ return -ENOENT;
+
+ /* Large PUDs are not supported yet. */
+ if (pud_large(*pud))
+ return -EFAULT;
+
+ *pmdp = pmd_offset(pud, addr);
+ return 0;
}
#endif
@@ -437,7 +469,7 @@ pmd_t pmdp_xchg_direct(struct mm_struct *mm, unsigned long addr,
preempt_disable();
old = pmdp_flush_direct(mm, addr, pmdp);
- *pmdp = new;
+ set_pmd(pmdp, new);
preempt_enable();
return old;
}
@@ -450,7 +482,7 @@ pmd_t pmdp_xchg_lazy(struct mm_struct *mm, unsigned long addr,
preempt_disable();
old = pmdp_flush_lazy(mm, addr, pmdp);
- *pmdp = new;
+ set_pmd(pmdp, new);
preempt_enable();
return old;
}
@@ -507,7 +539,7 @@ pud_t pudp_xchg_direct(struct mm_struct *mm, unsigned long addr,
preempt_disable();
old = pudp_flush_direct(mm, addr, pudp);
- *pudp = new;
+ set_pud(pudp, new);
preempt_enable();
return old;
}
@@ -547,9 +579,9 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
list_del(lh);
}
ptep = (pte_t *) pgtable;
- pte_val(*ptep) = _PAGE_INVALID;
+ set_pte(ptep, __pte(_PAGE_INVALID));
ptep++;
- pte_val(*ptep) = _PAGE_INVALID;
+ set_pte(ptep, __pte(_PAGE_INVALID));
return pgtable;
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
@@ -614,12 +646,12 @@ int ptep_force_prot(struct mm_struct *mm, unsigned long addr,
if (prot == PROT_NONE && !pte_i) {
ptep_flush_direct(mm, addr, ptep, nodat);
pgste = pgste_update_all(entry, pgste, mm);
- pte_val(entry) |= _PAGE_INVALID;
+ entry = set_pte_bit(entry, __pgprot(_PAGE_INVALID));
}
if (prot == PROT_READ && !pte_p) {
ptep_flush_direct(mm, addr, ptep, nodat);
- pte_val(entry) &= ~_PAGE_INVALID;
- pte_val(entry) |= _PAGE_PROTECT;
+ entry = clear_pte_bit(entry, __pgprot(_PAGE_INVALID));
+ entry = set_pte_bit(entry, __pgprot(_PAGE_PROTECT));
}
pgste_val(pgste) |= bit;
pgste = pgste_set_pte(ptep, pgste, entry);
@@ -643,8 +675,8 @@ int ptep_shadow_pte(struct mm_struct *mm, unsigned long saddr,
!(pte_val(pte) & _PAGE_PROTECT))) {
pgste_val(spgste) |= PGSTE_VSIE_BIT;
tpgste = pgste_get_lock(tptep);
- pte_val(tpte) = (pte_val(spte) & PAGE_MASK) |
- (pte_val(pte) & _PAGE_PROTECT);
+ tpte = __pte((pte_val(spte) & PAGE_MASK) |
+ (pte_val(pte) & _PAGE_PROTECT));
/* don't touch the storage key - it belongs to parent pgste */
tpgste = pgste_set_pte(tptep, tpgste, tpte);
pgste_set_unlock(tptep, tpgste);
@@ -673,7 +705,7 @@ static void ptep_zap_swap_entry(struct mm_struct *mm, swp_entry_t entry)
if (!non_swap_entry(entry))
dec_mm_counter(mm, MM_SWAPENTS);
else if (is_migration_entry(entry)) {
- struct page *page = migration_entry_to_page(entry);
+ struct page *page = pfn_swap_entry_to_page(entry);
dec_mm_counter(mm, mm_counter(page));
}
@@ -716,7 +748,7 @@ void ptep_zap_key(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
pgste_val(pgste) |= PGSTE_GR_BIT | PGSTE_GC_BIT;
ptev = pte_val(*ptep);
if (!(ptev & _PAGE_INVALID) && (ptev & _PAGE_WRITE))
- page_set_storage_key(ptev & PAGE_MASK, PAGE_DEFAULT_KEY, 1);
+ page_set_storage_key(ptev & PAGE_MASK, PAGE_DEFAULT_KEY, 0);
pgste_set_unlock(ptep, pgste);
preempt_enable();
}
@@ -741,10 +773,10 @@ bool ptep_test_and_clear_uc(struct mm_struct *mm, unsigned long addr,
nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT);
ptep_ipte_global(mm, addr, ptep, nodat);
if (MACHINE_HAS_ESOP || !(pte_val(pte) & _PAGE_WRITE))
- pte_val(pte) |= _PAGE_PROTECT;
+ pte = set_pte_bit(pte, __pgprot(_PAGE_PROTECT));
else
- pte_val(pte) |= _PAGE_INVALID;
- *ptep = pte;
+ pte = set_pte_bit(pte, __pgprot(_PAGE_INVALID));
+ set_pte(ptep, pte);
}
pgste_set_unlock(ptep, pgste);
return dirty;
@@ -760,14 +792,23 @@ int set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
pmd_t *pmdp;
pte_t *ptep;
- pmdp = pmd_alloc_map(mm, addr);
- if (unlikely(!pmdp))
+ /*
+ * If we don't have a PTE table and if there is no huge page mapped,
+ * we can ignore attempts to set the key to 0, because it already is 0.
+ */
+ switch (pmd_lookup(mm, addr, &pmdp)) {
+ case -ENOENT:
+ return key ? -EFAULT : 0;
+ case 0:
+ break;
+ default:
return -EFAULT;
+ }
ptl = pmd_lock(mm, pmdp);
if (!pmd_present(*pmdp)) {
spin_unlock(ptl);
- return -EFAULT;
+ return key ? -EFAULT : 0;
}
if (pmd_large(*pmdp)) {
@@ -783,10 +824,7 @@ int set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
}
spin_unlock(ptl);
- ptep = pte_alloc_map_lock(mm, pmdp, addr, &ptl);
- if (unlikely(!ptep))
- return -EFAULT;
-
+ ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
new = old = pgste_get_lock(ptep);
pgste_val(new) &= ~(PGSTE_GR_BIT | PGSTE_GC_BIT |
PGSTE_ACC_BITS | PGSTE_FP_BIT);
@@ -816,7 +854,7 @@ int set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
}
EXPORT_SYMBOL(set_guest_storage_key);
-/**
+/*
* Conditionally set a guest storage key (handling csske).
* oldkey will be updated when either mr or mc is set and a pointer is given.
*
@@ -849,7 +887,7 @@ int cond_set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
}
EXPORT_SYMBOL(cond_set_guest_storage_key);
-/**
+/*
* Reset a guest reference bit (rrbe), returning the reference and changed bit.
*
* Returns < 0 in case of error, otherwise the cc to be reported to the guest.
@@ -863,14 +901,23 @@ int reset_guest_reference_bit(struct mm_struct *mm, unsigned long addr)
pte_t *ptep;
int cc = 0;
- pmdp = pmd_alloc_map(mm, addr);
- if (unlikely(!pmdp))
+ /*
+ * If we don't have a PTE table and if there is no huge page mapped,
+ * the storage key is 0 and there is nothing for us to do.
+ */
+ switch (pmd_lookup(mm, addr, &pmdp)) {
+ case -ENOENT:
+ return 0;
+ case 0:
+ break;
+ default:
return -EFAULT;
+ }
ptl = pmd_lock(mm, pmdp);
if (!pmd_present(*pmdp)) {
spin_unlock(ptl);
- return -EFAULT;
+ return 0;
}
if (pmd_large(*pmdp)) {
@@ -882,10 +929,7 @@ int reset_guest_reference_bit(struct mm_struct *mm, unsigned long addr)
}
spin_unlock(ptl);
- ptep = pte_alloc_map_lock(mm, pmdp, addr, &ptl);
- if (unlikely(!ptep))
- return -EFAULT;
-
+ ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
new = old = pgste_get_lock(ptep);
/* Reset guest reference bit only */
pgste_val(new) &= ~PGSTE_GR_BIT;
@@ -917,15 +961,24 @@ int get_guest_storage_key(struct mm_struct *mm, unsigned long addr,
pmd_t *pmdp;
pte_t *ptep;
- pmdp = pmd_alloc_map(mm, addr);
- if (unlikely(!pmdp))
+ /*
+ * If we don't have a PTE table and if there is no huge page mapped,
+ * the storage key is 0.
+ */
+ *key = 0;
+
+ switch (pmd_lookup(mm, addr, &pmdp)) {
+ case -ENOENT:
+ return 0;
+ case 0:
+ break;
+ default:
return -EFAULT;
+ }
ptl = pmd_lock(mm, pmdp);
if (!pmd_present(*pmdp)) {
- /* Not yet mapped memory has a zero key */
spin_unlock(ptl);
- *key = 0;
return 0;
}
@@ -938,10 +991,7 @@ int get_guest_storage_key(struct mm_struct *mm, unsigned long addr,
}
spin_unlock(ptl);
- ptep = pte_alloc_map_lock(mm, pmdp, addr, &ptl);
- if (unlikely(!ptep))
- return -EFAULT;
-
+ ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
pgste = pgste_get_lock(ptep);
*key = (pgste_val(pgste) & (PGSTE_ACC_BITS | PGSTE_FP_BIT)) >> 56;
paddr = pte_val(*ptep) & PAGE_MASK;
@@ -970,6 +1020,7 @@ EXPORT_SYMBOL(get_guest_storage_key);
int pgste_perform_essa(struct mm_struct *mm, unsigned long hva, int orc,
unsigned long *oldpte, unsigned long *oldpgste)
{
+ struct vm_area_struct *vma;
unsigned long pgstev;
spinlock_t *ptl;
pgste_t pgste;
@@ -979,6 +1030,10 @@ int pgste_perform_essa(struct mm_struct *mm, unsigned long hva, int orc,
WARN_ON_ONCE(orc > ESSA_MAX);
if (unlikely(orc > ESSA_MAX))
return -EINVAL;
+
+ vma = vma_lookup(mm, hva);
+ if (!vma || is_vm_hugetlb_page(vma))
+ return -EFAULT;
ptep = get_locked_pte(mm, hva, &ptl);
if (unlikely(!ptep))
return -EFAULT;
@@ -1071,10 +1126,14 @@ EXPORT_SYMBOL(pgste_perform_essa);
int set_pgste_bits(struct mm_struct *mm, unsigned long hva,
unsigned long bits, unsigned long value)
{
+ struct vm_area_struct *vma;
spinlock_t *ptl;
pgste_t new;
pte_t *ptep;
+ vma = vma_lookup(mm, hva);
+ if (!vma || is_vm_hugetlb_page(vma))
+ return -EFAULT;
ptep = get_locked_pte(mm, hva, &ptl);
if (unlikely(!ptep))
return -EFAULT;
@@ -1099,9 +1158,13 @@ EXPORT_SYMBOL(set_pgste_bits);
*/
int get_pgste(struct mm_struct *mm, unsigned long hva, unsigned long *pgstep)
{
+ struct vm_area_struct *vma;
spinlock_t *ptl;
pte_t *ptep;
+ vma = vma_lookup(mm, hva);
+ if (!vma || is_vm_hugetlb_page(vma))
+ return -EFAULT;
ptep = get_locked_pte(mm, hva, &ptl);
if (unlikely(!ptep))
return -EFAULT;
diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c
index b403fa14847d..ee1a97078527 100644
--- a/arch/s390/mm/vmem.c
+++ b/arch/s390/mm/vmem.c
@@ -1,9 +1,9 @@
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright IBM Corp. 2006
- * Author(s): Heiko Carstens <heiko.carstens@de.ibm.com>
*/
+#include <linux/memory_hotplug.h>
#include <linux/memblock.h>
#include <linux/pfn.h>
#include <linux/mm.h>
@@ -12,8 +12,8 @@
#include <linux/hugetlb.h>
#include <linux/slab.h>
#include <asm/cacheflush.h>
+#include <asm/nospec-branch.h>
#include <asm/pgalloc.h>
-#include <asm/pgtable.h>
#include <asm/setup.h>
#include <asm/tlbflush.h>
#include <asm/sections.h>
@@ -21,21 +21,22 @@
static DEFINE_MUTEX(vmem_mutex);
-struct memory_segment {
- struct list_head list;
- unsigned long start;
- unsigned long size;
-};
-
-static LIST_HEAD(mem_segs);
-
static void __ref *vmem_alloc_pages(unsigned int order)
{
unsigned long size = PAGE_SIZE << order;
if (slab_is_available())
return (void *)__get_free_pages(GFP_KERNEL, order);
- return (void *) memblock_phys_alloc(size, size);
+ return memblock_alloc(size, size);
+}
+
+static void vmem_free_pages(unsigned long addr, int order)
+{
+ /* We don't expect boot memory to be removed ever. */
+ if (!slab_is_available() ||
+ WARN_ON_ONCE(PageReserved(virt_to_page(addr))))
+ return;
+ free_pages(addr, order);
}
void *vmem_crst_alloc(unsigned long val)
@@ -56,341 +57,604 @@ pte_t __ref *vmem_pte_alloc(void)
if (slab_is_available())
pte = (pte_t *) page_table_alloc(&init_mm);
else
- pte = (pte_t *) memblock_phys_alloc(size, size);
+ pte = (pte_t *) memblock_alloc(size, size);
if (!pte)
return NULL;
memset64((u64 *)pte, _PAGE_INVALID, PTRS_PER_PTE);
return pte;
}
+static void vmem_pte_free(unsigned long *table)
+{
+ /* We don't expect boot memory to be removed ever. */
+ if (!slab_is_available() ||
+ WARN_ON_ONCE(PageReserved(virt_to_page(table))))
+ return;
+ page_table_free(&init_mm, table);
+}
+
+#define PAGE_UNUSED 0xFD
+
/*
- * Add a physical memory range to the 1:1 mapping.
+ * The unused vmemmap range, which was not yet memset(PAGE_UNUSED) ranges
+ * from unused_sub_pmd_start to next PMD_SIZE boundary.
*/
-static int vmem_add_mem(unsigned long start, unsigned long size)
-{
- unsigned long pgt_prot, sgt_prot, r3_prot;
- unsigned long pages4k, pages1m, pages2g;
- unsigned long end = start + size;
- unsigned long address = start;
- pgd_t *pg_dir;
- p4d_t *p4_dir;
- pud_t *pu_dir;
- pmd_t *pm_dir;
- pte_t *pt_dir;
- int ret = -ENOMEM;
+static unsigned long unused_sub_pmd_start;
+
+static void vmemmap_flush_unused_sub_pmd(void)
+{
+ if (!unused_sub_pmd_start)
+ return;
+ memset((void *)unused_sub_pmd_start, PAGE_UNUSED,
+ ALIGN(unused_sub_pmd_start, PMD_SIZE) - unused_sub_pmd_start);
+ unused_sub_pmd_start = 0;
+}
- pgt_prot = pgprot_val(PAGE_KERNEL);
- sgt_prot = pgprot_val(SEGMENT_KERNEL);
- r3_prot = pgprot_val(REGION3_KERNEL);
- if (!MACHINE_HAS_NX) {
- pgt_prot &= ~_PAGE_NOEXEC;
- sgt_prot &= ~_SEGMENT_ENTRY_NOEXEC;
- r3_prot &= ~_REGION_ENTRY_NOEXEC;
+static void vmemmap_mark_sub_pmd_used(unsigned long start, unsigned long end)
+{
+ /*
+ * As we expect to add in the same granularity as we remove, it's
+ * sufficient to mark only some piece used to block the memmap page from
+ * getting removed (just in case the memmap never gets initialized,
+ * e.g., because the memory block never gets onlined).
+ */
+ memset((void *)start, 0, sizeof(struct page));
+}
+
+static void vmemmap_use_sub_pmd(unsigned long start, unsigned long end)
+{
+ /*
+ * We only optimize if the new used range directly follows the
+ * previously unused range (esp., when populating consecutive sections).
+ */
+ if (unused_sub_pmd_start == start) {
+ unused_sub_pmd_start = end;
+ if (likely(IS_ALIGNED(unused_sub_pmd_start, PMD_SIZE)))
+ unused_sub_pmd_start = 0;
+ return;
}
- pages4k = pages1m = pages2g = 0;
- while (address < end) {
- pg_dir = pgd_offset_k(address);
- if (pgd_none(*pg_dir)) {
- p4_dir = vmem_crst_alloc(_REGION2_ENTRY_EMPTY);
- if (!p4_dir)
- goto out;
- pgd_populate(&init_mm, pg_dir, p4_dir);
- }
- p4_dir = p4d_offset(pg_dir, address);
- if (p4d_none(*p4_dir)) {
- pu_dir = vmem_crst_alloc(_REGION3_ENTRY_EMPTY);
- if (!pu_dir)
- goto out;
- p4d_populate(&init_mm, p4_dir, pu_dir);
- }
- pu_dir = pud_offset(p4_dir, address);
- if (MACHINE_HAS_EDAT2 && pud_none(*pu_dir) && address &&
- !(address & ~PUD_MASK) && (address + PUD_SIZE <= end) &&
- !debug_pagealloc_enabled()) {
- pud_val(*pu_dir) = address | r3_prot;
- address += PUD_SIZE;
- pages2g++;
- continue;
- }
- if (pud_none(*pu_dir)) {
- pm_dir = vmem_crst_alloc(_SEGMENT_ENTRY_EMPTY);
- if (!pm_dir)
- goto out;
- pud_populate(&init_mm, pu_dir, pm_dir);
- }
- pm_dir = pmd_offset(pu_dir, address);
- if (MACHINE_HAS_EDAT1 && pmd_none(*pm_dir) && address &&
- !(address & ~PMD_MASK) && (address + PMD_SIZE <= end) &&
- !debug_pagealloc_enabled()) {
- pmd_val(*pm_dir) = address | sgt_prot;
- address += PMD_SIZE;
- pages1m++;
+ vmemmap_flush_unused_sub_pmd();
+ vmemmap_mark_sub_pmd_used(start, end);
+}
+
+static void vmemmap_use_new_sub_pmd(unsigned long start, unsigned long end)
+{
+ unsigned long page = ALIGN_DOWN(start, PMD_SIZE);
+
+ vmemmap_flush_unused_sub_pmd();
+
+ /* Could be our memmap page is filled with PAGE_UNUSED already ... */
+ vmemmap_mark_sub_pmd_used(start, end);
+
+ /* Mark the unused parts of the new memmap page PAGE_UNUSED. */
+ if (!IS_ALIGNED(start, PMD_SIZE))
+ memset((void *)page, PAGE_UNUSED, start - page);
+ /*
+ * We want to avoid memset(PAGE_UNUSED) when populating the vmemmap of
+ * consecutive sections. Remember for the last added PMD the last
+ * unused range in the populated PMD.
+ */
+ if (!IS_ALIGNED(end, PMD_SIZE))
+ unused_sub_pmd_start = end;
+}
+
+/* Returns true if the PMD is completely unused and can be freed. */
+static bool vmemmap_unuse_sub_pmd(unsigned long start, unsigned long end)
+{
+ unsigned long page = ALIGN_DOWN(start, PMD_SIZE);
+
+ vmemmap_flush_unused_sub_pmd();
+ memset((void *)start, PAGE_UNUSED, end - start);
+ return !memchr_inv((void *)page, PAGE_UNUSED, PMD_SIZE);
+}
+
+/* __ref: we'll only call vmemmap_alloc_block() via vmemmap_populate() */
+static int __ref modify_pte_table(pmd_t *pmd, unsigned long addr,
+ unsigned long end, bool add, bool direct)
+{
+ unsigned long prot, pages = 0;
+ int ret = -ENOMEM;
+ pte_t *pte;
+
+ prot = pgprot_val(PAGE_KERNEL);
+ if (!MACHINE_HAS_NX)
+ prot &= ~_PAGE_NOEXEC;
+
+ pte = pte_offset_kernel(pmd, addr);
+ for (; addr < end; addr += PAGE_SIZE, pte++) {
+ if (!add) {
+ if (pte_none(*pte))
+ continue;
+ if (!direct)
+ vmem_free_pages((unsigned long) pfn_to_virt(pte_pfn(*pte)), 0);
+ pte_clear(&init_mm, addr, pte);
+ } else if (pte_none(*pte)) {
+ if (!direct) {
+ void *new_page = vmemmap_alloc_block(PAGE_SIZE, NUMA_NO_NODE);
+
+ if (!new_page)
+ goto out;
+ set_pte(pte, __pte(__pa(new_page) | prot));
+ } else {
+ set_pte(pte, __pte(__pa(addr) | prot));
+ }
+ } else {
continue;
}
- if (pmd_none(*pm_dir)) {
- pt_dir = vmem_pte_alloc();
- if (!pt_dir)
- goto out;
- pmd_populate(&init_mm, pm_dir, pt_dir);
- }
-
- pt_dir = pte_offset_kernel(pm_dir, address);
- pte_val(*pt_dir) = address | pgt_prot;
- address += PAGE_SIZE;
- pages4k++;
+ pages++;
}
ret = 0;
out:
- update_page_count(PG_DIRECT_MAP_4K, pages4k);
- update_page_count(PG_DIRECT_MAP_1M, pages1m);
- update_page_count(PG_DIRECT_MAP_2G, pages2g);
+ if (direct)
+ update_page_count(PG_DIRECT_MAP_4K, add ? pages : -pages);
return ret;
}
-/*
- * Remove a physical memory range from the 1:1 mapping.
- * Currently only invalidates page table entries.
- */
-static void vmem_remove_range(unsigned long start, unsigned long size)
+static void try_free_pte_table(pmd_t *pmd, unsigned long start)
{
- unsigned long pages4k, pages1m, pages2g;
- unsigned long end = start + size;
- unsigned long address = start;
- pgd_t *pg_dir;
- p4d_t *p4_dir;
- pud_t *pu_dir;
- pmd_t *pm_dir;
- pte_t *pt_dir;
-
- pages4k = pages1m = pages2g = 0;
- while (address < end) {
- pg_dir = pgd_offset_k(address);
- if (pgd_none(*pg_dir)) {
- address += PGDIR_SIZE;
- continue;
- }
- p4_dir = p4d_offset(pg_dir, address);
- if (p4d_none(*p4_dir)) {
- address += P4D_SIZE;
- continue;
- }
- pu_dir = pud_offset(p4_dir, address);
- if (pud_none(*pu_dir)) {
- address += PUD_SIZE;
- continue;
- }
- if (pud_large(*pu_dir)) {
- pud_clear(pu_dir);
- address += PUD_SIZE;
- pages2g++;
- continue;
- }
- pm_dir = pmd_offset(pu_dir, address);
- if (pmd_none(*pm_dir)) {
- address += PMD_SIZE;
- continue;
- }
- if (pmd_large(*pm_dir)) {
- pmd_clear(pm_dir);
- address += PMD_SIZE;
- pages1m++;
- continue;
- }
- pt_dir = pte_offset_kernel(pm_dir, address);
- pte_clear(&init_mm, address, pt_dir);
- address += PAGE_SIZE;
- pages4k++;
+ pte_t *pte;
+ int i;
+
+ /* We can safely assume this is fully in 1:1 mapping & vmemmap area */
+ pte = pte_offset_kernel(pmd, start);
+ for (i = 0; i < PTRS_PER_PTE; i++, pte++) {
+ if (!pte_none(*pte))
+ return;
}
- flush_tlb_kernel_range(start, end);
- update_page_count(PG_DIRECT_MAP_4K, -pages4k);
- update_page_count(PG_DIRECT_MAP_1M, -pages1m);
- update_page_count(PG_DIRECT_MAP_2G, -pages2g);
+ vmem_pte_free((unsigned long *) pmd_deref(*pmd));
+ pmd_clear(pmd);
}
-/*
- * Add a backed mem_map array to the virtual mem_map array.
- */
-int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
- struct vmem_altmap *altmap)
-{
- unsigned long pgt_prot, sgt_prot;
- unsigned long address = start;
- pgd_t *pg_dir;
- p4d_t *p4_dir;
- pud_t *pu_dir;
- pmd_t *pm_dir;
- pte_t *pt_dir;
+/* __ref: we'll only call vmemmap_alloc_block() via vmemmap_populate() */
+static int __ref modify_pmd_table(pud_t *pud, unsigned long addr,
+ unsigned long end, bool add, bool direct)
+{
+ unsigned long next, prot, pages = 0;
int ret = -ENOMEM;
+ pmd_t *pmd;
+ pte_t *pte;
- pgt_prot = pgprot_val(PAGE_KERNEL);
- sgt_prot = pgprot_val(SEGMENT_KERNEL);
- if (!MACHINE_HAS_NX) {
- pgt_prot &= ~_PAGE_NOEXEC;
- sgt_prot &= ~_SEGMENT_ENTRY_NOEXEC;
- }
- for (address = start; address < end;) {
- pg_dir = pgd_offset_k(address);
- if (pgd_none(*pg_dir)) {
- p4_dir = vmem_crst_alloc(_REGION2_ENTRY_EMPTY);
- if (!p4_dir)
- goto out;
- pgd_populate(&init_mm, pg_dir, p4_dir);
- }
+ prot = pgprot_val(SEGMENT_KERNEL);
+ if (!MACHINE_HAS_NX)
+ prot &= ~_SEGMENT_ENTRY_NOEXEC;
- p4_dir = p4d_offset(pg_dir, address);
- if (p4d_none(*p4_dir)) {
- pu_dir = vmem_crst_alloc(_REGION3_ENTRY_EMPTY);
- if (!pu_dir)
- goto out;
- p4d_populate(&init_mm, p4_dir, pu_dir);
- }
+ pmd = pmd_offset(pud, addr);
+ for (; addr < end; addr = next, pmd++) {
+ next = pmd_addr_end(addr, end);
+ if (!add) {
+ if (pmd_none(*pmd))
+ continue;
+ if (pmd_large(*pmd)) {
+ if (IS_ALIGNED(addr, PMD_SIZE) &&
+ IS_ALIGNED(next, PMD_SIZE)) {
+ if (!direct)
+ vmem_free_pages(pmd_deref(*pmd), get_order(PMD_SIZE));
+ pmd_clear(pmd);
+ pages++;
+ } else if (!direct && vmemmap_unuse_sub_pmd(addr, next)) {
+ vmem_free_pages(pmd_deref(*pmd), get_order(PMD_SIZE));
+ pmd_clear(pmd);
+ }
+ continue;
+ }
+ } else if (pmd_none(*pmd)) {
+ if (IS_ALIGNED(addr, PMD_SIZE) &&
+ IS_ALIGNED(next, PMD_SIZE) &&
+ MACHINE_HAS_EDAT1 && direct &&
+ !debug_pagealloc_enabled()) {
+ set_pmd(pmd, __pmd(__pa(addr) | prot));
+ pages++;
+ continue;
+ } else if (!direct && MACHINE_HAS_EDAT1) {
+ void *new_page;
- pu_dir = pud_offset(p4_dir, address);
- if (pud_none(*pu_dir)) {
- pm_dir = vmem_crst_alloc(_SEGMENT_ENTRY_EMPTY);
- if (!pm_dir)
+ /*
+ * Use 1MB frames for vmemmap if available. We
+ * always use large frames even if they are only
+ * partially used. Otherwise we would have also
+ * page tables since vmemmap_populate gets
+ * called for each section separately.
+ */
+ new_page = vmemmap_alloc_block(PMD_SIZE, NUMA_NO_NODE);
+ if (new_page) {
+ set_pmd(pmd, __pmd(__pa(new_page) | prot));
+ if (!IS_ALIGNED(addr, PMD_SIZE) ||
+ !IS_ALIGNED(next, PMD_SIZE)) {
+ vmemmap_use_new_sub_pmd(addr, next);
+ }
+ continue;
+ }
+ }
+ pte = vmem_pte_alloc();
+ if (!pte)
goto out;
- pud_populate(&init_mm, pu_dir, pm_dir);
+ pmd_populate(&init_mm, pmd, pte);
+ } else if (pmd_large(*pmd)) {
+ if (!direct)
+ vmemmap_use_sub_pmd(addr, next);
+ continue;
}
+ ret = modify_pte_table(pmd, addr, next, add, direct);
+ if (ret)
+ goto out;
+ if (!add)
+ try_free_pte_table(pmd, addr & PMD_MASK);
+ }
+ ret = 0;
+out:
+ if (direct)
+ update_page_count(PG_DIRECT_MAP_1M, add ? pages : -pages);
+ return ret;
+}
- pm_dir = pmd_offset(pu_dir, address);
- if (pmd_none(*pm_dir)) {
- /* Use 1MB frames for vmemmap if available. We always
- * use large frames even if they are only partially
- * used.
- * Otherwise we would have also page tables since
- * vmemmap_populate gets called for each section
- * separately. */
- if (MACHINE_HAS_EDAT1) {
- void *new_page;
+static void try_free_pmd_table(pud_t *pud, unsigned long start)
+{
+ const unsigned long end = start + PUD_SIZE;
+ pmd_t *pmd;
+ int i;
+
+ /* Don't mess with any tables not fully in 1:1 mapping & vmemmap area */
+ if (end > VMALLOC_START)
+ return;
+#ifdef CONFIG_KASAN
+ if (start < KASAN_SHADOW_END && KASAN_SHADOW_START > end)
+ return;
+#endif
+ pmd = pmd_offset(pud, start);
+ for (i = 0; i < PTRS_PER_PMD; i++, pmd++)
+ if (!pmd_none(*pmd))
+ return;
+ vmem_free_pages(pud_deref(*pud), CRST_ALLOC_ORDER);
+ pud_clear(pud);
+}
- new_page = vmemmap_alloc_block(PMD_SIZE, node);
- if (!new_page)
- goto out;
- pmd_val(*pm_dir) = __pa(new_page) | sgt_prot;
- address = (address + PMD_SIZE) & PMD_MASK;
+static int modify_pud_table(p4d_t *p4d, unsigned long addr, unsigned long end,
+ bool add, bool direct)
+{
+ unsigned long next, prot, pages = 0;
+ int ret = -ENOMEM;
+ pud_t *pud;
+ pmd_t *pmd;
+
+ prot = pgprot_val(REGION3_KERNEL);
+ if (!MACHINE_HAS_NX)
+ prot &= ~_REGION_ENTRY_NOEXEC;
+ pud = pud_offset(p4d, addr);
+ for (; addr < end; addr = next, pud++) {
+ next = pud_addr_end(addr, end);
+ if (!add) {
+ if (pud_none(*pud))
+ continue;
+ if (pud_large(*pud)) {
+ if (IS_ALIGNED(addr, PUD_SIZE) &&
+ IS_ALIGNED(next, PUD_SIZE)) {
+ pud_clear(pud);
+ pages++;
+ }
+ continue;
+ }
+ } else if (pud_none(*pud)) {
+ if (IS_ALIGNED(addr, PUD_SIZE) &&
+ IS_ALIGNED(next, PUD_SIZE) &&
+ MACHINE_HAS_EDAT2 && direct &&
+ !debug_pagealloc_enabled()) {
+ set_pud(pud, __pud(__pa(addr) | prot));
+ pages++;
continue;
}
- pt_dir = vmem_pte_alloc();
- if (!pt_dir)
+ pmd = vmem_crst_alloc(_SEGMENT_ENTRY_EMPTY);
+ if (!pmd)
goto out;
- pmd_populate(&init_mm, pm_dir, pt_dir);
- } else if (pmd_large(*pm_dir)) {
- address = (address + PMD_SIZE) & PMD_MASK;
+ pud_populate(&init_mm, pud, pmd);
+ } else if (pud_large(*pud)) {
continue;
}
+ ret = modify_pmd_table(pud, addr, next, add, direct);
+ if (ret)
+ goto out;
+ if (!add)
+ try_free_pmd_table(pud, addr & PUD_MASK);
+ }
+ ret = 0;
+out:
+ if (direct)
+ update_page_count(PG_DIRECT_MAP_2G, add ? pages : -pages);
+ return ret;
+}
- pt_dir = pte_offset_kernel(pm_dir, address);
- if (pte_none(*pt_dir)) {
- void *new_page;
+static void try_free_pud_table(p4d_t *p4d, unsigned long start)
+{
+ const unsigned long end = start + P4D_SIZE;
+ pud_t *pud;
+ int i;
+
+ /* Don't mess with any tables not fully in 1:1 mapping & vmemmap area */
+ if (end > VMALLOC_START)
+ return;
+#ifdef CONFIG_KASAN
+ if (start < KASAN_SHADOW_END && KASAN_SHADOW_START > end)
+ return;
+#endif
+
+ pud = pud_offset(p4d, start);
+ for (i = 0; i < PTRS_PER_PUD; i++, pud++) {
+ if (!pud_none(*pud))
+ return;
+ }
+ vmem_free_pages(p4d_deref(*p4d), CRST_ALLOC_ORDER);
+ p4d_clear(p4d);
+}
- new_page = vmemmap_alloc_block(PAGE_SIZE, node);
- if (!new_page)
+static int modify_p4d_table(pgd_t *pgd, unsigned long addr, unsigned long end,
+ bool add, bool direct)
+{
+ unsigned long next;
+ int ret = -ENOMEM;
+ p4d_t *p4d;
+ pud_t *pud;
+
+ p4d = p4d_offset(pgd, addr);
+ for (; addr < end; addr = next, p4d++) {
+ next = p4d_addr_end(addr, end);
+ if (!add) {
+ if (p4d_none(*p4d))
+ continue;
+ } else if (p4d_none(*p4d)) {
+ pud = vmem_crst_alloc(_REGION3_ENTRY_EMPTY);
+ if (!pud)
goto out;
- pte_val(*pt_dir) = __pa(new_page) | pgt_prot;
+ p4d_populate(&init_mm, p4d, pud);
}
- address += PAGE_SIZE;
+ ret = modify_pud_table(p4d, addr, next, add, direct);
+ if (ret)
+ goto out;
+ if (!add)
+ try_free_pud_table(p4d, addr & P4D_MASK);
}
ret = 0;
out:
return ret;
}
-void vmemmap_free(unsigned long start, unsigned long end,
- struct vmem_altmap *altmap)
+static void try_free_p4d_table(pgd_t *pgd, unsigned long start)
{
+ const unsigned long end = start + PGDIR_SIZE;
+ p4d_t *p4d;
+ int i;
+
+ /* Don't mess with any tables not fully in 1:1 mapping & vmemmap area */
+ if (end > VMALLOC_START)
+ return;
+#ifdef CONFIG_KASAN
+ if (start < KASAN_SHADOW_END && KASAN_SHADOW_START > end)
+ return;
+#endif
+
+ p4d = p4d_offset(pgd, start);
+ for (i = 0; i < PTRS_PER_P4D; i++, p4d++) {
+ if (!p4d_none(*p4d))
+ return;
+ }
+ vmem_free_pages(pgd_deref(*pgd), CRST_ALLOC_ORDER);
+ pgd_clear(pgd);
}
-/*
- * Add memory segment to the segment list if it doesn't overlap with
- * an already present segment.
- */
-static int insert_memory_segment(struct memory_segment *seg)
+static int modify_pagetable(unsigned long start, unsigned long end, bool add,
+ bool direct)
{
- struct memory_segment *tmp;
+ unsigned long addr, next;
+ int ret = -ENOMEM;
+ pgd_t *pgd;
+ p4d_t *p4d;
- if (seg->start + seg->size > VMEM_MAX_PHYS ||
- seg->start + seg->size < seg->start)
- return -ERANGE;
+ if (WARN_ON_ONCE(!PAGE_ALIGNED(start | end)))
+ return -EINVAL;
+ for (addr = start; addr < end; addr = next) {
+ next = pgd_addr_end(addr, end);
+ pgd = pgd_offset_k(addr);
- list_for_each_entry(tmp, &mem_segs, list) {
- if (seg->start >= tmp->start + tmp->size)
- continue;
- if (seg->start + seg->size <= tmp->start)
- continue;
- return -ENOSPC;
+ if (!add) {
+ if (pgd_none(*pgd))
+ continue;
+ } else if (pgd_none(*pgd)) {
+ p4d = vmem_crst_alloc(_REGION2_ENTRY_EMPTY);
+ if (!p4d)
+ goto out;
+ pgd_populate(&init_mm, pgd, p4d);
+ }
+ ret = modify_p4d_table(pgd, addr, next, add, direct);
+ if (ret)
+ goto out;
+ if (!add)
+ try_free_p4d_table(pgd, addr & PGDIR_MASK);
}
- list_add(&seg->list, &mem_segs);
- return 0;
+ ret = 0;
+out:
+ if (!add)
+ flush_tlb_kernel_range(start, end);
+ return ret;
+}
+
+static int add_pagetable(unsigned long start, unsigned long end, bool direct)
+{
+ return modify_pagetable(start, end, true, direct);
+}
+
+static int remove_pagetable(unsigned long start, unsigned long end, bool direct)
+{
+ return modify_pagetable(start, end, false, direct);
}
/*
- * Remove memory segment from the segment list.
+ * Add a physical memory range to the 1:1 mapping.
*/
-static void remove_memory_segment(struct memory_segment *seg)
+static int vmem_add_range(unsigned long start, unsigned long size)
{
- list_del(&seg->list);
+ return add_pagetable(start, start + size, true);
}
-static void __remove_shared_memory(struct memory_segment *seg)
+/*
+ * Remove a physical memory range from the 1:1 mapping.
+ */
+static void vmem_remove_range(unsigned long start, unsigned long size)
{
- remove_memory_segment(seg);
- vmem_remove_range(seg->start, seg->size);
+ remove_pagetable(start, start + size, true);
}
-int vmem_remove_mapping(unsigned long start, unsigned long size)
+/*
+ * Add a backed mem_map array to the virtual mem_map array.
+ */
+int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
+ struct vmem_altmap *altmap)
{
- struct memory_segment *seg;
int ret;
mutex_lock(&vmem_mutex);
+ /* We don't care about the node, just use NUMA_NO_NODE on allocations */
+ ret = add_pagetable(start, end, false);
+ if (ret)
+ remove_pagetable(start, end, false);
+ mutex_unlock(&vmem_mutex);
+ return ret;
+}
- ret = -ENOENT;
- list_for_each_entry(seg, &mem_segs, list) {
- if (seg->start == start && seg->size == size)
- break;
- }
-
- if (seg->start != start || seg->size != size)
- goto out;
+void vmemmap_free(unsigned long start, unsigned long end,
+ struct vmem_altmap *altmap)
+{
+ mutex_lock(&vmem_mutex);
+ remove_pagetable(start, end, false);
+ mutex_unlock(&vmem_mutex);
+}
- ret = 0;
- __remove_shared_memory(seg);
- kfree(seg);
-out:
+void vmem_remove_mapping(unsigned long start, unsigned long size)
+{
+ mutex_lock(&vmem_mutex);
+ vmem_remove_range(start, size);
mutex_unlock(&vmem_mutex);
- return ret;
+}
+
+struct range arch_get_mappable_range(void)
+{
+ struct range mhp_range;
+
+ mhp_range.start = 0;
+ mhp_range.end = VMEM_MAX_PHYS - 1;
+ return mhp_range;
}
int vmem_add_mapping(unsigned long start, unsigned long size)
{
- struct memory_segment *seg;
+ struct range range = arch_get_mappable_range();
int ret;
- mutex_lock(&vmem_mutex);
- ret = -ENOMEM;
- seg = kzalloc(sizeof(*seg), GFP_KERNEL);
- if (!seg)
- goto out;
- seg->start = start;
- seg->size = size;
+ if (start < range.start ||
+ start + size > range.end + 1 ||
+ start + size < start)
+ return -ERANGE;
- ret = insert_memory_segment(seg);
+ mutex_lock(&vmem_mutex);
+ ret = vmem_add_range(start, size);
if (ret)
- goto out_free;
+ vmem_remove_range(start, size);
+ mutex_unlock(&vmem_mutex);
+ return ret;
+}
- ret = vmem_add_mem(start, size);
- if (ret)
- goto out_remove;
- goto out;
+/*
+ * Allocate new or return existing page-table entry, but do not map it
+ * to any physical address. If missing, allocate segment- and region-
+ * table entries along. Meeting a large segment- or region-table entry
+ * while traversing is an error, since the function is expected to be
+ * called against virtual regions reserverd for 4KB mappings only.
+ */
+pte_t *vmem_get_alloc_pte(unsigned long addr, bool alloc)
+{
+ pte_t *ptep = NULL;
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
-out_remove:
- __remove_shared_memory(seg);
-out_free:
- kfree(seg);
+ pgd = pgd_offset_k(addr);
+ if (pgd_none(*pgd)) {
+ if (!alloc)
+ goto out;
+ p4d = vmem_crst_alloc(_REGION2_ENTRY_EMPTY);
+ if (!p4d)
+ goto out;
+ pgd_populate(&init_mm, pgd, p4d);
+ }
+ p4d = p4d_offset(pgd, addr);
+ if (p4d_none(*p4d)) {
+ if (!alloc)
+ goto out;
+ pud = vmem_crst_alloc(_REGION3_ENTRY_EMPTY);
+ if (!pud)
+ goto out;
+ p4d_populate(&init_mm, p4d, pud);
+ }
+ pud = pud_offset(p4d, addr);
+ if (pud_none(*pud)) {
+ if (!alloc)
+ goto out;
+ pmd = vmem_crst_alloc(_SEGMENT_ENTRY_EMPTY);
+ if (!pmd)
+ goto out;
+ pud_populate(&init_mm, pud, pmd);
+ } else if (WARN_ON_ONCE(pud_large(*pud))) {
+ goto out;
+ }
+ pmd = pmd_offset(pud, addr);
+ if (pmd_none(*pmd)) {
+ if (!alloc)
+ goto out;
+ pte = vmem_pte_alloc();
+ if (!pte)
+ goto out;
+ pmd_populate(&init_mm, pmd, pte);
+ } else if (WARN_ON_ONCE(pmd_large(*pmd))) {
+ goto out;
+ }
+ ptep = pte_offset_kernel(pmd, addr);
out:
+ return ptep;
+}
+
+int __vmem_map_4k_page(unsigned long addr, unsigned long phys, pgprot_t prot, bool alloc)
+{
+ pte_t *ptep, pte;
+
+ if (!IS_ALIGNED(addr, PAGE_SIZE))
+ return -EINVAL;
+ ptep = vmem_get_alloc_pte(addr, alloc);
+ if (!ptep)
+ return -ENOMEM;
+ __ptep_ipte(addr, ptep, 0, 0, IPTE_GLOBAL);
+ pte = mk_pte_phys(phys, prot);
+ set_pte(ptep, pte);
+ return 0;
+}
+
+int vmem_map_4k_page(unsigned long addr, unsigned long phys, pgprot_t prot)
+{
+ int rc;
+
+ mutex_lock(&vmem_mutex);
+ rc = __vmem_map_4k_page(addr, phys, prot, true);
+ mutex_unlock(&vmem_mutex);
+ return rc;
+}
+
+void vmem_unmap_4k_page(unsigned long addr)
+{
+ pte_t *ptep;
+
+ mutex_lock(&vmem_mutex);
+ ptep = virt_to_kpte(addr);
+ __ptep_ipte(addr, ptep, 0, 0, IPTE_GLOBAL);
+ pte_clear(&init_mm, addr, ptep);
mutex_unlock(&vmem_mutex);
- return ret;
}
/*
@@ -400,10 +664,11 @@ out:
*/
void __init vmem_map_init(void)
{
- struct memblock_region *reg;
+ phys_addr_t base, end;
+ u64 i;
- for_each_memblock(memory, reg)
- vmem_add_mem(reg->base, reg->size);
+ for_each_mem_range(i, &base, &end)
+ vmem_add_range(base, end - base);
__set_memory((unsigned long)_stext,
(unsigned long)(_etext - _stext) >> PAGE_SHIFT,
SET_MEMORY_RO | SET_MEMORY_X);
@@ -413,32 +678,16 @@ void __init vmem_map_init(void)
__set_memory((unsigned long)_sinittext,
(unsigned long)(_einittext - _sinittext) >> PAGE_SHIFT,
SET_MEMORY_RO | SET_MEMORY_X);
- __set_memory(__stext_dma, (__etext_dma - __stext_dma) >> PAGE_SHIFT,
+ __set_memory(__stext_amode31, (__etext_amode31 - __stext_amode31) >> PAGE_SHIFT,
SET_MEMORY_RO | SET_MEMORY_X);
- pr_info("Write protected kernel read-only data: %luk\n",
- (unsigned long)(__end_rodata - _stext) >> 10);
-}
-/*
- * Convert memblock.memory to a memory segment list so there is a single
- * list that contains all memory segments.
- */
-static int __init vmem_convert_memory_chunk(void)
-{
- struct memblock_region *reg;
- struct memory_segment *seg;
+ /* lowcore requires 4k mapping for real addresses / prefixing */
+ set_memory_4k(0, LC_PAGES);
- mutex_lock(&vmem_mutex);
- for_each_memblock(memory, reg) {
- seg = kzalloc(sizeof(*seg), GFP_KERNEL);
- if (!seg)
- panic("Out of memory...\n");
- seg->start = reg->base;
- seg->size = reg->size;
- insert_memory_segment(seg);
- }
- mutex_unlock(&vmem_mutex);
- return 0;
-}
+ /* lowcore must be executable for LPSWE */
+ if (!static_key_enabled(&cpu_has_bear))
+ set_memory_x(0, 1);
-core_initcall(vmem_convert_memory_chunk);
+ pr_info("Write protected kernel read-only data: %luk\n",
+ (unsigned long)(__end_rodata - _stext) >> 10);
+}