diff options
Diffstat (limited to 'arch/s390/mm')
-rw-r--r-- | arch/s390/mm/Makefile | 4 | ||||
-rw-r--r-- | arch/s390/mm/cmm.c | 74 | ||||
-rw-r--r-- | arch/s390/mm/dump_pagetables.c | 405 | ||||
-rw-r--r-- | arch/s390/mm/extable.c | 81 | ||||
-rw-r--r-- | arch/s390/mm/extmem.c | 34 | ||||
-rw-r--r-- | arch/s390/mm/fault.c | 329 | ||||
-rw-r--r-- | arch/s390/mm/gmap.c | 424 | ||||
-rw-r--r-- | arch/s390/mm/hugetlbpage.c | 93 | ||||
-rw-r--r-- | arch/s390/mm/init.c | 53 | ||||
-rw-r--r-- | arch/s390/mm/kasan_init.c | 157 | ||||
-rw-r--r-- | arch/s390/mm/maccess.c | 234 | ||||
-rw-r--r-- | arch/s390/mm/mmap.c | 67 | ||||
-rw-r--r-- | arch/s390/mm/page-states.c | 70 | ||||
-rw-r--r-- | arch/s390/mm/pageattr.c | 85 | ||||
-rw-r--r-- | arch/s390/mm/pgalloc.c | 383 | ||||
-rw-r--r-- | arch/s390/mm/pgtable.c | 183 | ||||
-rw-r--r-- | arch/s390/mm/vmem.c | 835 |
17 files changed, 2071 insertions, 1440 deletions
diff --git a/arch/s390/mm/Makefile b/arch/s390/mm/Makefile index 3175413186b9..57e4f3a24829 100644 --- a/arch/s390/mm/Makefile +++ b/arch/s390/mm/Makefile @@ -4,11 +4,11 @@ # obj-y := init.o fault.o extmem.o mmap.o vmem.o maccess.o -obj-y += page-states.o pageattr.o pgtable.o pgalloc.o +obj-y += page-states.o pageattr.o pgtable.o pgalloc.o extable.o obj-$(CONFIG_CMM) += cmm.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o -obj-$(CONFIG_S390_PTDUMP) += dump_pagetables.o +obj-$(CONFIG_PTDUMP_CORE) += dump_pagetables.o obj-$(CONFIG_PGSTE) += gmap.o KASAN_SANITIZE_kasan_init.o := n diff --git a/arch/s390/mm/cmm.c b/arch/s390/mm/cmm.c index a51c892f14f3..9141ed4c52e9 100644 --- a/arch/s390/mm/cmm.c +++ b/arch/s390/mm/cmm.c @@ -14,15 +14,13 @@ #include <linux/moduleparam.h> #include <linux/gfp.h> #include <linux/sched.h> +#include <linux/string_helpers.h> #include <linux/sysctl.h> -#include <linux/ctype.h> #include <linux/swap.h> #include <linux/kthread.h> #include <linux/oom.h> -#include <linux/suspend.h> #include <linux/uaccess.h> -#include <asm/pgalloc.h> #include <asm/diag.h> #ifdef CONFIG_CMM_IUCV @@ -49,7 +47,6 @@ static volatile long cmm_pages_target; static volatile long cmm_timed_pages_target; static long cmm_timeout_pages; static long cmm_timeout_seconds; -static int cmm_suspended; static struct cmm_page_array *cmm_page_list; static struct cmm_page_array *cmm_timed_page_list; @@ -93,7 +90,7 @@ static long cmm_alloc_pages(long nr, long *counter, } else free_page((unsigned long) npa); } - diag10_range(addr >> PAGE_SHIFT, 1); + diag10_range(virt_to_pfn(addr), 1); pa->pages[pa->index++] = addr; (*counter)++; spin_unlock(&cmm_lock); @@ -151,9 +148,9 @@ static int cmm_thread(void *dummy) while (1) { rc = wait_event_interruptible(cmm_thread_wait, - (!cmm_suspended && (cmm_pages != cmm_pages_target || - cmm_timed_pages != cmm_timed_pages_target)) || - kthread_should_stop()); + cmm_pages != cmm_pages_target || + cmm_timed_pages != cmm_timed_pages_target || + kthread_should_stop()); if (kthread_should_stop() || rc == -ERESTARTSYS) { cmm_pages_target = cmm_pages; cmm_timed_pages_target = cmm_timed_pages; @@ -191,7 +188,7 @@ static void cmm_set_timer(void) del_timer(&cmm_timer); return; } - mod_timer(&cmm_timer, jiffies + cmm_timeout_seconds * HZ); + mod_timer(&cmm_timer, jiffies + msecs_to_jiffies(cmm_timeout_seconds * MSEC_PER_SEC)); } static void cmm_timer_fn(struct timer_list *unused) @@ -247,7 +244,7 @@ static int cmm_skip_blanks(char *cp, char **endp) } static int cmm_pages_handler(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { long nr = cmm_get_pages(); struct ctl_table ctl_entry = { @@ -266,7 +263,7 @@ static int cmm_pages_handler(struct ctl_table *ctl, int write, } static int cmm_timed_pages_handler(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, + void *buffer, size_t *lenp, loff_t *ppos) { long nr = cmm_get_timed_pages(); @@ -286,7 +283,7 @@ static int cmm_timed_pages_handler(struct ctl_table *ctl, int write, } static int cmm_timeout_handler(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { char buf[64], *p; long nr, seconds; @@ -299,8 +296,7 @@ static int cmm_timeout_handler(struct ctl_table *ctl, int write, if (write) { len = min(*lenp, sizeof(buf)); - if (copy_from_user(buf, buffer, len)) - return -EFAULT; + memcpy(buf, buffer, len); buf[len - 1] = '\0'; cmm_skip_blanks(buf, &p); nr = simple_strtoul(p, &p, 0); @@ -313,8 +309,7 @@ static int cmm_timeout_handler(struct ctl_table *ctl, int write, cmm_timeout_pages, cmm_timeout_seconds); if (len > *lenp) len = *lenp; - if (copy_to_user(buffer, buf, len)) - return -EFAULT; + memcpy(buffer, buf, len); *lenp = len; *ppos += len; } @@ -390,38 +385,6 @@ static void cmm_smsg_target(const char *from, char *msg) static struct ctl_table_header *cmm_sysctl_header; -static int cmm_suspend(void) -{ - cmm_suspended = 1; - cmm_free_pages(cmm_pages, &cmm_pages, &cmm_page_list); - cmm_free_pages(cmm_timed_pages, &cmm_timed_pages, &cmm_timed_page_list); - return 0; -} - -static int cmm_resume(void) -{ - cmm_suspended = 0; - cmm_kick_thread(); - return 0; -} - -static int cmm_power_event(struct notifier_block *this, - unsigned long event, void *ptr) -{ - switch (event) { - case PM_POST_HIBERNATION: - return cmm_resume(); - case PM_HIBERNATION_PREPARE: - return cmm_suspend(); - default: - return NOTIFY_DONE; - } -} - -static struct notifier_block cmm_power_notifier = { - .notifier_call = cmm_power_event, -}; - static int __init cmm_init(void) { int rc = -ENOMEM; @@ -431,13 +394,10 @@ static int __init cmm_init(void) goto out_sysctl; #ifdef CONFIG_CMM_IUCV /* convert sender to uppercase characters */ - if (sender) { - int len = strlen(sender); - while (len--) - sender[len] = toupper(sender[len]); - } else { + if (sender) + string_upper(sender, sender); + else sender = cmm_default_sender; - } rc = smsg_register_callback(SMSG_PREFIX, cmm_smsg_target); if (rc < 0) @@ -446,16 +406,11 @@ static int __init cmm_init(void) rc = register_oom_notifier(&cmm_oom_nb); if (rc < 0) goto out_oom_notify; - rc = register_pm_notifier(&cmm_power_notifier); - if (rc) - goto out_pm; cmm_thread_ptr = kthread_run(cmm_thread, NULL, "cmmthread"); if (!IS_ERR(cmm_thread_ptr)) return 0; rc = PTR_ERR(cmm_thread_ptr); - unregister_pm_notifier(&cmm_power_notifier); -out_pm: unregister_oom_notifier(&cmm_oom_nb); out_oom_notify: #ifdef CONFIG_CMM_IUCV @@ -475,7 +430,6 @@ static void __exit cmm_exit(void) #ifdef CONFIG_CMM_IUCV smsg_unregister_callback(SMSG_PREFIX, cmm_smsg_target); #endif - unregister_pm_notifier(&cmm_power_notifier); unregister_oom_notifier(&cmm_oom_nb); kthread_stop(cmm_thread_ptr); del_timer_sync(&cmm_timer); diff --git a/arch/s390/mm/dump_pagetables.c b/arch/s390/mm/dump_pagetables.c index 5d67b81c704a..9953819d7959 100644 --- a/arch/s390/mm/dump_pagetables.c +++ b/arch/s390/mm/dump_pagetables.c @@ -1,12 +1,17 @@ // SPDX-License-Identifier: GPL-2.0 +#include <linux/set_memory.h> +#include <linux/ptdump.h> #include <linux/seq_file.h> #include <linux/debugfs.h> -#include <linux/sched.h> #include <linux/mm.h> +#include <linux/kfence.h> #include <linux/kasan.h> +#include <asm/ptdump.h> #include <asm/kasan.h> +#include <asm/abs_lowcore.h> +#include <asm/nospec-branch.h> #include <asm/sections.h> -#include <asm/pgtable.h> +#include <asm/maccess.h> static unsigned long max_addr; @@ -16,266 +21,267 @@ struct addr_marker { }; enum address_markers_idx { - IDENTITY_NR = 0, + IDENTITY_BEFORE_NR = 0, + IDENTITY_BEFORE_END_NR, + AMODE31_START_NR, + AMODE31_END_NR, KERNEL_START_NR, KERNEL_END_NR, +#ifdef CONFIG_KFENCE + KFENCE_START_NR, + KFENCE_END_NR, +#endif + IDENTITY_AFTER_NR, + IDENTITY_AFTER_END_NR, #ifdef CONFIG_KASAN KASAN_SHADOW_START_NR, KASAN_SHADOW_END_NR, #endif VMEMMAP_NR, + VMEMMAP_END_NR, VMALLOC_NR, + VMALLOC_END_NR, MODULES_NR, + MODULES_END_NR, + ABS_LOWCORE_NR, + ABS_LOWCORE_END_NR, + MEMCPY_REAL_NR, + MEMCPY_REAL_END_NR, }; static struct addr_marker address_markers[] = { - [IDENTITY_NR] = {0, "Identity Mapping"}, + [IDENTITY_BEFORE_NR] = {0, "Identity Mapping Start"}, + [IDENTITY_BEFORE_END_NR] = {(unsigned long)_stext, "Identity Mapping End"}, + [AMODE31_START_NR] = {0, "Amode31 Area Start"}, + [AMODE31_END_NR] = {0, "Amode31 Area End"}, [KERNEL_START_NR] = {(unsigned long)_stext, "Kernel Image Start"}, [KERNEL_END_NR] = {(unsigned long)_end, "Kernel Image End"}, +#ifdef CONFIG_KFENCE + [KFENCE_START_NR] = {0, "KFence Pool Start"}, + [KFENCE_END_NR] = {0, "KFence Pool End"}, +#endif + [IDENTITY_AFTER_NR] = {(unsigned long)_end, "Identity Mapping Start"}, + [IDENTITY_AFTER_END_NR] = {0, "Identity Mapping End"}, #ifdef CONFIG_KASAN [KASAN_SHADOW_START_NR] = {KASAN_SHADOW_START, "Kasan Shadow Start"}, [KASAN_SHADOW_END_NR] = {KASAN_SHADOW_END, "Kasan Shadow End"}, #endif - [VMEMMAP_NR] = {0, "vmemmap Area"}, - [VMALLOC_NR] = {0, "vmalloc Area"}, - [MODULES_NR] = {0, "Modules Area"}, + [VMEMMAP_NR] = {0, "vmemmap Area Start"}, + [VMEMMAP_END_NR] = {0, "vmemmap Area End"}, + [VMALLOC_NR] = {0, "vmalloc Area Start"}, + [VMALLOC_END_NR] = {0, "vmalloc Area End"}, + [MODULES_NR] = {0, "Modules Area Start"}, + [MODULES_END_NR] = {0, "Modules Area End"}, + [ABS_LOWCORE_NR] = {0, "Lowcore Area Start"}, + [ABS_LOWCORE_END_NR] = {0, "Lowcore Area End"}, + [MEMCPY_REAL_NR] = {0, "Real Memory Copy Area Start"}, + [MEMCPY_REAL_END_NR] = {0, "Real Memory Copy Area End"}, { -1, NULL } }; struct pg_state { + struct ptdump_state ptdump; + struct seq_file *seq; int level; unsigned int current_prot; + bool check_wx; + unsigned long wx_pages; unsigned long start_address; - unsigned long current_address; const struct addr_marker *marker; }; +#define pt_dump_seq_printf(m, fmt, args...) \ +({ \ + struct seq_file *__m = (m); \ + \ + if (__m) \ + seq_printf(__m, fmt, ##args); \ +}) + +#define pt_dump_seq_puts(m, fmt) \ +({ \ + struct seq_file *__m = (m); \ + \ + if (__m) \ + seq_printf(__m, fmt); \ +}) + static void print_prot(struct seq_file *m, unsigned int pr, int level) { static const char * const level_name[] = { "ASCE", "PGD", "PUD", "PMD", "PTE" }; - seq_printf(m, "%s ", level_name[level]); + pt_dump_seq_printf(m, "%s ", level_name[level]); if (pr & _PAGE_INVALID) { - seq_printf(m, "I\n"); + pt_dump_seq_printf(m, "I\n"); return; } - seq_puts(m, (pr & _PAGE_PROTECT) ? "RO " : "RW "); - seq_puts(m, (pr & _PAGE_NOEXEC) ? "NX\n" : "X\n"); + pt_dump_seq_puts(m, (pr & _PAGE_PROTECT) ? "RO " : "RW "); + pt_dump_seq_puts(m, (pr & _PAGE_NOEXEC) ? "NX\n" : "X\n"); } -static void note_page(struct seq_file *m, struct pg_state *st, - unsigned int new_prot, int level) +static void note_prot_wx(struct pg_state *st, unsigned long addr) +{ +#ifdef CONFIG_DEBUG_WX + if (!st->check_wx) + return; + if (st->current_prot & _PAGE_INVALID) + return; + if (st->current_prot & _PAGE_PROTECT) + return; + if (st->current_prot & _PAGE_NOEXEC) + return; + /* + * The first lowcore page is W+X if spectre mitigations are using + * trampolines or the BEAR enhancements facility is not installed, + * in which case we have two lpswe instructions in lowcore that need + * to be executable. + */ + if (addr == PAGE_SIZE && (nospec_uses_trampoline() || !static_key_enabled(&cpu_has_bear))) + return; + WARN_ONCE(1, "s390/mm: Found insecure W+X mapping at address %pS\n", + (void *)st->start_address); + st->wx_pages += (addr - st->start_address) / PAGE_SIZE; +#endif /* CONFIG_DEBUG_WX */ +} + +static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level, u64 val) { - static const char units[] = "KMGTPE"; int width = sizeof(unsigned long) * 2; + static const char units[] = "KMGTPE"; const char *unit = units; - unsigned int prot, cur; unsigned long delta; + struct pg_state *st; + struct seq_file *m; + unsigned int prot; - /* - * If we have a "break" in the series, we need to flush the state - * that we have now. "break" is either changing perms, levels or - * address space marker. - */ - prot = new_prot; - cur = st->current_prot; - - if (!st->level) { - /* First entry */ - st->current_prot = new_prot; + st = container_of(pt_st, struct pg_state, ptdump); + m = st->seq; + prot = val & (_PAGE_PROTECT | _PAGE_NOEXEC); + if (level == 4 && (val & _PAGE_INVALID)) + prot = _PAGE_INVALID; + /* For pmd_none() & friends val gets passed as zero. */ + if (level != 4 && !val) + prot = _PAGE_INVALID; + /* Final flush from generic code. */ + if (level == -1) + addr = max_addr; + if (st->level == -1) { + pt_dump_seq_printf(m, "---[ %s ]---\n", st->marker->name); + st->start_address = addr; + st->current_prot = prot; st->level = level; - st->marker = address_markers; - seq_printf(m, "---[ %s ]---\n", st->marker->name); - } else if (prot != cur || level != st->level || - st->current_address >= st->marker[1].start_address) { - /* Print the actual finished series */ - seq_printf(m, "0x%0*lx-0x%0*lx ", - width, st->start_address, - width, st->current_address); - delta = (st->current_address - st->start_address) >> 10; + } else if (prot != st->current_prot || level != st->level || + addr >= st->marker[1].start_address) { + note_prot_wx(st, addr); + pt_dump_seq_printf(m, "0x%0*lx-0x%0*lx ", + width, st->start_address, + width, addr); + delta = (addr - st->start_address) >> 10; while (!(delta & 0x3ff) && unit[1]) { delta >>= 10; unit++; } - seq_printf(m, "%9lu%c ", delta, *unit); + pt_dump_seq_printf(m, "%9lu%c ", delta, *unit); print_prot(m, st->current_prot, st->level); - while (st->current_address >= st->marker[1].start_address) { + while (addr >= st->marker[1].start_address) { st->marker++; - seq_printf(m, "---[ %s ]---\n", st->marker->name); + pt_dump_seq_printf(m, "---[ %s ]---\n", st->marker->name); } - st->start_address = st->current_address; - st->current_prot = new_prot; + st->start_address = addr; + st->current_prot = prot; st->level = level; } } -#ifdef CONFIG_KASAN -static void note_kasan_early_shadow_page(struct seq_file *m, - struct pg_state *st) -{ - unsigned int prot; - - prot = pte_val(*kasan_early_shadow_pte) & - (_PAGE_PROTECT | _PAGE_INVALID | _PAGE_NOEXEC); - note_page(m, st, prot, 4); -} -#endif - -/* - * The actual page table walker functions. In order to keep the - * implementation of print_prot() short, we only check and pass - * _PAGE_INVALID and _PAGE_PROTECT flags to note_page() if a region, - * segment or page table entry is invalid or read-only. - * After all it's just a hint that the current level being walked - * contains an invalid or read-only entry. - */ -static void walk_pte_level(struct seq_file *m, struct pg_state *st, - pmd_t *pmd, unsigned long addr) -{ - unsigned int prot; - pte_t *pte; - int i; - - for (i = 0; i < PTRS_PER_PTE && addr < max_addr; i++) { - st->current_address = addr; - pte = pte_offset_kernel(pmd, addr); - prot = pte_val(*pte) & - (_PAGE_PROTECT | _PAGE_INVALID | _PAGE_NOEXEC); - note_page(m, st, prot, 4); - addr += PAGE_SIZE; - } -} - -static void walk_pmd_level(struct seq_file *m, struct pg_state *st, - pud_t *pud, unsigned long addr) -{ - unsigned int prot; - pmd_t *pmd; - int i; - -#ifdef CONFIG_KASAN - if ((pud_val(*pud) & PAGE_MASK) == __pa(kasan_early_shadow_pmd)) { - note_kasan_early_shadow_page(m, st); - return; - } -#endif - - pmd = pmd_offset(pud, addr); - for (i = 0; i < PTRS_PER_PMD && addr < max_addr; i++, pmd++) { - st->current_address = addr; - if (!pmd_none(*pmd)) { - if (pmd_large(*pmd)) { - prot = pmd_val(*pmd) & - (_SEGMENT_ENTRY_PROTECT | - _SEGMENT_ENTRY_NOEXEC); - note_page(m, st, prot, 3); - } else - walk_pte_level(m, st, pmd, addr); - } else - note_page(m, st, _PAGE_INVALID, 3); - addr += PMD_SIZE; - } -} - -static void walk_pud_level(struct seq_file *m, struct pg_state *st, - p4d_t *p4d, unsigned long addr) +#ifdef CONFIG_DEBUG_WX +void ptdump_check_wx(void) { - unsigned int prot; - pud_t *pud; - int i; + struct pg_state st = { + .ptdump = { + .note_page = note_page, + .range = (struct ptdump_range[]) { + {.start = 0, .end = max_addr}, + {.start = 0, .end = 0}, + } + }, + .seq = NULL, + .level = -1, + .current_prot = 0, + .check_wx = true, + .wx_pages = 0, + .start_address = 0, + .marker = (struct addr_marker[]) { + { .start_address = 0, .name = NULL}, + { .start_address = -1, .name = NULL}, + }, + }; -#ifdef CONFIG_KASAN - if ((p4d_val(*p4d) & PAGE_MASK) == __pa(kasan_early_shadow_pud)) { - note_kasan_early_shadow_page(m, st); + if (!MACHINE_HAS_NX) return; - } -#endif - - pud = pud_offset(p4d, addr); - for (i = 0; i < PTRS_PER_PUD && addr < max_addr; i++, pud++) { - st->current_address = addr; - if (!pud_none(*pud)) - if (pud_large(*pud)) { - prot = pud_val(*pud) & - (_REGION_ENTRY_PROTECT | - _REGION_ENTRY_NOEXEC); - note_page(m, st, prot, 2); - } else - walk_pmd_level(m, st, pud, addr); - else - note_page(m, st, _PAGE_INVALID, 2); - addr += PUD_SIZE; - } + ptdump_walk_pgd(&st.ptdump, &init_mm, NULL); + if (st.wx_pages) + pr_warn("Checked W+X mappings: FAILED, %lu W+X pages found\n", st.wx_pages); + else + pr_info("Checked W+X mappings: passed, no %sW+X pages found\n", + (nospec_uses_trampoline() || !static_key_enabled(&cpu_has_bear)) ? + "unexpected " : ""); } +#endif /* CONFIG_DEBUG_WX */ -static void walk_p4d_level(struct seq_file *m, struct pg_state *st, - pgd_t *pgd, unsigned long addr) +#ifdef CONFIG_PTDUMP_DEBUGFS +static int ptdump_show(struct seq_file *m, void *v) { - p4d_t *p4d; - int i; + struct pg_state st = { + .ptdump = { + .note_page = note_page, + .range = (struct ptdump_range[]) { + {.start = 0, .end = max_addr}, + {.start = 0, .end = 0}, + } + }, + .seq = m, + .level = -1, + .current_prot = 0, + .check_wx = false, + .wx_pages = 0, + .start_address = 0, + .marker = address_markers, + }; -#ifdef CONFIG_KASAN - if ((pgd_val(*pgd) & PAGE_MASK) == __pa(kasan_early_shadow_p4d)) { - note_kasan_early_shadow_page(m, st); - return; - } -#endif - - p4d = p4d_offset(pgd, addr); - for (i = 0; i < PTRS_PER_P4D && addr < max_addr; i++, p4d++) { - st->current_address = addr; - if (!p4d_none(*p4d)) - walk_pud_level(m, st, p4d, addr); - else - note_page(m, st, _PAGE_INVALID, 2); - addr += P4D_SIZE; - } + get_online_mems(); + mutex_lock(&cpa_mutex); + ptdump_walk_pgd(&st.ptdump, &init_mm, NULL); + mutex_unlock(&cpa_mutex); + put_online_mems(); + return 0; } +DEFINE_SHOW_ATTRIBUTE(ptdump); +#endif /* CONFIG_PTDUMP_DEBUGFS */ -static void walk_pgd_level(struct seq_file *m) +/* + * Heapsort from lib/sort.c is not a stable sorting algorithm, do a simple + * insertion sort to preserve the original order of markers with the same + * start address. + */ +static void sort_address_markers(void) { - unsigned long addr = 0; - struct pg_state st; - pgd_t *pgd; - int i; + struct addr_marker tmp; + int i, j; - memset(&st, 0, sizeof(st)); - for (i = 0; i < PTRS_PER_PGD && addr < max_addr; i++) { - st.current_address = addr; - pgd = pgd_offset_k(addr); - if (!pgd_none(*pgd)) - walk_p4d_level(m, &st, pgd, addr); - else - note_page(m, &st, _PAGE_INVALID, 1); - addr += PGDIR_SIZE; - cond_resched(); + for (i = 1; i < ARRAY_SIZE(address_markers) - 1; i++) { + tmp = address_markers[i]; + for (j = i - 1; j >= 0 && address_markers[j].start_address > tmp.start_address; j--) + address_markers[j + 1] = address_markers[j]; + address_markers[j + 1] = tmp; } - /* Flush out the last page */ - st.current_address = max_addr; - note_page(m, &st, 0, 0); -} - -static int ptdump_show(struct seq_file *m, void *v) -{ - walk_pgd_level(m); - return 0; -} - -static int ptdump_open(struct inode *inode, struct file *filp) -{ - return single_open(filp, ptdump_show, NULL); } -static const struct file_operations ptdump_fops = { - .open = ptdump_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - static int pt_dump_init(void) { +#ifdef CONFIG_KFENCE + unsigned long kfence_start = (unsigned long)__kfence_pool; +#endif /* * Figure out the maximum virtual address being accessible with the * kernel ASCE. We need this to keep the page table walker functions @@ -283,10 +289,27 @@ static int pt_dump_init(void) */ max_addr = (S390_lowcore.kernel_asce & _REGION_ENTRY_TYPE_MASK) >> 2; max_addr = 1UL << (max_addr * 11 + 31); + address_markers[IDENTITY_AFTER_END_NR].start_address = ident_map_size; + address_markers[AMODE31_START_NR].start_address = __samode31; + address_markers[AMODE31_END_NR].start_address = __eamode31; address_markers[MODULES_NR].start_address = MODULES_VADDR; + address_markers[MODULES_END_NR].start_address = MODULES_END; + address_markers[ABS_LOWCORE_NR].start_address = __abs_lowcore; + address_markers[ABS_LOWCORE_END_NR].start_address = __abs_lowcore + ABS_LOWCORE_MAP_SIZE; + address_markers[MEMCPY_REAL_NR].start_address = __memcpy_real_area; + address_markers[MEMCPY_REAL_END_NR].start_address = __memcpy_real_area + PAGE_SIZE; address_markers[VMEMMAP_NR].start_address = (unsigned long) vmemmap; + address_markers[VMEMMAP_END_NR].start_address = (unsigned long)vmemmap + vmemmap_size; address_markers[VMALLOC_NR].start_address = VMALLOC_START; + address_markers[VMALLOC_END_NR].start_address = VMALLOC_END; +#ifdef CONFIG_KFENCE + address_markers[KFENCE_START_NR].start_address = kfence_start; + address_markers[KFENCE_END_NR].start_address = kfence_start + KFENCE_POOL_SIZE; +#endif + sort_address_markers(); +#ifdef CONFIG_PTDUMP_DEBUGFS debugfs_create_file("kernel_page_tables", 0400, NULL, NULL, &ptdump_fops); +#endif /* CONFIG_PTDUMP_DEBUGFS */ return 0; } device_initcall(pt_dump_init); diff --git a/arch/s390/mm/extable.c b/arch/s390/mm/extable.c new file mode 100644 index 000000000000..1e4d2187541a --- /dev/null +++ b/arch/s390/mm/extable.c @@ -0,0 +1,81 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/bitfield.h> +#include <linux/extable.h> +#include <linux/string.h> +#include <linux/errno.h> +#include <linux/panic.h> +#include <asm/asm-extable.h> +#include <asm/extable.h> + +const struct exception_table_entry *s390_search_extables(unsigned long addr) +{ + const struct exception_table_entry *fixup; + size_t num; + + fixup = search_exception_tables(addr); + if (fixup) + return fixup; + num = __stop_amode31_ex_table - __start_amode31_ex_table; + return search_extable(__start_amode31_ex_table, num, addr); +} + +static bool ex_handler_fixup(const struct exception_table_entry *ex, struct pt_regs *regs) +{ + regs->psw.addr = extable_fixup(ex); + return true; +} + +static bool ex_handler_ua_store(const struct exception_table_entry *ex, struct pt_regs *regs) +{ + unsigned int reg_err = FIELD_GET(EX_DATA_REG_ERR, ex->data); + + regs->gprs[reg_err] = -EFAULT; + regs->psw.addr = extable_fixup(ex); + return true; +} + +static bool ex_handler_ua_load_mem(const struct exception_table_entry *ex, struct pt_regs *regs) +{ + unsigned int reg_addr = FIELD_GET(EX_DATA_REG_ADDR, ex->data); + unsigned int reg_err = FIELD_GET(EX_DATA_REG_ERR, ex->data); + size_t len = FIELD_GET(EX_DATA_LEN, ex->data); + + regs->gprs[reg_err] = -EFAULT; + memset((void *)regs->gprs[reg_addr], 0, len); + regs->psw.addr = extable_fixup(ex); + return true; +} + +static bool ex_handler_ua_load_reg(const struct exception_table_entry *ex, struct pt_regs *regs) +{ + unsigned int reg_zero = FIELD_GET(EX_DATA_REG_ADDR, ex->data); + unsigned int reg_err = FIELD_GET(EX_DATA_REG_ERR, ex->data); + + regs->gprs[reg_err] = -EFAULT; + regs->gprs[reg_zero] = 0; + regs->psw.addr = extable_fixup(ex); + return true; +} + +bool fixup_exception(struct pt_regs *regs) +{ + const struct exception_table_entry *ex; + + ex = s390_search_extables(instruction_pointer(regs)); + if (!ex) + return false; + switch (ex->type) { + case EX_TYPE_FIXUP: + return ex_handler_fixup(ex, regs); + case EX_TYPE_BPF: + return ex_handler_bpf(ex, regs); + case EX_TYPE_UA_STORE: + return ex_handler_ua_store(ex, regs); + case EX_TYPE_UA_LOAD_MEM: + return ex_handler_ua_load_mem(ex, regs); + case EX_TYPE_UA_LOAD_REG: + return ex_handler_ua_load_reg(ex, regs); + } + panic("invalid exception table entry"); +} diff --git a/arch/s390/mm/extmem.c b/arch/s390/mm/extmem.c index fd0dae9d10f4..5060956b8e7d 100644 --- a/arch/s390/mm/extmem.c +++ b/arch/s390/mm/extmem.c @@ -20,9 +20,9 @@ #include <linux/ctype.h> #include <linux/ioport.h> #include <linux/refcount.h> +#include <linux/pgtable.h> #include <asm/diag.h> #include <asm/page.h> -#include <asm/pgtable.h> #include <asm/ebcdic.h> #include <asm/errno.h> #include <asm/extmem.h> @@ -313,15 +313,10 @@ __segment_load (char *name, int do_nonshared, unsigned long *addr, unsigned long goto out_free; } - rc = vmem_add_mapping(seg->start_addr, seg->end - seg->start_addr + 1); - - if (rc) - goto out_free; - seg->res = kzalloc(sizeof(struct resource), GFP_KERNEL); if (seg->res == NULL) { rc = -ENOMEM; - goto out_shared; + goto out_free; } seg->res->flags = IORESOURCE_BUSY | IORESOURCE_MEM; seg->res->start = seg->start_addr; @@ -335,12 +330,17 @@ __segment_load (char *name, int do_nonshared, unsigned long *addr, unsigned long if (rc == SEG_TYPE_SC || ((rc == SEG_TYPE_SR || rc == SEG_TYPE_ER) && !do_nonshared)) seg->res->flags |= IORESOURCE_READONLY; + + /* Check for overlapping resources before adding the mapping. */ if (request_resource(&iomem_resource, seg->res)) { rc = -EBUSY; - kfree(seg->res); - goto out_shared; + goto out_free_resource; } + rc = vmem_add_mapping(seg->start_addr, seg->end - seg->start_addr + 1); + if (rc) + goto out_resource; + if (do_nonshared) diag_cc = dcss_diag(&loadnsr_scode, seg->dcss_name, &start_addr, &end_addr); @@ -351,14 +351,14 @@ __segment_load (char *name, int do_nonshared, unsigned long *addr, unsigned long dcss_diag(&purgeseg_scode, seg->dcss_name, &dummy, &dummy); rc = diag_cc; - goto out_resource; + goto out_mapping; } if (diag_cc > 1) { pr_warn("Loading DCSS %s failed with rc=%ld\n", name, end_addr); rc = dcss_diag_translate_rc(end_addr); dcss_diag(&purgeseg_scode, seg->dcss_name, &dummy, &dummy); - goto out_resource; + goto out_mapping; } seg->start_addr = start_addr; seg->end = end_addr; @@ -377,11 +377,12 @@ __segment_load (char *name, int do_nonshared, unsigned long *addr, unsigned long (void*) seg->end, segtype_string[seg->vm_segtype]); } goto out; + out_mapping: + vmem_remove_mapping(seg->start_addr, seg->end - seg->start_addr + 1); out_resource: release_resource(seg->res); + out_free_resource: kfree(seg->res); - out_shared: - vmem_remove_mapping(seg->start_addr, seg->end - seg->start_addr + 1); out_free: kfree(seg); out: @@ -400,8 +401,7 @@ __segment_load (char *name, int do_nonshared, unsigned long *addr, unsigned long * -EIO : could not perform query or load diagnose * -ENOENT : no such segment * -EOPNOTSUPP: multi-part segment cannot be used with linux - * -ENOSPC : segment cannot be used (overlaps with storage) - * -EBUSY : segment can temporarily not be used (overlaps with dcss) + * -EBUSY : segment cannot be used (overlaps with dcss or storage) * -ERANGE : segment cannot be used (exceeds kernel mapping range) * -EPERM : segment is currently loaded with incompatible permissions * -ENOMEM : out of memory @@ -626,10 +626,6 @@ void segment_warning(int rc, char *seg_name) pr_err("DCSS %s has multiple page ranges and cannot be " "loaded or queried\n", seg_name); break; - case -ENOSPC: - pr_err("DCSS %s overlaps with used storage and cannot " - "be loaded\n", seg_name); - break; case -EBUSY: pr_err("%s needs used memory resources and cannot be " "loaded or queried\n", seg_name); diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index 7b0bb475c166..9649d9382e0a 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -31,29 +31,30 @@ #include <linux/kprobes.h> #include <linux/uaccess.h> #include <linux/hugetlb.h> +#include <linux/kfence.h> +#include <asm/asm-extable.h> #include <asm/asm-offsets.h> #include <asm/diag.h> -#include <asm/pgtable.h> #include <asm/gmap.h> #include <asm/irq.h> #include <asm/mmu_context.h> #include <asm/facility.h> +#include <asm/uv.h> #include "../kernel/entry.h" #define __FAIL_ADDR_MASK -4096L #define __SUBCODE_MASK 0x0600 #define __PF_RES_FIELD 0x8000000000000000ULL -#define VM_FAULT_BADCONTEXT 0x010000 -#define VM_FAULT_BADMAP 0x020000 -#define VM_FAULT_BADACCESS 0x040000 -#define VM_FAULT_SIGNAL 0x080000 -#define VM_FAULT_PFAULT 0x100000 +#define VM_FAULT_BADCONTEXT ((__force vm_fault_t) 0x010000) +#define VM_FAULT_BADMAP ((__force vm_fault_t) 0x020000) +#define VM_FAULT_BADACCESS ((__force vm_fault_t) 0x040000) +#define VM_FAULT_SIGNAL ((__force vm_fault_t) 0x080000) +#define VM_FAULT_PFAULT ((__force vm_fault_t) 0x100000) enum fault_type { KERNEL_FAULT, USER_FAULT, - VDSO_FAULT, GMAP_FAULT, }; @@ -77,22 +78,16 @@ static enum fault_type get_fault_type(struct pt_regs *regs) trans_exc_code = regs->int_parm_long & 3; if (likely(trans_exc_code == 0)) { /* primary space exception */ - if (IS_ENABLED(CONFIG_PGSTE) && - test_pt_regs_flag(regs, PIF_GUEST_FAULT)) - return GMAP_FAULT; - if (current->thread.mm_segment == USER_DS) + if (user_mode(regs)) return USER_FAULT; - return KERNEL_FAULT; - } - if (trans_exc_code == 2) { - /* secondary space exception */ - if (current->thread.mm_segment & 1) { - if (current->thread.mm_segment == USER_DS_SACF) - return USER_FAULT; + if (!IS_ENABLED(CONFIG_PGSTE)) return KERNEL_FAULT; - } - return VDSO_FAULT; + if (test_pt_regs_flag(regs, PIF_GUEST_FAULT)) + return GMAP_FAULT; + return KERNEL_FAULT; } + if (trans_exc_code == 2) + return USER_FAULT; if (trans_exc_code == 1) { /* access register mode, not used in the kernel */ return USER_FAULT; @@ -105,7 +100,7 @@ static int bad_address(void *p) { unsigned long dummy; - return probe_kernel_address((unsigned long *)p, dummy); + return get_kernel_nofault(dummy, (unsigned long *)p); } static void dump_pagetable(unsigned long asce, unsigned long address) @@ -121,8 +116,8 @@ static void dump_pagetable(unsigned long asce, unsigned long address) pr_cont("R1:%016lx ", *table); if (*table & _REGION_ENTRY_INVALID) goto out; - table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); - /* fallthrough */ + table = __va(*table & _REGION_ENTRY_ORIGIN); + fallthrough; case _ASCE_TYPE_REGION2: table += (address & _REGION2_INDEX) >> _REGION2_SHIFT; if (bad_address(table)) @@ -130,8 +125,8 @@ static void dump_pagetable(unsigned long asce, unsigned long address) pr_cont("R2:%016lx ", *table); if (*table & _REGION_ENTRY_INVALID) goto out; - table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); - /* fallthrough */ + table = __va(*table & _REGION_ENTRY_ORIGIN); + fallthrough; case _ASCE_TYPE_REGION3: table += (address & _REGION3_INDEX) >> _REGION3_SHIFT; if (bad_address(table)) @@ -139,8 +134,8 @@ static void dump_pagetable(unsigned long asce, unsigned long address) pr_cont("R3:%016lx ", *table); if (*table & (_REGION_ENTRY_INVALID | _REGION3_ENTRY_LARGE)) goto out; - table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); - /* fallthrough */ + table = __va(*table & _REGION_ENTRY_ORIGIN); + fallthrough; case _ASCE_TYPE_SEGMENT: table += (address & _SEGMENT_INDEX) >> _SEGMENT_SHIFT; if (bad_address(table)) @@ -148,7 +143,7 @@ static void dump_pagetable(unsigned long asce, unsigned long address) pr_cont("S:%016lx ", *table); if (*table & (_SEGMENT_ENTRY_INVALID | _SEGMENT_ENTRY_LARGE)) goto out; - table = (unsigned long *)(*table & _SEGMENT_ENTRY_ORIGIN); + table = __va(*table & _SEGMENT_ENTRY_ORIGIN); } table += (address & _PAGE_INDEX) >> _PAGE_SHIFT; if (bad_address(table)) @@ -188,10 +183,6 @@ static void dump_fault_info(struct pt_regs *regs) asce = S390_lowcore.user_asce; pr_cont("user "); break; - case VDSO_FAULT: - asce = S390_lowcore.vdso_asce; - pr_cont("vdso "); - break; case GMAP_FAULT: asce = ((struct gmap *) S390_lowcore.gmap)->asce; pr_cont("gmap "); @@ -237,29 +228,10 @@ static noinline void do_sigsegv(struct pt_regs *regs, int si_code) (void __user *)(regs->int_parm_long & __FAIL_ADDR_MASK)); } -const struct exception_table_entry *s390_search_extables(unsigned long addr) -{ - const struct exception_table_entry *fixup; - - fixup = search_extable(__start_dma_ex_table, - __stop_dma_ex_table - __start_dma_ex_table, - addr); - if (!fixup) - fixup = search_exception_tables(addr); - return fixup; -} - static noinline void do_no_context(struct pt_regs *regs) { - const struct exception_table_entry *fixup; - - /* Are we prepared to handle this kernel fault? */ - fixup = s390_search_extables(regs->psw.addr); - if (fixup) { - regs->psw.addr = extable_fixup(fixup); + if (fixup_exception(regs)) return; - } - /* * Oops. The kernel tried to access some bad page. We'll have to * terminate things with extreme prejudice. @@ -272,7 +244,6 @@ static noinline void do_no_context(struct pt_regs *regs) " in virtual user address space\n"); dump_fault_info(regs); die(regs, "Oops"); - do_exit(SIGKILL); } static noinline void do_low_address(struct pt_regs *regs) @@ -282,7 +253,6 @@ static noinline void do_low_address(struct pt_regs *regs) if (regs->psw.mask & PSW_MASK_PSTATE) { /* Low-address protection hit in user mode 'cannot happen'. */ die (regs, "Low-address protection"); - do_exit(SIGKILL); } do_no_context(regs); @@ -298,36 +268,12 @@ static noinline void do_sigbus(struct pt_regs *regs) (void __user *)(regs->int_parm_long & __FAIL_ADDR_MASK)); } -static noinline int signal_return(struct pt_regs *regs) -{ - u16 instruction; - int rc; - - rc = __get_user(instruction, (u16 __user *) regs->psw.addr); - if (rc) - return rc; - if (instruction == 0x0a77) { - set_pt_regs_flag(regs, PIF_SYSCALL); - regs->int_code = 0x00040077; - return 0; - } else if (instruction == 0x0aad) { - set_pt_regs_flag(regs, PIF_SYSCALL); - regs->int_code = 0x000400ad; - return 0; - } - return -EACCES; -} - -static noinline void do_fault_error(struct pt_regs *regs, int access, - vm_fault_t fault) +static noinline void do_fault_error(struct pt_regs *regs, vm_fault_t fault) { int si_code; switch (fault) { case VM_FAULT_BADACCESS: - if (access == VM_EXEC && signal_return(regs) == 0) - break; - /* fallthrough */ case VM_FAULT_BADMAP: /* Bad memory access. Check if it is kernel or user space. */ if (user_mode(regs)) { @@ -337,9 +283,8 @@ static noinline void do_fault_error(struct pt_regs *regs, int access, do_sigsegv(regs, si_code); break; } - /* fallthrough */ + fallthrough; case VM_FAULT_BADCONTEXT: - /* fallthrough */ case VM_FAULT_PFAULT: do_no_context(regs); break; @@ -377,7 +322,7 @@ static noinline void do_fault_error(struct pt_regs *regs, int access, * routines. * * interruption code (int_code): - * 04 Protection -> Write-Protection (suprression) + * 04 Protection -> Write-Protection (suppression) * 10 Segment translation -> Not present (nullification) * 11 Page translation -> Not present (nullification) * 3b Region third trans. -> Not present (nullification) @@ -393,19 +338,22 @@ static inline vm_fault_t do_exception(struct pt_regs *regs, int access) unsigned long address; unsigned int flags; vm_fault_t fault; + bool is_write; tsk = current; /* * The instruction that caused the program check has * been nullified. Don't signal single step via SIGTRAP. */ - clear_pt_regs_flag(regs, PIF_PER_TRAP); + clear_thread_flag(TIF_PER_TRAP); if (kprobe_page_fault(regs, 14)) return 0; mm = tsk->mm; trans_exc_code = regs->int_parm_long; + address = trans_exc_code & __FAIL_ADDR_MASK; + is_write = (trans_exc_code & store_indication) == 0x400; /* * Verify that the fault happened in user space, that @@ -416,9 +364,8 @@ static inline vm_fault_t do_exception(struct pt_regs *regs, int access) type = get_fault_type(regs); switch (type) { case KERNEL_FAULT: - goto out; - case VDSO_FAULT: - fault = VM_FAULT_BADMAP; + if (kfence_handle_page_fault(address, is_write, regs)) + return 0; goto out; case USER_FAULT: case GMAP_FAULT: @@ -427,14 +374,15 @@ static inline vm_fault_t do_exception(struct pt_regs *regs, int access) break; } - address = trans_exc_code & __FAIL_ADDR_MASK; perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); - flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; + flags = FAULT_FLAG_DEFAULT; if (user_mode(regs)) flags |= FAULT_FLAG_USER; - if (access == VM_WRITE || (trans_exc_code & store_indication) == 0x400) + if (is_write) + access = VM_WRITE; + if (access == VM_WRITE) flags |= FAULT_FLAG_WRITE; - down_read(&mm->mmap_sem); + mmap_read_lock(mm); gmap = NULL; if (IS_ENABLED(CONFIG_PGSTE) && type == GMAP_FAULT) { @@ -472,57 +420,49 @@ retry: if (unlikely(!(vma->vm_flags & access))) goto out_up; - if (is_vm_hugetlb_page(vma)) - address &= HPAGE_MASK; /* * If for any reason at all we couldn't handle the fault, * make sure we exit gracefully rather than endlessly redo * the fault. */ - fault = handle_mm_fault(vma, address, flags); - /* No reason to continue if interrupted by SIGKILL. */ - if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) { + fault = handle_mm_fault(vma, address, flags, regs); + if (fault_signal_pending(fault, regs)) { fault = VM_FAULT_SIGNAL; if (flags & FAULT_FLAG_RETRY_NOWAIT) goto out_up; goto out; } + + /* The fault is fully completed (including releasing mmap lock) */ + if (fault & VM_FAULT_COMPLETED) { + if (gmap) { + mmap_read_lock(mm); + goto out_gmap; + } + fault = 0; + goto out; + } + if (unlikely(fault & VM_FAULT_ERROR)) goto out_up; - /* - * Major/minor page fault accounting is only done on the - * initial attempt. If we go through a retry, it is extremely - * likely that the page will be found in page cache at that point. - */ - if (flags & FAULT_FLAG_ALLOW_RETRY) { - if (fault & VM_FAULT_MAJOR) { - tsk->maj_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, - regs, address); - } else { - tsk->min_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, - regs, address); - } - if (fault & VM_FAULT_RETRY) { - if (IS_ENABLED(CONFIG_PGSTE) && gmap && - (flags & FAULT_FLAG_RETRY_NOWAIT)) { - /* FAULT_FLAG_RETRY_NOWAIT has been set, - * mmap_sem has not been released */ - current->thread.gmap_pfault = 1; - fault = VM_FAULT_PFAULT; - goto out_up; - } - /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk - * of starvation. */ - flags &= ~(FAULT_FLAG_ALLOW_RETRY | - FAULT_FLAG_RETRY_NOWAIT); - flags |= FAULT_FLAG_TRIED; - down_read(&mm->mmap_sem); - goto retry; + if (fault & VM_FAULT_RETRY) { + if (IS_ENABLED(CONFIG_PGSTE) && gmap && + (flags & FAULT_FLAG_RETRY_NOWAIT)) { + /* + * FAULT_FLAG_RETRY_NOWAIT has been set, mmap_lock has + * not been released + */ + current->thread.gmap_pfault = 1; + fault = VM_FAULT_PFAULT; + goto out_up; } + flags &= ~FAULT_FLAG_RETRY_NOWAIT; + flags |= FAULT_FLAG_TRIED; + mmap_read_lock(mm); + goto retry; } +out_gmap: if (IS_ENABLED(CONFIG_PGSTE) && gmap) { address = __gmap_link(gmap, current->thread.gmap_addr, address); @@ -537,7 +477,7 @@ retry: } fault = 0; out_up: - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); out: return fault; } @@ -575,7 +515,7 @@ void do_protection_exception(struct pt_regs *regs) fault = do_exception(regs, access); } if (unlikely(fault)) - do_fault_error(regs, access, fault); + do_fault_error(regs, fault); } NOKPROBE_SYMBOL(do_protection_exception); @@ -584,10 +524,10 @@ void do_dat_exception(struct pt_regs *regs) int access; vm_fault_t fault; - access = VM_READ | VM_EXEC | VM_WRITE; + access = VM_ACCESS_FLAGS; fault = do_exception(regs, access); if (unlikely(fault)) - do_fault_error(regs, access, fault); + do_fault_error(regs, fault); } NOKPROBE_SYMBOL(do_dat_exception); @@ -737,7 +677,7 @@ static void pfault_interrupt(struct ext_code ext_code, * interrupt since it must be a leftover of a PFAULT * CANCEL operation which didn't remove all pending * completion interrupts. */ - if (tsk->state == TASK_RUNNING) + if (task_is_running(tsk)) tsk->thread.pfault_wait = -1; } } else { @@ -816,3 +756,130 @@ out_extint: early_initcall(pfault_irq_init); #endif /* CONFIG_PFAULT */ + +#if IS_ENABLED(CONFIG_PGSTE) + +void do_secure_storage_access(struct pt_regs *regs) +{ + unsigned long addr = regs->int_parm_long & __FAIL_ADDR_MASK; + struct vm_area_struct *vma; + struct mm_struct *mm; + struct page *page; + struct gmap *gmap; + int rc; + + /* + * bit 61 tells us if the address is valid, if it's not we + * have a major problem and should stop the kernel or send a + * SIGSEGV to the process. Unfortunately bit 61 is not + * reliable without the misc UV feature so we need to check + * for that as well. + */ + if (test_bit_inv(BIT_UV_FEAT_MISC, &uv_info.uv_feature_indications) && + !test_bit_inv(61, ®s->int_parm_long)) { + /* + * When this happens, userspace did something that it + * was not supposed to do, e.g. branching into secure + * memory. Trigger a segmentation fault. + */ + if (user_mode(regs)) { + send_sig(SIGSEGV, current, 0); + return; + } + + /* + * The kernel should never run into this case and we + * have no way out of this situation. + */ + panic("Unexpected PGM 0x3d with TEID bit 61=0"); + } + + switch (get_fault_type(regs)) { + case GMAP_FAULT: + mm = current->mm; + gmap = (struct gmap *)S390_lowcore.gmap; + mmap_read_lock(mm); + addr = __gmap_translate(gmap, addr); + mmap_read_unlock(mm); + if (IS_ERR_VALUE(addr)) { + do_fault_error(regs, VM_FAULT_BADMAP); + break; + } + fallthrough; + case USER_FAULT: + mm = current->mm; + mmap_read_lock(mm); + vma = find_vma(mm, addr); + if (!vma) { + mmap_read_unlock(mm); + do_fault_error(regs, VM_FAULT_BADMAP); + break; + } + page = follow_page(vma, addr, FOLL_WRITE | FOLL_GET); + if (IS_ERR_OR_NULL(page)) { + mmap_read_unlock(mm); + break; + } + if (arch_make_page_accessible(page)) + send_sig(SIGSEGV, current, 0); + put_page(page); + mmap_read_unlock(mm); + break; + case KERNEL_FAULT: + page = phys_to_page(addr); + if (unlikely(!try_get_page(page))) + break; + rc = arch_make_page_accessible(page); + put_page(page); + if (rc) + BUG(); + break; + default: + do_fault_error(regs, VM_FAULT_BADMAP); + WARN_ON_ONCE(1); + } +} +NOKPROBE_SYMBOL(do_secure_storage_access); + +void do_non_secure_storage_access(struct pt_regs *regs) +{ + unsigned long gaddr = regs->int_parm_long & __FAIL_ADDR_MASK; + struct gmap *gmap = (struct gmap *)S390_lowcore.gmap; + + if (get_fault_type(regs) != GMAP_FAULT) { + do_fault_error(regs, VM_FAULT_BADMAP); + WARN_ON_ONCE(1); + return; + } + + if (gmap_convert_to_secure(gmap, gaddr) == -EINVAL) + send_sig(SIGSEGV, current, 0); +} +NOKPROBE_SYMBOL(do_non_secure_storage_access); + +void do_secure_storage_violation(struct pt_regs *regs) +{ + unsigned long gaddr = regs->int_parm_long & __FAIL_ADDR_MASK; + struct gmap *gmap = (struct gmap *)S390_lowcore.gmap; + + /* + * If the VM has been rebooted, its address space might still contain + * secure pages from the previous boot. + * Clear the page so it can be reused. + */ + if (!gmap_destroy_page(gmap, gaddr)) + return; + /* + * Either KVM messed up the secure guest mapping or the same + * page is mapped into multiple secure guests. + * + * This exception is only triggered when a guest 2 is running + * and can therefore never occur in kernel context. + */ + printk_ratelimited(KERN_WARNING + "Secure storage violation in task: %s, pid %d\n", + current->comm, current->pid); + send_sig(SIGSEGV, current, 0); +} + +#endif /* CONFIG_PGSTE */ diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index edcdca97e85e..02d15c8dc92e 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -2,7 +2,7 @@ /* * KVM guest address space mapping code * - * Copyright IBM Corp. 2007, 2016, 2018 + * Copyright IBM Corp. 2007, 2020 * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> * David Hildenbrand <david@redhat.com> * Janosch Frank <frankja@linux.vnet.ibm.com> @@ -17,8 +17,8 @@ #include <linux/swapops.h> #include <linux/ksm.h> #include <linux/mman.h> +#include <linux/pgtable.h> -#include <asm/pgtable.h> #include <asm/pgalloc.h> #include <asm/gmap.h> #include <asm/tlb.h> @@ -27,7 +27,6 @@ /** * gmap_alloc - allocate and initialize a guest address space - * @mm: pointer to the parent mm_struct * @limit: maximum address of the gmap address space * * Returns a guest address space structure. @@ -56,19 +55,19 @@ static struct gmap *gmap_alloc(unsigned long limit) atype = _ASCE_TYPE_REGION1; etype = _REGION1_ENTRY_EMPTY; } - gmap = kzalloc(sizeof(struct gmap), GFP_KERNEL); + gmap = kzalloc(sizeof(struct gmap), GFP_KERNEL_ACCOUNT); if (!gmap) goto out; INIT_LIST_HEAD(&gmap->crst_list); INIT_LIST_HEAD(&gmap->children); INIT_LIST_HEAD(&gmap->pt_list); - INIT_RADIX_TREE(&gmap->guest_to_host, GFP_KERNEL); - INIT_RADIX_TREE(&gmap->host_to_guest, GFP_ATOMIC); - INIT_RADIX_TREE(&gmap->host_to_rmap, GFP_ATOMIC); + INIT_RADIX_TREE(&gmap->guest_to_host, GFP_KERNEL_ACCOUNT); + INIT_RADIX_TREE(&gmap->host_to_guest, GFP_ATOMIC | __GFP_ACCOUNT); + INIT_RADIX_TREE(&gmap->host_to_rmap, GFP_ATOMIC | __GFP_ACCOUNT); spin_lock_init(&gmap->guest_table_lock); spin_lock_init(&gmap->shadow_lock); refcount_set(&gmap->ref_count, 1); - page = alloc_pages(GFP_KERNEL, CRST_ALLOC_ORDER); + page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER); if (!page) goto out_free; page->index = 0; @@ -300,7 +299,7 @@ struct gmap *gmap_get_enabled(void) EXPORT_SYMBOL_GPL(gmap_get_enabled); /* - * gmap_alloc_table is assumed to be called with mmap_sem held + * gmap_alloc_table is assumed to be called with mmap_lock held */ static int gmap_alloc_table(struct gmap *gmap, unsigned long *table, unsigned long init, unsigned long gaddr) @@ -309,7 +308,7 @@ static int gmap_alloc_table(struct gmap *gmap, unsigned long *table, unsigned long *new; /* since we dont free the gmap table until gmap_free we can unlock */ - page = alloc_pages(GFP_KERNEL, CRST_ALLOC_ORDER); + page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER); if (!page) return -ENOMEM; new = (unsigned long *) page_to_phys(page); @@ -405,10 +404,10 @@ int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len) return -EINVAL; flush = 0; - down_write(&gmap->mm->mmap_sem); + mmap_write_lock(gmap->mm); for (off = 0; off < len; off += PMD_SIZE) flush |= __gmap_unmap_by_gaddr(gmap, to + off); - up_write(&gmap->mm->mmap_sem); + mmap_write_unlock(gmap->mm); if (flush) gmap_flush_tlb(gmap); return 0; @@ -438,7 +437,7 @@ int gmap_map_segment(struct gmap *gmap, unsigned long from, return -EINVAL; flush = 0; - down_write(&gmap->mm->mmap_sem); + mmap_write_lock(gmap->mm); for (off = 0; off < len; off += PMD_SIZE) { /* Remove old translation */ flush |= __gmap_unmap_by_gaddr(gmap, to + off); @@ -448,7 +447,7 @@ int gmap_map_segment(struct gmap *gmap, unsigned long from, (void *) from + off)) break; } - up_write(&gmap->mm->mmap_sem); + mmap_write_unlock(gmap->mm); if (flush) gmap_flush_tlb(gmap); if (off >= len) @@ -466,7 +465,7 @@ EXPORT_SYMBOL_GPL(gmap_map_segment); * Returns user space address which corresponds to the guest address or * -EFAULT if no such mapping exists. * This function does not establish potentially missing page table entries. - * The mmap_sem of the mm that belongs to the address space must be held + * The mmap_lock of the mm that belongs to the address space must be held * when this function gets called. * * Note: Can also be called for shadow gmaps. @@ -495,16 +494,16 @@ unsigned long gmap_translate(struct gmap *gmap, unsigned long gaddr) { unsigned long rc; - down_read(&gmap->mm->mmap_sem); + mmap_read_lock(gmap->mm); rc = __gmap_translate(gmap, gaddr); - up_read(&gmap->mm->mmap_sem); + mmap_read_unlock(gmap->mm); return rc; } EXPORT_SYMBOL_GPL(gmap_translate); /** * gmap_unlink - disconnect a page table from the gmap shadow tables - * @gmap: pointer to guest mapping meta data structure + * @mm: pointer to the parent mm_struct * @table: pointer to the host page table * @vmaddr: vm address associated with the host page table */ @@ -527,14 +526,14 @@ static void gmap_pmdp_xchg(struct gmap *gmap, pmd_t *old, pmd_t new, unsigned long gaddr); /** - * gmap_link - set up shadow page tables to connect a host to a guest address + * __gmap_link - set up shadow page tables to connect a host to a guest address * @gmap: pointer to guest mapping meta data structure * @gaddr: guest address * @vmaddr: vm address * * Returns 0 on success, -ENOMEM for out of memory conditions, and -EFAULT * if the vm address is already mapped to a different guest segment. - * The mmap_sem of the mm that belongs to the address space must be held + * The mmap_lock of the mm that belongs to the address space must be held * when this function gets called. */ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr) @@ -594,7 +593,7 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr) if (pmd_large(*pmd) && !gmap->mm->context.allow_gmap_hpage_1m) return -EFAULT; /* Link gmap segment table entry location to page table. */ - rc = radix_tree_preload(GFP_KERNEL); + rc = radix_tree_preload(GFP_KERNEL_ACCOUNT); if (rc) return rc; ptl = pmd_lock(mm, pmd); @@ -640,7 +639,7 @@ int gmap_fault(struct gmap *gmap, unsigned long gaddr, int rc; bool unlocked; - down_read(&gmap->mm->mmap_sem); + mmap_read_lock(gmap->mm); retry: unlocked = false; @@ -649,13 +648,13 @@ retry: rc = vmaddr; goto out_up; } - if (fixup_user_fault(current, gmap->mm, vmaddr, fault_flags, + if (fixup_user_fault(gmap->mm, vmaddr, fault_flags, &unlocked)) { rc = -EFAULT; goto out_up; } /* - * In the case that fixup_user_fault unlocked the mmap_sem during + * In the case that fixup_user_fault unlocked the mmap_lock during * faultin redo __gmap_translate to not race with a map/unmap_segment. */ if (unlocked) @@ -663,16 +662,17 @@ retry: rc = __gmap_link(gmap, gaddr, vmaddr); out_up: - up_read(&gmap->mm->mmap_sem); + mmap_read_unlock(gmap->mm); return rc; } EXPORT_SYMBOL_GPL(gmap_fault); /* - * this function is assumed to be called with mmap_sem held + * this function is assumed to be called with mmap_lock held */ void __gmap_zap(struct gmap *gmap, unsigned long gaddr) { + struct vm_area_struct *vma; unsigned long vmaddr; spinlock_t *ptl; pte_t *ptep; @@ -682,11 +682,17 @@ void __gmap_zap(struct gmap *gmap, unsigned long gaddr) gaddr >> PMD_SHIFT); if (vmaddr) { vmaddr |= gaddr & ~PMD_MASK; + + vma = vma_lookup(gmap->mm, vmaddr); + if (!vma || is_vm_hugetlb_page(vma)) + return; + /* Get pointer to the page table entry */ ptep = get_locked_pte(gmap->mm, vmaddr, &ptl); - if (likely(ptep)) + if (likely(ptep)) { ptep_zap_unused(gmap->mm, vmaddr, ptep, 0); - pte_unmap_unlock(ptep, ptl); + pte_unmap_unlock(ptep, ptl); + } } } EXPORT_SYMBOL_GPL(__gmap_zap); @@ -696,7 +702,7 @@ void gmap_discard(struct gmap *gmap, unsigned long from, unsigned long to) unsigned long gaddr, vmaddr, size; struct vm_area_struct *vma; - down_read(&gmap->mm->mmap_sem); + mmap_read_lock(gmap->mm); for (gaddr = from; gaddr < to; gaddr = (gaddr + PMD_SIZE) & PMD_MASK) { /* Find the vm address for the guest address */ @@ -719,7 +725,7 @@ void gmap_discard(struct gmap *gmap, unsigned long from, unsigned long to) size = min(to - gaddr, PMD_SIZE - (gaddr & ~PMD_MASK)); zap_page_range(vma, vmaddr, size); } - up_read(&gmap->mm->mmap_sem); + mmap_read_unlock(gmap->mm); } EXPORT_SYMBOL_GPL(gmap_discard); @@ -787,16 +793,20 @@ static void gmap_call_notifier(struct gmap *gmap, unsigned long start, static inline unsigned long *gmap_table_walk(struct gmap *gmap, unsigned long gaddr, int level) { - unsigned long *table; + const int asce_type = gmap->asce & _ASCE_TYPE_MASK; + unsigned long *table = gmap->table; - if ((gmap->asce & _ASCE_TYPE_MASK) + 4 < (level * 4)) - return NULL; if (gmap_is_shadow(gmap) && gmap->removed) return NULL; - if (gaddr & (-1UL << (31 + ((gmap->asce & _ASCE_TYPE_MASK) >> 2)*11))) + + if (WARN_ON_ONCE(level > (asce_type >> 2) + 1)) return NULL; - table = gmap->table; - switch (gmap->asce & _ASCE_TYPE_MASK) { + + if (asce_type != _ASCE_TYPE_REGION1 && + gaddr & (-1UL << (31 + (asce_type >> 2) * 11))) + return NULL; + + switch (asce_type) { case _ASCE_TYPE_REGION1: table += (gaddr & _REGION1_INDEX) >> _REGION1_SHIFT; if (level == 4) @@ -804,7 +814,7 @@ static inline unsigned long *gmap_table_walk(struct gmap *gmap, if (*table & _REGION_ENTRY_INVALID) return NULL; table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); - /* Fallthrough */ + fallthrough; case _ASCE_TYPE_REGION2: table += (gaddr & _REGION2_INDEX) >> _REGION2_SHIFT; if (level == 3) @@ -812,7 +822,7 @@ static inline unsigned long *gmap_table_walk(struct gmap *gmap, if (*table & _REGION_ENTRY_INVALID) return NULL; table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); - /* Fallthrough */ + fallthrough; case _ASCE_TYPE_REGION3: table += (gaddr & _REGION3_INDEX) >> _REGION3_SHIFT; if (level == 2) @@ -820,7 +830,7 @@ static inline unsigned long *gmap_table_walk(struct gmap *gmap, if (*table & _REGION_ENTRY_INVALID) return NULL; table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); - /* Fallthrough */ + fallthrough; case _ASCE_TYPE_SEGMENT: table += (gaddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT; if (level == 1) @@ -875,10 +885,10 @@ static int gmap_pte_op_fixup(struct gmap *gmap, unsigned long gaddr, BUG_ON(gmap_is_shadow(gmap)); fault_flags = (prot == PROT_WRITE) ? FAULT_FLAG_WRITE : 0; - if (fixup_user_fault(current, mm, vmaddr, fault_flags, &unlocked)) + if (fixup_user_fault(mm, vmaddr, fault_flags, &unlocked)) return -EFAULT; if (unlocked) - /* lost mmap_sem, caller has to retry __gmap_translate */ + /* lost mmap_lock, caller has to retry __gmap_translate */ return 0; /* Connect the page tables */ return __gmap_link(gmap, gaddr, vmaddr); @@ -949,7 +959,7 @@ static inline void gmap_pmd_op_end(struct gmap *gmap, pmd_t *pmdp) * -EAGAIN if a fixup is needed * -EINVAL if unsupported notifier bits have been specified * - * Expected to be called with sg->mm->mmap_sem in read and + * Expected to be called with sg->mm->mmap_lock in read and * guest_table_lock held. */ static int gmap_protect_pmd(struct gmap *gmap, unsigned long gaddr, @@ -964,18 +974,18 @@ static int gmap_protect_pmd(struct gmap *gmap, unsigned long gaddr, return -EAGAIN; if (prot == PROT_NONE && !pmd_i) { - pmd_val(new) |= _SEGMENT_ENTRY_INVALID; + new = set_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_INVALID)); gmap_pmdp_xchg(gmap, pmdp, new, gaddr); } if (prot == PROT_READ && !pmd_p) { - pmd_val(new) &= ~_SEGMENT_ENTRY_INVALID; - pmd_val(new) |= _SEGMENT_ENTRY_PROTECT; + new = clear_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_INVALID)); + new = set_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_PROTECT)); gmap_pmdp_xchg(gmap, pmdp, new, gaddr); } if (bits & GMAP_NOTIFY_MPROT) - pmd_val(*pmdp) |= _SEGMENT_ENTRY_GMAP_IN; + set_pmd(pmdp, set_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_GMAP_IN))); /* Shadow GMAP protection needs split PMDs */ if (bits & GMAP_NOTIFY_SHADOW) @@ -995,7 +1005,7 @@ static int gmap_protect_pmd(struct gmap *gmap, unsigned long gaddr, * Returns 0 if successfully protected, -ENOMEM if out of memory and * -EAGAIN if a fixup is needed. * - * Expected to be called with sg->mm->mmap_sem in read + * Expected to be called with sg->mm->mmap_lock in read */ static int gmap_protect_pte(struct gmap *gmap, unsigned long gaddr, pmd_t *pmdp, int prot, unsigned long bits) @@ -1031,7 +1041,7 @@ static int gmap_protect_pte(struct gmap *gmap, unsigned long gaddr, * Returns 0 if successfully protected, -ENOMEM if out of memory and * -EFAULT if gaddr is invalid (or mapping for shadows is missing). * - * Called with sg->mm->mmap_sem in read. + * Called with sg->mm->mmap_lock in read. */ static int gmap_protect_range(struct gmap *gmap, unsigned long gaddr, unsigned long len, int prot, unsigned long bits) @@ -1102,9 +1112,9 @@ int gmap_mprotect_notify(struct gmap *gmap, unsigned long gaddr, return -EINVAL; if (!MACHINE_HAS_ESOP && prot == PROT_READ) return -EINVAL; - down_read(&gmap->mm->mmap_sem); + mmap_read_lock(gmap->mm); rc = gmap_protect_range(gmap, gaddr, len, prot, GMAP_NOTIFY_MPROT); - up_read(&gmap->mm->mmap_sem); + mmap_read_unlock(gmap->mm); return rc; } EXPORT_SYMBOL_GPL(gmap_mprotect_notify); @@ -1120,7 +1130,7 @@ EXPORT_SYMBOL_GPL(gmap_mprotect_notify); * if reading using the virtual address failed. -EINVAL if called on a gmap * shadow. * - * Called with gmap->mm->mmap_sem in read. + * Called with gmap->mm->mmap_lock in read. */ int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val) { @@ -1141,7 +1151,7 @@ int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val) address = pte_val(pte) & PAGE_MASK; address += gaddr & ~PAGE_MASK; *val = *(unsigned long *) address; - pte_val(*ptep) |= _PAGE_YOUNG; + set_pte(ptep, set_pte_bit(*ptep, __pgprot(_PAGE_YOUNG))); /* Do *NOT* clear the _PAGE_INVALID bit! */ rc = 0; } @@ -1173,6 +1183,7 @@ EXPORT_SYMBOL_GPL(gmap_read_table); static inline void gmap_insert_rmap(struct gmap *sg, unsigned long vmaddr, struct gmap_rmap *rmap) { + struct gmap_rmap *temp; void __rcu **slot; BUG_ON(!gmap_is_shadow(sg)); @@ -1180,6 +1191,12 @@ static inline void gmap_insert_rmap(struct gmap *sg, unsigned long vmaddr, if (slot) { rmap->next = radix_tree_deref_slot_protected(slot, &sg->guest_table_lock); + for (temp = rmap->next; temp; temp = temp->next) { + if (temp->raddr == rmap->raddr) { + kfree(rmap); + return; + } + } radix_tree_replace_slot(&sg->host_to_rmap, slot, rmap); } else { rmap->next = NULL; @@ -1214,11 +1231,11 @@ static int gmap_protect_rmap(struct gmap *sg, unsigned long raddr, vmaddr = __gmap_translate(parent, paddr); if (IS_ERR_VALUE(vmaddr)) return vmaddr; - rmap = kzalloc(sizeof(*rmap), GFP_KERNEL); + rmap = kzalloc(sizeof(*rmap), GFP_KERNEL_ACCOUNT); if (!rmap) return -ENOMEM; rmap->raddr = raddr; - rc = radix_tree_preload(GFP_KERNEL); + rc = radix_tree_preload(GFP_KERNEL_ACCOUNT); if (rc) { kfree(rmap); return rc; @@ -1268,7 +1285,7 @@ static int gmap_protect_rmap(struct gmap *sg, unsigned long raddr, static inline void gmap_idte_one(unsigned long asce, unsigned long vaddr) { asm volatile( - " .insn rrf,0xb98e0000,%0,%1,0,0" + " idte %0,0,%1" : : "a" (asce), "a" (vaddr) : "cc", "memory"); } @@ -1692,11 +1709,11 @@ struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce, } spin_unlock(&parent->shadow_lock); /* protect after insertion, so it will get properly invalidated */ - down_read(&parent->mm->mmap_sem); + mmap_read_lock(parent->mm); rc = gmap_protect_range(parent, asce & _ASCE_ORIGIN, ((asce & _ASCE_TABLE_LENGTH) + 1) * PAGE_SIZE, PROT_READ, GMAP_NOTIFY_SHADOW); - up_read(&parent->mm->mmap_sem); + mmap_read_unlock(parent->mm); spin_lock(&parent->shadow_lock); new->initialized = true; if (rc) { @@ -1725,7 +1742,7 @@ EXPORT_SYMBOL_GPL(gmap_shadow); * shadow table structure is incomplete, -ENOMEM if out of memory and * -EFAULT if an address in the parent gmap could not be resolved. * - * Called with sg->mm->mmap_sem in read. + * Called with sg->mm->mmap_lock in read. */ int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t, int fake) @@ -1737,7 +1754,7 @@ int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t, BUG_ON(!gmap_is_shadow(sg)); /* Allocate a shadow region second table */ - page = alloc_pages(GFP_KERNEL, CRST_ALLOC_ORDER); + page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER); if (!page) return -ENOMEM; page->index = r2t & _REGION_ENTRY_ORIGIN; @@ -1809,7 +1826,7 @@ EXPORT_SYMBOL_GPL(gmap_shadow_r2t); * shadow table structure is incomplete, -ENOMEM if out of memory and * -EFAULT if an address in the parent gmap could not be resolved. * - * Called with sg->mm->mmap_sem in read. + * Called with sg->mm->mmap_lock in read. */ int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t, int fake) @@ -1821,7 +1838,7 @@ int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t, BUG_ON(!gmap_is_shadow(sg)); /* Allocate a shadow region second table */ - page = alloc_pages(GFP_KERNEL, CRST_ALLOC_ORDER); + page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER); if (!page) return -ENOMEM; page->index = r3t & _REGION_ENTRY_ORIGIN; @@ -1840,6 +1857,7 @@ int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t, goto out_free; } else if (*table & _REGION_ENTRY_ORIGIN) { rc = -EAGAIN; /* Race with shadow */ + goto out_free; } crst_table_init(s_r3t, _REGION3_ENTRY_EMPTY); /* mark as invalid as long as the parent table is not protected */ @@ -1892,7 +1910,7 @@ EXPORT_SYMBOL_GPL(gmap_shadow_r3t); * shadow table structure is incomplete, -ENOMEM if out of memory and * -EFAULT if an address in the parent gmap could not be resolved. * - * Called with sg->mm->mmap_sem in read. + * Called with sg->mm->mmap_lock in read. */ int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt, int fake) @@ -1904,7 +1922,7 @@ int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt, BUG_ON(!gmap_is_shadow(sg) || (sgt & _REGION3_ENTRY_LARGE)); /* Allocate a shadow segment table */ - page = alloc_pages(GFP_KERNEL, CRST_ALLOC_ORDER); + page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER); if (!page) return -ENOMEM; page->index = sgt & _REGION_ENTRY_ORIGIN; @@ -1966,7 +1984,7 @@ out_free: EXPORT_SYMBOL_GPL(gmap_shadow_sgt); /** - * gmap_shadow_lookup_pgtable - find a shadow page table + * gmap_shadow_pgt_lookup - find a shadow page table * @sg: pointer to the shadow guest address space structure * @saddr: the address in the shadow aguest address space * @pgt: parent gmap address of the page table to get shadowed @@ -1976,7 +1994,7 @@ EXPORT_SYMBOL_GPL(gmap_shadow_sgt); * Returns 0 if the shadow page table was found and -EAGAIN if the page * table was not found. * - * Called with sg->mm->mmap_sem in read. + * Called with sg->mm->mmap_lock in read. */ int gmap_shadow_pgt_lookup(struct gmap *sg, unsigned long saddr, unsigned long *pgt, int *dat_protection, @@ -2016,7 +2034,7 @@ EXPORT_SYMBOL_GPL(gmap_shadow_pgt_lookup); * shadow table structure is incomplete, -ENOMEM if out of memory, * -EFAULT if an address in the parent gmap could not be resolved and * - * Called with gmap->mm->mmap_sem in read + * Called with gmap->mm->mmap_lock in read */ int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt, int fake) @@ -2095,7 +2113,7 @@ EXPORT_SYMBOL_GPL(gmap_shadow_pgt); * shadow table structure is incomplete, -ENOMEM if out of memory and * -EFAULT if an address in the parent gmap could not be resolved. * - * Called with sg->mm->mmap_sem in read. + * Called with sg->mm->mmap_lock in read. */ int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte) { @@ -2111,7 +2129,7 @@ int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte) parent = sg->parent; prot = (pte_val(pte) & _PAGE_PROTECT) ? PROT_READ : PROT_WRITE; - rmap = kzalloc(sizeof(*rmap), GFP_KERNEL); + rmap = kzalloc(sizeof(*rmap), GFP_KERNEL_ACCOUNT); if (!rmap) return -ENOMEM; rmap->raddr = (saddr & PAGE_MASK) | _SHADOW_RMAP_PGTABLE; @@ -2123,7 +2141,7 @@ int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte) rc = vmaddr; break; } - rc = radix_tree_preload(GFP_KERNEL); + rc = radix_tree_preload(GFP_KERNEL_ACCOUNT); if (rc) break; rc = -EAGAIN; @@ -2160,7 +2178,7 @@ int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte) } EXPORT_SYMBOL_GPL(gmap_shadow_page); -/** +/* * gmap_shadow_notify - handle notifications for shadow gmap * * Called with sg->parent->shadow_lock. @@ -2220,7 +2238,7 @@ static void gmap_shadow_notify(struct gmap *sg, unsigned long vmaddr, /** * ptep_notify - call all invalidation callbacks for a specific pte. * @mm: pointer to the process mm_struct - * @addr: virtual address in the process address space + * @vmaddr: virtual address in the process address space * @pte: pointer to the page table entry * @bits: bits from the pgste that caused the notify call * @@ -2264,7 +2282,7 @@ EXPORT_SYMBOL_GPL(ptep_notify); static void pmdp_notify_gmap(struct gmap *gmap, pmd_t *pmdp, unsigned long gaddr) { - pmd_val(*pmdp) &= ~_SEGMENT_ENTRY_GMAP_IN; + set_pmd(pmdp, clear_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_GMAP_IN))); gmap_call_notifier(gmap, gaddr, gaddr + HPAGE_SIZE - 1); } @@ -2283,7 +2301,7 @@ static void gmap_pmdp_xchg(struct gmap *gmap, pmd_t *pmdp, pmd_t new, { gaddr &= HPAGE_MASK; pmdp_notify_gmap(gmap, pmdp, gaddr); - pmd_val(new) &= ~_SEGMENT_ENTRY_GMAP_IN; + new = clear_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_GMAP_IN)); if (MACHINE_HAS_TLB_GUEST) __pmdp_idte(gaddr, (pmd_t *)pmdp, IDTE_GUEST_ASCE, gmap->asce, IDTE_GLOBAL); @@ -2291,7 +2309,7 @@ static void gmap_pmdp_xchg(struct gmap *gmap, pmd_t *pmdp, pmd_t new, __pmdp_idte(gaddr, (pmd_t *)pmdp, 0, 0, IDTE_GLOBAL); else __pmdp_csp(pmdp); - *pmdp = new; + set_pmd(pmdp, new); } static void gmap_pmdp_clear(struct mm_struct *mm, unsigned long vmaddr, @@ -2313,7 +2331,7 @@ static void gmap_pmdp_clear(struct mm_struct *mm, unsigned long vmaddr, _SEGMENT_ENTRY_GMAP_UC)); if (purge) __pmdp_csp(pmdp); - pmd_val(*pmdp) = _SEGMENT_ENTRY_EMPTY; + set_pmd(pmdp, __pmd(_SEGMENT_ENTRY_EMPTY)); } spin_unlock(&gmap->guest_table_lock); } @@ -2436,7 +2454,7 @@ static bool gmap_test_and_clear_dirty_pmd(struct gmap *gmap, pmd_t *pmdp, return false; /* Clear UC indication and reset protection */ - pmd_val(*pmdp) &= ~_SEGMENT_ENTRY_GMAP_UC; + set_pmd(pmdp, clear_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_GMAP_UC))); gmap_protect_pmd(gmap, gaddr, pmdp, PROT_READ, 0); return true; } @@ -2480,23 +2498,37 @@ void gmap_sync_dirty_log_pmd(struct gmap *gmap, unsigned long bitmap[4], } EXPORT_SYMBOL_GPL(gmap_sync_dirty_log_pmd); +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static int thp_split_walk_pmd_entry(pmd_t *pmd, unsigned long addr, + unsigned long end, struct mm_walk *walk) +{ + struct vm_area_struct *vma = walk->vma; + + split_huge_pmd(vma, pmd, addr); + return 0; +} + +static const struct mm_walk_ops thp_split_walk_ops = { + .pmd_entry = thp_split_walk_pmd_entry, +}; + static inline void thp_split_mm(struct mm_struct *mm) { -#ifdef CONFIG_TRANSPARENT_HUGEPAGE struct vm_area_struct *vma; - unsigned long addr; + VMA_ITERATOR(vmi, mm, 0); - for (vma = mm->mmap; vma != NULL; vma = vma->vm_next) { - for (addr = vma->vm_start; - addr < vma->vm_end; - addr += PAGE_SIZE) - follow_page(vma, addr, FOLL_SPLIT); + for_each_vma(vmi, vma) { vma->vm_flags &= ~VM_HUGEPAGE; vma->vm_flags |= VM_NOHUGEPAGE; + walk_page_vma(vma, &thp_split_walk_ops, NULL); } mm->def_flags |= VM_NOHUGEPAGE; -#endif } +#else +static inline void thp_split_mm(struct mm_struct *mm) +{ +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ /* * Remove all empty zero pages from the mapping for lazy refaulting @@ -2538,16 +2570,34 @@ int s390_enable_sie(void) /* Fail if the page tables are 2K */ if (!mm_alloc_pgste(mm)) return -EINVAL; - down_write(&mm->mmap_sem); + mmap_write_lock(mm); mm->context.has_pgste = 1; /* split thp mappings and disable thp for future mappings */ thp_split_mm(mm); walk_page_range(mm, 0, TASK_SIZE, &zap_zero_walk_ops, NULL); - up_write(&mm->mmap_sem); + mmap_write_unlock(mm); return 0; } EXPORT_SYMBOL_GPL(s390_enable_sie); +int gmap_mark_unmergeable(void) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + int ret; + VMA_ITERATOR(vmi, mm, 0); + + for_each_vma(vmi, vma) { + ret = ksm_madvise(vma, vma->vm_start, vma->vm_end, + MADV_UNMERGEABLE, &vma->vm_flags); + if (ret) + return ret; + } + mm->def_flags &= ~VM_MERGEABLE; + return 0; +} +EXPORT_SYMBOL_GPL(gmap_mark_unmergeable); + /* * Enable storage key handling from now on and initialize the storage * keys with the default key. @@ -2560,6 +2610,18 @@ static int __s390_enable_skey_pte(pte_t *pte, unsigned long addr, return 0; } +/* + * Give a chance to schedule after setting a key to 256 pages. + * We only hold the mm lock, which is a rwsem and the kvm srcu. + * Both can sleep. + */ +static int __s390_enable_skey_pmd(pmd_t *pmd, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + cond_resched(); + return 0; +} + static int __s390_enable_skey_hugetlb(pte_t *pte, unsigned long addr, unsigned long hmask, unsigned long next, struct mm_walk *walk) @@ -2582,39 +2644,35 @@ static int __s390_enable_skey_hugetlb(pte_t *pte, unsigned long addr, end = start + HPAGE_SIZE - 1; __storage_key_init_range(start, end); set_bit(PG_arch_1, &page->flags); + cond_resched(); return 0; } static const struct mm_walk_ops enable_skey_walk_ops = { .hugetlb_entry = __s390_enable_skey_hugetlb, .pte_entry = __s390_enable_skey_pte, + .pmd_entry = __s390_enable_skey_pmd, }; int s390_enable_skey(void) { struct mm_struct *mm = current->mm; - struct vm_area_struct *vma; int rc = 0; - down_write(&mm->mmap_sem); + mmap_write_lock(mm); if (mm_uses_skeys(mm)) goto out_up; mm->context.uses_skeys = 1; - for (vma = mm->mmap; vma; vma = vma->vm_next) { - if (ksm_madvise(vma, vma->vm_start, vma->vm_end, - MADV_UNMERGEABLE, &vma->vm_flags)) { - mm->context.uses_skeys = 0; - rc = -ENOMEM; - goto out_up; - } + rc = gmap_mark_unmergeable(); + if (rc) { + mm->context.uses_skeys = 0; + goto out_up; } - mm->def_flags &= ~VM_MERGEABLE; - walk_page_range(mm, 0, TASK_SIZE, &enable_skey_walk_ops, NULL); out_up: - up_write(&mm->mmap_sem); + mmap_write_unlock(mm); return rc; } EXPORT_SYMBOL_GPL(s390_enable_skey); @@ -2635,8 +2693,174 @@ static const struct mm_walk_ops reset_cmma_walk_ops = { void s390_reset_cmma(struct mm_struct *mm) { - down_write(&mm->mmap_sem); + mmap_write_lock(mm); walk_page_range(mm, 0, TASK_SIZE, &reset_cmma_walk_ops, NULL); - up_write(&mm->mmap_sem); + mmap_write_unlock(mm); } EXPORT_SYMBOL_GPL(s390_reset_cmma); + +#define GATHER_GET_PAGES 32 + +struct reset_walk_state { + unsigned long next; + unsigned long count; + unsigned long pfns[GATHER_GET_PAGES]; +}; + +static int s390_gather_pages(pte_t *ptep, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + struct reset_walk_state *p = walk->private; + pte_t pte = READ_ONCE(*ptep); + + if (pte_present(pte)) { + /* we have a reference from the mapping, take an extra one */ + get_page(phys_to_page(pte_val(pte))); + p->pfns[p->count] = phys_to_pfn(pte_val(pte)); + p->next = next; + p->count++; + } + return p->count >= GATHER_GET_PAGES; +} + +static const struct mm_walk_ops gather_pages_ops = { + .pte_entry = s390_gather_pages, +}; + +/* + * Call the Destroy secure page UVC on each page in the given array of PFNs. + * Each page needs to have an extra reference, which will be released here. + */ +void s390_uv_destroy_pfns(unsigned long count, unsigned long *pfns) +{ + unsigned long i; + + for (i = 0; i < count; i++) { + /* we always have an extra reference */ + uv_destroy_owned_page(pfn_to_phys(pfns[i])); + /* get rid of the extra reference */ + put_page(pfn_to_page(pfns[i])); + cond_resched(); + } +} +EXPORT_SYMBOL_GPL(s390_uv_destroy_pfns); + +/** + * __s390_uv_destroy_range - Call the destroy secure page UVC on each page + * in the given range of the given address space. + * @mm: the mm to operate on + * @start: the start of the range + * @end: the end of the range + * @interruptible: if not 0, stop when a fatal signal is received + * + * Walk the given range of the given address space and call the destroy + * secure page UVC on each page. Optionally exit early if a fatal signal is + * pending. + * + * Return: 0 on success, -EINTR if the function stopped before completing + */ +int __s390_uv_destroy_range(struct mm_struct *mm, unsigned long start, + unsigned long end, bool interruptible) +{ + struct reset_walk_state state = { .next = start }; + int r = 1; + + while (r > 0) { + state.count = 0; + mmap_read_lock(mm); + r = walk_page_range(mm, state.next, end, &gather_pages_ops, &state); + mmap_read_unlock(mm); + cond_resched(); + s390_uv_destroy_pfns(state.count, state.pfns); + if (interruptible && fatal_signal_pending(current)) + return -EINTR; + } + return 0; +} +EXPORT_SYMBOL_GPL(__s390_uv_destroy_range); + +/** + * s390_unlist_old_asce - Remove the topmost level of page tables from the + * list of page tables of the gmap. + * @gmap: the gmap whose table is to be removed + * + * On s390x, KVM keeps a list of all pages containing the page tables of the + * gmap (the CRST list). This list is used at tear down time to free all + * pages that are now not needed anymore. + * + * This function removes the topmost page of the tree (the one pointed to by + * the ASCE) from the CRST list. + * + * This means that it will not be freed when the VM is torn down, and needs + * to be handled separately by the caller, unless a leak is actually + * intended. Notice that this function will only remove the page from the + * list, the page will still be used as a top level page table (and ASCE). + */ +void s390_unlist_old_asce(struct gmap *gmap) +{ + struct page *old; + + old = virt_to_page(gmap->table); + spin_lock(&gmap->guest_table_lock); + list_del(&old->lru); + /* + * Sometimes the topmost page might need to be "removed" multiple + * times, for example if the VM is rebooted into secure mode several + * times concurrently, or if s390_replace_asce fails after calling + * s390_remove_old_asce and is attempted again later. In that case + * the old asce has been removed from the list, and therefore it + * will not be freed when the VM terminates, but the ASCE is still + * in use and still pointed to. + * A subsequent call to replace_asce will follow the pointer and try + * to remove the same page from the list again. + * Therefore it's necessary that the page of the ASCE has valid + * pointers, so list_del can work (and do nothing) without + * dereferencing stale or invalid pointers. + */ + INIT_LIST_HEAD(&old->lru); + spin_unlock(&gmap->guest_table_lock); +} +EXPORT_SYMBOL_GPL(s390_unlist_old_asce); + +/** + * s390_replace_asce - Try to replace the current ASCE of a gmap with a copy + * @gmap: the gmap whose ASCE needs to be replaced + * + * If the allocation of the new top level page table fails, the ASCE is not + * replaced. + * In any case, the old ASCE is always removed from the gmap CRST list. + * Therefore the caller has to make sure to save a pointer to it + * beforehand, unless a leak is actually intended. + */ +int s390_replace_asce(struct gmap *gmap) +{ + unsigned long asce; + struct page *page; + void *table; + + s390_unlist_old_asce(gmap); + + page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER); + if (!page) + return -ENOMEM; + table = page_to_virt(page); + memcpy(table, gmap->table, 1UL << (CRST_ALLOC_ORDER + PAGE_SHIFT)); + + /* + * The caller has to deal with the old ASCE, but here we make sure + * the new one is properly added to the CRST list, so that + * it will be freed when the VM is torn down. + */ + spin_lock(&gmap->guest_table_lock); + list_add(&page->lru, &gmap->crst_list); + spin_unlock(&gmap->guest_table_lock); + + /* Set new table origin while preserving existing ASCE control bits */ + asce = (gmap->asce & ~_ASCE_ORIGIN) | __pa(table); + WRITE_ONCE(gmap->asce, asce); + WRITE_ONCE(gmap->mm->context.gmap_asce, asce); + WRITE_ONCE(gmap->table, table); + + return 0; +} +EXPORT_SYMBOL_GPL(s390_replace_asce); diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c index 5674710a4841..c299a18273ff 100644 --- a/arch/s390/mm/hugetlbpage.c +++ b/arch/s390/mm/hugetlbpage.c @@ -9,6 +9,7 @@ #define KMSG_COMPONENT "hugetlb" #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#include <asm/pgalloc.h> #include <linux/mm.h> #include <linux/hugetlb.h> #include <linux/mman.h> @@ -72,8 +73,8 @@ static inline unsigned long __pte_to_rste(pte_t pte) static inline pte_t __rste_to_pte(unsigned long rste) { + unsigned long pteval; int present; - pte_t pte; if ((rste & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3) present = pud_present(__pud(rste)); @@ -101,29 +102,21 @@ static inline pte_t __rste_to_pte(unsigned long rste) * u unused, l large */ if (present) { - pte_val(pte) = rste & _SEGMENT_ENTRY_ORIGIN_LARGE; - pte_val(pte) |= _PAGE_LARGE | _PAGE_PRESENT; - pte_val(pte) |= move_set_bit(rste, _SEGMENT_ENTRY_READ, - _PAGE_READ); - pte_val(pte) |= move_set_bit(rste, _SEGMENT_ENTRY_WRITE, - _PAGE_WRITE); - pte_val(pte) |= move_set_bit(rste, _SEGMENT_ENTRY_INVALID, - _PAGE_INVALID); - pte_val(pte) |= move_set_bit(rste, _SEGMENT_ENTRY_PROTECT, - _PAGE_PROTECT); - pte_val(pte) |= move_set_bit(rste, _SEGMENT_ENTRY_DIRTY, - _PAGE_DIRTY); - pte_val(pte) |= move_set_bit(rste, _SEGMENT_ENTRY_YOUNG, - _PAGE_YOUNG); + pteval = rste & _SEGMENT_ENTRY_ORIGIN_LARGE; + pteval |= _PAGE_LARGE | _PAGE_PRESENT; + pteval |= move_set_bit(rste, _SEGMENT_ENTRY_READ, _PAGE_READ); + pteval |= move_set_bit(rste, _SEGMENT_ENTRY_WRITE, _PAGE_WRITE); + pteval |= move_set_bit(rste, _SEGMENT_ENTRY_INVALID, _PAGE_INVALID); + pteval |= move_set_bit(rste, _SEGMENT_ENTRY_PROTECT, _PAGE_PROTECT); + pteval |= move_set_bit(rste, _SEGMENT_ENTRY_DIRTY, _PAGE_DIRTY); + pteval |= move_set_bit(rste, _SEGMENT_ENTRY_YOUNG, _PAGE_YOUNG); #ifdef CONFIG_MEM_SOFT_DIRTY - pte_val(pte) |= move_set_bit(rste, _SEGMENT_ENTRY_SOFT_DIRTY, - _PAGE_DIRTY); + pteval |= move_set_bit(rste, _SEGMENT_ENTRY_SOFT_DIRTY, _PAGE_SOFT_DIRTY); #endif - pte_val(pte) |= move_set_bit(rste, _SEGMENT_ENTRY_NOEXEC, - _PAGE_NOEXEC); + pteval |= move_set_bit(rste, _SEGMENT_ENTRY_NOEXEC, _PAGE_NOEXEC); } else - pte_val(pte) = _PAGE_INVALID; - return pte; + pteval = _PAGE_INVALID; + return __pte(pteval); } static void clear_huge_pte_skeys(struct mm_struct *mm, unsigned long rste) @@ -159,12 +152,15 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, rste &= ~_SEGMENT_ENTRY_NOEXEC; /* Set correct table type for 2G hugepages */ - if ((pte_val(*ptep) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3) - rste |= _REGION_ENTRY_TYPE_R3 | _REGION3_ENTRY_LARGE; - else + if ((pte_val(*ptep) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3) { + if (likely(pte_present(pte))) + rste |= _REGION3_ENTRY_LARGE; + rste |= _REGION_ENTRY_TYPE_R3; + } else if (likely(pte_present(pte))) rste |= _SEGMENT_ENTRY_LARGE; + clear_huge_pte_skeys(mm, rste); - pte_val(*ptep) = rste; + set_pte(ptep, __pte(rste)); } pte_t huge_ptep_get(pte_t *ptep) @@ -186,7 +182,7 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm, return pte; } -pte_t *huge_pte_alloc(struct mm_struct *mm, +pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, unsigned long sz) { pgd_t *pgdp; @@ -241,35 +237,15 @@ int pud_huge(pud_t pud) return pud_large(pud); } -struct page * -follow_huge_pud(struct mm_struct *mm, unsigned long address, - pud_t *pud, int flags) +bool __init arch_hugetlb_valid_size(unsigned long size) { - if (flags & FOLL_GET) - return NULL; - - return pud_page(*pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT); -} - -static __init int setup_hugepagesz(char *opt) -{ - unsigned long size; - char *string = opt; - - size = memparse(opt, &opt); - if (MACHINE_HAS_EDAT1 && size == PMD_SIZE) { - hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT); - } else if (MACHINE_HAS_EDAT2 && size == PUD_SIZE) { - hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT); - } else { - hugetlb_bad_size(); - pr_err("hugepagesz= specifies an unsupported page size %s\n", - string); - return 0; - } - return 1; + if (MACHINE_HAS_EDAT1 && size == PMD_SIZE) + return true; + else if (MACHINE_HAS_EDAT2 && size == PUD_SIZE) + return true; + else + return false; } -__setup("hugepagesz=", setup_hugepagesz); static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file, unsigned long addr, unsigned long len, @@ -326,7 +302,6 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, struct hstate *h = hstate_file(file); struct mm_struct *mm = current->mm; struct vm_area_struct *vma; - int rc; if (len & ~huge_page_mask(h)) return -EINVAL; @@ -353,15 +328,9 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, else addr = hugetlb_get_unmapped_area_topdown(file, addr, len, pgoff, flags); - if (addr & ~PAGE_MASK) + if (offset_in_page(addr)) return addr; check_asce_limit: - if (addr + len > current->mm->context.asce_limit && - addr + len <= TASK_SIZE) { - rc = crst_table_upgrade(mm, addr + len); - if (rc) - return (unsigned long) rc; - } - return addr; + return check_asce_limit(mm, addr, len); } diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index ac44bd76db4b..97d66a3e60fb 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -33,10 +33,11 @@ #include <linux/dma-direct.h> #include <asm/processor.h> #include <linux/uaccess.h> -#include <asm/pgtable.h> #include <asm/pgalloc.h> +#include <asm/kfence.h> +#include <asm/ptdump.h> #include <asm/dma.h> -#include <asm/lowcore.h> +#include <asm/abs_lowcore.h> #include <asm/tlb.h> #include <asm/tlbflush.h> #include <asm/sections.h> @@ -46,15 +47,18 @@ #include <asm/kasan.h> #include <asm/dma-mapping.h> #include <asm/uv.h> +#include <linux/virtio_anchor.h> +#include <linux/virtio_config.h> -pgd_t swapper_pg_dir[PTRS_PER_PGD] __section(.bss..swapper_pg_dir); +pgd_t swapper_pg_dir[PTRS_PER_PGD] __section(".bss..swapper_pg_dir"); +static pgd_t invalid_pg_dir[PTRS_PER_PGD] __section(".bss..invalid_pg_dir"); + +unsigned long s390_invalid_asce; unsigned long empty_zero_page, zero_page_mask; EXPORT_SYMBOL(empty_zero_page); EXPORT_SYMBOL(zero_page_mask); -bool initmem_freed; - static void __init setup_zero_pages(void) { unsigned int order; @@ -91,6 +95,9 @@ void __init paging_init(void) unsigned long pgd_type, asce_bits; psw_t psw; + s390_invalid_asce = (unsigned long)invalid_pg_dir; + s390_invalid_asce |= _ASCE_TYPE_REGION3 | _ASCE_TABLE_LENGTH; + crst_table_init((unsigned long *)invalid_pg_dir, _REGION3_ENTRY_EMPTY); init_mm.pgd = swapper_pg_dir; if (VMALLOC_END > _REGION2_SIZE) { asce_bits = _ASCE_TYPE_REGION2 | _ASCE_TABLE_LENGTH; @@ -101,14 +108,14 @@ void __init paging_init(void) } init_mm.context.asce = (__pa(init_mm.pgd) & PAGE_MASK) | asce_bits; S390_lowcore.kernel_asce = init_mm.context.asce; - S390_lowcore.user_asce = S390_lowcore.kernel_asce; + S390_lowcore.user_asce = s390_invalid_asce; crst_table_init((unsigned long *) init_mm.pgd, pgd_type); vmem_map_init(); - kasan_copy_shadow(init_mm.pgd); + kasan_copy_shadow_mapping(); /* enable virtual mapping in kernel mode */ __ctl_load(S390_lowcore.kernel_asce, 1, 1); - __ctl_load(S390_lowcore.kernel_asce, 7, 7); + __ctl_load(S390_lowcore.user_asce, 7, 7); __ctl_load(S390_lowcore.kernel_asce, 13, 13); psw.mask = __extract_psw(); psw_bits(psw).dat = 1; @@ -116,13 +123,12 @@ void __init paging_init(void) __load_psw_mask(psw.mask); kasan_free_early_identity(); - sparse_memory_present_with_active_regions(MAX_NUMNODES); sparse_init(); zone_dma_bits = 31; memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); max_zone_pfns[ZONE_DMA] = PFN_DOWN(MAX_DMA_ADDRESS); max_zone_pfns[ZONE_NORMAL] = max_low_pfn; - free_area_init_nodes(max_zone_pfns); + free_area_init(max_zone_pfns); } void mark_rodata_ro(void) @@ -131,6 +137,7 @@ void mark_rodata_ro(void) set_memory_ro((unsigned long)__start_ro_after_init, size >> PAGE_SHIFT); pr_info("Write protected read-only-after-init data: %luk\n", size >> 10); + debug_checkwx(); } int set_memory_encrypted(unsigned long addr, int numpages) @@ -168,10 +175,11 @@ static void pv_init(void) if (!is_prot_virt_guest()) return; + virtio_set_mem_acc_cb(virtio_require_restricted_mem_acc); + /* make sure bounce buffers are shared */ - swiotlb_init(1); + swiotlb_init(true, SWIOTLB_FORCE | SWIOTLB_VERBOSE); swiotlb_update_mem_attributes(); - swiotlb_force = SWIOTLB_FORCE; } void __init mem_init(void) @@ -183,7 +191,7 @@ void __init mem_init(void) high_memory = (void *) __va(max_low_pfn * PAGE_SIZE); pv_init(); - + kfence_split_mapping(); /* Setup guest page hinting */ cmma_init(); @@ -192,16 +200,16 @@ void __init mem_init(void) setup_zero_pages(); /* Setup zeroed pages. */ cmma_init_nodat(); - - mem_init_print_info(NULL); } void free_initmem(void) { - initmem_freed = true; __set_memory((unsigned long)_sinittext, (unsigned long)(_einittext - _sinittext) >> PAGE_SHIFT, SET_MEMORY_RW | SET_MEMORY_NX); + free_reserved_area(sclp_early_sccb, + sclp_early_sccb + EXT_SCCB_READ_SCP, + POISON_FREE_INITMEM, "unused early sccb"); free_initmem_default(POISON_FREE_INITMEM); } @@ -268,27 +276,30 @@ device_initcall(s390_cma_mem_init); #endif /* CONFIG_CMA */ int arch_add_memory(int nid, u64 start, u64 size, - struct mhp_restrictions *restrictions) + struct mhp_params *params) { unsigned long start_pfn = PFN_DOWN(start); unsigned long size_pages = PFN_DOWN(size); int rc; - if (WARN_ON_ONCE(restrictions->altmap)) + if (WARN_ON_ONCE(params->altmap)) + return -EINVAL; + + if (WARN_ON_ONCE(params->pgprot.pgprot != PAGE_KERNEL.pgprot)) return -EINVAL; + VM_BUG_ON(!mhp_range_allowed(start, size, true)); rc = vmem_add_mapping(start, size); if (rc) return rc; - rc = __add_pages(nid, start_pfn, size_pages, restrictions); + rc = __add_pages(nid, start_pfn, size_pages, params); if (rc) vmem_remove_mapping(start, size); return rc; } -void arch_remove_memory(int nid, u64 start, u64 size, - struct vmem_altmap *altmap) +void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; diff --git a/arch/s390/mm/kasan_init.c b/arch/s390/mm/kasan_init.c index 06345616a646..9f988d4582ed 100644 --- a/arch/s390/mm/kasan_init.c +++ b/arch/s390/mm/kasan_init.c @@ -2,8 +2,8 @@ #include <linux/kasan.h> #include <linux/sched/task.h> #include <linux/memblock.h> +#include <linux/pgtable.h> #include <asm/pgalloc.h> -#include <asm/pgtable.h> #include <asm/kasan.h> #include <asm/mem_detect.h> #include <asm/processor.h> @@ -11,6 +11,7 @@ #include <asm/facility.h> #include <asm/sections.h> #include <asm/setup.h> +#include <asm/uv.h> static unsigned long segment_pos __initdata; static unsigned long segment_low __initdata; @@ -85,7 +86,7 @@ enum populate_mode { POPULATE_ZERO_SHADOW, POPULATE_SHALLOW }; -static void __init kasan_early_vmemmap_populate(unsigned long address, +static void __init kasan_early_pgtable_populate(unsigned long address, unsigned long end, enum populate_mode mode) { @@ -99,9 +100,16 @@ static void __init kasan_early_vmemmap_populate(unsigned long address, pgt_prot_zero = pgprot_val(PAGE_KERNEL_RO); if (!has_nx) pgt_prot_zero &= ~_PAGE_NOEXEC; - pgt_prot = pgprot_val(PAGE_KERNEL_EXEC); - sgt_prot = pgprot_val(SEGMENT_KERNEL_EXEC); + pgt_prot = pgprot_val(PAGE_KERNEL); + sgt_prot = pgprot_val(SEGMENT_KERNEL); + if (!has_nx || mode == POPULATE_ONE2ONE) { + pgt_prot &= ~_PAGE_NOEXEC; + sgt_prot &= ~_SEGMENT_ENTRY_NOEXEC; + } + /* + * The first 1MB of 1:1 mapping is mapped with 4KB pages + */ while (address < end) { pg_dir = pgd_offset_k(address); if (pgd_none(*pg_dir)) { @@ -117,8 +125,7 @@ static void __init kasan_early_vmemmap_populate(unsigned long address, pgd_populate(&init_mm, pg_dir, p4_dir); } - if (IS_ENABLED(CONFIG_KASAN_S390_4_LEVEL_PAGING) && - mode == POPULATE_SHALLOW) { + if (mode == POPULATE_SHALLOW) { address = (address + P4D_SIZE) & P4D_MASK; continue; } @@ -137,12 +144,6 @@ static void __init kasan_early_vmemmap_populate(unsigned long address, p4d_populate(&init_mm, p4_dir, pu_dir); } - if (!IS_ENABLED(CONFIG_KASAN_S390_4_LEVEL_PAGING) && - mode == POPULATE_SHALLOW) { - address = (address + PUD_SIZE) & PUD_MASK; - continue; - } - pu_dir = pud_offset(p4_dir, address); if (pud_none(*pu_dir)) { if (mode == POPULATE_ZERO_SHADOW && @@ -159,30 +160,26 @@ static void __init kasan_early_vmemmap_populate(unsigned long address, pm_dir = pmd_offset(pu_dir, address); if (pmd_none(*pm_dir)) { - if (mode == POPULATE_ZERO_SHADOW && - IS_ALIGNED(address, PMD_SIZE) && + if (IS_ALIGNED(address, PMD_SIZE) && end - address >= PMD_SIZE) { - pmd_populate(&init_mm, pm_dir, - kasan_early_shadow_pte); - address = (address + PMD_SIZE) & PMD_MASK; - continue; - } - /* the first megabyte of 1:1 is mapped with 4k pages */ - if (has_edat && address && end - address >= PMD_SIZE && - mode != POPULATE_ZERO_SHADOW) { - void *page; - - if (mode == POPULATE_ONE2ONE) { - page = (void *)address; - } else { - page = kasan_early_alloc_segment(); - memset(page, 0, _SEGMENT_SIZE); + if (mode == POPULATE_ZERO_SHADOW) { + pmd_populate(&init_mm, pm_dir, kasan_early_shadow_pte); + address = (address + PMD_SIZE) & PMD_MASK; + continue; + } else if (has_edat && address) { + void *page; + + if (mode == POPULATE_ONE2ONE) { + page = (void *)address; + } else { + page = kasan_early_alloc_segment(); + memset(page, 0, _SEGMENT_SIZE); + } + set_pmd(pm_dir, __pmd(__pa(page) | sgt_prot)); + address = (address + PMD_SIZE) & PMD_MASK; + continue; } - pmd_val(*pm_dir) = __pa(page) | sgt_prot; - address = (address + PMD_SIZE) & PMD_MASK; - continue; } - pt_dir = kasan_early_pte_alloc(); pmd_populate(&init_mm, pm_dir, pt_dir); } else if (pmd_large(*pm_dir)) { @@ -197,16 +194,16 @@ static void __init kasan_early_vmemmap_populate(unsigned long address, switch (mode) { case POPULATE_ONE2ONE: page = (void *)address; - pte_val(*pt_dir) = __pa(page) | pgt_prot; + set_pte(pt_dir, __pte(__pa(page) | pgt_prot)); break; case POPULATE_MAP: page = kasan_early_alloc_pages(0); memset(page, 0, PAGE_SIZE); - pte_val(*pt_dir) = __pa(page) | pgt_prot; + set_pte(pt_dir, __pte(__pa(page) | pgt_prot)); break; case POPULATE_ZERO_SHADOW: page = kasan_early_shadow_page; - pte_val(*pt_dir) = __pa(page) | pgt_prot_zero; + set_pte(pt_dir, __pte(__pa(page) | pgt_prot_zero)); break; case POPULATE_SHALLOW: /* should never happen */ @@ -254,12 +251,9 @@ static void __init kasan_early_detect_facilities(void) void __init kasan_early_init(void) { - unsigned long untracked_mem_end; unsigned long shadow_alloc_size; unsigned long initrd_end; - unsigned long asce_type; unsigned long memsize; - unsigned long vmax; unsigned long pgt_prot = pgprot_val(PAGE_KERNEL_RO); pte_t pte_z; pmd_t pmd_z = __pmd(__pa(kasan_early_shadow_pte) | _SEGMENT_ENTRY); @@ -274,30 +268,23 @@ void __init kasan_early_init(void) memsize = get_mem_detect_end(); if (!memsize) kasan_early_panic("cannot detect physical memory size\n"); - /* respect mem= cmdline parameter */ - if (memory_end_set && memsize > memory_end) - memsize = memory_end; - if (IS_ENABLED(CONFIG_CRASH_DUMP) && OLDMEM_BASE) - memsize = min(memsize, OLDMEM_SIZE); - memsize = min(memsize, KASAN_SHADOW_START); - - if (IS_ENABLED(CONFIG_KASAN_S390_4_LEVEL_PAGING)) { - /* 4 level paging */ - BUILD_BUG_ON(!IS_ALIGNED(KASAN_SHADOW_START, P4D_SIZE)); - BUILD_BUG_ON(!IS_ALIGNED(KASAN_SHADOW_END, P4D_SIZE)); - crst_table_init((unsigned long *)early_pg_dir, - _REGION2_ENTRY_EMPTY); - untracked_mem_end = vmax = _REGION1_SIZE; - asce_type = _ASCE_TYPE_REGION2; - } else { - /* 3 level paging */ - BUILD_BUG_ON(!IS_ALIGNED(KASAN_SHADOW_START, PUD_SIZE)); - BUILD_BUG_ON(!IS_ALIGNED(KASAN_SHADOW_END, PUD_SIZE)); - crst_table_init((unsigned long *)early_pg_dir, - _REGION3_ENTRY_EMPTY); - untracked_mem_end = vmax = _REGION2_SIZE; - asce_type = _ASCE_TYPE_REGION3; - } + /* + * Kasan currently supports standby memory but only if it follows + * online memory (default allocation), i.e. no memory holes. + * - memsize represents end of online memory + * - ident_map_size represents online + standby and memory limits + * accounted. + * Kasan maps "memsize" right away. + * [0, memsize] - as identity mapping + * [__sha(0), __sha(memsize)] - shadow memory for identity mapping + * The rest [memsize, ident_map_size] if memsize < ident_map_size + * could be mapped/unmapped dynamically later during memory hotplug. + */ + memsize = min(memsize, ident_map_size); + + BUILD_BUG_ON(!IS_ALIGNED(KASAN_SHADOW_START, P4D_SIZE)); + BUILD_BUG_ON(!IS_ALIGNED(KASAN_SHADOW_END, P4D_SIZE)); + crst_table_init((unsigned long *)early_pg_dir, _REGION2_ENTRY_EMPTY); /* init kasan zero shadow */ crst_table_init((unsigned long *)kasan_early_shadow_p4d, @@ -312,7 +299,7 @@ void __init kasan_early_init(void) pgalloc_low = round_up((unsigned long)_end, _SEGMENT_SIZE); if (IS_ENABLED(CONFIG_BLK_DEV_INITRD)) { initrd_end = - round_up(INITRD_START + INITRD_SIZE, _SEGMENT_SIZE); + round_up(initrd_data.start + initrd_data.size, _SEGMENT_SIZE); pgalloc_low = max(pgalloc_low, initrd_end); } @@ -363,24 +350,25 @@ void __init kasan_early_init(void) * +-----------------+ +- shadow end ---+ */ /* populate kasan shadow (for identity mapping and zero page mapping) */ - kasan_early_vmemmap_populate(__sha(0), __sha(memsize), POPULATE_MAP); - if (IS_ENABLED(CONFIG_MODULES)) - untracked_mem_end = vmax - MODULES_LEN; + kasan_early_pgtable_populate(__sha(0), __sha(memsize), POPULATE_MAP); if (IS_ENABLED(CONFIG_KASAN_VMALLOC)) { - untracked_mem_end = vmax - vmalloc_size - MODULES_LEN; /* shallowly populate kasan shadow for vmalloc and modules */ - kasan_early_vmemmap_populate(__sha(untracked_mem_end), - __sha(vmax), POPULATE_SHALLOW); + kasan_early_pgtable_populate(__sha(VMALLOC_START), __sha(MODULES_END), + POPULATE_SHALLOW); } /* populate kasan shadow for untracked memory */ - kasan_early_vmemmap_populate(__sha(max_physmem_end), - __sha(untracked_mem_end), + kasan_early_pgtable_populate(__sha(ident_map_size), + IS_ENABLED(CONFIG_KASAN_VMALLOC) ? + __sha(VMALLOC_START) : + __sha(MODULES_VADDR), + POPULATE_ZERO_SHADOW); + kasan_early_pgtable_populate(__sha(MODULES_END), __sha(_REGION1_SIZE), POPULATE_ZERO_SHADOW); /* memory allocated for identity mapping structs will be freed later */ pgalloc_freeable = pgalloc_pos; /* populate identity mapping */ - kasan_early_vmemmap_populate(0, memsize, POPULATE_ONE2ONE); - kasan_set_pgd(early_pg_dir, asce_type); + kasan_early_pgtable_populate(0, memsize, POPULATE_ONE2ONE); + kasan_set_pgd(early_pg_dir, _ASCE_TYPE_REGION2); kasan_enable_dat(); /* enable kasan */ init_task.kasan_depth = 0; @@ -388,7 +376,7 @@ void __init kasan_early_init(void) sclp_early_printk("KernelAddressSanitizer initialized\n"); } -void __init kasan_copy_shadow(pgd_t *pg_dir) +void __init kasan_copy_shadow_mapping(void) { /* * At this point we are still running on early pages setup early_pg_dir, @@ -400,27 +388,16 @@ void __init kasan_copy_shadow(pgd_t *pg_dir) pgd_t *pg_dir_dst; p4d_t *p4_dir_src; p4d_t *p4_dir_dst; - pud_t *pu_dir_src; - pud_t *pu_dir_dst; pg_dir_src = pgd_offset_raw(early_pg_dir, KASAN_SHADOW_START); - pg_dir_dst = pgd_offset_raw(pg_dir, KASAN_SHADOW_START); + pg_dir_dst = pgd_offset_raw(init_mm.pgd, KASAN_SHADOW_START); p4_dir_src = p4d_offset(pg_dir_src, KASAN_SHADOW_START); p4_dir_dst = p4d_offset(pg_dir_dst, KASAN_SHADOW_START); - if (!p4d_folded(*p4_dir_src)) { - /* 4 level paging */ - memcpy(p4_dir_dst, p4_dir_src, - (KASAN_SHADOW_SIZE >> P4D_SHIFT) * sizeof(p4d_t)); - return; - } - /* 3 level paging */ - pu_dir_src = pud_offset(p4_dir_src, KASAN_SHADOW_START); - pu_dir_dst = pud_offset(p4_dir_dst, KASAN_SHADOW_START); - memcpy(pu_dir_dst, pu_dir_src, - (KASAN_SHADOW_SIZE >> PUD_SHIFT) * sizeof(pud_t)); + memcpy(p4_dir_dst, p4_dir_src, + (KASAN_SHADOW_SIZE >> P4D_SHIFT) * sizeof(p4d_t)); } void __init kasan_free_early_identity(void) { - memblock_free(pgalloc_pos, pgalloc_freeable - pgalloc_pos); + memblock_phys_free(pgalloc_pos, pgalloc_freeable - pgalloc_pos); } diff --git a/arch/s390/mm/maccess.c b/arch/s390/mm/maccess.c index de7ca4b6718f..1571cdcb0c50 100644 --- a/arch/s390/mm/maccess.c +++ b/arch/s390/mm/maccess.c @@ -4,8 +4,6 @@ * * Copyright IBM Corp. 2009, 2015 * - * Author(s): Heiko Carstens <heiko.carstens@de.ibm.com>, - * */ #include <linux/uaccess.h> @@ -14,9 +12,17 @@ #include <linux/errno.h> #include <linux/gfp.h> #include <linux/cpu.h> +#include <linux/uio.h> +#include <asm/asm-extable.h> #include <asm/ctl_reg.h> #include <asm/io.h> +#include <asm/abs_lowcore.h> #include <asm/stacktrace.h> +#include <asm/maccess.h> + +unsigned long __bootdata_preserved(__memcpy_real_area); +static __ro_after_init pte_t *memcpy_real_ptep; +static DEFINE_MUTEX(memcpy_real_mutex); static notrace long s390_kernel_write_odd(void *dst, const void *src, size_t size) { @@ -55,155 +61,94 @@ static notrace long s390_kernel_write_odd(void *dst, const void *src, size_t siz */ static DEFINE_SPINLOCK(s390_kernel_write_lock); -void notrace s390_kernel_write(void *dst, const void *src, size_t size) +notrace void *s390_kernel_write(void *dst, const void *src, size_t size) { + void *tmp = dst; unsigned long flags; long copied; spin_lock_irqsave(&s390_kernel_write_lock, flags); - while (size) { - copied = s390_kernel_write_odd(dst, src, size); - dst += copied; - src += copied; - size -= copied; + if (!(flags & PSW_MASK_DAT)) { + memcpy(dst, src, size); + } else { + while (size) { + copied = s390_kernel_write_odd(tmp, src, size); + tmp += copied; + src += copied; + size -= copied; + } } spin_unlock_irqrestore(&s390_kernel_write_lock, flags); -} - -static int __no_sanitize_address __memcpy_real(void *dest, void *src, size_t count) -{ - register unsigned long _dest asm("2") = (unsigned long) dest; - register unsigned long _len1 asm("3") = (unsigned long) count; - register unsigned long _src asm("4") = (unsigned long) src; - register unsigned long _len2 asm("5") = (unsigned long) count; - int rc = -EFAULT; - - asm volatile ( - "0: mvcle %1,%2,0x0\n" - "1: jo 0b\n" - " lhi %0,0x0\n" - "2:\n" - EX_TABLE(1b,2b) - : "+d" (rc), "+d" (_dest), "+d" (_src), "+d" (_len1), - "+d" (_len2), "=m" (*((long *) dest)) - : "m" (*((long *) src)) - : "cc", "memory"); - return rc; -} -static unsigned long __no_sanitize_address _memcpy_real(unsigned long dest, - unsigned long src, - unsigned long count) -{ - int irqs_disabled, rc; - unsigned long flags; - - if (!count) - return 0; - flags = arch_local_irq_save(); - irqs_disabled = arch_irqs_disabled_flags(flags); - if (!irqs_disabled) - trace_hardirqs_off(); - __arch_local_irq_stnsm(0xf8); // disable DAT - rc = __memcpy_real((void *) dest, (void *) src, (size_t) count); - if (flags & PSW_MASK_DAT) - __arch_local_irq_stosm(0x04); // enable DAT - if (!irqs_disabled) - trace_hardirqs_on(); - __arch_local_irq_ssm(flags); - return rc; + return dst; } -/* - * Copy memory in real mode (kernel to kernel) - */ -int memcpy_real(void *dest, void *src, size_t count) +void __init memcpy_real_init(void) { - int rc; - - if (S390_lowcore.nodat_stack != 0) { - preempt_disable(); - rc = CALL_ON_STACK(_memcpy_real, S390_lowcore.nodat_stack, 3, - dest, src, count); - preempt_enable(); - return rc; - } - /* - * This is a really early memcpy_real call, the stacks are - * not set up yet. Just call _memcpy_real on the early boot - * stack - */ - return _memcpy_real((unsigned long) dest,(unsigned long) src, - (unsigned long) count); + memcpy_real_ptep = vmem_get_alloc_pte(__memcpy_real_area, true); + if (!memcpy_real_ptep) + panic("Couldn't setup memcpy real area"); } -/* - * Copy memory in absolute mode (kernel to kernel) - */ -void memcpy_absolute(void *dest, void *src, size_t count) +size_t memcpy_real_iter(struct iov_iter *iter, unsigned long src, size_t count) { - unsigned long cr0, flags, prefix; - - flags = arch_local_irq_save(); - __ctl_store(cr0, 0, 0); - __ctl_clear_bit(0, 28); /* disable lowcore protection */ - prefix = store_prefix(); - if (prefix) { - local_mcck_disable(); - set_prefix(0); - memcpy(dest, src, count); - set_prefix(prefix); - local_mcck_enable(); - } else { - memcpy(dest, src, count); + size_t len, copied, res = 0; + unsigned long phys, offset; + void *chunk; + pte_t pte; + + while (count) { + phys = src & PAGE_MASK; + offset = src & ~PAGE_MASK; + chunk = (void *)(__memcpy_real_area + offset); + len = min(count, PAGE_SIZE - offset); + pte = mk_pte_phys(phys, PAGE_KERNEL_RO); + + mutex_lock(&memcpy_real_mutex); + if (pte_val(pte) != pte_val(*memcpy_real_ptep)) { + __ptep_ipte(__memcpy_real_area, memcpy_real_ptep, 0, 0, IPTE_GLOBAL); + set_pte(memcpy_real_ptep, pte); + } + copied = copy_to_iter(chunk, len, iter); + mutex_unlock(&memcpy_real_mutex); + + count -= copied; + src += copied; + res += copied; + if (copied < len) + break; } - __ctl_load(cr0, 0, 0); - arch_local_irq_restore(flags); + return res; } -/* - * Copy memory from kernel (real) to user (virtual) - */ -int copy_to_user_real(void __user *dest, void *src, unsigned long count) +int memcpy_real(void *dest, unsigned long src, size_t count) { - int offs = 0, size, rc; - char *buf; - - buf = (char *) __get_free_page(GFP_KERNEL); - if (!buf) - return -ENOMEM; - rc = -EFAULT; - while (offs < count) { - size = min(PAGE_SIZE, count - offs); - if (memcpy_real(buf, src + offs, size)) - goto out; - if (copy_to_user(dest + offs, buf, size)) - goto out; - offs += size; - } - rc = 0; -out: - free_page((unsigned long) buf); - return rc; + struct iov_iter iter; + struct kvec kvec; + + kvec.iov_base = dest; + kvec.iov_len = count; + iov_iter_kvec(&iter, WRITE, &kvec, 1, count); + if (memcpy_real_iter(&iter, src, count) < count) + return -EFAULT; + return 0; } /* - * Check if physical address is within prefix or zero page + * Find CPU that owns swapped prefix page */ -static int is_swapped(unsigned long addr) +static int get_swapped_owner(phys_addr_t addr) { - unsigned long lc; + phys_addr_t lc; int cpu; - if (addr < sizeof(struct lowcore)) - return 1; for_each_online_cpu(cpu) { - lc = (unsigned long) lowcore_ptr[cpu]; + lc = virt_to_phys(lowcore_ptr[cpu]); if (addr > lc + sizeof(struct lowcore) - 1 || addr < lc) continue; - return 1; + return cpu; } - return 0; + return -1; } /* @@ -214,27 +159,46 @@ static int is_swapped(unsigned long addr) */ void *xlate_dev_mem_ptr(phys_addr_t addr) { - void *bounce = (void *) addr; + void *ptr = phys_to_virt(addr); + void *bounce = ptr; + struct lowcore *abs_lc; + unsigned long flags; unsigned long size; + int this_cpu, cpu; - get_online_cpus(); - preempt_disable(); - if (is_swapped(addr)) { - size = PAGE_SIZE - (addr & ~PAGE_MASK); - bounce = (void *) __get_free_page(GFP_ATOMIC); - if (bounce) - memcpy_absolute(bounce, (void *) addr, size); + cpus_read_lock(); + this_cpu = get_cpu(); + if (addr >= sizeof(struct lowcore)) { + cpu = get_swapped_owner(addr); + if (cpu < 0) + goto out; } - preempt_enable(); - put_online_cpus(); + bounce = (void *)__get_free_page(GFP_ATOMIC); + if (!bounce) + goto out; + size = PAGE_SIZE - (addr & ~PAGE_MASK); + if (addr < sizeof(struct lowcore)) { + abs_lc = get_abs_lowcore(&flags); + ptr = (void *)abs_lc + addr; + memcpy(bounce, ptr, size); + put_abs_lowcore(abs_lc, flags); + } else if (cpu == this_cpu) { + ptr = (void *)(addr - virt_to_phys(lowcore_ptr[cpu])); + memcpy(bounce, ptr, size); + } else { + memcpy(bounce, ptr, size); + } +out: + put_cpu(); + cpus_read_unlock(); return bounce; } /* * Free converted buffer for /dev/mem access (if necessary) */ -void unxlate_dev_mem_ptr(phys_addr_t addr, void *buf) +void unxlate_dev_mem_ptr(phys_addr_t addr, void *ptr) { - if ((void *) addr != buf) - free_page((unsigned long) buf); + if (addr != virt_to_phys(ptr)) + free_page((unsigned long)ptr); } diff --git a/arch/s390/mm/mmap.c b/arch/s390/mm/mmap.c index cbc718ba6d78..3327c47bc181 100644 --- a/arch/s390/mm/mmap.c +++ b/arch/s390/mm/mmap.c @@ -17,7 +17,6 @@ #include <linux/random.h> #include <linux/compat.h> #include <linux/security.h> -#include <asm/pgalloc.h> #include <asm/elf.h> static unsigned long stack_maxrandom_size(void) @@ -38,7 +37,7 @@ static inline int mmap_is_legacy(struct rlimit *rlim_stack) unsigned long arch_mmap_rnd(void) { - return (get_random_int() & MMAP_RND_MASK) << PAGE_SHIFT; + return (get_random_u32() & MMAP_RND_MASK) << PAGE_SHIFT; } static unsigned long mmap_base_legacy(unsigned long rnd) @@ -59,9 +58,9 @@ static inline unsigned long mmap_base(unsigned long rnd, /* * Top of mmap area (just below the process stack). - * Leave at least a ~32 MB hole. + * Leave at least a ~128 MB hole. */ - gap_min = 32 * 1024 * 1024UL; + gap_min = SZ_128M; gap_max = (STACK_TOP / 6) * 5; if (gap < gap_min) @@ -72,14 +71,13 @@ static inline unsigned long mmap_base(unsigned long rnd, return PAGE_ALIGN(STACK_TOP - gap - rnd); } -unsigned long -arch_get_unmapped_area(struct file *filp, unsigned long addr, - unsigned long len, unsigned long pgoff, unsigned long flags) +unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; struct vm_unmapped_area_info info; - int rc; if (len > TASK_SIZE - mmap_min_addr) return -ENOMEM; @@ -105,30 +103,20 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, info.align_mask = 0; info.align_offset = pgoff << PAGE_SHIFT; addr = vm_unmapped_area(&info); - if (addr & ~PAGE_MASK) + if (offset_in_page(addr)) return addr; check_asce_limit: - if (addr + len > current->mm->context.asce_limit && - addr + len <= TASK_SIZE) { - rc = crst_table_upgrade(mm, addr + len); - if (rc) - return (unsigned long) rc; - } - - return addr; + return check_asce_limit(mm, addr, len); } -unsigned long -arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, - const unsigned long len, const unsigned long pgoff, - const unsigned long flags) +unsigned long arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags) { struct vm_area_struct *vma; struct mm_struct *mm = current->mm; - unsigned long addr = addr0; struct vm_unmapped_area_info info; - int rc; /* requested length too big for entire address space */ if (len > TASK_SIZE - mmap_min_addr) @@ -163,25 +151,18 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, * can happen with large stack limits and large mmap() * allocations. */ - if (addr & ~PAGE_MASK) { + if (offset_in_page(addr)) { VM_BUG_ON(addr != -ENOMEM); info.flags = 0; info.low_limit = TASK_UNMAPPED_BASE; info.high_limit = TASK_SIZE; addr = vm_unmapped_area(&info); - if (addr & ~PAGE_MASK) + if (offset_in_page(addr)) return addr; } check_asce_limit: - if (addr + len > current->mm->context.asce_limit && - addr + len <= TASK_SIZE) { - rc = crst_table_upgrade(mm, addr + len); - if (rc) - return (unsigned long) rc; - } - - return addr; + return check_asce_limit(mm, addr, len); } /* @@ -207,3 +188,23 @@ void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) mm->get_unmapped_area = arch_get_unmapped_area_topdown; } } + +static const pgprot_t protection_map[16] = { + [VM_NONE] = PAGE_NONE, + [VM_READ] = PAGE_RO, + [VM_WRITE] = PAGE_RO, + [VM_WRITE | VM_READ] = PAGE_RO, + [VM_EXEC] = PAGE_RX, + [VM_EXEC | VM_READ] = PAGE_RX, + [VM_EXEC | VM_WRITE] = PAGE_RX, + [VM_EXEC | VM_WRITE | VM_READ] = PAGE_RX, + [VM_SHARED] = PAGE_NONE, + [VM_SHARED | VM_READ] = PAGE_RO, + [VM_SHARED | VM_WRITE] = PAGE_RW, + [VM_SHARED | VM_WRITE | VM_READ] = PAGE_RW, + [VM_SHARED | VM_EXEC] = PAGE_RX, + [VM_SHARED | VM_EXEC | VM_READ] = PAGE_RX, + [VM_SHARED | VM_EXEC | VM_WRITE] = PAGE_RWX, + [VM_SHARED | VM_EXEC | VM_WRITE | VM_READ] = PAGE_RWX +}; +DECLARE_VM_GET_PAGE_PROT diff --git a/arch/s390/mm/page-states.c b/arch/s390/mm/page-states.c index fc141893d028..d5ea09d78938 100644 --- a/arch/s390/mm/page-states.c +++ b/arch/s390/mm/page-states.c @@ -14,6 +14,7 @@ #include <linux/memblock.h> #include <linux/gfp.h> #include <linux/init.h> +#include <asm/asm-extable.h> #include <asm/facility.h> #include <asm/page-states.h> @@ -31,17 +32,17 @@ __setup("cmma=", cmma); static inline int cmma_test_essa(void) { - register unsigned long tmp asm("0") = 0; - register int rc asm("1"); + unsigned long tmp = 0; + int rc = -EOPNOTSUPP; /* test ESSA_GET_STATE */ asm volatile( - " .insn rrf,0xb9ab0000,%1,%1,%2,0\n" - "0: la %0,0\n" + " .insn rrf,0xb9ab0000,%[tmp],%[tmp],%[cmd],0\n" + "0: la %[rc],0\n" "1:\n" EX_TABLE(0b,1b) - : "=&d" (rc), "+&d" (tmp) - : "i" (ESSA_GET_STATE), "0" (-EOPNOTSUPP)); + : [rc] "+&d" (rc), [tmp] "+&d" (tmp) + : [cmd] "i" (ESSA_GET_STATE)); return rc; } @@ -112,7 +113,7 @@ static void mark_kernel_pmd(pud_t *pud, unsigned long addr, unsigned long end) next = pmd_addr_end(addr, end); if (pmd_none(*pmd) || pmd_large(*pmd)) continue; - page = virt_to_page(pmd_val(*pmd)); + page = phys_to_page(pmd_val(*pmd)); set_bit(PG_arch_1, &page->flags); } while (pmd++, addr = next, addr != end); } @@ -130,7 +131,7 @@ static void mark_kernel_pud(p4d_t *p4d, unsigned long addr, unsigned long end) if (pud_none(*pud) || pud_large(*pud)) continue; if (!pud_folded(*pud)) { - page = virt_to_page(pud_val(*pud)); + page = phys_to_page(pud_val(*pud)); for (i = 0; i < 3; i++) set_bit(PG_arch_1, &page[i].flags); } @@ -151,7 +152,7 @@ static void mark_kernel_p4d(pgd_t *pgd, unsigned long addr, unsigned long end) if (p4d_none(*p4d)) continue; if (!p4d_folded(*p4d)) { - page = virt_to_page(p4d_val(*p4d)); + page = phys_to_page(p4d_val(*p4d)); for (i = 0; i < 3; i++) set_bit(PG_arch_1, &page[i].flags); } @@ -173,7 +174,7 @@ static void mark_kernel_pgd(void) if (pgd_none(*pgd)) continue; if (!pgd_folded(*pgd)) { - page = virt_to_page(pgd_val(*pgd)); + page = phys_to_page(pgd_val(*pgd)); for (i = 0; i < 3; i++) set_bit(PG_arch_1, &page[i].flags); } @@ -183,9 +184,9 @@ static void mark_kernel_pgd(void) void __init cmma_init_nodat(void) { - struct memblock_region *reg; struct page *page; unsigned long start, end, ix; + int i; if (cmma_flag < 2) return; @@ -193,9 +194,7 @@ void __init cmma_init_nodat(void) mark_kernel_pgd(); /* Set all kernel pages not used for page tables to stable/no-dat */ - for_each_memblock(memory, reg) { - start = memblock_region_memory_base_pfn(reg); - end = memblock_region_memory_end_pfn(reg); + for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, NULL) { page = pfn_to_page(start); for (ix = start; ix < end; ix++, page++) { if (__test_and_clear_bit(PG_arch_1, &page->flags)) @@ -230,46 +229,3 @@ void arch_set_page_dat(struct page *page, int order) return; set_page_stable_dat(page, order); } - -void arch_set_page_nodat(struct page *page, int order) -{ - if (cmma_flag < 2) - return; - set_page_stable_nodat(page, order); -} - -int arch_test_page_nodat(struct page *page) -{ - unsigned char state; - - if (cmma_flag < 2) - return 0; - state = get_page_state(page); - return !!(state & 0x20); -} - -void arch_set_page_states(int make_stable) -{ - unsigned long flags, order, t; - struct list_head *l; - struct page *page; - struct zone *zone; - - if (!cmma_flag) - return; - if (make_stable) - drain_local_pages(NULL); - for_each_populated_zone(zone) { - spin_lock_irqsave(&zone->lock, flags); - for_each_migratetype_order(order, t) { - list_for_each(l, &zone->free_area[order].free_list[t]) { - page = list_entry(l, struct page, lru); - if (make_stable) - set_page_stable_dat(page, order); - else - set_page_unused(page, order); - } - } - spin_unlock_irqrestore(&zone->lock, flags); - } -} diff --git a/arch/s390/mm/pageattr.c b/arch/s390/mm/pageattr.c index f8c6faab41f4..85195c18b2e8 100644 --- a/arch/s390/mm/pageattr.c +++ b/arch/s390/mm/pageattr.c @@ -7,8 +7,8 @@ #include <linux/mm.h> #include <asm/cacheflush.h> #include <asm/facility.h> -#include <asm/pgtable.h> #include <asm/pgalloc.h> +#include <asm/kfence.h> #include <asm/page.h> #include <asm/set_memory.h> @@ -57,7 +57,7 @@ void arch_report_meminfo(struct seq_file *m) static void pgt_set(unsigned long *old, unsigned long new, unsigned long addr, unsigned long dtt) { - unsigned long table, mask; + unsigned long *table, mask; mask = 0; if (MACHINE_HAS_EDAT2) { @@ -72,7 +72,7 @@ static void pgt_set(unsigned long *old, unsigned long new, unsigned long addr, mask = ~(PTRS_PER_PTE * sizeof(pte_t) - 1); break; } - table = (unsigned long)old & mask; + table = (unsigned long *)((unsigned long)old & mask); crdte(*old, new, table, dtt, addr, S390_lowcore.kernel_asce); } else if (MACHINE_HAS_IDTE) { cspg(old, *old, new); @@ -86,7 +86,9 @@ static int walk_pte_level(pmd_t *pmdp, unsigned long addr, unsigned long end, { pte_t *ptep, new; - ptep = pte_offset(pmdp, addr); + if (flags == SET_MEMORY_4K) + return 0; + ptep = pte_offset_kernel(pmdp, addr); do { new = *ptep; if (pte_none(new)) @@ -96,9 +98,9 @@ static int walk_pte_level(pmd_t *pmdp, unsigned long addr, unsigned long end, else if (flags & SET_MEMORY_RW) new = pte_mkwrite(pte_mkdirty(new)); if (flags & SET_MEMORY_NX) - pte_val(new) |= _PAGE_NOEXEC; + new = set_pte_bit(new, __pgprot(_PAGE_NOEXEC)); else if (flags & SET_MEMORY_X) - pte_val(new) &= ~_PAGE_NOEXEC; + new = clear_pte_bit(new, __pgprot(_PAGE_NOEXEC)); pgt_set((unsigned long *)ptep, pte_val(new), addr, CRDTE_DTT_PAGE); ptep++; addr += PAGE_SIZE; @@ -125,11 +127,11 @@ static int split_pmd_page(pmd_t *pmdp, unsigned long addr) prot &= ~_PAGE_NOEXEC; ptep = pt_dir; for (i = 0; i < PTRS_PER_PTE; i++) { - pte_val(*ptep) = pte_addr | prot; + set_pte(ptep, __pte(pte_addr | prot)); pte_addr += PAGE_SIZE; ptep++; } - pmd_val(new) = __pa(pt_dir) | _SEGMENT_ENTRY; + new = __pmd(__pa(pt_dir) | _SEGMENT_ENTRY); pgt_set((unsigned long *)pmdp, pmd_val(new), addr, CRDTE_DTT_SEGMENT); update_page_count(PG_DIRECT_MAP_4K, PTRS_PER_PTE); update_page_count(PG_DIRECT_MAP_1M, -1); @@ -146,9 +148,9 @@ static void modify_pmd_page(pmd_t *pmdp, unsigned long addr, else if (flags & SET_MEMORY_RW) new = pmd_mkwrite(pmd_mkdirty(new)); if (flags & SET_MEMORY_NX) - pmd_val(new) |= _SEGMENT_ENTRY_NOEXEC; + new = set_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_NOEXEC)); else if (flags & SET_MEMORY_X) - pmd_val(new) &= ~_SEGMENT_ENTRY_NOEXEC; + new = clear_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_NOEXEC)); pgt_set((unsigned long *)pmdp, pmd_val(new), addr, CRDTE_DTT_SEGMENT); } @@ -156,6 +158,7 @@ static int walk_pmd_level(pud_t *pudp, unsigned long addr, unsigned long end, unsigned long flags) { unsigned long next; + int need_split; pmd_t *pmdp; int rc = 0; @@ -165,7 +168,10 @@ static int walk_pmd_level(pud_t *pudp, unsigned long addr, unsigned long end, return -EINVAL; next = pmd_addr_end(addr, end); if (pmd_large(*pmdp)) { - if (addr & ~PMD_MASK || addr + PMD_SIZE > next) { + need_split = !!(flags & SET_MEMORY_4K); + need_split |= !!(addr & ~PMD_MASK); + need_split |= !!(addr + PMD_SIZE > next); + if (need_split) { rc = split_pmd_page(pmdp, addr); if (rc) return rc; @@ -202,11 +208,11 @@ static int split_pud_page(pud_t *pudp, unsigned long addr) prot &= ~_SEGMENT_ENTRY_NOEXEC; pmdp = pm_dir; for (i = 0; i < PTRS_PER_PMD; i++) { - pmd_val(*pmdp) = pmd_addr | prot; + set_pmd(pmdp, __pmd(pmd_addr | prot)); pmd_addr += PMD_SIZE; pmdp++; } - pud_val(new) = __pa(pm_dir) | _REGION3_ENTRY; + new = __pud(__pa(pm_dir) | _REGION3_ENTRY); pgt_set((unsigned long *)pudp, pud_val(new), addr, CRDTE_DTT_REGION3); update_page_count(PG_DIRECT_MAP_1M, PTRS_PER_PMD); update_page_count(PG_DIRECT_MAP_2G, -1); @@ -223,9 +229,9 @@ static void modify_pud_page(pud_t *pudp, unsigned long addr, else if (flags & SET_MEMORY_RW) new = pud_mkwrite(pud_mkdirty(new)); if (flags & SET_MEMORY_NX) - pud_val(new) |= _REGION_ENTRY_NOEXEC; + new = set_pud_bit(new, __pgprot(_REGION_ENTRY_NOEXEC)); else if (flags & SET_MEMORY_X) - pud_val(new) &= ~_REGION_ENTRY_NOEXEC; + new = clear_pud_bit(new, __pgprot(_REGION_ENTRY_NOEXEC)); pgt_set((unsigned long *)pudp, pud_val(new), addr, CRDTE_DTT_REGION3); } @@ -233,6 +239,7 @@ static int walk_pud_level(p4d_t *p4d, unsigned long addr, unsigned long end, unsigned long flags) { unsigned long next; + int need_split; pud_t *pudp; int rc = 0; @@ -242,7 +249,10 @@ static int walk_pud_level(p4d_t *p4d, unsigned long addr, unsigned long end, return -EINVAL; next = pud_addr_end(addr, end); if (pud_large(*pudp)) { - if (addr & ~PUD_MASK || addr + PUD_SIZE > next) { + need_split = !!(flags & SET_MEMORY_4K); + need_split |= !!(addr & ~PUD_MASK); + need_split |= !!(addr + PUD_SIZE > next); + if (need_split) { rc = split_pud_page(pudp, addr); if (rc) break; @@ -279,7 +289,7 @@ static int walk_p4d_level(pgd_t *pgd, unsigned long addr, unsigned long end, return rc; } -static DEFINE_MUTEX(cpa_mutex); +DEFINE_MUTEX(cpa_mutex); static int change_page_attr(unsigned long addr, unsigned long end, unsigned long flags) @@ -317,7 +327,7 @@ int __set_memory(unsigned long addr, int numpages, unsigned long flags) return change_page_attr(addr, addr + numpages * PAGE_SIZE, flags); } -#ifdef CONFIG_DEBUG_PAGEALLOC +#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KFENCE) static void ipte_range(pte_t *pte, unsigned long address, int nr) { @@ -337,50 +347,27 @@ static void ipte_range(pte_t *pte, unsigned long address, int nr) void __kernel_map_pages(struct page *page, int numpages, int enable) { unsigned long address; + pte_t *ptep, pte; int nr, i, j; - pgd_t *pgd; - p4d_t *p4d; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; for (i = 0; i < numpages;) { - address = page_to_phys(page + i); - pgd = pgd_offset_k(address); - p4d = p4d_offset(pgd, address); - pud = pud_offset(p4d, address); - pmd = pmd_offset(pud, address); - pte = pte_offset_kernel(pmd, address); - nr = (unsigned long)pte >> ilog2(sizeof(long)); + address = (unsigned long)page_to_virt(page + i); + ptep = virt_to_kpte(address); + nr = (unsigned long)ptep >> ilog2(sizeof(long)); nr = PTRS_PER_PTE - (nr & (PTRS_PER_PTE - 1)); nr = min(numpages - i, nr); if (enable) { for (j = 0; j < nr; j++) { - pte_val(*pte) &= ~_PAGE_INVALID; + pte = clear_pte_bit(*ptep, __pgprot(_PAGE_INVALID)); + set_pte(ptep, pte); address += PAGE_SIZE; - pte++; + ptep++; } } else { - ipte_range(pte, address, nr); + ipte_range(ptep, address, nr); } i += nr; } } -#ifdef CONFIG_HIBERNATION -bool kernel_page_present(struct page *page) -{ - unsigned long addr; - int cc; - - addr = page_to_phys(page); - asm volatile( - " lra %1,0(%1)\n" - " ipm %0\n" - " srl %0,28" - : "=d" (cc), "+a" (addr) : : "cc"); - return cc == 0; -} -#endif /* CONFIG_HIBERNATION */ - #endif /* CONFIG_DEBUG_PAGEALLOC */ diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c index 3dd253f81a77..2de48b2c1b04 100644 --- a/arch/s390/mm/pgalloc.c +++ b/arch/s390/mm/pgalloc.c @@ -53,91 +53,92 @@ __initcall(page_table_register_sysctl); unsigned long *crst_table_alloc(struct mm_struct *mm) { - struct page *page = alloc_pages(GFP_KERNEL, 2); + struct page *page = alloc_pages(GFP_KERNEL, CRST_ALLOC_ORDER); if (!page) return NULL; - arch_set_page_dat(page, 2); - return (unsigned long *) page_to_phys(page); + arch_set_page_dat(page, CRST_ALLOC_ORDER); + return (unsigned long *) page_to_virt(page); } void crst_table_free(struct mm_struct *mm, unsigned long *table) { - free_pages((unsigned long) table, 2); + free_pages((unsigned long)table, CRST_ALLOC_ORDER); } static void __crst_table_upgrade(void *arg) { struct mm_struct *mm = arg; - if (current->active_mm == mm) - set_user_asce(mm); + /* change all active ASCEs to avoid the creation of new TLBs */ + if (current->active_mm == mm) { + S390_lowcore.user_asce = mm->context.asce; + __ctl_load(S390_lowcore.user_asce, 7, 7); + } __tlb_flush_local(); } int crst_table_upgrade(struct mm_struct *mm, unsigned long end) { - unsigned long *table, *pgd; - int rc, notify; + unsigned long *pgd = NULL, *p4d = NULL, *__pgd; + unsigned long asce_limit = mm->context.asce_limit; /* upgrade should only happen from 3 to 4, 3 to 5, or 4 to 5 levels */ - VM_BUG_ON(mm->context.asce_limit < _REGION2_SIZE); - rc = 0; - notify = 0; - while (mm->context.asce_limit < end) { - table = crst_table_alloc(mm); - if (!table) { - rc = -ENOMEM; - break; - } - spin_lock_bh(&mm->page_table_lock); - pgd = (unsigned long *) mm->pgd; - if (mm->context.asce_limit == _REGION2_SIZE) { - crst_table_init(table, _REGION2_ENTRY_EMPTY); - p4d_populate(mm, (p4d_t *) table, (pud_t *) pgd); - mm->pgd = (pgd_t *) table; - mm->context.asce_limit = _REGION1_SIZE; - mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH | - _ASCE_USER_BITS | _ASCE_TYPE_REGION2; - mm_inc_nr_puds(mm); - } else { - crst_table_init(table, _REGION1_ENTRY_EMPTY); - pgd_populate(mm, (pgd_t *) table, (p4d_t *) pgd); - mm->pgd = (pgd_t *) table; - mm->context.asce_limit = -PAGE_SIZE; - mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH | - _ASCE_USER_BITS | _ASCE_TYPE_REGION1; - } - notify = 1; - spin_unlock_bh(&mm->page_table_lock); - } - if (notify) - on_each_cpu(__crst_table_upgrade, mm, 0); - return rc; -} + VM_BUG_ON(asce_limit < _REGION2_SIZE); -void crst_table_downgrade(struct mm_struct *mm) -{ - pgd_t *pgd; + if (end <= asce_limit) + return 0; - /* downgrade should only happen from 3 to 2 levels (compat only) */ - VM_BUG_ON(mm->context.asce_limit != _REGION2_SIZE); + if (asce_limit == _REGION2_SIZE) { + p4d = crst_table_alloc(mm); + if (unlikely(!p4d)) + goto err_p4d; + crst_table_init(p4d, _REGION2_ENTRY_EMPTY); + } + if (end > _REGION1_SIZE) { + pgd = crst_table_alloc(mm); + if (unlikely(!pgd)) + goto err_pgd; + crst_table_init(pgd, _REGION1_ENTRY_EMPTY); + } - if (current->active_mm == mm) { - clear_user_asce(); - __tlb_flush_mm(mm); + spin_lock_bh(&mm->page_table_lock); + + /* + * This routine gets called with mmap_lock lock held and there is + * no reason to optimize for the case of otherwise. However, if + * that would ever change, the below check will let us know. + */ + VM_BUG_ON(asce_limit != mm->context.asce_limit); + + if (p4d) { + __pgd = (unsigned long *) mm->pgd; + p4d_populate(mm, (p4d_t *) p4d, (pud_t *) __pgd); + mm->pgd = (pgd_t *) p4d; + mm->context.asce_limit = _REGION1_SIZE; + mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH | + _ASCE_USER_BITS | _ASCE_TYPE_REGION2; + mm_inc_nr_puds(mm); + } + if (pgd) { + __pgd = (unsigned long *) mm->pgd; + pgd_populate(mm, (pgd_t *) pgd, (p4d_t *) __pgd); + mm->pgd = (pgd_t *) pgd; + mm->context.asce_limit = TASK_SIZE_MAX; + mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH | + _ASCE_USER_BITS | _ASCE_TYPE_REGION1; } - pgd = mm->pgd; - mm_dec_nr_pmds(mm); - mm->pgd = (pgd_t *) (pgd_val(*pgd) & _REGION_ENTRY_ORIGIN); - mm->context.asce_limit = _REGION3_SIZE; - mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH | - _ASCE_USER_BITS | _ASCE_TYPE_SEGMENT; - crst_table_free(mm, (unsigned long *) pgd); + spin_unlock_bh(&mm->page_table_lock); + + on_each_cpu(__crst_table_upgrade, mm, 0); - if (current->active_mm == mm) - set_user_asce(mm); + return 0; + +err_pgd: + crst_table_free(mm, p4d); +err_p4d: + return -ENOMEM; } static inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits) @@ -160,7 +161,7 @@ struct page *page_table_alloc_pgste(struct mm_struct *mm) page = alloc_page(GFP_KERNEL); if (page) { - table = (u64 *)page_to_phys(page); + table = (u64 *)page_to_virt(page); memset64(table, _PAGE_INVALID, PTRS_PER_PTE); memset64(table + PTRS_PER_PTE, 0, PTRS_PER_PTE); } @@ -175,7 +176,75 @@ void page_table_free_pgste(struct page *page) #endif /* CONFIG_PGSTE */ /* - * page table entry allocation/free routines. + * A 2KB-pgtable is either upper or lower half of a normal page. + * The second half of the page may be unused or used as another + * 2KB-pgtable. + * + * Whenever possible the parent page for a new 2KB-pgtable is picked + * from the list of partially allocated pages mm_context_t::pgtable_list. + * In case the list is empty a new parent page is allocated and added to + * the list. + * + * When a parent page gets fully allocated it contains 2KB-pgtables in both + * upper and lower halves and is removed from mm_context_t::pgtable_list. + * + * When 2KB-pgtable is freed from to fully allocated parent page that + * page turns partially allocated and added to mm_context_t::pgtable_list. + * + * If 2KB-pgtable is freed from the partially allocated parent page that + * page turns unused and gets removed from mm_context_t::pgtable_list. + * Furthermore, the unused parent page is released. + * + * As follows from the above, no unallocated or fully allocated parent + * pages are contained in mm_context_t::pgtable_list. + * + * The upper byte (bits 24-31) of the parent page _refcount is used + * for tracking contained 2KB-pgtables and has the following format: + * + * PP AA + * 01234567 upper byte (bits 24-31) of struct page::_refcount + * || || + * || |+--- upper 2KB-pgtable is allocated + * || +---- lower 2KB-pgtable is allocated + * |+------- upper 2KB-pgtable is pending for removal + * +-------- lower 2KB-pgtable is pending for removal + * + * (See commit 620b4e903179 ("s390: use _refcount for pgtables") on why + * using _refcount is possible). + * + * When 2KB-pgtable is allocated the corresponding AA bit is set to 1. + * The parent page is either: + * - added to mm_context_t::pgtable_list in case the second half of the + * parent page is still unallocated; + * - removed from mm_context_t::pgtable_list in case both hales of the + * parent page are allocated; + * These operations are protected with mm_context_t::lock. + * + * When 2KB-pgtable is deallocated the corresponding AA bit is set to 0 + * and the corresponding PP bit is set to 1 in a single atomic operation. + * Thus, PP and AA bits corresponding to the same 2KB-pgtable are mutually + * exclusive and may never be both set to 1! + * The parent page is either: + * - added to mm_context_t::pgtable_list in case the second half of the + * parent page is still allocated; + * - removed from mm_context_t::pgtable_list in case the second half of + * the parent page is unallocated; + * These operations are protected with mm_context_t::lock. + * + * It is important to understand that mm_context_t::lock only protects + * mm_context_t::pgtable_list and AA bits, but not the parent page itself + * and PP bits. + * + * Releasing the parent page happens whenever the PP bit turns from 1 to 0, + * while both AA bits and the second PP bit are already unset. Then the + * parent page does not contain any 2KB-pgtable fragment anymore, and it has + * also been removed from mm_context_t::pgtable_list. It is safe to release + * the page therefore. + * + * PGSTE memory spaces use full 4KB-pgtables and do not need most of the + * logic described above. Both AA bits are set to 1 to denote a 4KB-pgtable + * while the PP bits are never used, nor such a page is added to or removed + * from mm_context_t::pgtable_list. */ unsigned long *page_table_alloc(struct mm_struct *mm) { @@ -191,14 +260,23 @@ unsigned long *page_table_alloc(struct mm_struct *mm) page = list_first_entry(&mm->context.pgtable_list, struct page, lru); mask = atomic_read(&page->_refcount) >> 24; - mask = (mask | (mask >> 4)) & 3; - if (mask != 3) { - table = (unsigned long *) page_to_phys(page); + /* + * The pending removal bits must also be checked. + * Failure to do so might lead to an impossible + * value of (i.e 0x13 or 0x23) written to _refcount. + * Such values violate the assumption that pending and + * allocation bits are mutually exclusive, and the rest + * of the code unrails as result. That could lead to + * a whole bunch of races and corruptions. + */ + mask = (mask | (mask >> 4)) & 0x03U; + if (mask != 0x03U) { + table = (unsigned long *) page_to_virt(page); bit = mask & 1; /* =1 -> second 2K */ if (bit) table += PTRS_PER_PTE; atomic_xor_bits(&page->_refcount, - 1U << (bit + 24)); + 0x01U << (bit + 24)); list_del(&page->lru); } } @@ -216,15 +294,15 @@ unsigned long *page_table_alloc(struct mm_struct *mm) } arch_set_page_dat(page, 0); /* Initialize page table */ - table = (unsigned long *) page_to_phys(page); + table = (unsigned long *) page_to_virt(page); if (mm_alloc_pgste(mm)) { /* Return 4K page table with PGSTEs */ - atomic_xor_bits(&page->_refcount, 3 << 24); + atomic_xor_bits(&page->_refcount, 0x03U << 24); memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE); memset64((u64 *)table + PTRS_PER_PTE, 0, PTRS_PER_PTE); } else { /* Return the first 2K fragment of the page */ - atomic_xor_bits(&page->_refcount, 1 << 24); + atomic_xor_bits(&page->_refcount, 0x01U << 24); memset64((u64 *)table, _PAGE_INVALID, 2 * PTRS_PER_PTE); spin_lock_bh(&mm->context.lock); list_add(&page->lru, &mm->context.pgtable_list); @@ -233,29 +311,53 @@ unsigned long *page_table_alloc(struct mm_struct *mm) return table; } +static void page_table_release_check(struct page *page, void *table, + unsigned int half, unsigned int mask) +{ + char msg[128]; + + if (!IS_ENABLED(CONFIG_DEBUG_VM) || !mask) + return; + snprintf(msg, sizeof(msg), + "Invalid pgtable %p release half 0x%02x mask 0x%02x", + table, half, mask); + dump_page(page, msg); +} + void page_table_free(struct mm_struct *mm, unsigned long *table) { + unsigned int mask, bit, half; struct page *page; - unsigned int bit, mask; - page = pfn_to_page(__pa(table) >> PAGE_SHIFT); + page = virt_to_page(table); if (!mm_alloc_pgste(mm)) { /* Free 2K page table fragment of a 4K page */ - bit = (__pa(table) & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t)); + bit = ((unsigned long) table & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t)); spin_lock_bh(&mm->context.lock); - mask = atomic_xor_bits(&page->_refcount, 1U << (bit + 24)); + /* + * Mark the page for delayed release. The actual release + * will happen outside of the critical section from this + * function or from __tlb_remove_table() + */ + mask = atomic_xor_bits(&page->_refcount, 0x11U << (bit + 24)); mask >>= 24; - if (mask & 3) + if (mask & 0x03U) list_add(&page->lru, &mm->context.pgtable_list); else list_del(&page->lru); spin_unlock_bh(&mm->context.lock); - if (mask != 0) + mask = atomic_xor_bits(&page->_refcount, 0x10U << (bit + 24)); + mask >>= 24; + if (mask != 0x00U) return; + half = 0x01U << bit; } else { - atomic_xor_bits(&page->_refcount, 3U << 24); + half = 0x03U; + mask = atomic_xor_bits(&page->_refcount, 0x03U << 24); + mask >>= 24; } + page_table_release_check(page, table, half, mask); pgtable_pte_page_dtor(page); __free_page(page); } @@ -268,50 +370,57 @@ void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table, unsigned int bit, mask; mm = tlb->mm; - page = pfn_to_page(__pa(table) >> PAGE_SHIFT); + page = virt_to_page(table); if (mm_alloc_pgste(mm)) { gmap_unlink(mm, table, vmaddr); - table = (unsigned long *) (__pa(table) | 3); + table = (unsigned long *) ((unsigned long)table | 0x03U); tlb_remove_table(tlb, table); return; } - bit = (__pa(table) & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t)); + bit = ((unsigned long) table & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t)); spin_lock_bh(&mm->context.lock); + /* + * Mark the page for delayed release. The actual release will happen + * outside of the critical section from __tlb_remove_table() or from + * page_table_free() + */ mask = atomic_xor_bits(&page->_refcount, 0x11U << (bit + 24)); mask >>= 24; - if (mask & 3) + if (mask & 0x03U) list_add_tail(&page->lru, &mm->context.pgtable_list); else list_del(&page->lru); spin_unlock_bh(&mm->context.lock); - table = (unsigned long *) (__pa(table) | (1U << bit)); + table = (unsigned long *) ((unsigned long) table | (0x01U << bit)); tlb_remove_table(tlb, table); } void __tlb_remove_table(void *_table) { - unsigned int mask = (unsigned long) _table & 3; + unsigned int mask = (unsigned long) _table & 0x03U, half = mask; void *table = (void *)((unsigned long) _table ^ mask); - struct page *page = pfn_to_page(__pa(table) >> PAGE_SHIFT); + struct page *page = virt_to_page(table); - switch (mask) { - case 0: /* pmd, pud, or p4d */ - free_pages((unsigned long) table, 2); - break; - case 1: /* lower 2K of a 4K page table */ - case 2: /* higher 2K of a 4K page table */ + switch (half) { + case 0x00U: /* pmd, pud, or p4d */ + free_pages((unsigned long)table, CRST_ALLOC_ORDER); + return; + case 0x01U: /* lower 2K of a 4K page table */ + case 0x02U: /* higher 2K of a 4K page table */ mask = atomic_xor_bits(&page->_refcount, mask << (4 + 24)); mask >>= 24; - if (mask != 0) - break; - /* fallthrough */ - case 3: /* 4K page table with pgstes */ - if (mask & 3) - atomic_xor_bits(&page->_refcount, 3 << 24); - pgtable_pte_page_dtor(page); - __free_page(page); + if (mask != 0x00U) + return; + break; + case 0x03U: /* 4K page table with pgstes */ + mask = atomic_xor_bits(&page->_refcount, 0x03U << 24); + mask >>= 24; break; } + + page_table_release_check(page, table, half, mask); + pgtable_pte_page_dtor(page); + __free_page(page); } /* @@ -321,34 +430,34 @@ void __tlb_remove_table(void *_table) static struct kmem_cache *base_pgt_cache; -static unsigned long base_pgt_alloc(void) +static unsigned long *base_pgt_alloc(void) { - u64 *table; + unsigned long *table; table = kmem_cache_alloc(base_pgt_cache, GFP_KERNEL); if (table) - memset64(table, _PAGE_INVALID, PTRS_PER_PTE); - return (unsigned long) table; + memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE); + return table; } -static void base_pgt_free(unsigned long table) +static void base_pgt_free(unsigned long *table) { - kmem_cache_free(base_pgt_cache, (void *) table); + kmem_cache_free(base_pgt_cache, table); } -static unsigned long base_crst_alloc(unsigned long val) +static unsigned long *base_crst_alloc(unsigned long val) { - unsigned long table; + unsigned long *table; - table = __get_free_pages(GFP_KERNEL, CRST_ALLOC_ORDER); + table = (unsigned long *)__get_free_pages(GFP_KERNEL, CRST_ALLOC_ORDER); if (table) - crst_table_init((unsigned long *)table, val); + crst_table_init(table, val); return table; } -static void base_crst_free(unsigned long table) +static void base_crst_free(unsigned long *table) { - free_pages(table, CRST_ALLOC_ORDER); + free_pages((unsigned long)table, CRST_ALLOC_ORDER); } #define BASE_ADDR_END_FUNC(NAME, SIZE) \ @@ -376,14 +485,14 @@ static inline unsigned long base_lra(unsigned long address) return real; } -static int base_page_walk(unsigned long origin, unsigned long addr, +static int base_page_walk(unsigned long *origin, unsigned long addr, unsigned long end, int alloc) { unsigned long *pte, next; if (!alloc) return 0; - pte = (unsigned long *) origin; + pte = origin; pte += (addr & _PAGE_INDEX) >> _PAGE_SHIFT; do { next = base_page_addr_end(addr, end); @@ -392,13 +501,13 @@ static int base_page_walk(unsigned long origin, unsigned long addr, return 0; } -static int base_segment_walk(unsigned long origin, unsigned long addr, +static int base_segment_walk(unsigned long *origin, unsigned long addr, unsigned long end, int alloc) { - unsigned long *ste, next, table; + unsigned long *ste, next, *table; int rc; - ste = (unsigned long *) origin; + ste = origin; ste += (addr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT; do { next = base_segment_addr_end(addr, end); @@ -408,9 +517,9 @@ static int base_segment_walk(unsigned long origin, unsigned long addr, table = base_pgt_alloc(); if (!table) return -ENOMEM; - *ste = table | _SEGMENT_ENTRY; + *ste = __pa(table) | _SEGMENT_ENTRY; } - table = *ste & _SEGMENT_ENTRY_ORIGIN; + table = __va(*ste & _SEGMENT_ENTRY_ORIGIN); rc = base_page_walk(table, addr, next, alloc); if (rc) return rc; @@ -421,13 +530,13 @@ static int base_segment_walk(unsigned long origin, unsigned long addr, return 0; } -static int base_region3_walk(unsigned long origin, unsigned long addr, +static int base_region3_walk(unsigned long *origin, unsigned long addr, unsigned long end, int alloc) { - unsigned long *rtte, next, table; + unsigned long *rtte, next, *table; int rc; - rtte = (unsigned long *) origin; + rtte = origin; rtte += (addr & _REGION3_INDEX) >> _REGION3_SHIFT; do { next = base_region3_addr_end(addr, end); @@ -437,9 +546,9 @@ static int base_region3_walk(unsigned long origin, unsigned long addr, table = base_crst_alloc(_SEGMENT_ENTRY_EMPTY); if (!table) return -ENOMEM; - *rtte = table | _REGION3_ENTRY; + *rtte = __pa(table) | _REGION3_ENTRY; } - table = *rtte & _REGION_ENTRY_ORIGIN; + table = __va(*rtte & _REGION_ENTRY_ORIGIN); rc = base_segment_walk(table, addr, next, alloc); if (rc) return rc; @@ -449,13 +558,13 @@ static int base_region3_walk(unsigned long origin, unsigned long addr, return 0; } -static int base_region2_walk(unsigned long origin, unsigned long addr, +static int base_region2_walk(unsigned long *origin, unsigned long addr, unsigned long end, int alloc) { - unsigned long *rste, next, table; + unsigned long *rste, next, *table; int rc; - rste = (unsigned long *) origin; + rste = origin; rste += (addr & _REGION2_INDEX) >> _REGION2_SHIFT; do { next = base_region2_addr_end(addr, end); @@ -465,9 +574,9 @@ static int base_region2_walk(unsigned long origin, unsigned long addr, table = base_crst_alloc(_REGION3_ENTRY_EMPTY); if (!table) return -ENOMEM; - *rste = table | _REGION2_ENTRY; + *rste = __pa(table) | _REGION2_ENTRY; } - table = *rste & _REGION_ENTRY_ORIGIN; + table = __va(*rste & _REGION_ENTRY_ORIGIN); rc = base_region3_walk(table, addr, next, alloc); if (rc) return rc; @@ -477,13 +586,13 @@ static int base_region2_walk(unsigned long origin, unsigned long addr, return 0; } -static int base_region1_walk(unsigned long origin, unsigned long addr, +static int base_region1_walk(unsigned long *origin, unsigned long addr, unsigned long end, int alloc) { - unsigned long *rfte, next, table; + unsigned long *rfte, next, *table; int rc; - rfte = (unsigned long *) origin; + rfte = origin; rfte += (addr & _REGION1_INDEX) >> _REGION1_SHIFT; do { next = base_region1_addr_end(addr, end); @@ -493,9 +602,9 @@ static int base_region1_walk(unsigned long origin, unsigned long addr, table = base_crst_alloc(_REGION2_ENTRY_EMPTY); if (!table) return -ENOMEM; - *rfte = table | _REGION1_ENTRY; + *rfte = __pa(table) | _REGION1_ENTRY; } - table = *rfte & _REGION_ENTRY_ORIGIN; + table = __va(*rfte & _REGION_ENTRY_ORIGIN); rc = base_region2_walk(table, addr, next, alloc); if (rc) return rc; @@ -514,7 +623,7 @@ static int base_region1_walk(unsigned long origin, unsigned long addr, */ void base_asce_free(unsigned long asce) { - unsigned long table = asce & _ASCE_ORIGIN; + unsigned long *table = __va(asce & _ASCE_ORIGIN); if (!asce) return; @@ -529,7 +638,7 @@ void base_asce_free(unsigned long asce) base_region2_walk(table, 0, _REGION1_SIZE, 0); break; case _ASCE_TYPE_REGION1: - base_region1_walk(table, 0, -_PAGE_SIZE, 0); + base_region1_walk(table, 0, TASK_SIZE_MAX, 0); break; } base_crst_free(table); @@ -566,7 +675,7 @@ static int base_pgt_cache_init(void) */ unsigned long base_asce_alloc(unsigned long addr, unsigned long num_pages) { - unsigned long asce, table, end; + unsigned long asce, *table, end; int rc; if (base_pgt_cache_init()) @@ -577,25 +686,25 @@ unsigned long base_asce_alloc(unsigned long addr, unsigned long num_pages) if (!table) return 0; rc = base_segment_walk(table, addr, end, 1); - asce = table | _ASCE_TYPE_SEGMENT | _ASCE_TABLE_LENGTH; + asce = __pa(table) | _ASCE_TYPE_SEGMENT | _ASCE_TABLE_LENGTH; } else if (end <= _REGION2_SIZE) { table = base_crst_alloc(_REGION3_ENTRY_EMPTY); if (!table) return 0; rc = base_region3_walk(table, addr, end, 1); - asce = table | _ASCE_TYPE_REGION3 | _ASCE_TABLE_LENGTH; + asce = __pa(table) | _ASCE_TYPE_REGION3 | _ASCE_TABLE_LENGTH; } else if (end <= _REGION1_SIZE) { table = base_crst_alloc(_REGION2_ENTRY_EMPTY); if (!table) return 0; rc = base_region2_walk(table, addr, end, 1); - asce = table | _ASCE_TYPE_REGION2 | _ASCE_TABLE_LENGTH; + asce = __pa(table) | _ASCE_TYPE_REGION2 | _ASCE_TABLE_LENGTH; } else { table = base_crst_alloc(_REGION1_ENTRY_EMPTY); if (!table) return 0; rc = base_region1_walk(table, addr, end, 1); - asce = table | _ASCE_TYPE_REGION1 | _ASCE_TABLE_LENGTH; + asce = __pa(table) | _ASCE_TYPE_REGION1 | _ASCE_TABLE_LENGTH; } if (rc) { base_asce_free(asce); diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index 9ebd01219812..4909dcd762e8 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -19,13 +19,31 @@ #include <linux/ksm.h> #include <linux/mman.h> -#include <asm/pgtable.h> -#include <asm/pgalloc.h> #include <asm/tlb.h> #include <asm/tlbflush.h> #include <asm/mmu_context.h> #include <asm/page-states.h> +pgprot_t pgprot_writecombine(pgprot_t prot) +{ + /* + * mio_wb_bit_mask may be set on a different CPU, but it is only set + * once at init and only read afterwards. + */ + return __pgprot(pgprot_val(prot) | mio_wb_bit_mask); +} +EXPORT_SYMBOL_GPL(pgprot_writecombine); + +pgprot_t pgprot_writethrough(pgprot_t prot) +{ + /* + * mio_wb_bit_mask may be set on a different CPU, but it is only set + * once at init and only read afterwards. + */ + return __pgprot(pgprot_val(prot) & ~mio_wb_bit_mask); +} +EXPORT_SYMBOL_GPL(pgprot_writethrough); + static inline void ptep_ipte_local(struct mm_struct *mm, unsigned long addr, pte_t *ptep, int nodat) { @@ -97,7 +115,7 @@ static inline pte_t ptep_flush_lazy(struct mm_struct *mm, atomic_inc(&mm->context.flush_count); if (cpumask_equal(&mm->context.cpu_attach_mask, cpumask_of(smp_processor_id()))) { - pte_val(*ptep) |= _PAGE_INVALID; + set_pte(ptep, set_pte_bit(*ptep, __pgprot(_PAGE_INVALID))); mm->context.flush_mm = 1; } else ptep_ipte_global(mm, addr, ptep, nodat); @@ -206,15 +224,15 @@ static inline pgste_t pgste_set_pte(pte_t *ptep, pgste_t pgste, pte_t entry) * Without enhanced suppression-on-protection force * the dirty bit on for all writable ptes. */ - pte_val(entry) |= _PAGE_DIRTY; - pte_val(entry) &= ~_PAGE_PROTECT; + entry = set_pte_bit(entry, __pgprot(_PAGE_DIRTY)); + entry = clear_pte_bit(entry, __pgprot(_PAGE_PROTECT)); } if (!(pte_val(entry) & _PAGE_PROTECT)) /* This pte allows write access, set user-dirty */ pgste_val(pgste) |= PGSTE_UC_BIT; } #endif - *ptep = entry; + set_pte(ptep, entry); return pgste; } @@ -257,12 +275,12 @@ static inline pte_t ptep_xchg_commit(struct mm_struct *mm, pgste = pgste_update_all(old, pgste, mm); if ((pgste_val(pgste) & _PGSTE_GPS_USAGE_MASK) == _PGSTE_GPS_USAGE_UNUSED) - pte_val(old) |= _PAGE_UNUSED; + old = set_pte_bit(old, __pgprot(_PAGE_UNUSED)); } pgste = pgste_set_pte(ptep, pgste, new); pgste_set_unlock(ptep, pgste); } else { - *ptep = new; + set_pte(ptep, new); } return old; } @@ -327,14 +345,14 @@ void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, struct mm_struct *mm = vma->vm_mm; if (!MACHINE_HAS_NX) - pte_val(pte) &= ~_PAGE_NOEXEC; + pte = clear_pte_bit(pte, __pgprot(_PAGE_NOEXEC)); if (mm_has_pgste(mm)) { pgste = pgste_get(ptep); pgste_set_key(ptep, pgste, pte, mm); pgste = pgste_set_pte(ptep, pgste, pte); pgste_set_unlock(ptep, pgste); } else { - *ptep = pte; + set_pte(ptep, pte); } preempt_enable(); } @@ -399,7 +417,7 @@ static inline pmd_t pmdp_flush_lazy(struct mm_struct *mm, atomic_inc(&mm->context.flush_count); if (cpumask_equal(&mm->context.cpu_attach_mask, cpumask_of(smp_processor_id()))) { - pmd_val(*pmdp) |= _SEGMENT_ENTRY_INVALID; + set_pmd(pmdp, set_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_INVALID))); mm->context.flush_mm = 1; if (mm_has_pgste(mm)) gmap_pmdp_invalidate(mm, addr); @@ -411,22 +429,36 @@ static inline pmd_t pmdp_flush_lazy(struct mm_struct *mm, } #ifdef CONFIG_PGSTE -static pmd_t *pmd_alloc_map(struct mm_struct *mm, unsigned long addr) +static int pmd_lookup(struct mm_struct *mm, unsigned long addr, pmd_t **pmdp) { + struct vm_area_struct *vma; pgd_t *pgd; p4d_t *p4d; pud_t *pud; - pmd_t *pmd; + + /* We need a valid VMA, otherwise this is clearly a fault. */ + vma = vma_lookup(mm, addr); + if (!vma) + return -EFAULT; pgd = pgd_offset(mm, addr); - p4d = p4d_alloc(mm, pgd, addr); - if (!p4d) - return NULL; - pud = pud_alloc(mm, p4d, addr); - if (!pud) - return NULL; - pmd = pmd_alloc(mm, pud, addr); - return pmd; + if (!pgd_present(*pgd)) + return -ENOENT; + + p4d = p4d_offset(pgd, addr); + if (!p4d_present(*p4d)) + return -ENOENT; + + pud = pud_offset(p4d, addr); + if (!pud_present(*pud)) + return -ENOENT; + + /* Large PUDs are not supported yet. */ + if (pud_large(*pud)) + return -EFAULT; + + *pmdp = pmd_offset(pud, addr); + return 0; } #endif @@ -437,7 +469,7 @@ pmd_t pmdp_xchg_direct(struct mm_struct *mm, unsigned long addr, preempt_disable(); old = pmdp_flush_direct(mm, addr, pmdp); - *pmdp = new; + set_pmd(pmdp, new); preempt_enable(); return old; } @@ -450,7 +482,7 @@ pmd_t pmdp_xchg_lazy(struct mm_struct *mm, unsigned long addr, preempt_disable(); old = pmdp_flush_lazy(mm, addr, pmdp); - *pmdp = new; + set_pmd(pmdp, new); preempt_enable(); return old; } @@ -507,7 +539,7 @@ pud_t pudp_xchg_direct(struct mm_struct *mm, unsigned long addr, preempt_disable(); old = pudp_flush_direct(mm, addr, pudp); - *pudp = new; + set_pud(pudp, new); preempt_enable(); return old; } @@ -547,9 +579,9 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) list_del(lh); } ptep = (pte_t *) pgtable; - pte_val(*ptep) = _PAGE_INVALID; + set_pte(ptep, __pte(_PAGE_INVALID)); ptep++; - pte_val(*ptep) = _PAGE_INVALID; + set_pte(ptep, __pte(_PAGE_INVALID)); return pgtable; } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ @@ -614,12 +646,12 @@ int ptep_force_prot(struct mm_struct *mm, unsigned long addr, if (prot == PROT_NONE && !pte_i) { ptep_flush_direct(mm, addr, ptep, nodat); pgste = pgste_update_all(entry, pgste, mm); - pte_val(entry) |= _PAGE_INVALID; + entry = set_pte_bit(entry, __pgprot(_PAGE_INVALID)); } if (prot == PROT_READ && !pte_p) { ptep_flush_direct(mm, addr, ptep, nodat); - pte_val(entry) &= ~_PAGE_INVALID; - pte_val(entry) |= _PAGE_PROTECT; + entry = clear_pte_bit(entry, __pgprot(_PAGE_INVALID)); + entry = set_pte_bit(entry, __pgprot(_PAGE_PROTECT)); } pgste_val(pgste) |= bit; pgste = pgste_set_pte(ptep, pgste, entry); @@ -643,8 +675,8 @@ int ptep_shadow_pte(struct mm_struct *mm, unsigned long saddr, !(pte_val(pte) & _PAGE_PROTECT))) { pgste_val(spgste) |= PGSTE_VSIE_BIT; tpgste = pgste_get_lock(tptep); - pte_val(tpte) = (pte_val(spte) & PAGE_MASK) | - (pte_val(pte) & _PAGE_PROTECT); + tpte = __pte((pte_val(spte) & PAGE_MASK) | + (pte_val(pte) & _PAGE_PROTECT)); /* don't touch the storage key - it belongs to parent pgste */ tpgste = pgste_set_pte(tptep, tpgste, tpte); pgste_set_unlock(tptep, tpgste); @@ -673,7 +705,7 @@ static void ptep_zap_swap_entry(struct mm_struct *mm, swp_entry_t entry) if (!non_swap_entry(entry)) dec_mm_counter(mm, MM_SWAPENTS); else if (is_migration_entry(entry)) { - struct page *page = migration_entry_to_page(entry); + struct page *page = pfn_swap_entry_to_page(entry); dec_mm_counter(mm, mm_counter(page)); } @@ -716,7 +748,7 @@ void ptep_zap_key(struct mm_struct *mm, unsigned long addr, pte_t *ptep) pgste_val(pgste) |= PGSTE_GR_BIT | PGSTE_GC_BIT; ptev = pte_val(*ptep); if (!(ptev & _PAGE_INVALID) && (ptev & _PAGE_WRITE)) - page_set_storage_key(ptev & PAGE_MASK, PAGE_DEFAULT_KEY, 1); + page_set_storage_key(ptev & PAGE_MASK, PAGE_DEFAULT_KEY, 0); pgste_set_unlock(ptep, pgste); preempt_enable(); } @@ -741,10 +773,10 @@ bool ptep_test_and_clear_uc(struct mm_struct *mm, unsigned long addr, nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); ptep_ipte_global(mm, addr, ptep, nodat); if (MACHINE_HAS_ESOP || !(pte_val(pte) & _PAGE_WRITE)) - pte_val(pte) |= _PAGE_PROTECT; + pte = set_pte_bit(pte, __pgprot(_PAGE_PROTECT)); else - pte_val(pte) |= _PAGE_INVALID; - *ptep = pte; + pte = set_pte_bit(pte, __pgprot(_PAGE_INVALID)); + set_pte(ptep, pte); } pgste_set_unlock(ptep, pgste); return dirty; @@ -760,14 +792,23 @@ int set_guest_storage_key(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp; pte_t *ptep; - pmdp = pmd_alloc_map(mm, addr); - if (unlikely(!pmdp)) + /* + * If we don't have a PTE table and if there is no huge page mapped, + * we can ignore attempts to set the key to 0, because it already is 0. + */ + switch (pmd_lookup(mm, addr, &pmdp)) { + case -ENOENT: + return key ? -EFAULT : 0; + case 0: + break; + default: return -EFAULT; + } ptl = pmd_lock(mm, pmdp); if (!pmd_present(*pmdp)) { spin_unlock(ptl); - return -EFAULT; + return key ? -EFAULT : 0; } if (pmd_large(*pmdp)) { @@ -783,10 +824,7 @@ int set_guest_storage_key(struct mm_struct *mm, unsigned long addr, } spin_unlock(ptl); - ptep = pte_alloc_map_lock(mm, pmdp, addr, &ptl); - if (unlikely(!ptep)) - return -EFAULT; - + ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); new = old = pgste_get_lock(ptep); pgste_val(new) &= ~(PGSTE_GR_BIT | PGSTE_GC_BIT | PGSTE_ACC_BITS | PGSTE_FP_BIT); @@ -816,7 +854,7 @@ int set_guest_storage_key(struct mm_struct *mm, unsigned long addr, } EXPORT_SYMBOL(set_guest_storage_key); -/** +/* * Conditionally set a guest storage key (handling csske). * oldkey will be updated when either mr or mc is set and a pointer is given. * @@ -849,7 +887,7 @@ int cond_set_guest_storage_key(struct mm_struct *mm, unsigned long addr, } EXPORT_SYMBOL(cond_set_guest_storage_key); -/** +/* * Reset a guest reference bit (rrbe), returning the reference and changed bit. * * Returns < 0 in case of error, otherwise the cc to be reported to the guest. @@ -863,14 +901,23 @@ int reset_guest_reference_bit(struct mm_struct *mm, unsigned long addr) pte_t *ptep; int cc = 0; - pmdp = pmd_alloc_map(mm, addr); - if (unlikely(!pmdp)) + /* + * If we don't have a PTE table and if there is no huge page mapped, + * the storage key is 0 and there is nothing for us to do. + */ + switch (pmd_lookup(mm, addr, &pmdp)) { + case -ENOENT: + return 0; + case 0: + break; + default: return -EFAULT; + } ptl = pmd_lock(mm, pmdp); if (!pmd_present(*pmdp)) { spin_unlock(ptl); - return -EFAULT; + return 0; } if (pmd_large(*pmdp)) { @@ -882,10 +929,7 @@ int reset_guest_reference_bit(struct mm_struct *mm, unsigned long addr) } spin_unlock(ptl); - ptep = pte_alloc_map_lock(mm, pmdp, addr, &ptl); - if (unlikely(!ptep)) - return -EFAULT; - + ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); new = old = pgste_get_lock(ptep); /* Reset guest reference bit only */ pgste_val(new) &= ~PGSTE_GR_BIT; @@ -917,15 +961,24 @@ int get_guest_storage_key(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp; pte_t *ptep; - pmdp = pmd_alloc_map(mm, addr); - if (unlikely(!pmdp)) + /* + * If we don't have a PTE table and if there is no huge page mapped, + * the storage key is 0. + */ + *key = 0; + + switch (pmd_lookup(mm, addr, &pmdp)) { + case -ENOENT: + return 0; + case 0: + break; + default: return -EFAULT; + } ptl = pmd_lock(mm, pmdp); if (!pmd_present(*pmdp)) { - /* Not yet mapped memory has a zero key */ spin_unlock(ptl); - *key = 0; return 0; } @@ -938,10 +991,7 @@ int get_guest_storage_key(struct mm_struct *mm, unsigned long addr, } spin_unlock(ptl); - ptep = pte_alloc_map_lock(mm, pmdp, addr, &ptl); - if (unlikely(!ptep)) - return -EFAULT; - + ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); pgste = pgste_get_lock(ptep); *key = (pgste_val(pgste) & (PGSTE_ACC_BITS | PGSTE_FP_BIT)) >> 56; paddr = pte_val(*ptep) & PAGE_MASK; @@ -970,6 +1020,7 @@ EXPORT_SYMBOL(get_guest_storage_key); int pgste_perform_essa(struct mm_struct *mm, unsigned long hva, int orc, unsigned long *oldpte, unsigned long *oldpgste) { + struct vm_area_struct *vma; unsigned long pgstev; spinlock_t *ptl; pgste_t pgste; @@ -979,6 +1030,10 @@ int pgste_perform_essa(struct mm_struct *mm, unsigned long hva, int orc, WARN_ON_ONCE(orc > ESSA_MAX); if (unlikely(orc > ESSA_MAX)) return -EINVAL; + + vma = vma_lookup(mm, hva); + if (!vma || is_vm_hugetlb_page(vma)) + return -EFAULT; ptep = get_locked_pte(mm, hva, &ptl); if (unlikely(!ptep)) return -EFAULT; @@ -1071,10 +1126,14 @@ EXPORT_SYMBOL(pgste_perform_essa); int set_pgste_bits(struct mm_struct *mm, unsigned long hva, unsigned long bits, unsigned long value) { + struct vm_area_struct *vma; spinlock_t *ptl; pgste_t new; pte_t *ptep; + vma = vma_lookup(mm, hva); + if (!vma || is_vm_hugetlb_page(vma)) + return -EFAULT; ptep = get_locked_pte(mm, hva, &ptl); if (unlikely(!ptep)) return -EFAULT; @@ -1099,9 +1158,13 @@ EXPORT_SYMBOL(set_pgste_bits); */ int get_pgste(struct mm_struct *mm, unsigned long hva, unsigned long *pgstep) { + struct vm_area_struct *vma; spinlock_t *ptl; pte_t *ptep; + vma = vma_lookup(mm, hva); + if (!vma || is_vm_hugetlb_page(vma)) + return -EFAULT; ptep = get_locked_pte(mm, hva, &ptl); if (unlikely(!ptep)) return -EFAULT; diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c index b403fa14847d..ee1a97078527 100644 --- a/arch/s390/mm/vmem.c +++ b/arch/s390/mm/vmem.c @@ -1,9 +1,9 @@ // SPDX-License-Identifier: GPL-2.0 /* * Copyright IBM Corp. 2006 - * Author(s): Heiko Carstens <heiko.carstens@de.ibm.com> */ +#include <linux/memory_hotplug.h> #include <linux/memblock.h> #include <linux/pfn.h> #include <linux/mm.h> @@ -12,8 +12,8 @@ #include <linux/hugetlb.h> #include <linux/slab.h> #include <asm/cacheflush.h> +#include <asm/nospec-branch.h> #include <asm/pgalloc.h> -#include <asm/pgtable.h> #include <asm/setup.h> #include <asm/tlbflush.h> #include <asm/sections.h> @@ -21,21 +21,22 @@ static DEFINE_MUTEX(vmem_mutex); -struct memory_segment { - struct list_head list; - unsigned long start; - unsigned long size; -}; - -static LIST_HEAD(mem_segs); - static void __ref *vmem_alloc_pages(unsigned int order) { unsigned long size = PAGE_SIZE << order; if (slab_is_available()) return (void *)__get_free_pages(GFP_KERNEL, order); - return (void *) memblock_phys_alloc(size, size); + return memblock_alloc(size, size); +} + +static void vmem_free_pages(unsigned long addr, int order) +{ + /* We don't expect boot memory to be removed ever. */ + if (!slab_is_available() || + WARN_ON_ONCE(PageReserved(virt_to_page(addr)))) + return; + free_pages(addr, order); } void *vmem_crst_alloc(unsigned long val) @@ -56,341 +57,604 @@ pte_t __ref *vmem_pte_alloc(void) if (slab_is_available()) pte = (pte_t *) page_table_alloc(&init_mm); else - pte = (pte_t *) memblock_phys_alloc(size, size); + pte = (pte_t *) memblock_alloc(size, size); if (!pte) return NULL; memset64((u64 *)pte, _PAGE_INVALID, PTRS_PER_PTE); return pte; } +static void vmem_pte_free(unsigned long *table) +{ + /* We don't expect boot memory to be removed ever. */ + if (!slab_is_available() || + WARN_ON_ONCE(PageReserved(virt_to_page(table)))) + return; + page_table_free(&init_mm, table); +} + +#define PAGE_UNUSED 0xFD + /* - * Add a physical memory range to the 1:1 mapping. + * The unused vmemmap range, which was not yet memset(PAGE_UNUSED) ranges + * from unused_sub_pmd_start to next PMD_SIZE boundary. */ -static int vmem_add_mem(unsigned long start, unsigned long size) -{ - unsigned long pgt_prot, sgt_prot, r3_prot; - unsigned long pages4k, pages1m, pages2g; - unsigned long end = start + size; - unsigned long address = start; - pgd_t *pg_dir; - p4d_t *p4_dir; - pud_t *pu_dir; - pmd_t *pm_dir; - pte_t *pt_dir; - int ret = -ENOMEM; +static unsigned long unused_sub_pmd_start; + +static void vmemmap_flush_unused_sub_pmd(void) +{ + if (!unused_sub_pmd_start) + return; + memset((void *)unused_sub_pmd_start, PAGE_UNUSED, + ALIGN(unused_sub_pmd_start, PMD_SIZE) - unused_sub_pmd_start); + unused_sub_pmd_start = 0; +} - pgt_prot = pgprot_val(PAGE_KERNEL); - sgt_prot = pgprot_val(SEGMENT_KERNEL); - r3_prot = pgprot_val(REGION3_KERNEL); - if (!MACHINE_HAS_NX) { - pgt_prot &= ~_PAGE_NOEXEC; - sgt_prot &= ~_SEGMENT_ENTRY_NOEXEC; - r3_prot &= ~_REGION_ENTRY_NOEXEC; +static void vmemmap_mark_sub_pmd_used(unsigned long start, unsigned long end) +{ + /* + * As we expect to add in the same granularity as we remove, it's + * sufficient to mark only some piece used to block the memmap page from + * getting removed (just in case the memmap never gets initialized, + * e.g., because the memory block never gets onlined). + */ + memset((void *)start, 0, sizeof(struct page)); +} + +static void vmemmap_use_sub_pmd(unsigned long start, unsigned long end) +{ + /* + * We only optimize if the new used range directly follows the + * previously unused range (esp., when populating consecutive sections). + */ + if (unused_sub_pmd_start == start) { + unused_sub_pmd_start = end; + if (likely(IS_ALIGNED(unused_sub_pmd_start, PMD_SIZE))) + unused_sub_pmd_start = 0; + return; } - pages4k = pages1m = pages2g = 0; - while (address < end) { - pg_dir = pgd_offset_k(address); - if (pgd_none(*pg_dir)) { - p4_dir = vmem_crst_alloc(_REGION2_ENTRY_EMPTY); - if (!p4_dir) - goto out; - pgd_populate(&init_mm, pg_dir, p4_dir); - } - p4_dir = p4d_offset(pg_dir, address); - if (p4d_none(*p4_dir)) { - pu_dir = vmem_crst_alloc(_REGION3_ENTRY_EMPTY); - if (!pu_dir) - goto out; - p4d_populate(&init_mm, p4_dir, pu_dir); - } - pu_dir = pud_offset(p4_dir, address); - if (MACHINE_HAS_EDAT2 && pud_none(*pu_dir) && address && - !(address & ~PUD_MASK) && (address + PUD_SIZE <= end) && - !debug_pagealloc_enabled()) { - pud_val(*pu_dir) = address | r3_prot; - address += PUD_SIZE; - pages2g++; - continue; - } - if (pud_none(*pu_dir)) { - pm_dir = vmem_crst_alloc(_SEGMENT_ENTRY_EMPTY); - if (!pm_dir) - goto out; - pud_populate(&init_mm, pu_dir, pm_dir); - } - pm_dir = pmd_offset(pu_dir, address); - if (MACHINE_HAS_EDAT1 && pmd_none(*pm_dir) && address && - !(address & ~PMD_MASK) && (address + PMD_SIZE <= end) && - !debug_pagealloc_enabled()) { - pmd_val(*pm_dir) = address | sgt_prot; - address += PMD_SIZE; - pages1m++; + vmemmap_flush_unused_sub_pmd(); + vmemmap_mark_sub_pmd_used(start, end); +} + +static void vmemmap_use_new_sub_pmd(unsigned long start, unsigned long end) +{ + unsigned long page = ALIGN_DOWN(start, PMD_SIZE); + + vmemmap_flush_unused_sub_pmd(); + + /* Could be our memmap page is filled with PAGE_UNUSED already ... */ + vmemmap_mark_sub_pmd_used(start, end); + + /* Mark the unused parts of the new memmap page PAGE_UNUSED. */ + if (!IS_ALIGNED(start, PMD_SIZE)) + memset((void *)page, PAGE_UNUSED, start - page); + /* + * We want to avoid memset(PAGE_UNUSED) when populating the vmemmap of + * consecutive sections. Remember for the last added PMD the last + * unused range in the populated PMD. + */ + if (!IS_ALIGNED(end, PMD_SIZE)) + unused_sub_pmd_start = end; +} + +/* Returns true if the PMD is completely unused and can be freed. */ +static bool vmemmap_unuse_sub_pmd(unsigned long start, unsigned long end) +{ + unsigned long page = ALIGN_DOWN(start, PMD_SIZE); + + vmemmap_flush_unused_sub_pmd(); + memset((void *)start, PAGE_UNUSED, end - start); + return !memchr_inv((void *)page, PAGE_UNUSED, PMD_SIZE); +} + +/* __ref: we'll only call vmemmap_alloc_block() via vmemmap_populate() */ +static int __ref modify_pte_table(pmd_t *pmd, unsigned long addr, + unsigned long end, bool add, bool direct) +{ + unsigned long prot, pages = 0; + int ret = -ENOMEM; + pte_t *pte; + + prot = pgprot_val(PAGE_KERNEL); + if (!MACHINE_HAS_NX) + prot &= ~_PAGE_NOEXEC; + + pte = pte_offset_kernel(pmd, addr); + for (; addr < end; addr += PAGE_SIZE, pte++) { + if (!add) { + if (pte_none(*pte)) + continue; + if (!direct) + vmem_free_pages((unsigned long) pfn_to_virt(pte_pfn(*pte)), 0); + pte_clear(&init_mm, addr, pte); + } else if (pte_none(*pte)) { + if (!direct) { + void *new_page = vmemmap_alloc_block(PAGE_SIZE, NUMA_NO_NODE); + + if (!new_page) + goto out; + set_pte(pte, __pte(__pa(new_page) | prot)); + } else { + set_pte(pte, __pte(__pa(addr) | prot)); + } + } else { continue; } - if (pmd_none(*pm_dir)) { - pt_dir = vmem_pte_alloc(); - if (!pt_dir) - goto out; - pmd_populate(&init_mm, pm_dir, pt_dir); - } - - pt_dir = pte_offset_kernel(pm_dir, address); - pte_val(*pt_dir) = address | pgt_prot; - address += PAGE_SIZE; - pages4k++; + pages++; } ret = 0; out: - update_page_count(PG_DIRECT_MAP_4K, pages4k); - update_page_count(PG_DIRECT_MAP_1M, pages1m); - update_page_count(PG_DIRECT_MAP_2G, pages2g); + if (direct) + update_page_count(PG_DIRECT_MAP_4K, add ? pages : -pages); return ret; } -/* - * Remove a physical memory range from the 1:1 mapping. - * Currently only invalidates page table entries. - */ -static void vmem_remove_range(unsigned long start, unsigned long size) +static void try_free_pte_table(pmd_t *pmd, unsigned long start) { - unsigned long pages4k, pages1m, pages2g; - unsigned long end = start + size; - unsigned long address = start; - pgd_t *pg_dir; - p4d_t *p4_dir; - pud_t *pu_dir; - pmd_t *pm_dir; - pte_t *pt_dir; - - pages4k = pages1m = pages2g = 0; - while (address < end) { - pg_dir = pgd_offset_k(address); - if (pgd_none(*pg_dir)) { - address += PGDIR_SIZE; - continue; - } - p4_dir = p4d_offset(pg_dir, address); - if (p4d_none(*p4_dir)) { - address += P4D_SIZE; - continue; - } - pu_dir = pud_offset(p4_dir, address); - if (pud_none(*pu_dir)) { - address += PUD_SIZE; - continue; - } - if (pud_large(*pu_dir)) { - pud_clear(pu_dir); - address += PUD_SIZE; - pages2g++; - continue; - } - pm_dir = pmd_offset(pu_dir, address); - if (pmd_none(*pm_dir)) { - address += PMD_SIZE; - continue; - } - if (pmd_large(*pm_dir)) { - pmd_clear(pm_dir); - address += PMD_SIZE; - pages1m++; - continue; - } - pt_dir = pte_offset_kernel(pm_dir, address); - pte_clear(&init_mm, address, pt_dir); - address += PAGE_SIZE; - pages4k++; + pte_t *pte; + int i; + + /* We can safely assume this is fully in 1:1 mapping & vmemmap area */ + pte = pte_offset_kernel(pmd, start); + for (i = 0; i < PTRS_PER_PTE; i++, pte++) { + if (!pte_none(*pte)) + return; } - flush_tlb_kernel_range(start, end); - update_page_count(PG_DIRECT_MAP_4K, -pages4k); - update_page_count(PG_DIRECT_MAP_1M, -pages1m); - update_page_count(PG_DIRECT_MAP_2G, -pages2g); + vmem_pte_free((unsigned long *) pmd_deref(*pmd)); + pmd_clear(pmd); } -/* - * Add a backed mem_map array to the virtual mem_map array. - */ -int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, - struct vmem_altmap *altmap) -{ - unsigned long pgt_prot, sgt_prot; - unsigned long address = start; - pgd_t *pg_dir; - p4d_t *p4_dir; - pud_t *pu_dir; - pmd_t *pm_dir; - pte_t *pt_dir; +/* __ref: we'll only call vmemmap_alloc_block() via vmemmap_populate() */ +static int __ref modify_pmd_table(pud_t *pud, unsigned long addr, + unsigned long end, bool add, bool direct) +{ + unsigned long next, prot, pages = 0; int ret = -ENOMEM; + pmd_t *pmd; + pte_t *pte; - pgt_prot = pgprot_val(PAGE_KERNEL); - sgt_prot = pgprot_val(SEGMENT_KERNEL); - if (!MACHINE_HAS_NX) { - pgt_prot &= ~_PAGE_NOEXEC; - sgt_prot &= ~_SEGMENT_ENTRY_NOEXEC; - } - for (address = start; address < end;) { - pg_dir = pgd_offset_k(address); - if (pgd_none(*pg_dir)) { - p4_dir = vmem_crst_alloc(_REGION2_ENTRY_EMPTY); - if (!p4_dir) - goto out; - pgd_populate(&init_mm, pg_dir, p4_dir); - } + prot = pgprot_val(SEGMENT_KERNEL); + if (!MACHINE_HAS_NX) + prot &= ~_SEGMENT_ENTRY_NOEXEC; - p4_dir = p4d_offset(pg_dir, address); - if (p4d_none(*p4_dir)) { - pu_dir = vmem_crst_alloc(_REGION3_ENTRY_EMPTY); - if (!pu_dir) - goto out; - p4d_populate(&init_mm, p4_dir, pu_dir); - } + pmd = pmd_offset(pud, addr); + for (; addr < end; addr = next, pmd++) { + next = pmd_addr_end(addr, end); + if (!add) { + if (pmd_none(*pmd)) + continue; + if (pmd_large(*pmd)) { + if (IS_ALIGNED(addr, PMD_SIZE) && + IS_ALIGNED(next, PMD_SIZE)) { + if (!direct) + vmem_free_pages(pmd_deref(*pmd), get_order(PMD_SIZE)); + pmd_clear(pmd); + pages++; + } else if (!direct && vmemmap_unuse_sub_pmd(addr, next)) { + vmem_free_pages(pmd_deref(*pmd), get_order(PMD_SIZE)); + pmd_clear(pmd); + } + continue; + } + } else if (pmd_none(*pmd)) { + if (IS_ALIGNED(addr, PMD_SIZE) && + IS_ALIGNED(next, PMD_SIZE) && + MACHINE_HAS_EDAT1 && direct && + !debug_pagealloc_enabled()) { + set_pmd(pmd, __pmd(__pa(addr) | prot)); + pages++; + continue; + } else if (!direct && MACHINE_HAS_EDAT1) { + void *new_page; - pu_dir = pud_offset(p4_dir, address); - if (pud_none(*pu_dir)) { - pm_dir = vmem_crst_alloc(_SEGMENT_ENTRY_EMPTY); - if (!pm_dir) + /* + * Use 1MB frames for vmemmap if available. We + * always use large frames even if they are only + * partially used. Otherwise we would have also + * page tables since vmemmap_populate gets + * called for each section separately. + */ + new_page = vmemmap_alloc_block(PMD_SIZE, NUMA_NO_NODE); + if (new_page) { + set_pmd(pmd, __pmd(__pa(new_page) | prot)); + if (!IS_ALIGNED(addr, PMD_SIZE) || + !IS_ALIGNED(next, PMD_SIZE)) { + vmemmap_use_new_sub_pmd(addr, next); + } + continue; + } + } + pte = vmem_pte_alloc(); + if (!pte) goto out; - pud_populate(&init_mm, pu_dir, pm_dir); + pmd_populate(&init_mm, pmd, pte); + } else if (pmd_large(*pmd)) { + if (!direct) + vmemmap_use_sub_pmd(addr, next); + continue; } + ret = modify_pte_table(pmd, addr, next, add, direct); + if (ret) + goto out; + if (!add) + try_free_pte_table(pmd, addr & PMD_MASK); + } + ret = 0; +out: + if (direct) + update_page_count(PG_DIRECT_MAP_1M, add ? pages : -pages); + return ret; +} - pm_dir = pmd_offset(pu_dir, address); - if (pmd_none(*pm_dir)) { - /* Use 1MB frames for vmemmap if available. We always - * use large frames even if they are only partially - * used. - * Otherwise we would have also page tables since - * vmemmap_populate gets called for each section - * separately. */ - if (MACHINE_HAS_EDAT1) { - void *new_page; +static void try_free_pmd_table(pud_t *pud, unsigned long start) +{ + const unsigned long end = start + PUD_SIZE; + pmd_t *pmd; + int i; + + /* Don't mess with any tables not fully in 1:1 mapping & vmemmap area */ + if (end > VMALLOC_START) + return; +#ifdef CONFIG_KASAN + if (start < KASAN_SHADOW_END && KASAN_SHADOW_START > end) + return; +#endif + pmd = pmd_offset(pud, start); + for (i = 0; i < PTRS_PER_PMD; i++, pmd++) + if (!pmd_none(*pmd)) + return; + vmem_free_pages(pud_deref(*pud), CRST_ALLOC_ORDER); + pud_clear(pud); +} - new_page = vmemmap_alloc_block(PMD_SIZE, node); - if (!new_page) - goto out; - pmd_val(*pm_dir) = __pa(new_page) | sgt_prot; - address = (address + PMD_SIZE) & PMD_MASK; +static int modify_pud_table(p4d_t *p4d, unsigned long addr, unsigned long end, + bool add, bool direct) +{ + unsigned long next, prot, pages = 0; + int ret = -ENOMEM; + pud_t *pud; + pmd_t *pmd; + + prot = pgprot_val(REGION3_KERNEL); + if (!MACHINE_HAS_NX) + prot &= ~_REGION_ENTRY_NOEXEC; + pud = pud_offset(p4d, addr); + for (; addr < end; addr = next, pud++) { + next = pud_addr_end(addr, end); + if (!add) { + if (pud_none(*pud)) + continue; + if (pud_large(*pud)) { + if (IS_ALIGNED(addr, PUD_SIZE) && + IS_ALIGNED(next, PUD_SIZE)) { + pud_clear(pud); + pages++; + } + continue; + } + } else if (pud_none(*pud)) { + if (IS_ALIGNED(addr, PUD_SIZE) && + IS_ALIGNED(next, PUD_SIZE) && + MACHINE_HAS_EDAT2 && direct && + !debug_pagealloc_enabled()) { + set_pud(pud, __pud(__pa(addr) | prot)); + pages++; continue; } - pt_dir = vmem_pte_alloc(); - if (!pt_dir) + pmd = vmem_crst_alloc(_SEGMENT_ENTRY_EMPTY); + if (!pmd) goto out; - pmd_populate(&init_mm, pm_dir, pt_dir); - } else if (pmd_large(*pm_dir)) { - address = (address + PMD_SIZE) & PMD_MASK; + pud_populate(&init_mm, pud, pmd); + } else if (pud_large(*pud)) { continue; } + ret = modify_pmd_table(pud, addr, next, add, direct); + if (ret) + goto out; + if (!add) + try_free_pmd_table(pud, addr & PUD_MASK); + } + ret = 0; +out: + if (direct) + update_page_count(PG_DIRECT_MAP_2G, add ? pages : -pages); + return ret; +} - pt_dir = pte_offset_kernel(pm_dir, address); - if (pte_none(*pt_dir)) { - void *new_page; +static void try_free_pud_table(p4d_t *p4d, unsigned long start) +{ + const unsigned long end = start + P4D_SIZE; + pud_t *pud; + int i; + + /* Don't mess with any tables not fully in 1:1 mapping & vmemmap area */ + if (end > VMALLOC_START) + return; +#ifdef CONFIG_KASAN + if (start < KASAN_SHADOW_END && KASAN_SHADOW_START > end) + return; +#endif + + pud = pud_offset(p4d, start); + for (i = 0; i < PTRS_PER_PUD; i++, pud++) { + if (!pud_none(*pud)) + return; + } + vmem_free_pages(p4d_deref(*p4d), CRST_ALLOC_ORDER); + p4d_clear(p4d); +} - new_page = vmemmap_alloc_block(PAGE_SIZE, node); - if (!new_page) +static int modify_p4d_table(pgd_t *pgd, unsigned long addr, unsigned long end, + bool add, bool direct) +{ + unsigned long next; + int ret = -ENOMEM; + p4d_t *p4d; + pud_t *pud; + + p4d = p4d_offset(pgd, addr); + for (; addr < end; addr = next, p4d++) { + next = p4d_addr_end(addr, end); + if (!add) { + if (p4d_none(*p4d)) + continue; + } else if (p4d_none(*p4d)) { + pud = vmem_crst_alloc(_REGION3_ENTRY_EMPTY); + if (!pud) goto out; - pte_val(*pt_dir) = __pa(new_page) | pgt_prot; + p4d_populate(&init_mm, p4d, pud); } - address += PAGE_SIZE; + ret = modify_pud_table(p4d, addr, next, add, direct); + if (ret) + goto out; + if (!add) + try_free_pud_table(p4d, addr & P4D_MASK); } ret = 0; out: return ret; } -void vmemmap_free(unsigned long start, unsigned long end, - struct vmem_altmap *altmap) +static void try_free_p4d_table(pgd_t *pgd, unsigned long start) { + const unsigned long end = start + PGDIR_SIZE; + p4d_t *p4d; + int i; + + /* Don't mess with any tables not fully in 1:1 mapping & vmemmap area */ + if (end > VMALLOC_START) + return; +#ifdef CONFIG_KASAN + if (start < KASAN_SHADOW_END && KASAN_SHADOW_START > end) + return; +#endif + + p4d = p4d_offset(pgd, start); + for (i = 0; i < PTRS_PER_P4D; i++, p4d++) { + if (!p4d_none(*p4d)) + return; + } + vmem_free_pages(pgd_deref(*pgd), CRST_ALLOC_ORDER); + pgd_clear(pgd); } -/* - * Add memory segment to the segment list if it doesn't overlap with - * an already present segment. - */ -static int insert_memory_segment(struct memory_segment *seg) +static int modify_pagetable(unsigned long start, unsigned long end, bool add, + bool direct) { - struct memory_segment *tmp; + unsigned long addr, next; + int ret = -ENOMEM; + pgd_t *pgd; + p4d_t *p4d; - if (seg->start + seg->size > VMEM_MAX_PHYS || - seg->start + seg->size < seg->start) - return -ERANGE; + if (WARN_ON_ONCE(!PAGE_ALIGNED(start | end))) + return -EINVAL; + for (addr = start; addr < end; addr = next) { + next = pgd_addr_end(addr, end); + pgd = pgd_offset_k(addr); - list_for_each_entry(tmp, &mem_segs, list) { - if (seg->start >= tmp->start + tmp->size) - continue; - if (seg->start + seg->size <= tmp->start) - continue; - return -ENOSPC; + if (!add) { + if (pgd_none(*pgd)) + continue; + } else if (pgd_none(*pgd)) { + p4d = vmem_crst_alloc(_REGION2_ENTRY_EMPTY); + if (!p4d) + goto out; + pgd_populate(&init_mm, pgd, p4d); + } + ret = modify_p4d_table(pgd, addr, next, add, direct); + if (ret) + goto out; + if (!add) + try_free_p4d_table(pgd, addr & PGDIR_MASK); } - list_add(&seg->list, &mem_segs); - return 0; + ret = 0; +out: + if (!add) + flush_tlb_kernel_range(start, end); + return ret; +} + +static int add_pagetable(unsigned long start, unsigned long end, bool direct) +{ + return modify_pagetable(start, end, true, direct); +} + +static int remove_pagetable(unsigned long start, unsigned long end, bool direct) +{ + return modify_pagetable(start, end, false, direct); } /* - * Remove memory segment from the segment list. + * Add a physical memory range to the 1:1 mapping. */ -static void remove_memory_segment(struct memory_segment *seg) +static int vmem_add_range(unsigned long start, unsigned long size) { - list_del(&seg->list); + return add_pagetable(start, start + size, true); } -static void __remove_shared_memory(struct memory_segment *seg) +/* + * Remove a physical memory range from the 1:1 mapping. + */ +static void vmem_remove_range(unsigned long start, unsigned long size) { - remove_memory_segment(seg); - vmem_remove_range(seg->start, seg->size); + remove_pagetable(start, start + size, true); } -int vmem_remove_mapping(unsigned long start, unsigned long size) +/* + * Add a backed mem_map array to the virtual mem_map array. + */ +int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, + struct vmem_altmap *altmap) { - struct memory_segment *seg; int ret; mutex_lock(&vmem_mutex); + /* We don't care about the node, just use NUMA_NO_NODE on allocations */ + ret = add_pagetable(start, end, false); + if (ret) + remove_pagetable(start, end, false); + mutex_unlock(&vmem_mutex); + return ret; +} - ret = -ENOENT; - list_for_each_entry(seg, &mem_segs, list) { - if (seg->start == start && seg->size == size) - break; - } - - if (seg->start != start || seg->size != size) - goto out; +void vmemmap_free(unsigned long start, unsigned long end, + struct vmem_altmap *altmap) +{ + mutex_lock(&vmem_mutex); + remove_pagetable(start, end, false); + mutex_unlock(&vmem_mutex); +} - ret = 0; - __remove_shared_memory(seg); - kfree(seg); -out: +void vmem_remove_mapping(unsigned long start, unsigned long size) +{ + mutex_lock(&vmem_mutex); + vmem_remove_range(start, size); mutex_unlock(&vmem_mutex); - return ret; +} + +struct range arch_get_mappable_range(void) +{ + struct range mhp_range; + + mhp_range.start = 0; + mhp_range.end = VMEM_MAX_PHYS - 1; + return mhp_range; } int vmem_add_mapping(unsigned long start, unsigned long size) { - struct memory_segment *seg; + struct range range = arch_get_mappable_range(); int ret; - mutex_lock(&vmem_mutex); - ret = -ENOMEM; - seg = kzalloc(sizeof(*seg), GFP_KERNEL); - if (!seg) - goto out; - seg->start = start; - seg->size = size; + if (start < range.start || + start + size > range.end + 1 || + start + size < start) + return -ERANGE; - ret = insert_memory_segment(seg); + mutex_lock(&vmem_mutex); + ret = vmem_add_range(start, size); if (ret) - goto out_free; + vmem_remove_range(start, size); + mutex_unlock(&vmem_mutex); + return ret; +} - ret = vmem_add_mem(start, size); - if (ret) - goto out_remove; - goto out; +/* + * Allocate new or return existing page-table entry, but do not map it + * to any physical address. If missing, allocate segment- and region- + * table entries along. Meeting a large segment- or region-table entry + * while traversing is an error, since the function is expected to be + * called against virtual regions reserverd for 4KB mappings only. + */ +pte_t *vmem_get_alloc_pte(unsigned long addr, bool alloc) +{ + pte_t *ptep = NULL; + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; -out_remove: - __remove_shared_memory(seg); -out_free: - kfree(seg); + pgd = pgd_offset_k(addr); + if (pgd_none(*pgd)) { + if (!alloc) + goto out; + p4d = vmem_crst_alloc(_REGION2_ENTRY_EMPTY); + if (!p4d) + goto out; + pgd_populate(&init_mm, pgd, p4d); + } + p4d = p4d_offset(pgd, addr); + if (p4d_none(*p4d)) { + if (!alloc) + goto out; + pud = vmem_crst_alloc(_REGION3_ENTRY_EMPTY); + if (!pud) + goto out; + p4d_populate(&init_mm, p4d, pud); + } + pud = pud_offset(p4d, addr); + if (pud_none(*pud)) { + if (!alloc) + goto out; + pmd = vmem_crst_alloc(_SEGMENT_ENTRY_EMPTY); + if (!pmd) + goto out; + pud_populate(&init_mm, pud, pmd); + } else if (WARN_ON_ONCE(pud_large(*pud))) { + goto out; + } + pmd = pmd_offset(pud, addr); + if (pmd_none(*pmd)) { + if (!alloc) + goto out; + pte = vmem_pte_alloc(); + if (!pte) + goto out; + pmd_populate(&init_mm, pmd, pte); + } else if (WARN_ON_ONCE(pmd_large(*pmd))) { + goto out; + } + ptep = pte_offset_kernel(pmd, addr); out: + return ptep; +} + +int __vmem_map_4k_page(unsigned long addr, unsigned long phys, pgprot_t prot, bool alloc) +{ + pte_t *ptep, pte; + + if (!IS_ALIGNED(addr, PAGE_SIZE)) + return -EINVAL; + ptep = vmem_get_alloc_pte(addr, alloc); + if (!ptep) + return -ENOMEM; + __ptep_ipte(addr, ptep, 0, 0, IPTE_GLOBAL); + pte = mk_pte_phys(phys, prot); + set_pte(ptep, pte); + return 0; +} + +int vmem_map_4k_page(unsigned long addr, unsigned long phys, pgprot_t prot) +{ + int rc; + + mutex_lock(&vmem_mutex); + rc = __vmem_map_4k_page(addr, phys, prot, true); + mutex_unlock(&vmem_mutex); + return rc; +} + +void vmem_unmap_4k_page(unsigned long addr) +{ + pte_t *ptep; + + mutex_lock(&vmem_mutex); + ptep = virt_to_kpte(addr); + __ptep_ipte(addr, ptep, 0, 0, IPTE_GLOBAL); + pte_clear(&init_mm, addr, ptep); mutex_unlock(&vmem_mutex); - return ret; } /* @@ -400,10 +664,11 @@ out: */ void __init vmem_map_init(void) { - struct memblock_region *reg; + phys_addr_t base, end; + u64 i; - for_each_memblock(memory, reg) - vmem_add_mem(reg->base, reg->size); + for_each_mem_range(i, &base, &end) + vmem_add_range(base, end - base); __set_memory((unsigned long)_stext, (unsigned long)(_etext - _stext) >> PAGE_SHIFT, SET_MEMORY_RO | SET_MEMORY_X); @@ -413,32 +678,16 @@ void __init vmem_map_init(void) __set_memory((unsigned long)_sinittext, (unsigned long)(_einittext - _sinittext) >> PAGE_SHIFT, SET_MEMORY_RO | SET_MEMORY_X); - __set_memory(__stext_dma, (__etext_dma - __stext_dma) >> PAGE_SHIFT, + __set_memory(__stext_amode31, (__etext_amode31 - __stext_amode31) >> PAGE_SHIFT, SET_MEMORY_RO | SET_MEMORY_X); - pr_info("Write protected kernel read-only data: %luk\n", - (unsigned long)(__end_rodata - _stext) >> 10); -} -/* - * Convert memblock.memory to a memory segment list so there is a single - * list that contains all memory segments. - */ -static int __init vmem_convert_memory_chunk(void) -{ - struct memblock_region *reg; - struct memory_segment *seg; + /* lowcore requires 4k mapping for real addresses / prefixing */ + set_memory_4k(0, LC_PAGES); - mutex_lock(&vmem_mutex); - for_each_memblock(memory, reg) { - seg = kzalloc(sizeof(*seg), GFP_KERNEL); - if (!seg) - panic("Out of memory...\n"); - seg->start = reg->base; - seg->size = reg->size; - insert_memory_segment(seg); - } - mutex_unlock(&vmem_mutex); - return 0; -} + /* lowcore must be executable for LPSWE */ + if (!static_key_enabled(&cpu_has_bear)) + set_memory_x(0, 1); -core_initcall(vmem_convert_memory_chunk); + pr_info("Write protected kernel read-only data: %luk\n", + (unsigned long)(__end_rodata - _stext) >> 10); +} |