diff options
Diffstat (limited to 'arch/s390/mm')
-rw-r--r-- | arch/s390/mm/Makefile | 5 | ||||
-rw-r--r-- | arch/s390/mm/cmm.c | 15 | ||||
-rw-r--r-- | arch/s390/mm/dump_pagetables.c | 59 | ||||
-rw-r--r-- | arch/s390/mm/extable.c | 27 | ||||
-rw-r--r-- | arch/s390/mm/extmem.c | 25 | ||||
-rw-r--r-- | arch/s390/mm/fault.c | 746 | ||||
-rw-r--r-- | arch/s390/mm/gmap.c | 472 | ||||
-rw-r--r-- | arch/s390/mm/hugetlbpage.c | 26 | ||||
-rw-r--r-- | arch/s390/mm/init.c | 112 | ||||
-rw-r--r-- | arch/s390/mm/kasan_init.c | 403 | ||||
-rw-r--r-- | arch/s390/mm/maccess.c | 218 | ||||
-rw-r--r-- | arch/s390/mm/mmap.c | 43 | ||||
-rw-r--r-- | arch/s390/mm/page-states.c | 211 | ||||
-rw-r--r-- | arch/s390/mm/pageattr.c | 107 | ||||
-rw-r--r-- | arch/s390/mm/pfault.c | 248 | ||||
-rw-r--r-- | arch/s390/mm/pgalloc.c | 331 | ||||
-rw-r--r-- | arch/s390/mm/pgtable.c | 80 | ||||
-rw-r--r-- | arch/s390/mm/physaddr.c | 15 | ||||
-rw-r--r-- | arch/s390/mm/vmem.c | 259 |
19 files changed, 1477 insertions, 1925 deletions
diff --git a/arch/s390/mm/Makefile b/arch/s390/mm/Makefile index 57e4f3a24829..f6c2db7a8669 100644 --- a/arch/s390/mm/Makefile +++ b/arch/s390/mm/Makefile @@ -7,9 +7,8 @@ obj-y := init.o fault.o extmem.o mmap.o vmem.o maccess.o obj-y += page-states.o pageattr.o pgtable.o pgalloc.o extable.o obj-$(CONFIG_CMM) += cmm.o +obj-$(CONFIG_DEBUG_VIRTUAL) += physaddr.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o obj-$(CONFIG_PTDUMP_CORE) += dump_pagetables.o obj-$(CONFIG_PGSTE) += gmap.o - -KASAN_SANITIZE_kasan_init.o := n -obj-$(CONFIG_KASAN) += kasan_init.o +obj-$(CONFIG_PFAULT) += pfault.o diff --git a/arch/s390/mm/cmm.c b/arch/s390/mm/cmm.c index 9141ed4c52e9..f8b13f247646 100644 --- a/arch/s390/mm/cmm.c +++ b/arch/s390/mm/cmm.c @@ -90,7 +90,7 @@ static long cmm_alloc_pages(long nr, long *counter, } else free_page((unsigned long) npa); } - diag10_range(virt_to_pfn(addr), 1); + diag10_range(virt_to_pfn((void *)addr), 1); pa->pages[pa->index++] = addr; (*counter)++; spin_unlock(&cmm_lock); @@ -332,17 +332,6 @@ static struct ctl_table cmm_table[] = { .mode = 0644, .proc_handler = cmm_timeout_handler, }, - { } -}; - -static struct ctl_table cmm_dir_table[] = { - { - .procname = "vm", - .maxlen = 0, - .mode = 0555, - .child = cmm_table, - }, - { } }; #ifdef CONFIG_CMM_IUCV @@ -389,7 +378,7 @@ static int __init cmm_init(void) { int rc = -ENOMEM; - cmm_sysctl_header = register_sysctl_table(cmm_dir_table); + cmm_sysctl_header = register_sysctl("vm", cmm_table); if (!cmm_sysctl_header) goto out_sysctl; #ifdef CONFIG_CMM_IUCV diff --git a/arch/s390/mm/dump_pagetables.c b/arch/s390/mm/dump_pagetables.c index 9f9af5298dd6..ffd07ed7b4af 100644 --- a/arch/s390/mm/dump_pagetables.c +++ b/arch/s390/mm/dump_pagetables.c @@ -6,10 +6,11 @@ #include <linux/mm.h> #include <linux/kfence.h> #include <linux/kasan.h> -#include <asm/ptdump.h> #include <asm/kasan.h> +#include <asm/abs_lowcore.h> #include <asm/nospec-branch.h> #include <asm/sections.h> +#include <asm/maccess.h> static unsigned long max_addr; @@ -21,6 +22,8 @@ struct addr_marker { enum address_markers_idx { IDENTITY_BEFORE_NR = 0, IDENTITY_BEFORE_END_NR, + AMODE31_START_NR, + AMODE31_END_NR, KERNEL_START_NR, KERNEL_END_NR, #ifdef CONFIG_KFENCE @@ -29,21 +32,27 @@ enum address_markers_idx { #endif IDENTITY_AFTER_NR, IDENTITY_AFTER_END_NR, -#ifdef CONFIG_KASAN - KASAN_SHADOW_START_NR, - KASAN_SHADOW_END_NR, -#endif VMEMMAP_NR, VMEMMAP_END_NR, VMALLOC_NR, VMALLOC_END_NR, MODULES_NR, MODULES_END_NR, + ABS_LOWCORE_NR, + ABS_LOWCORE_END_NR, + MEMCPY_REAL_NR, + MEMCPY_REAL_END_NR, +#ifdef CONFIG_KASAN + KASAN_SHADOW_START_NR, + KASAN_SHADOW_END_NR, +#endif }; static struct addr_marker address_markers[] = { [IDENTITY_BEFORE_NR] = {0, "Identity Mapping Start"}, [IDENTITY_BEFORE_END_NR] = {(unsigned long)_stext, "Identity Mapping End"}, + [AMODE31_START_NR] = {0, "Amode31 Area Start"}, + [AMODE31_END_NR] = {0, "Amode31 Area End"}, [KERNEL_START_NR] = {(unsigned long)_stext, "Kernel Image Start"}, [KERNEL_END_NR] = {(unsigned long)_end, "Kernel Image End"}, #ifdef CONFIG_KFENCE @@ -52,16 +61,20 @@ static struct addr_marker address_markers[] = { #endif [IDENTITY_AFTER_NR] = {(unsigned long)_end, "Identity Mapping Start"}, [IDENTITY_AFTER_END_NR] = {0, "Identity Mapping End"}, -#ifdef CONFIG_KASAN - [KASAN_SHADOW_START_NR] = {KASAN_SHADOW_START, "Kasan Shadow Start"}, - [KASAN_SHADOW_END_NR] = {KASAN_SHADOW_END, "Kasan Shadow End"}, -#endif [VMEMMAP_NR] = {0, "vmemmap Area Start"}, [VMEMMAP_END_NR] = {0, "vmemmap Area End"}, [VMALLOC_NR] = {0, "vmalloc Area Start"}, [VMALLOC_END_NR] = {0, "vmalloc Area End"}, [MODULES_NR] = {0, "Modules Area Start"}, [MODULES_END_NR] = {0, "Modules Area End"}, + [ABS_LOWCORE_NR] = {0, "Lowcore Area Start"}, + [ABS_LOWCORE_END_NR] = {0, "Lowcore Area End"}, + [MEMCPY_REAL_NR] = {0, "Real Memory Copy Area Start"}, + [MEMCPY_REAL_END_NR] = {0, "Real Memory Copy Area End"}, +#ifdef CONFIG_KASAN + [KASAN_SHADOW_START_NR] = {KASAN_SHADOW_START, "Kasan Shadow Start"}, + [KASAN_SHADOW_END_NR] = {KASAN_SHADOW_END, "Kasan Shadow End"}, +#endif { -1, NULL } }; @@ -108,7 +121,6 @@ static void print_prot(struct seq_file *m, unsigned int pr, int level) static void note_prot_wx(struct pg_state *st, unsigned long addr) { -#ifdef CONFIG_DEBUG_WX if (!st->check_wx) return; if (st->current_prot & _PAGE_INVALID) @@ -125,10 +137,10 @@ static void note_prot_wx(struct pg_state *st, unsigned long addr) */ if (addr == PAGE_SIZE && (nospec_uses_trampoline() || !static_key_enabled(&cpu_has_bear))) return; - WARN_ONCE(1, "s390/mm: Found insecure W+X mapping at address %pS\n", + WARN_ONCE(IS_ENABLED(CONFIG_DEBUG_WX), + "s390/mm: Found insecure W+X mapping at address %pS\n", (void *)st->start_address); st->wx_pages += (addr - st->start_address) / PAGE_SIZE; -#endif /* CONFIG_DEBUG_WX */ } static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level, u64 val) @@ -180,8 +192,7 @@ static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level, } } -#ifdef CONFIG_DEBUG_WX -void ptdump_check_wx(void) +bool ptdump_check_wx(void) { struct pg_state st = { .ptdump = { @@ -204,16 +215,20 @@ void ptdump_check_wx(void) }; if (!MACHINE_HAS_NX) - return; + return true; ptdump_walk_pgd(&st.ptdump, &init_mm, NULL); - if (st.wx_pages) + if (st.wx_pages) { pr_warn("Checked W+X mappings: FAILED, %lu W+X pages found\n", st.wx_pages); - else + + return false; + } else { pr_info("Checked W+X mappings: passed, no %sW+X pages found\n", (nospec_uses_trampoline() || !static_key_enabled(&cpu_has_bear)) ? "unexpected " : ""); + + return true; + } } -#endif /* CONFIG_DEBUG_WX */ #ifdef CONFIG_PTDUMP_DEBUGFS static int ptdump_show(struct seq_file *m, void *v) @@ -273,11 +288,17 @@ static int pt_dump_init(void) * kernel ASCE. We need this to keep the page table walker functions * from accessing non-existent entries. */ - max_addr = (S390_lowcore.kernel_asce & _REGION_ENTRY_TYPE_MASK) >> 2; + max_addr = (S390_lowcore.kernel_asce.val & _REGION_ENTRY_TYPE_MASK) >> 2; max_addr = 1UL << (max_addr * 11 + 31); address_markers[IDENTITY_AFTER_END_NR].start_address = ident_map_size; + address_markers[AMODE31_START_NR].start_address = (unsigned long)__samode31; + address_markers[AMODE31_END_NR].start_address = (unsigned long)__eamode31; address_markers[MODULES_NR].start_address = MODULES_VADDR; address_markers[MODULES_END_NR].start_address = MODULES_END; + address_markers[ABS_LOWCORE_NR].start_address = __abs_lowcore; + address_markers[ABS_LOWCORE_END_NR].start_address = __abs_lowcore + ABS_LOWCORE_MAP_SIZE; + address_markers[MEMCPY_REAL_NR].start_address = __memcpy_real_area; + address_markers[MEMCPY_REAL_END_NR].start_address = __memcpy_real_area + MEMCPY_REAL_SIZE; address_markers[VMEMMAP_NR].start_address = (unsigned long) vmemmap; address_markers[VMEMMAP_END_NR].start_address = (unsigned long)vmemmap + vmemmap_size; address_markers[VMALLOC_NR].start_address = VMALLOC_START; diff --git a/arch/s390/mm/extable.c b/arch/s390/mm/extable.c index 1e4d2187541a..0a0738a473af 100644 --- a/arch/s390/mm/extable.c +++ b/arch/s390/mm/extable.c @@ -47,13 +47,32 @@ static bool ex_handler_ua_load_mem(const struct exception_table_entry *ex, struc return true; } -static bool ex_handler_ua_load_reg(const struct exception_table_entry *ex, struct pt_regs *regs) +static bool ex_handler_ua_load_reg(const struct exception_table_entry *ex, + bool pair, struct pt_regs *regs) { unsigned int reg_zero = FIELD_GET(EX_DATA_REG_ADDR, ex->data); unsigned int reg_err = FIELD_GET(EX_DATA_REG_ERR, ex->data); regs->gprs[reg_err] = -EFAULT; regs->gprs[reg_zero] = 0; + if (pair) + regs->gprs[reg_zero + 1] = 0; + regs->psw.addr = extable_fixup(ex); + return true; +} + +static bool ex_handler_zeropad(const struct exception_table_entry *ex, struct pt_regs *regs) +{ + unsigned int reg_addr = FIELD_GET(EX_DATA_REG_ADDR, ex->data); + unsigned int reg_data = FIELD_GET(EX_DATA_REG_ERR, ex->data); + unsigned long data, addr, offset; + + addr = regs->gprs[reg_addr]; + offset = addr & (sizeof(unsigned long) - 1); + addr &= ~(sizeof(unsigned long) - 1); + data = *(unsigned long *)addr; + data <<= BITS_PER_BYTE * offset; + regs->gprs[reg_data] = data; regs->psw.addr = extable_fixup(ex); return true; } @@ -75,7 +94,11 @@ bool fixup_exception(struct pt_regs *regs) case EX_TYPE_UA_LOAD_MEM: return ex_handler_ua_load_mem(ex, regs); case EX_TYPE_UA_LOAD_REG: - return ex_handler_ua_load_reg(ex, regs); + return ex_handler_ua_load_reg(ex, false, regs); + case EX_TYPE_UA_LOAD_REGPAIR: + return ex_handler_ua_load_reg(ex, true, regs); + case EX_TYPE_ZEROPAD: + return ex_handler_zeropad(ex, regs); } panic("invalid exception table entry"); } diff --git a/arch/s390/mm/extmem.c b/arch/s390/mm/extmem.c index 5060956b8e7d..282fefe107a2 100644 --- a/arch/s390/mm/extmem.c +++ b/arch/s390/mm/extmem.c @@ -136,7 +136,7 @@ dcss_diag(int *func, void *parameter, unsigned long rx, ry; int rc; - rx = (unsigned long) parameter; + rx = virt_to_phys(parameter); ry = (unsigned long) *func; diag_stat_inc(DIAG_STAT_X064); @@ -178,7 +178,7 @@ query_segment_type (struct dcss_segment *seg) /* initialize diag input parameters */ qin->qopcode = DCSS_FINDSEGA; - qin->qoutptr = (unsigned long) qout; + qin->qoutptr = virt_to_phys(qout); qin->qoutlen = sizeof(struct qout64); memcpy (qin->qname, seg->dcss_name, 8); @@ -289,15 +289,17 @@ segment_overlaps_others (struct dcss_segment *seg) /* * real segment loading function, called from segment_load + * Must return either an error code < 0, or the segment type code >= 0 */ static int __segment_load (char *name, int do_nonshared, unsigned long *addr, unsigned long *end) { unsigned long start_addr, end_addr, dummy; struct dcss_segment *seg; - int rc, diag_cc; + int rc, diag_cc, segtype; start_addr = end_addr = 0; + segtype = -1; seg = kmalloc(sizeof(*seg), GFP_KERNEL | GFP_DMA); if (seg == NULL) { rc = -ENOMEM; @@ -326,9 +328,9 @@ __segment_load (char *name, int do_nonshared, unsigned long *addr, unsigned long seg->res_name[8] = '\0'; strlcat(seg->res_name, " (DCSS)", sizeof(seg->res_name)); seg->res->name = seg->res_name; - rc = seg->vm_segtype; - if (rc == SEG_TYPE_SC || - ((rc == SEG_TYPE_SR || rc == SEG_TYPE_ER) && !do_nonshared)) + segtype = seg->vm_segtype; + if (segtype == SEG_TYPE_SC || + ((segtype == SEG_TYPE_SR || segtype == SEG_TYPE_ER) && !do_nonshared)) seg->res->flags |= IORESOURCE_READONLY; /* Check for overlapping resources before adding the mapping. */ @@ -386,7 +388,7 @@ __segment_load (char *name, int do_nonshared, unsigned long *addr, unsigned long out_free: kfree(seg); out: - return rc; + return rc < 0 ? rc : segtype; } /* @@ -638,10 +640,13 @@ void segment_warning(int rc, char *seg_name) pr_err("There is not enough memory to load or query " "DCSS %s\n", seg_name); break; - case -ERANGE: - pr_err("DCSS %s exceeds the kernel mapping range (%lu) " - "and cannot be loaded\n", seg_name, VMEM_MAX_PHYS); + case -ERANGE: { + struct range mhp_range = arch_get_mappable_range(); + + pr_err("DCSS %s exceeds the kernel mapping range (%llu) " + "and cannot be loaded\n", seg_name, mhp_range.end + 1); break; + } default: break; } diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index e173b6187ad5..c421dd44ffbe 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -3,17 +3,19 @@ * S390 version * Copyright IBM Corp. 1999 * Author(s): Hartmut Penner (hp@de.ibm.com) - * Ulrich Weigand (uweigand@de.ibm.com) + * Ulrich Weigand (uweigand@de.ibm.com) * * Derived from "arch/i386/mm/fault.c" * Copyright (C) 1995 Linus Torvalds */ #include <linux/kernel_stat.h> +#include <linux/mmu_context.h> #include <linux/perf_event.h> #include <linux/signal.h> #include <linux/sched.h> #include <linux/sched/debug.h> +#include <linux/jump_label.h> #include <linux/kernel.h> #include <linux/errno.h> #include <linux/string.h> @@ -34,36 +36,27 @@ #include <linux/kfence.h> #include <asm/asm-extable.h> #include <asm/asm-offsets.h> +#include <asm/ptrace.h> +#include <asm/fault.h> #include <asm/diag.h> #include <asm/gmap.h> #include <asm/irq.h> -#include <asm/mmu_context.h> #include <asm/facility.h> #include <asm/uv.h> #include "../kernel/entry.h" -#define __FAIL_ADDR_MASK -4096L -#define __SUBCODE_MASK 0x0600 -#define __PF_RES_FIELD 0x8000000000000000ULL - -#define VM_FAULT_BADCONTEXT ((__force vm_fault_t) 0x010000) -#define VM_FAULT_BADMAP ((__force vm_fault_t) 0x020000) -#define VM_FAULT_BADACCESS ((__force vm_fault_t) 0x040000) -#define VM_FAULT_SIGNAL ((__force vm_fault_t) 0x080000) -#define VM_FAULT_PFAULT ((__force vm_fault_t) 0x100000) - enum fault_type { KERNEL_FAULT, USER_FAULT, GMAP_FAULT, }; -static unsigned long store_indication __read_mostly; +static DEFINE_STATIC_KEY_FALSE(have_store_indication); static int __init fault_init(void) { if (test_facility(75)) - store_indication = 0xc00; + static_branch_enable(&have_store_indication); return 0; } early_initcall(fault_init); @@ -73,82 +66,90 @@ early_initcall(fault_init); */ static enum fault_type get_fault_type(struct pt_regs *regs) { - unsigned long trans_exc_code; + union teid teid = { .val = regs->int_parm_long }; + struct gmap *gmap; - trans_exc_code = regs->int_parm_long & 3; - if (likely(trans_exc_code == 0)) { - /* primary space exception */ + if (likely(teid.as == PSW_BITS_AS_PRIMARY)) { if (user_mode(regs)) return USER_FAULT; if (!IS_ENABLED(CONFIG_PGSTE)) return KERNEL_FAULT; - if (test_pt_regs_flag(regs, PIF_GUEST_FAULT)) + gmap = (struct gmap *)S390_lowcore.gmap; + if (regs->cr1 == gmap->asce) return GMAP_FAULT; return KERNEL_FAULT; } - if (trans_exc_code == 2) + if (teid.as == PSW_BITS_AS_SECONDARY) return USER_FAULT; - if (trans_exc_code == 1) { - /* access register mode, not used in the kernel */ + /* Access register mode, not used in the kernel */ + if (teid.as == PSW_BITS_AS_ACCREG) return USER_FAULT; - } - /* home space exception -> access via kernel ASCE */ + /* Home space -> access via kernel ASCE */ return KERNEL_FAULT; } -static int bad_address(void *p) +static unsigned long get_fault_address(struct pt_regs *regs) { - unsigned long dummy; + union teid teid = { .val = regs->int_parm_long }; - return get_kernel_nofault(dummy, (unsigned long *)p); + return teid.addr * PAGE_SIZE; +} + +static __always_inline bool fault_is_write(struct pt_regs *regs) +{ + union teid teid = { .val = regs->int_parm_long }; + + if (static_branch_likely(&have_store_indication)) + return teid.fsi == TEID_FSI_STORE; + return false; } static void dump_pagetable(unsigned long asce, unsigned long address) { - unsigned long *table = __va(asce & _ASCE_ORIGIN); + unsigned long entry, *table = __va(asce & _ASCE_ORIGIN); pr_alert("AS:%016lx ", asce); switch (asce & _ASCE_TYPE_MASK) { case _ASCE_TYPE_REGION1: table += (address & _REGION1_INDEX) >> _REGION1_SHIFT; - if (bad_address(table)) + if (get_kernel_nofault(entry, table)) goto bad; - pr_cont("R1:%016lx ", *table); - if (*table & _REGION_ENTRY_INVALID) + pr_cont("R1:%016lx ", entry); + if (entry & _REGION_ENTRY_INVALID) goto out; - table = __va(*table & _REGION_ENTRY_ORIGIN); + table = __va(entry & _REGION_ENTRY_ORIGIN); fallthrough; case _ASCE_TYPE_REGION2: table += (address & _REGION2_INDEX) >> _REGION2_SHIFT; - if (bad_address(table)) + if (get_kernel_nofault(entry, table)) goto bad; - pr_cont("R2:%016lx ", *table); - if (*table & _REGION_ENTRY_INVALID) + pr_cont("R2:%016lx ", entry); + if (entry & _REGION_ENTRY_INVALID) goto out; - table = __va(*table & _REGION_ENTRY_ORIGIN); + table = __va(entry & _REGION_ENTRY_ORIGIN); fallthrough; case _ASCE_TYPE_REGION3: table += (address & _REGION3_INDEX) >> _REGION3_SHIFT; - if (bad_address(table)) + if (get_kernel_nofault(entry, table)) goto bad; - pr_cont("R3:%016lx ", *table); - if (*table & (_REGION_ENTRY_INVALID | _REGION3_ENTRY_LARGE)) + pr_cont("R3:%016lx ", entry); + if (entry & (_REGION_ENTRY_INVALID | _REGION3_ENTRY_LARGE)) goto out; - table = __va(*table & _REGION_ENTRY_ORIGIN); + table = __va(entry & _REGION_ENTRY_ORIGIN); fallthrough; case _ASCE_TYPE_SEGMENT: table += (address & _SEGMENT_INDEX) >> _SEGMENT_SHIFT; - if (bad_address(table)) + if (get_kernel_nofault(entry, table)) goto bad; - pr_cont("S:%016lx ", *table); - if (*table & (_SEGMENT_ENTRY_INVALID | _SEGMENT_ENTRY_LARGE)) + pr_cont("S:%016lx ", entry); + if (entry & (_SEGMENT_ENTRY_INVALID | _SEGMENT_ENTRY_LARGE)) goto out; - table = __va(*table & _SEGMENT_ENTRY_ORIGIN); + table = __va(entry & _SEGMENT_ENTRY_ORIGIN); } table += (address & _PAGE_INDEX) >> _PAGE_SHIFT; - if (bad_address(table)) + if (get_kernel_nofault(entry, table)) goto bad; - pr_cont("P:%016lx ", *table); + pr_cont("P:%016lx ", entry); out: pr_cont("\n"); return; @@ -158,163 +159,113 @@ bad: static void dump_fault_info(struct pt_regs *regs) { + union teid teid = { .val = regs->int_parm_long }; unsigned long asce; pr_alert("Failing address: %016lx TEID: %016lx\n", - regs->int_parm_long & __FAIL_ADDR_MASK, regs->int_parm_long); + get_fault_address(regs), teid.val); pr_alert("Fault in "); - switch (regs->int_parm_long & 3) { - case 3: + switch (teid.as) { + case PSW_BITS_AS_HOME: pr_cont("home space "); break; - case 2: + case PSW_BITS_AS_SECONDARY: pr_cont("secondary space "); break; - case 1: + case PSW_BITS_AS_ACCREG: pr_cont("access register "); break; - case 0: + case PSW_BITS_AS_PRIMARY: pr_cont("primary space "); break; } pr_cont("mode while using "); switch (get_fault_type(regs)) { case USER_FAULT: - asce = S390_lowcore.user_asce; + asce = S390_lowcore.user_asce.val; pr_cont("user "); break; case GMAP_FAULT: - asce = ((struct gmap *) S390_lowcore.gmap)->asce; + asce = ((struct gmap *)S390_lowcore.gmap)->asce; pr_cont("gmap "); break; case KERNEL_FAULT: - asce = S390_lowcore.kernel_asce; + asce = S390_lowcore.kernel_asce.val; pr_cont("kernel "); break; default: unreachable(); } pr_cont("ASCE.\n"); - dump_pagetable(asce, regs->int_parm_long & __FAIL_ADDR_MASK); + dump_pagetable(asce, get_fault_address(regs)); } int show_unhandled_signals = 1; void report_user_fault(struct pt_regs *regs, long signr, int is_mm_fault) { + static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); + if ((task_pid_nr(current) > 1) && !show_unhandled_signals) return; if (!unhandled_signal(current, signr)) return; - if (!printk_ratelimit()) + if (!__ratelimit(&rs)) return; - printk(KERN_ALERT "User process fault: interruption code %04x ilc:%d ", - regs->int_code & 0xffff, regs->int_code >> 17); + pr_alert("User process fault: interruption code %04x ilc:%d ", + regs->int_code & 0xffff, regs->int_code >> 17); print_vma_addr(KERN_CONT "in ", regs->psw.addr); - printk(KERN_CONT "\n"); + pr_cont("\n"); if (is_mm_fault) dump_fault_info(regs); show_regs(regs); } -/* - * Send SIGSEGV to task. This is an external routine - * to keep the stack usage of do_page_fault small. - */ -static noinline void do_sigsegv(struct pt_regs *regs, int si_code) +static void do_sigsegv(struct pt_regs *regs, int si_code) { report_user_fault(regs, SIGSEGV, 1); - force_sig_fault(SIGSEGV, si_code, - (void __user *)(regs->int_parm_long & __FAIL_ADDR_MASK)); + force_sig_fault(SIGSEGV, si_code, (void __user *)get_fault_address(regs)); } -static noinline void do_no_context(struct pt_regs *regs) +static void handle_fault_error_nolock(struct pt_regs *regs, int si_code) { + enum fault_type fault_type; + unsigned long address; + bool is_write; + + if (user_mode(regs)) { + if (WARN_ON_ONCE(!si_code)) + si_code = SEGV_MAPERR; + return do_sigsegv(regs, si_code); + } if (fixup_exception(regs)) return; - /* - * Oops. The kernel tried to access some bad page. We'll have to - * terminate things with extreme prejudice. - */ - if (get_fault_type(regs) == KERNEL_FAULT) - printk(KERN_ALERT "Unable to handle kernel pointer dereference" - " in virtual kernel address space\n"); + fault_type = get_fault_type(regs); + if (fault_type == KERNEL_FAULT) { + address = get_fault_address(regs); + is_write = fault_is_write(regs); + if (kfence_handle_page_fault(address, is_write, regs)) + return; + } + if (fault_type == KERNEL_FAULT) + pr_alert("Unable to handle kernel pointer dereference in virtual kernel address space\n"); else - printk(KERN_ALERT "Unable to handle kernel paging request" - " in virtual user address space\n"); + pr_alert("Unable to handle kernel paging request in virtual user address space\n"); dump_fault_info(regs); die(regs, "Oops"); } -static noinline void do_low_address(struct pt_regs *regs) +static void handle_fault_error(struct pt_regs *regs, int si_code) { - /* Low-address protection hit in kernel mode means - NULL pointer write access in kernel mode. */ - if (regs->psw.mask & PSW_MASK_PSTATE) { - /* Low-address protection hit in user mode 'cannot happen'. */ - die (regs, "Low-address protection"); - } + struct mm_struct *mm = current->mm; - do_no_context(regs); -} - -static noinline void do_sigbus(struct pt_regs *regs) -{ - /* - * Send a sigbus, regardless of whether we were in kernel - * or user mode. - */ - force_sig_fault(SIGBUS, BUS_ADRERR, - (void __user *)(regs->int_parm_long & __FAIL_ADDR_MASK)); + mmap_read_unlock(mm); + handle_fault_error_nolock(regs, si_code); } -static noinline void do_fault_error(struct pt_regs *regs, int access, - vm_fault_t fault) +static void do_sigbus(struct pt_regs *regs) { - int si_code; - - switch (fault) { - case VM_FAULT_BADACCESS: - case VM_FAULT_BADMAP: - /* Bad memory access. Check if it is kernel or user space. */ - if (user_mode(regs)) { - /* User mode accesses just cause a SIGSEGV */ - si_code = (fault == VM_FAULT_BADMAP) ? - SEGV_MAPERR : SEGV_ACCERR; - do_sigsegv(regs, si_code); - break; - } - fallthrough; - case VM_FAULT_BADCONTEXT: - case VM_FAULT_PFAULT: - do_no_context(regs); - break; - case VM_FAULT_SIGNAL: - if (!user_mode(regs)) - do_no_context(regs); - break; - default: /* fault & VM_FAULT_ERROR */ - if (fault & VM_FAULT_OOM) { - if (!user_mode(regs)) - do_no_context(regs); - else - pagefault_out_of_memory(); - } else if (fault & VM_FAULT_SIGSEGV) { - /* Kernel mode? Handle exceptions or die */ - if (!user_mode(regs)) - do_no_context(regs); - else - do_sigsegv(regs, SEGV_MAPERR); - } else if (fault & VM_FAULT_SIGBUS) { - /* Kernel mode? Handle exceptions or die */ - if (!user_mode(regs)) - do_no_context(regs); - else - do_sigbus(regs); - } else - BUG(); - break; - } + force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)get_fault_address(regs)); } /* @@ -323,161 +274,178 @@ static noinline void do_fault_error(struct pt_regs *regs, int access, * routines. * * interruption code (int_code): - * 04 Protection -> Write-Protection (suppression) - * 10 Segment translation -> Not present (nullification) - * 11 Page translation -> Not present (nullification) - * 3b Region third trans. -> Not present (nullification) + * 04 Protection -> Write-Protection (suppression) + * 10 Segment translation -> Not present (nullification) + * 11 Page translation -> Not present (nullification) + * 3b Region third trans. -> Not present (nullification) */ -static inline vm_fault_t do_exception(struct pt_regs *regs, int access) +static void do_exception(struct pt_regs *regs, int access) { - struct gmap *gmap; - struct task_struct *tsk; - struct mm_struct *mm; struct vm_area_struct *vma; - enum fault_type type; - unsigned long trans_exc_code; unsigned long address; + struct mm_struct *mm; + enum fault_type type; unsigned int flags; + struct gmap *gmap; vm_fault_t fault; bool is_write; - tsk = current; /* * The instruction that caused the program check has * been nullified. Don't signal single step via SIGTRAP. */ clear_thread_flag(TIF_PER_TRAP); - if (kprobe_page_fault(regs, 14)) - return 0; - - mm = tsk->mm; - trans_exc_code = regs->int_parm_long; - address = trans_exc_code & __FAIL_ADDR_MASK; - is_write = (trans_exc_code & store_indication) == 0x400; - - /* - * Verify that the fault happened in user space, that - * we are not in an interrupt and that there is a - * user context. - */ - fault = VM_FAULT_BADCONTEXT; + return; + mm = current->mm; + address = get_fault_address(regs); + is_write = fault_is_write(regs); type = get_fault_type(regs); switch (type) { case KERNEL_FAULT: - if (kfence_handle_page_fault(address, is_write, regs)) - return 0; - goto out; + return handle_fault_error_nolock(regs, 0); case USER_FAULT: case GMAP_FAULT: if (faulthandler_disabled() || !mm) - goto out; + return handle_fault_error_nolock(regs, 0); break; } - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); flags = FAULT_FLAG_DEFAULT; if (user_mode(regs)) flags |= FAULT_FLAG_USER; - if (access == VM_WRITE || is_write) + if (is_write) + access = VM_WRITE; + if (access == VM_WRITE) flags |= FAULT_FLAG_WRITE; - mmap_read_lock(mm); + if (!(flags & FAULT_FLAG_USER)) + goto lock_mmap; + vma = lock_vma_under_rcu(mm, address); + if (!vma) + goto lock_mmap; + if (!(vma->vm_flags & access)) { + vma_end_read(vma); + goto lock_mmap; + } + fault = handle_mm_fault(vma, address, flags | FAULT_FLAG_VMA_LOCK, regs); + if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED))) + vma_end_read(vma); + if (!(fault & VM_FAULT_RETRY)) { + count_vm_vma_lock_event(VMA_LOCK_SUCCESS); + if (unlikely(fault & VM_FAULT_ERROR)) + goto error; + return; + } + count_vm_vma_lock_event(VMA_LOCK_RETRY); + if (fault & VM_FAULT_MAJOR) + flags |= FAULT_FLAG_TRIED; + /* Quick path to respond to signals */ + if (fault_signal_pending(fault, regs)) { + if (!user_mode(regs)) + handle_fault_error_nolock(regs, 0); + return; + } +lock_mmap: + mmap_read_lock(mm); gmap = NULL; if (IS_ENABLED(CONFIG_PGSTE) && type == GMAP_FAULT) { - gmap = (struct gmap *) S390_lowcore.gmap; + gmap = (struct gmap *)S390_lowcore.gmap; current->thread.gmap_addr = address; current->thread.gmap_write_flag = !!(flags & FAULT_FLAG_WRITE); current->thread.gmap_int_code = regs->int_code & 0xffff; address = __gmap_translate(gmap, address); - if (address == -EFAULT) { - fault = VM_FAULT_BADMAP; - goto out_up; - } + if (address == -EFAULT) + return handle_fault_error(regs, SEGV_MAPERR); if (gmap->pfault_enabled) flags |= FAULT_FLAG_RETRY_NOWAIT; } - retry: - fault = VM_FAULT_BADMAP; vma = find_vma(mm, address); if (!vma) - goto out_up; - + return handle_fault_error(regs, SEGV_MAPERR); if (unlikely(vma->vm_start > address)) { if (!(vma->vm_flags & VM_GROWSDOWN)) - goto out_up; - if (expand_stack(vma, address)) - goto out_up; + return handle_fault_error(regs, SEGV_MAPERR); + vma = expand_stack(mm, address); + if (!vma) + return handle_fault_error_nolock(regs, SEGV_MAPERR); } - - /* - * Ok, we have a good vm_area for this memory access, so - * we can handle it.. - */ - fault = VM_FAULT_BADACCESS; if (unlikely(!(vma->vm_flags & access))) - goto out_up; - - if (is_vm_hugetlb_page(vma)) - address &= HPAGE_MASK; - /* - * If for any reason at all we couldn't handle the fault, - * make sure we exit gracefully rather than endlessly redo - * the fault. - */ + return handle_fault_error(regs, SEGV_ACCERR); fault = handle_mm_fault(vma, address, flags, regs); if (fault_signal_pending(fault, regs)) { - fault = VM_FAULT_SIGNAL; if (flags & FAULT_FLAG_RETRY_NOWAIT) - goto out_up; - goto out; + mmap_read_unlock(mm); + if (!user_mode(regs)) + handle_fault_error_nolock(regs, 0); + return; + } + /* The fault is fully completed (including releasing mmap lock) */ + if (fault & VM_FAULT_COMPLETED) { + if (gmap) { + mmap_read_lock(mm); + goto gmap; + } + return; + } + if (unlikely(fault & VM_FAULT_ERROR)) { + mmap_read_unlock(mm); + goto error; } - if (unlikely(fault & VM_FAULT_ERROR)) - goto out_up; - if (fault & VM_FAULT_RETRY) { - if (IS_ENABLED(CONFIG_PGSTE) && gmap && - (flags & FAULT_FLAG_RETRY_NOWAIT)) { + if (IS_ENABLED(CONFIG_PGSTE) && gmap && (flags & FAULT_FLAG_RETRY_NOWAIT)) { /* - * FAULT_FLAG_RETRY_NOWAIT has been set, mmap_lock has - * not been released + * FAULT_FLAG_RETRY_NOWAIT has been set, + * mmap_lock has not been released */ current->thread.gmap_pfault = 1; - fault = VM_FAULT_PFAULT; - goto out_up; + return handle_fault_error(regs, 0); } flags &= ~FAULT_FLAG_RETRY_NOWAIT; flags |= FAULT_FLAG_TRIED; mmap_read_lock(mm); goto retry; } +gmap: if (IS_ENABLED(CONFIG_PGSTE) && gmap) { address = __gmap_link(gmap, current->thread.gmap_addr, address); - if (address == -EFAULT) { - fault = VM_FAULT_BADMAP; - goto out_up; - } + if (address == -EFAULT) + return handle_fault_error(regs, SEGV_MAPERR); if (address == -ENOMEM) { fault = VM_FAULT_OOM; - goto out_up; + mmap_read_unlock(mm); + goto error; } } - fault = 0; -out_up: mmap_read_unlock(mm); -out: - return fault; + return; +error: + if (fault & VM_FAULT_OOM) { + if (!user_mode(regs)) + handle_fault_error_nolock(regs, 0); + else + pagefault_out_of_memory(); + } else if (fault & VM_FAULT_SIGSEGV) { + if (!user_mode(regs)) + handle_fault_error_nolock(regs, 0); + else + do_sigsegv(regs, SEGV_MAPERR); + } else if (fault & VM_FAULT_SIGBUS) { + if (!user_mode(regs)) + handle_fault_error_nolock(regs, 0); + else + do_sigbus(regs); + } else { + BUG(); + } } void do_protection_exception(struct pt_regs *regs) { - unsigned long trans_exc_code; - int access; - vm_fault_t fault; + union teid teid = { .val = regs->int_parm_long }; - trans_exc_code = regs->int_parm_long; /* * Protection exceptions are suppressing, decrement psw address. * The exception to this rule are aborted transactions, for these @@ -490,281 +458,50 @@ void do_protection_exception(struct pt_regs *regs) * as a special case because the translation exception code * field is not guaranteed to contain valid data in this case. */ - if (unlikely(!(trans_exc_code & 4))) { - do_low_address(regs); - return; + if (unlikely(!teid.b61)) { + if (user_mode(regs)) { + /* Low-address protection in user mode: cannot happen */ + die(regs, "Low-address protection"); + } + /* + * Low-address protection in kernel mode means + * NULL pointer write access in kernel mode. + */ + return handle_fault_error_nolock(regs, 0); } - if (unlikely(MACHINE_HAS_NX && (trans_exc_code & 0x80))) { - regs->int_parm_long = (trans_exc_code & ~PAGE_MASK) | - (regs->psw.addr & PAGE_MASK); - access = VM_EXEC; - fault = VM_FAULT_BADACCESS; - } else { - access = VM_WRITE; - fault = do_exception(regs, access); + if (unlikely(MACHINE_HAS_NX && teid.b56)) { + regs->int_parm_long = (teid.addr * PAGE_SIZE) | (regs->psw.addr & PAGE_MASK); + return handle_fault_error_nolock(regs, SEGV_ACCERR); } - if (unlikely(fault)) - do_fault_error(regs, access, fault); + do_exception(regs, VM_WRITE); } NOKPROBE_SYMBOL(do_protection_exception); void do_dat_exception(struct pt_regs *regs) { - int access; - vm_fault_t fault; - - access = VM_ACCESS_FLAGS; - fault = do_exception(regs, access); - if (unlikely(fault)) - do_fault_error(regs, access, fault); + do_exception(regs, VM_ACCESS_FLAGS); } NOKPROBE_SYMBOL(do_dat_exception); -#ifdef CONFIG_PFAULT -/* - * 'pfault' pseudo page faults routines. - */ -static int pfault_disable; - -static int __init nopfault(char *str) -{ - pfault_disable = 1; - return 1; -} - -__setup("nopfault", nopfault); - -struct pfault_refbk { - u16 refdiagc; - u16 reffcode; - u16 refdwlen; - u16 refversn; - u64 refgaddr; - u64 refselmk; - u64 refcmpmk; - u64 reserved; -} __attribute__ ((packed, aligned(8))); - -static struct pfault_refbk pfault_init_refbk = { - .refdiagc = 0x258, - .reffcode = 0, - .refdwlen = 5, - .refversn = 2, - .refgaddr = __LC_LPP, - .refselmk = 1ULL << 48, - .refcmpmk = 1ULL << 48, - .reserved = __PF_RES_FIELD -}; - -int pfault_init(void) -{ - int rc; - - if (pfault_disable) - return -1; - diag_stat_inc(DIAG_STAT_X258); - asm volatile( - " diag %1,%0,0x258\n" - "0: j 2f\n" - "1: la %0,8\n" - "2:\n" - EX_TABLE(0b,1b) - : "=d" (rc) - : "a" (&pfault_init_refbk), "m" (pfault_init_refbk) : "cc"); - return rc; -} - -static struct pfault_refbk pfault_fini_refbk = { - .refdiagc = 0x258, - .reffcode = 1, - .refdwlen = 5, - .refversn = 2, -}; - -void pfault_fini(void) -{ - - if (pfault_disable) - return; - diag_stat_inc(DIAG_STAT_X258); - asm volatile( - " diag %0,0,0x258\n" - "0: nopr %%r7\n" - EX_TABLE(0b,0b) - : : "a" (&pfault_fini_refbk), "m" (pfault_fini_refbk) : "cc"); -} - -static DEFINE_SPINLOCK(pfault_lock); -static LIST_HEAD(pfault_list); - -#define PF_COMPLETE 0x0080 - -/* - * The mechanism of our pfault code: if Linux is running as guest, runs a user - * space process and the user space process accesses a page that the host has - * paged out we get a pfault interrupt. - * - * This allows us, within the guest, to schedule a different process. Without - * this mechanism the host would have to suspend the whole virtual cpu until - * the page has been paged in. - * - * So when we get such an interrupt then we set the state of the current task - * to uninterruptible and also set the need_resched flag. Both happens within - * interrupt context(!). If we later on want to return to user space we - * recognize the need_resched flag and then call schedule(). It's not very - * obvious how this works... - * - * Of course we have a lot of additional fun with the completion interrupt (-> - * host signals that a page of a process has been paged in and the process can - * continue to run). This interrupt can arrive on any cpu and, since we have - * virtual cpus, actually appear before the interrupt that signals that a page - * is missing. - */ -static void pfault_interrupt(struct ext_code ext_code, - unsigned int param32, unsigned long param64) -{ - struct task_struct *tsk; - __u16 subcode; - pid_t pid; - - /* - * Get the external interruption subcode & pfault initial/completion - * signal bit. VM stores this in the 'cpu address' field associated - * with the external interrupt. - */ - subcode = ext_code.subcode; - if ((subcode & 0xff00) != __SUBCODE_MASK) - return; - inc_irq_stat(IRQEXT_PFL); - /* Get the token (= pid of the affected task). */ - pid = param64 & LPP_PID_MASK; - rcu_read_lock(); - tsk = find_task_by_pid_ns(pid, &init_pid_ns); - if (tsk) - get_task_struct(tsk); - rcu_read_unlock(); - if (!tsk) - return; - spin_lock(&pfault_lock); - if (subcode & PF_COMPLETE) { - /* signal bit is set -> a page has been swapped in by VM */ - if (tsk->thread.pfault_wait == 1) { - /* Initial interrupt was faster than the completion - * interrupt. pfault_wait is valid. Set pfault_wait - * back to zero and wake up the process. This can - * safely be done because the task is still sleeping - * and can't produce new pfaults. */ - tsk->thread.pfault_wait = 0; - list_del(&tsk->thread.list); - wake_up_process(tsk); - put_task_struct(tsk); - } else { - /* Completion interrupt was faster than initial - * interrupt. Set pfault_wait to -1 so the initial - * interrupt doesn't put the task to sleep. - * If the task is not running, ignore the completion - * interrupt since it must be a leftover of a PFAULT - * CANCEL operation which didn't remove all pending - * completion interrupts. */ - if (task_is_running(tsk)) - tsk->thread.pfault_wait = -1; - } - } else { - /* signal bit not set -> a real page is missing. */ - if (WARN_ON_ONCE(tsk != current)) - goto out; - if (tsk->thread.pfault_wait == 1) { - /* Already on the list with a reference: put to sleep */ - goto block; - } else if (tsk->thread.pfault_wait == -1) { - /* Completion interrupt was faster than the initial - * interrupt (pfault_wait == -1). Set pfault_wait - * back to zero and exit. */ - tsk->thread.pfault_wait = 0; - } else { - /* Initial interrupt arrived before completion - * interrupt. Let the task sleep. - * An extra task reference is needed since a different - * cpu may set the task state to TASK_RUNNING again - * before the scheduler is reached. */ - get_task_struct(tsk); - tsk->thread.pfault_wait = 1; - list_add(&tsk->thread.list, &pfault_list); -block: - /* Since this must be a userspace fault, there - * is no kernel task state to trample. Rely on the - * return to userspace schedule() to block. */ - __set_current_state(TASK_UNINTERRUPTIBLE); - set_tsk_need_resched(tsk); - set_preempt_need_resched(); - } - } -out: - spin_unlock(&pfault_lock); - put_task_struct(tsk); -} - -static int pfault_cpu_dead(unsigned int cpu) -{ - struct thread_struct *thread, *next; - struct task_struct *tsk; - - spin_lock_irq(&pfault_lock); - list_for_each_entry_safe(thread, next, &pfault_list, list) { - thread->pfault_wait = 0; - list_del(&thread->list); - tsk = container_of(thread, struct task_struct, thread); - wake_up_process(tsk); - put_task_struct(tsk); - } - spin_unlock_irq(&pfault_lock); - return 0; -} - -static int __init pfault_irq_init(void) -{ - int rc; - - rc = register_external_irq(EXT_IRQ_CP_SERVICE, pfault_interrupt); - if (rc) - goto out_extint; - rc = pfault_init() == 0 ? 0 : -EOPNOTSUPP; - if (rc) - goto out_pfault; - irq_subclass_register(IRQ_SUBCLASS_SERVICE_SIGNAL); - cpuhp_setup_state_nocalls(CPUHP_S390_PFAULT_DEAD, "s390/pfault:dead", - NULL, pfault_cpu_dead); - return 0; - -out_pfault: - unregister_external_irq(EXT_IRQ_CP_SERVICE, pfault_interrupt); -out_extint: - pfault_disable = 1; - return rc; -} -early_initcall(pfault_irq_init); - -#endif /* CONFIG_PFAULT */ - #if IS_ENABLED(CONFIG_PGSTE) void do_secure_storage_access(struct pt_regs *regs) { - unsigned long addr = regs->int_parm_long & __FAIL_ADDR_MASK; + union teid teid = { .val = regs->int_parm_long }; + unsigned long addr = get_fault_address(regs); struct vm_area_struct *vma; struct mm_struct *mm; struct page *page; + struct gmap *gmap; int rc; /* - * bit 61 tells us if the address is valid, if it's not we - * have a major problem and should stop the kernel or send a - * SIGSEGV to the process. Unfortunately bit 61 is not - * reliable without the misc UV feature so we need to check - * for that as well. + * Bit 61 indicates if the address is valid, if it is not the + * kernel should be stopped or SIGSEGV should be sent to the + * process. Bit 61 is not reliable without the misc UV feature, + * therefore this needs to be checked too. */ - if (test_bit_inv(BIT_UV_FEAT_MISC, &uv_info.uv_feature_indications) && - !test_bit_inv(61, ®s->int_parm_long)) { + if (uv_has_feature(BIT_UV_FEAT_MISC) && !teid.b61) { /* * When this happens, userspace did something that it * was not supposed to do, e.g. branching into secure @@ -774,24 +511,28 @@ void do_secure_storage_access(struct pt_regs *regs) send_sig(SIGSEGV, current, 0); return; } - /* - * The kernel should never run into this case and we - * have no way out of this situation. + * The kernel should never run into this case and + * there is no way out of this situation. */ panic("Unexpected PGM 0x3d with TEID bit 61=0"); } - switch (get_fault_type(regs)) { + case GMAP_FAULT: + mm = current->mm; + gmap = (struct gmap *)S390_lowcore.gmap; + mmap_read_lock(mm); + addr = __gmap_translate(gmap, addr); + mmap_read_unlock(mm); + if (IS_ERR_VALUE(addr)) + return handle_fault_error_nolock(regs, SEGV_MAPERR); + fallthrough; case USER_FAULT: mm = current->mm; mmap_read_lock(mm); vma = find_vma(mm, addr); - if (!vma) { - mmap_read_unlock(mm); - do_fault_error(regs, VM_READ | VM_WRITE, VM_FAULT_BADMAP); - break; - } + if (!vma) + return handle_fault_error(regs, SEGV_MAPERR); page = follow_page(vma, addr, FOLL_WRITE | FOLL_GET); if (IS_ERR_OR_NULL(page)) { mmap_read_unlock(mm); @@ -811,25 +552,19 @@ void do_secure_storage_access(struct pt_regs *regs) if (rc) BUG(); break; - case GMAP_FAULT: default: - do_fault_error(regs, VM_READ | VM_WRITE, VM_FAULT_BADMAP); - WARN_ON_ONCE(1); + unreachable(); } } NOKPROBE_SYMBOL(do_secure_storage_access); void do_non_secure_storage_access(struct pt_regs *regs) { - unsigned long gaddr = regs->int_parm_long & __FAIL_ADDR_MASK; struct gmap *gmap = (struct gmap *)S390_lowcore.gmap; + unsigned long gaddr = get_fault_address(regs); - if (get_fault_type(regs) != GMAP_FAULT) { - do_fault_error(regs, VM_READ | VM_WRITE, VM_FAULT_BADMAP); - WARN_ON_ONCE(1); - return; - } - + if (WARN_ON_ONCE(get_fault_type(regs) != GMAP_FAULT)) + return handle_fault_error_nolock(regs, SEGV_MAPERR); if (gmap_convert_to_secure(gmap, gaddr) == -EINVAL) send_sig(SIGSEGV, current, 0); } @@ -837,6 +572,16 @@ NOKPROBE_SYMBOL(do_non_secure_storage_access); void do_secure_storage_violation(struct pt_regs *regs) { + struct gmap *gmap = (struct gmap *)S390_lowcore.gmap; + unsigned long gaddr = get_fault_address(regs); + + /* + * If the VM has been rebooted, its address space might still contain + * secure pages from the previous boot. + * Clear the page so it can be reused. + */ + if (!gmap_destroy_page(gmap, gaddr)) + return; /* * Either KVM messed up the secure guest mapping or the same * page is mapped into multiple secure guests. @@ -844,9 +589,8 @@ void do_secure_storage_violation(struct pt_regs *regs) * This exception is only triggered when a guest 2 is running * and can therefore never occur in kernel context. */ - printk_ratelimited(KERN_WARNING - "Secure storage violation in task: %s, pid %d\n", - current->comm, current->pid); + pr_warn_ratelimited("Secure storage violation in task: %s, pid %d\n", + current->comm, current->pid); send_sig(SIGSEGV, current, 0); } diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index b8ae4a4aa2ba..094b43b121cd 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -18,13 +18,25 @@ #include <linux/ksm.h> #include <linux/mman.h> #include <linux/pgtable.h> - +#include <asm/page-states.h> #include <asm/pgalloc.h> #include <asm/gmap.h> +#include <asm/page.h> #include <asm/tlb.h> #define GMAP_SHADOW_FAKE_TABLE 1ULL +static struct page *gmap_alloc_crst(void) +{ + struct page *page; + + page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER); + if (!page) + return NULL; + __arch_set_page_dat(page_to_virt(page), 1UL << CRST_ALLOC_ORDER); + return page; +} + /** * gmap_alloc - allocate and initialize a guest address space * @limit: maximum address of the gmap address space @@ -67,12 +79,12 @@ static struct gmap *gmap_alloc(unsigned long limit) spin_lock_init(&gmap->guest_table_lock); spin_lock_init(&gmap->shadow_lock); refcount_set(&gmap->ref_count, 1); - page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER); + page = gmap_alloc_crst(); if (!page) goto out_free; page->index = 0; list_add(&page->lru, &gmap->crst_list); - table = (unsigned long *) page_to_phys(page); + table = page_to_virt(page); crst_table_init(table, etype); gmap->table = table; gmap->asce = atype | _ASCE_TABLE_LENGTH | @@ -194,9 +206,11 @@ static void gmap_free(struct gmap *gmap) /* Free additional data for a shadow gmap */ if (gmap_is_shadow(gmap)) { + struct ptdesc *ptdesc, *n; + /* Free all page tables. */ - list_for_each_entry_safe(page, next, &gmap->pt_list, lru) - page_table_free_pgste(page); + list_for_each_entry_safe(ptdesc, n, &gmap->pt_list, pt_list) + page_table_free_pgste(ptdesc); gmap_rmap_radix_tree_free(&gmap->host_to_rmap); /* Release reference to the parent */ gmap_put(gmap->parent); @@ -308,15 +322,15 @@ static int gmap_alloc_table(struct gmap *gmap, unsigned long *table, unsigned long *new; /* since we dont free the gmap table until gmap_free we can unlock */ - page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER); + page = gmap_alloc_crst(); if (!page) return -ENOMEM; - new = (unsigned long *) page_to_phys(page); + new = page_to_virt(page); crst_table_init(new, init); spin_lock(&gmap->guest_table_lock); if (*table & _REGION_ENTRY_INVALID) { list_add(&page->lru, &gmap->crst_list); - *table = (unsigned long) new | _REGION_ENTRY_LENGTH | + *table = __pa(new) | _REGION_ENTRY_LENGTH | (*table & _REGION_ENTRY_TYPE_MASK); page->index = gaddr; page = NULL; @@ -336,12 +350,11 @@ static int gmap_alloc_table(struct gmap *gmap, unsigned long *table, static unsigned long __gmap_segment_gaddr(unsigned long *entry) { struct page *page; - unsigned long offset, mask; + unsigned long offset; offset = (unsigned long) entry / sizeof(unsigned long); offset = (offset & (PTRS_PER_PMD - 1)) * PMD_SIZE; - mask = ~(PTRS_PER_PMD * sizeof(pmd_t) - 1); - page = virt_to_page((void *)((unsigned long) entry & mask)); + page = pmd_pgtable_page((pmd_t *) entry); return page->index + offset; } @@ -557,7 +570,7 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr) gmap_alloc_table(gmap, table, _REGION2_ENTRY_EMPTY, gaddr & _REGION1_MASK)) return -ENOMEM; - table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); + table = __va(*table & _REGION_ENTRY_ORIGIN); } if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION2) { table += (gaddr & _REGION2_INDEX) >> _REGION2_SHIFT; @@ -565,7 +578,7 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr) gmap_alloc_table(gmap, table, _REGION3_ENTRY_EMPTY, gaddr & _REGION2_MASK)) return -ENOMEM; - table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); + table = __va(*table & _REGION_ENTRY_ORIGIN); } if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION3) { table += (gaddr & _REGION3_INDEX) >> _REGION3_SHIFT; @@ -573,7 +586,7 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr) gmap_alloc_table(gmap, table, _SEGMENT_ENTRY_EMPTY, gaddr & _REGION3_MASK)) return -ENOMEM; - table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); + table = __va(*table & _REGION_ENTRY_ORIGIN); } table += (gaddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT; /* Walk the parent mm page table */ @@ -585,12 +598,12 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr) pud = pud_offset(p4d, vmaddr); VM_BUG_ON(pud_none(*pud)); /* large puds cannot yet be handled */ - if (pud_large(*pud)) + if (pud_leaf(*pud)) return -EFAULT; pmd = pmd_offset(pud, vmaddr); VM_BUG_ON(pmd_none(*pmd)); /* Are we allowed to use huge pages? */ - if (pmd_large(*pmd) && !gmap->mm->context.allow_gmap_hpage_1m) + if (pmd_leaf(*pmd) && !gmap->mm->context.allow_gmap_hpage_1m) return -EFAULT; /* Link gmap segment table entry location to page table. */ rc = radix_tree_preload(GFP_KERNEL_ACCOUNT); @@ -602,7 +615,7 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr) rc = radix_tree_insert(&gmap->host_to_guest, vmaddr >> PMD_SHIFT, table); if (!rc) { - if (pmd_large(*pmd)) { + if (pmd_leaf(*pmd)) { *table = (pmd_val(*pmd) & _SEGMENT_ENTRY_HARDWARE_BITS_LARGE) | _SEGMENT_ENTRY_GMAP_UC; @@ -723,7 +736,7 @@ void gmap_discard(struct gmap *gmap, unsigned long from, unsigned long to) if (is_vm_hugetlb_page(vma)) continue; size = min(to - gaddr, PMD_SIZE - (gaddr & ~PMD_MASK)); - zap_page_range(vma, vmaddr, size); + zap_page_range_single(vma, vmaddr, size, NULL); } mmap_read_unlock(gmap->mm); } @@ -813,7 +826,7 @@ static inline unsigned long *gmap_table_walk(struct gmap *gmap, break; if (*table & _REGION_ENTRY_INVALID) return NULL; - table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); + table = __va(*table & _REGION_ENTRY_ORIGIN); fallthrough; case _ASCE_TYPE_REGION2: table += (gaddr & _REGION2_INDEX) >> _REGION2_SHIFT; @@ -821,7 +834,7 @@ static inline unsigned long *gmap_table_walk(struct gmap *gmap, break; if (*table & _REGION_ENTRY_INVALID) return NULL; - table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); + table = __va(*table & _REGION_ENTRY_ORIGIN); fallthrough; case _ASCE_TYPE_REGION3: table += (gaddr & _REGION3_INDEX) >> _REGION3_SHIFT; @@ -829,7 +842,7 @@ static inline unsigned long *gmap_table_walk(struct gmap *gmap, break; if (*table & _REGION_ENTRY_INVALID) return NULL; - table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); + table = __va(*table & _REGION_ENTRY_ORIGIN); fallthrough; case _ASCE_TYPE_SEGMENT: table += (gaddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT; @@ -837,7 +850,7 @@ static inline unsigned long *gmap_table_walk(struct gmap *gmap, break; if (*table & _REGION_ENTRY_INVALID) return NULL; - table = (unsigned long *)(*table & _SEGMENT_ENTRY_ORIGIN); + table = __va(*table & _SEGMENT_ENTRY_ORIGIN); table += (gaddr & _PAGE_INDEX) >> _PAGE_SHIFT; } return table; @@ -896,12 +909,12 @@ static int gmap_pte_op_fixup(struct gmap *gmap, unsigned long gaddr, /** * gmap_pte_op_end - release the page table lock - * @ptl: pointer to the spinlock pointer + * @ptep: pointer to the locked pte + * @ptl: pointer to the page table spinlock */ -static void gmap_pte_op_end(spinlock_t *ptl) +static void gmap_pte_op_end(pte_t *ptep, spinlock_t *ptl) { - if (ptl) - spin_unlock(ptl); + pte_unmap_unlock(ptep, ptl); } /** @@ -932,7 +945,7 @@ static inline pmd_t *gmap_pmd_op_walk(struct gmap *gmap, unsigned long gaddr) } /* 4k page table entries are locked via the pte (pte_alloc_map_lock). */ - if (!pmd_large(*pmdp)) + if (!pmd_leaf(*pmdp)) spin_unlock(&gmap->guest_table_lock); return pmdp; } @@ -944,7 +957,7 @@ static inline pmd_t *gmap_pmd_op_walk(struct gmap *gmap, unsigned long gaddr) */ static inline void gmap_pmd_op_end(struct gmap *gmap, pmd_t *pmdp) { - if (pmd_large(*pmdp)) + if (pmd_leaf(*pmdp)) spin_unlock(&gmap->guest_table_lock); } @@ -1012,7 +1025,7 @@ static int gmap_protect_pte(struct gmap *gmap, unsigned long gaddr, { int rc; pte_t *ptep; - spinlock_t *ptl = NULL; + spinlock_t *ptl; unsigned long pbits = 0; if (pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID) @@ -1026,7 +1039,7 @@ static int gmap_protect_pte(struct gmap *gmap, unsigned long gaddr, pbits |= (bits & GMAP_NOTIFY_SHADOW) ? PGSTE_VSIE_BIT : 0; /* Protect and unlock. */ rc = ptep_force_prot(gmap->mm, gaddr, ptep, prot, pbits); - gmap_pte_op_end(ptl); + gmap_pte_op_end(ptep, ptl); return rc; } @@ -1055,7 +1068,7 @@ static int gmap_protect_range(struct gmap *gmap, unsigned long gaddr, rc = -EAGAIN; pmdp = gmap_pmd_op_walk(gmap, gaddr); if (pmdp) { - if (!pmd_large(*pmdp)) { + if (!pmd_leaf(*pmdp)) { rc = gmap_protect_pte(gmap, gaddr, pmdp, prot, bits); if (!rc) { @@ -1150,12 +1163,12 @@ int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val) if (pte_present(pte) && (pte_val(pte) & _PAGE_READ)) { address = pte_val(pte) & PAGE_MASK; address += gaddr & ~PAGE_MASK; - *val = *(unsigned long *) address; + *val = *(unsigned long *)__va(address); set_pte(ptep, set_pte_bit(*ptep, __pgprot(_PAGE_YOUNG))); /* Do *NOT* clear the _PAGE_INVALID bit! */ rc = 0; } - gmap_pte_op_end(ptl); + gmap_pte_op_end(ptep, ptl); } if (!rc) break; @@ -1249,7 +1262,7 @@ static int gmap_protect_rmap(struct gmap *sg, unsigned long raddr, if (!rc) gmap_insert_rmap(sg, vmaddr, rmap); spin_unlock(&sg->guest_table_lock); - gmap_pte_op_end(ptl); + gmap_pte_op_end(ptep, ptl); } radix_tree_preload_end(); if (rc) { @@ -1335,23 +1348,24 @@ static void __gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr, */ static void gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr) { - unsigned long sto, *ste, *pgt; - struct page *page; + unsigned long *ste; + phys_addr_t sto, pgt; + struct ptdesc *ptdesc; BUG_ON(!gmap_is_shadow(sg)); ste = gmap_table_walk(sg, raddr, 1); /* get segment pointer */ if (!ste || !(*ste & _SEGMENT_ENTRY_ORIGIN)) return; gmap_call_notifier(sg, raddr, raddr + _SEGMENT_SIZE - 1); - sto = (unsigned long) (ste - ((raddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT)); + sto = __pa(ste - ((raddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT)); gmap_idte_one(sto | _ASCE_TYPE_SEGMENT, raddr); - pgt = (unsigned long *)(*ste & _SEGMENT_ENTRY_ORIGIN); + pgt = *ste & _SEGMENT_ENTRY_ORIGIN; *ste = _SEGMENT_ENTRY_EMPTY; - __gmap_unshadow_pgt(sg, raddr, pgt); + __gmap_unshadow_pgt(sg, raddr, __va(pgt)); /* Free page table */ - page = pfn_to_page(__pa(pgt) >> PAGE_SHIFT); - list_del(&page->lru); - page_table_free_pgste(page); + ptdesc = page_ptdesc(phys_to_page(pgt)); + list_del(&ptdesc->pt_list); + page_table_free_pgste(ptdesc); } /** @@ -1365,21 +1379,21 @@ static void gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr) static void __gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr, unsigned long *sgt) { - unsigned long *pgt; - struct page *page; + struct ptdesc *ptdesc; + phys_addr_t pgt; int i; BUG_ON(!gmap_is_shadow(sg)); for (i = 0; i < _CRST_ENTRIES; i++, raddr += _SEGMENT_SIZE) { if (!(sgt[i] & _SEGMENT_ENTRY_ORIGIN)) continue; - pgt = (unsigned long *)(sgt[i] & _REGION_ENTRY_ORIGIN); + pgt = sgt[i] & _REGION_ENTRY_ORIGIN; sgt[i] = _SEGMENT_ENTRY_EMPTY; - __gmap_unshadow_pgt(sg, raddr, pgt); + __gmap_unshadow_pgt(sg, raddr, __va(pgt)); /* Free page table */ - page = pfn_to_page(__pa(pgt) >> PAGE_SHIFT); - list_del(&page->lru); - page_table_free_pgste(page); + ptdesc = page_ptdesc(phys_to_page(pgt)); + list_del(&ptdesc->pt_list); + page_table_free_pgste(ptdesc); } } @@ -1392,7 +1406,8 @@ static void __gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr, */ static void gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr) { - unsigned long r3o, *r3e, *sgt; + unsigned long r3o, *r3e; + phys_addr_t sgt; struct page *page; BUG_ON(!gmap_is_shadow(sg)); @@ -1401,12 +1416,12 @@ static void gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr) return; gmap_call_notifier(sg, raddr, raddr + _REGION3_SIZE - 1); r3o = (unsigned long) (r3e - ((raddr & _REGION3_INDEX) >> _REGION3_SHIFT)); - gmap_idte_one(r3o | _ASCE_TYPE_REGION3, raddr); - sgt = (unsigned long *)(*r3e & _REGION_ENTRY_ORIGIN); + gmap_idte_one(__pa(r3o) | _ASCE_TYPE_REGION3, raddr); + sgt = *r3e & _REGION_ENTRY_ORIGIN; *r3e = _REGION3_ENTRY_EMPTY; - __gmap_unshadow_sgt(sg, raddr, sgt); + __gmap_unshadow_sgt(sg, raddr, __va(sgt)); /* Free segment table */ - page = pfn_to_page(__pa(sgt) >> PAGE_SHIFT); + page = phys_to_page(sgt); list_del(&page->lru); __free_pages(page, CRST_ALLOC_ORDER); } @@ -1422,19 +1437,19 @@ static void gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr) static void __gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr, unsigned long *r3t) { - unsigned long *sgt; struct page *page; + phys_addr_t sgt; int i; BUG_ON(!gmap_is_shadow(sg)); for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION3_SIZE) { if (!(r3t[i] & _REGION_ENTRY_ORIGIN)) continue; - sgt = (unsigned long *)(r3t[i] & _REGION_ENTRY_ORIGIN); + sgt = r3t[i] & _REGION_ENTRY_ORIGIN; r3t[i] = _REGION3_ENTRY_EMPTY; - __gmap_unshadow_sgt(sg, raddr, sgt); + __gmap_unshadow_sgt(sg, raddr, __va(sgt)); /* Free segment table */ - page = pfn_to_page(__pa(sgt) >> PAGE_SHIFT); + page = phys_to_page(sgt); list_del(&page->lru); __free_pages(page, CRST_ALLOC_ORDER); } @@ -1449,7 +1464,8 @@ static void __gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr, */ static void gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr) { - unsigned long r2o, *r2e, *r3t; + unsigned long r2o, *r2e; + phys_addr_t r3t; struct page *page; BUG_ON(!gmap_is_shadow(sg)); @@ -1458,12 +1474,12 @@ static void gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr) return; gmap_call_notifier(sg, raddr, raddr + _REGION2_SIZE - 1); r2o = (unsigned long) (r2e - ((raddr & _REGION2_INDEX) >> _REGION2_SHIFT)); - gmap_idte_one(r2o | _ASCE_TYPE_REGION2, raddr); - r3t = (unsigned long *)(*r2e & _REGION_ENTRY_ORIGIN); + gmap_idte_one(__pa(r2o) | _ASCE_TYPE_REGION2, raddr); + r3t = *r2e & _REGION_ENTRY_ORIGIN; *r2e = _REGION2_ENTRY_EMPTY; - __gmap_unshadow_r3t(sg, raddr, r3t); + __gmap_unshadow_r3t(sg, raddr, __va(r3t)); /* Free region 3 table */ - page = pfn_to_page(__pa(r3t) >> PAGE_SHIFT); + page = phys_to_page(r3t); list_del(&page->lru); __free_pages(page, CRST_ALLOC_ORDER); } @@ -1479,7 +1495,7 @@ static void gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr) static void __gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr, unsigned long *r2t) { - unsigned long *r3t; + phys_addr_t r3t; struct page *page; int i; @@ -1487,11 +1503,11 @@ static void __gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr, for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION2_SIZE) { if (!(r2t[i] & _REGION_ENTRY_ORIGIN)) continue; - r3t = (unsigned long *)(r2t[i] & _REGION_ENTRY_ORIGIN); + r3t = r2t[i] & _REGION_ENTRY_ORIGIN; r2t[i] = _REGION2_ENTRY_EMPTY; - __gmap_unshadow_r3t(sg, raddr, r3t); + __gmap_unshadow_r3t(sg, raddr, __va(r3t)); /* Free region 3 table */ - page = pfn_to_page(__pa(r3t) >> PAGE_SHIFT); + page = phys_to_page(r3t); list_del(&page->lru); __free_pages(page, CRST_ALLOC_ORDER); } @@ -1506,8 +1522,9 @@ static void __gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr, */ static void gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr) { - unsigned long r1o, *r1e, *r2t; + unsigned long r1o, *r1e; struct page *page; + phys_addr_t r2t; BUG_ON(!gmap_is_shadow(sg)); r1e = gmap_table_walk(sg, raddr, 4); /* get region-1 pointer */ @@ -1515,12 +1532,12 @@ static void gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr) return; gmap_call_notifier(sg, raddr, raddr + _REGION1_SIZE - 1); r1o = (unsigned long) (r1e - ((raddr & _REGION1_INDEX) >> _REGION1_SHIFT)); - gmap_idte_one(r1o | _ASCE_TYPE_REGION1, raddr); - r2t = (unsigned long *)(*r1e & _REGION_ENTRY_ORIGIN); + gmap_idte_one(__pa(r1o) | _ASCE_TYPE_REGION1, raddr); + r2t = *r1e & _REGION_ENTRY_ORIGIN; *r1e = _REGION1_ENTRY_EMPTY; - __gmap_unshadow_r2t(sg, raddr, r2t); + __gmap_unshadow_r2t(sg, raddr, __va(r2t)); /* Free region 2 table */ - page = pfn_to_page(__pa(r2t) >> PAGE_SHIFT); + page = phys_to_page(r2t); list_del(&page->lru); __free_pages(page, CRST_ALLOC_ORDER); } @@ -1536,22 +1553,23 @@ static void gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr) static void __gmap_unshadow_r1t(struct gmap *sg, unsigned long raddr, unsigned long *r1t) { - unsigned long asce, *r2t; + unsigned long asce; struct page *page; + phys_addr_t r2t; int i; BUG_ON(!gmap_is_shadow(sg)); - asce = (unsigned long) r1t | _ASCE_TYPE_REGION1; + asce = __pa(r1t) | _ASCE_TYPE_REGION1; for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION1_SIZE) { if (!(r1t[i] & _REGION_ENTRY_ORIGIN)) continue; - r2t = (unsigned long *)(r1t[i] & _REGION_ENTRY_ORIGIN); - __gmap_unshadow_r2t(sg, raddr, r2t); + r2t = r1t[i] & _REGION_ENTRY_ORIGIN; + __gmap_unshadow_r2t(sg, raddr, __va(r2t)); /* Clear entry and flush translation r1t -> r2t */ gmap_idte_one(asce, raddr); r1t[i] = _REGION1_ENTRY_EMPTY; /* Free region 2 table */ - page = pfn_to_page(__pa(r2t) >> PAGE_SHIFT); + page = phys_to_page(r2t); list_del(&page->lru); __free_pages(page, CRST_ALLOC_ORDER); } @@ -1573,7 +1591,7 @@ static void gmap_unshadow(struct gmap *sg) sg->removed = 1; gmap_call_notifier(sg, 0, -1UL); gmap_flush_tlb(sg); - table = (unsigned long *)(sg->asce & _ASCE_ORIGIN); + table = __va(sg->asce & _ASCE_ORIGIN); switch (sg->asce & _ASCE_TYPE_MASK) { case _ASCE_TYPE_REGION1: __gmap_unshadow_r1t(sg, 0, table); @@ -1675,6 +1693,7 @@ struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce, return ERR_PTR(-ENOMEM); new->mm = parent->mm; new->parent = gmap_get(parent); + new->private = parent->private; new->orig_asce = asce; new->edat_level = edat_level; new->initialized = false; @@ -1736,7 +1755,7 @@ EXPORT_SYMBOL_GPL(gmap_shadow); * The r2t parameter specifies the address of the source table. The * four pages of the source table are made read-only in the parent gmap * address space. A write to the source table area @r2t will automatically - * remove the shadow r2 table and all of its decendents. + * remove the shadow r2 table and all of its descendants. * * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the * shadow table structure is incomplete, -ENOMEM if out of memory and @@ -1748,19 +1767,20 @@ int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t, int fake) { unsigned long raddr, origin, offset, len; - unsigned long *s_r2t, *table; + unsigned long *table; + phys_addr_t s_r2t; struct page *page; int rc; BUG_ON(!gmap_is_shadow(sg)); /* Allocate a shadow region second table */ - page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER); + page = gmap_alloc_crst(); if (!page) return -ENOMEM; page->index = r2t & _REGION_ENTRY_ORIGIN; if (fake) page->index |= GMAP_SHADOW_FAKE_TABLE; - s_r2t = (unsigned long *) page_to_phys(page); + s_r2t = page_to_phys(page); /* Install shadow region second table */ spin_lock(&sg->guest_table_lock); table = gmap_table_walk(sg, saddr, 4); /* get region-1 pointer */ @@ -1775,9 +1795,9 @@ int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t, rc = -EAGAIN; /* Race with shadow */ goto out_free; } - crst_table_init(s_r2t, _REGION2_ENTRY_EMPTY); + crst_table_init(__va(s_r2t), _REGION2_ENTRY_EMPTY); /* mark as invalid as long as the parent table is not protected */ - *table = (unsigned long) s_r2t | _REGION_ENTRY_LENGTH | + *table = s_r2t | _REGION_ENTRY_LENGTH | _REGION_ENTRY_TYPE_R1 | _REGION_ENTRY_INVALID; if (sg->edat_level >= 1) *table |= (r2t & _REGION_ENTRY_PROTECT); @@ -1798,8 +1818,7 @@ int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t, spin_lock(&sg->guest_table_lock); if (!rc) { table = gmap_table_walk(sg, saddr, 4); - if (!table || (*table & _REGION_ENTRY_ORIGIN) != - (unsigned long) s_r2t) + if (!table || (*table & _REGION_ENTRY_ORIGIN) != s_r2t) rc = -EAGAIN; /* Race with unshadow */ else *table &= ~_REGION_ENTRY_INVALID; @@ -1832,19 +1851,20 @@ int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t, int fake) { unsigned long raddr, origin, offset, len; - unsigned long *s_r3t, *table; + unsigned long *table; + phys_addr_t s_r3t; struct page *page; int rc; BUG_ON(!gmap_is_shadow(sg)); /* Allocate a shadow region second table */ - page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER); + page = gmap_alloc_crst(); if (!page) return -ENOMEM; page->index = r3t & _REGION_ENTRY_ORIGIN; if (fake) page->index |= GMAP_SHADOW_FAKE_TABLE; - s_r3t = (unsigned long *) page_to_phys(page); + s_r3t = page_to_phys(page); /* Install shadow region second table */ spin_lock(&sg->guest_table_lock); table = gmap_table_walk(sg, saddr, 3); /* get region-2 pointer */ @@ -1859,9 +1879,9 @@ int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t, rc = -EAGAIN; /* Race with shadow */ goto out_free; } - crst_table_init(s_r3t, _REGION3_ENTRY_EMPTY); + crst_table_init(__va(s_r3t), _REGION3_ENTRY_EMPTY); /* mark as invalid as long as the parent table is not protected */ - *table = (unsigned long) s_r3t | _REGION_ENTRY_LENGTH | + *table = s_r3t | _REGION_ENTRY_LENGTH | _REGION_ENTRY_TYPE_R2 | _REGION_ENTRY_INVALID; if (sg->edat_level >= 1) *table |= (r3t & _REGION_ENTRY_PROTECT); @@ -1882,8 +1902,7 @@ int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t, spin_lock(&sg->guest_table_lock); if (!rc) { table = gmap_table_walk(sg, saddr, 3); - if (!table || (*table & _REGION_ENTRY_ORIGIN) != - (unsigned long) s_r3t) + if (!table || (*table & _REGION_ENTRY_ORIGIN) != s_r3t) rc = -EAGAIN; /* Race with unshadow */ else *table &= ~_REGION_ENTRY_INVALID; @@ -1916,19 +1935,20 @@ int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt, int fake) { unsigned long raddr, origin, offset, len; - unsigned long *s_sgt, *table; + unsigned long *table; + phys_addr_t s_sgt; struct page *page; int rc; BUG_ON(!gmap_is_shadow(sg) || (sgt & _REGION3_ENTRY_LARGE)); /* Allocate a shadow segment table */ - page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER); + page = gmap_alloc_crst(); if (!page) return -ENOMEM; page->index = sgt & _REGION_ENTRY_ORIGIN; if (fake) page->index |= GMAP_SHADOW_FAKE_TABLE; - s_sgt = (unsigned long *) page_to_phys(page); + s_sgt = page_to_phys(page); /* Install shadow region second table */ spin_lock(&sg->guest_table_lock); table = gmap_table_walk(sg, saddr, 2); /* get region-3 pointer */ @@ -1943,9 +1963,9 @@ int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt, rc = -EAGAIN; /* Race with shadow */ goto out_free; } - crst_table_init(s_sgt, _SEGMENT_ENTRY_EMPTY); + crst_table_init(__va(s_sgt), _SEGMENT_ENTRY_EMPTY); /* mark as invalid as long as the parent table is not protected */ - *table = (unsigned long) s_sgt | _REGION_ENTRY_LENGTH | + *table = s_sgt | _REGION_ENTRY_LENGTH | _REGION_ENTRY_TYPE_R3 | _REGION_ENTRY_INVALID; if (sg->edat_level >= 1) *table |= sgt & _REGION_ENTRY_PROTECT; @@ -1966,8 +1986,7 @@ int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt, spin_lock(&sg->guest_table_lock); if (!rc) { table = gmap_table_walk(sg, saddr, 2); - if (!table || (*table & _REGION_ENTRY_ORIGIN) != - (unsigned long) s_sgt) + if (!table || (*table & _REGION_ENTRY_ORIGIN) != s_sgt) rc = -EAGAIN; /* Race with unshadow */ else *table &= ~_REGION_ENTRY_INVALID; @@ -2040,19 +2059,20 @@ int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt, int fake) { unsigned long raddr, origin; - unsigned long *s_pgt, *table; - struct page *page; + unsigned long *table; + struct ptdesc *ptdesc; + phys_addr_t s_pgt; int rc; BUG_ON(!gmap_is_shadow(sg) || (pgt & _SEGMENT_ENTRY_LARGE)); /* Allocate a shadow page table */ - page = page_table_alloc_pgste(sg->mm); - if (!page) + ptdesc = page_table_alloc_pgste(sg->mm); + if (!ptdesc) return -ENOMEM; - page->index = pgt & _SEGMENT_ENTRY_ORIGIN; + ptdesc->pt_index = pgt & _SEGMENT_ENTRY_ORIGIN; if (fake) - page->index |= GMAP_SHADOW_FAKE_TABLE; - s_pgt = (unsigned long *) page_to_phys(page); + ptdesc->pt_index |= GMAP_SHADOW_FAKE_TABLE; + s_pgt = page_to_phys(ptdesc_page(ptdesc)); /* Install shadow page table */ spin_lock(&sg->guest_table_lock); table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */ @@ -2070,7 +2090,7 @@ int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt, /* mark as invalid as long as the parent table is not protected */ *table = (unsigned long) s_pgt | _SEGMENT_ENTRY | (pgt & _SEGMENT_ENTRY_PROTECT) | _SEGMENT_ENTRY_INVALID; - list_add(&page->lru, &sg->pt_list); + list_add(&ptdesc->pt_list, &sg->pt_list); if (fake) { /* nothing to protect for fake tables */ *table &= ~_SEGMENT_ENTRY_INVALID; @@ -2085,8 +2105,7 @@ int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt, spin_lock(&sg->guest_table_lock); if (!rc) { table = gmap_table_walk(sg, saddr, 1); - if (!table || (*table & _SEGMENT_ENTRY_ORIGIN) != - (unsigned long) s_pgt) + if (!table || (*table & _SEGMENT_ENTRY_ORIGIN) != s_pgt) rc = -EAGAIN; /* Race with unshadow */ else *table &= ~_SEGMENT_ENTRY_INVALID; @@ -2097,7 +2116,7 @@ int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt, return rc; out_free: spin_unlock(&sg->guest_table_lock); - page_table_free_pgste(page); + page_table_free_pgste(ptdesc); return rc; } @@ -2152,7 +2171,7 @@ int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte) tptep = (pte_t *) gmap_table_walk(sg, saddr, 0); if (!tptep) { spin_unlock(&sg->guest_table_lock); - gmap_pte_op_end(ptl); + gmap_pte_op_end(sptep, ptl); radix_tree_preload_end(); break; } @@ -2163,7 +2182,7 @@ int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte) rmap = NULL; rc = 0; } - gmap_pte_op_end(ptl); + gmap_pte_op_end(sptep, ptl); spin_unlock(&sg->guest_table_lock); } radix_tree_preload_end(); @@ -2481,7 +2500,7 @@ void gmap_sync_dirty_log_pmd(struct gmap *gmap, unsigned long bitmap[4], if (!pmdp) return; - if (pmd_large(*pmdp)) { + if (pmd_leaf(*pmdp)) { if (gmap_test_and_clear_dirty_pmd(gmap, pmdp, gaddr)) bitmap_fill(bitmap, _PAGE_ENTRIES); } else { @@ -2491,7 +2510,7 @@ void gmap_sync_dirty_log_pmd(struct gmap *gmap, unsigned long bitmap[4], continue; if (ptep_test_and_clear_uc(gmap->mm, vmaddr, ptep)) set_bit(i, bitmap); - spin_unlock(ptl); + pte_unmap_unlock(ptep, ptl); } } gmap_pmd_op_end(gmap, pmdp); @@ -2510,15 +2529,16 @@ static int thp_split_walk_pmd_entry(pmd_t *pmd, unsigned long addr, static const struct mm_walk_ops thp_split_walk_ops = { .pmd_entry = thp_split_walk_pmd_entry, + .walk_lock = PGWALK_WRLOCK_VERIFY, }; static inline void thp_split_mm(struct mm_struct *mm) { struct vm_area_struct *vma; + VMA_ITERATOR(vmi, mm, 0); - for (vma = mm->mmap; vma != NULL; vma = vma->vm_next) { - vma->vm_flags &= ~VM_HUGEPAGE; - vma->vm_flags |= VM_NOHUGEPAGE; + for_each_vma(vmi, vma) { + vm_flags_mod(vma, VM_NOHUGEPAGE, VM_HUGEPAGE); walk_page_vma(vma, &thp_split_walk_ops, NULL); } mm->def_flags |= VM_NOHUGEPAGE; @@ -2533,7 +2553,12 @@ static inline void thp_split_mm(struct mm_struct *mm) * Remove all empty zero pages from the mapping for lazy refaulting * - This must be called after mm->context.has_pgste is set, to avoid * future creation of zero pages - * - This must be called after THP was enabled + * - This must be called after THP was disabled. + * + * mm contracts with s390, that even if mm were to remove a page table, + * racing with the loop below and so causing pte_offset_map_lock() to fail, + * it will never insert a page table containing empty zero pages once + * mm_forbids_zeropage(mm) i.e. mm->context.has_pgste is set. */ static int __zap_zero_pages(pmd_t *pmd, unsigned long start, unsigned long end, struct mm_walk *walk) @@ -2545,6 +2570,8 @@ static int __zap_zero_pages(pmd_t *pmd, unsigned long start, spinlock_t *ptl; ptep = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); + if (!ptep) + break; if (is_zero_pfn(pte_pfn(*ptep))) ptep_xchg_direct(walk->mm, addr, ptep, __pte(_PAGE_INVALID)); pte_unmap_unlock(ptep, ptl); @@ -2554,6 +2581,7 @@ static int __zap_zero_pages(pmd_t *pmd, unsigned long start, static const struct mm_walk_ops zap_zero_walk_ops = { .pmd_entry = __zap_zero_pages, + .walk_lock = PGWALK_WRLOCK, }; /* @@ -2581,18 +2609,12 @@ EXPORT_SYMBOL_GPL(s390_enable_sie); int gmap_mark_unmergeable(void) { - struct mm_struct *mm = current->mm; - struct vm_area_struct *vma; - int ret; - - for (vma = mm->mmap; vma; vma = vma->vm_next) { - ret = ksm_madvise(vma, vma->vm_start, vma->vm_end, - MADV_UNMERGEABLE, &vma->vm_flags); - if (ret) - return ret; - } - mm->def_flags &= ~VM_MERGEABLE; - return 0; + /* + * Make sure to disable KSM (if enabled for the whole process or + * individual VMAs). Note that nothing currently hinders user space + * from re-enabling it. + */ + return ksm_disable(current->mm); } EXPORT_SYMBOL_GPL(gmap_mark_unmergeable); @@ -2650,6 +2672,7 @@ static const struct mm_walk_ops enable_skey_walk_ops = { .hugetlb_entry = __s390_enable_skey_hugetlb, .pte_entry = __s390_enable_skey_pte, .pmd_entry = __s390_enable_skey_pmd, + .walk_lock = PGWALK_WRLOCK, }; int s390_enable_skey(void) @@ -2687,6 +2710,7 @@ static int __s390_reset_cmma(pte_t *pte, unsigned long addr, static const struct mm_walk_ops reset_cmma_walk_ops = { .pte_entry = __s390_reset_cmma, + .walk_lock = PGWALK_WRLOCK, }; void s390_reset_cmma(struct mm_struct *mm) @@ -2697,41 +2721,177 @@ void s390_reset_cmma(struct mm_struct *mm) } EXPORT_SYMBOL_GPL(s390_reset_cmma); +#define GATHER_GET_PAGES 32 + +struct reset_walk_state { + unsigned long next; + unsigned long count; + unsigned long pfns[GATHER_GET_PAGES]; +}; + +static int s390_gather_pages(pte_t *ptep, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + struct reset_walk_state *p = walk->private; + pte_t pte = READ_ONCE(*ptep); + + if (pte_present(pte)) { + /* we have a reference from the mapping, take an extra one */ + get_page(phys_to_page(pte_val(pte))); + p->pfns[p->count] = phys_to_pfn(pte_val(pte)); + p->next = next; + p->count++; + } + return p->count >= GATHER_GET_PAGES; +} + +static const struct mm_walk_ops gather_pages_ops = { + .pte_entry = s390_gather_pages, + .walk_lock = PGWALK_RDLOCK, +}; + /* - * make inaccessible pages accessible again + * Call the Destroy secure page UVC on each page in the given array of PFNs. + * Each page needs to have an extra reference, which will be released here. */ -static int __s390_reset_acc(pte_t *ptep, unsigned long addr, - unsigned long next, struct mm_walk *walk) +void s390_uv_destroy_pfns(unsigned long count, unsigned long *pfns) { - pte_t pte = READ_ONCE(*ptep); + unsigned long i; - /* There is a reference through the mapping */ - if (pte_present(pte)) - WARN_ON_ONCE(uv_destroy_owned_page(pte_val(pte) & PAGE_MASK)); + for (i = 0; i < count; i++) { + /* we always have an extra reference */ + uv_destroy_owned_page(pfn_to_phys(pfns[i])); + /* get rid of the extra reference */ + put_page(pfn_to_page(pfns[i])); + cond_resched(); + } +} +EXPORT_SYMBOL_GPL(s390_uv_destroy_pfns); +/** + * __s390_uv_destroy_range - Call the destroy secure page UVC on each page + * in the given range of the given address space. + * @mm: the mm to operate on + * @start: the start of the range + * @end: the end of the range + * @interruptible: if not 0, stop when a fatal signal is received + * + * Walk the given range of the given address space and call the destroy + * secure page UVC on each page. Optionally exit early if a fatal signal is + * pending. + * + * Return: 0 on success, -EINTR if the function stopped before completing + */ +int __s390_uv_destroy_range(struct mm_struct *mm, unsigned long start, + unsigned long end, bool interruptible) +{ + struct reset_walk_state state = { .next = start }; + int r = 1; + + while (r > 0) { + state.count = 0; + mmap_read_lock(mm); + r = walk_page_range(mm, state.next, end, &gather_pages_ops, &state); + mmap_read_unlock(mm); + cond_resched(); + s390_uv_destroy_pfns(state.count, state.pfns); + if (interruptible && fatal_signal_pending(current)) + return -EINTR; + } return 0; } +EXPORT_SYMBOL_GPL(__s390_uv_destroy_range); -static const struct mm_walk_ops reset_acc_walk_ops = { - .pte_entry = __s390_reset_acc, -}; +/** + * s390_unlist_old_asce - Remove the topmost level of page tables from the + * list of page tables of the gmap. + * @gmap: the gmap whose table is to be removed + * + * On s390x, KVM keeps a list of all pages containing the page tables of the + * gmap (the CRST list). This list is used at tear down time to free all + * pages that are now not needed anymore. + * + * This function removes the topmost page of the tree (the one pointed to by + * the ASCE) from the CRST list. + * + * This means that it will not be freed when the VM is torn down, and needs + * to be handled separately by the caller, unless a leak is actually + * intended. Notice that this function will only remove the page from the + * list, the page will still be used as a top level page table (and ASCE). + */ +void s390_unlist_old_asce(struct gmap *gmap) +{ + struct page *old; + + old = virt_to_page(gmap->table); + spin_lock(&gmap->guest_table_lock); + list_del(&old->lru); + /* + * Sometimes the topmost page might need to be "removed" multiple + * times, for example if the VM is rebooted into secure mode several + * times concurrently, or if s390_replace_asce fails after calling + * s390_remove_old_asce and is attempted again later. In that case + * the old asce has been removed from the list, and therefore it + * will not be freed when the VM terminates, but the ASCE is still + * in use and still pointed to. + * A subsequent call to replace_asce will follow the pointer and try + * to remove the same page from the list again. + * Therefore it's necessary that the page of the ASCE has valid + * pointers, so list_del can work (and do nothing) without + * dereferencing stale or invalid pointers. + */ + INIT_LIST_HEAD(&old->lru); + spin_unlock(&gmap->guest_table_lock); +} +EXPORT_SYMBOL_GPL(s390_unlist_old_asce); -#include <linux/sched/mm.h> -void s390_reset_acc(struct mm_struct *mm) +/** + * s390_replace_asce - Try to replace the current ASCE of a gmap with a copy + * @gmap: the gmap whose ASCE needs to be replaced + * + * If the ASCE is a SEGMENT type then this function will return -EINVAL, + * otherwise the pointers in the host_to_guest radix tree will keep pointing + * to the wrong pages, causing use-after-free and memory corruption. + * If the allocation of the new top level page table fails, the ASCE is not + * replaced. + * In any case, the old ASCE is always removed from the gmap CRST list. + * Therefore the caller has to make sure to save a pointer to it + * beforehand, unless a leak is actually intended. + */ +int s390_replace_asce(struct gmap *gmap) { - if (!mm_is_protected(mm)) - return; + unsigned long asce; + struct page *page; + void *table; + + s390_unlist_old_asce(gmap); + + /* Replacing segment type ASCEs would cause serious issues */ + if ((gmap->asce & _ASCE_TYPE_MASK) == _ASCE_TYPE_SEGMENT) + return -EINVAL; + + page = gmap_alloc_crst(); + if (!page) + return -ENOMEM; + page->index = 0; + table = page_to_virt(page); + memcpy(table, gmap->table, 1UL << (CRST_ALLOC_ORDER + PAGE_SHIFT)); + /* - * we might be called during - * reset: we walk the pages and clear - * close of all kvm file descriptors: we walk the pages and clear - * exit of process on fd closure: vma already gone, do nothing + * The caller has to deal with the old ASCE, but here we make sure + * the new one is properly added to the CRST list, so that + * it will be freed when the VM is torn down. */ - if (!mmget_not_zero(mm)) - return; - mmap_read_lock(mm); - walk_page_range(mm, 0, TASK_SIZE, &reset_acc_walk_ops, NULL); - mmap_read_unlock(mm); - mmput(mm); + spin_lock(&gmap->guest_table_lock); + list_add(&page->lru, &gmap->crst_list); + spin_unlock(&gmap->guest_table_lock); + + /* Set new table origin while preserving existing ASCE control bits */ + asce = (gmap->asce & ~_ASCE_ORIGIN) | __pa(table); + WRITE_ONCE(gmap->asce, asce); + WRITE_ONCE(gmap->mm->context.gmap_asce, asce); + WRITE_ONCE(gmap->table, table); + + return 0; } -EXPORT_SYMBOL_GPL(s390_reset_acc); +EXPORT_SYMBOL_GPL(s390_replace_asce); diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c index 10e51ef9c79a..c2e8242bd15d 100644 --- a/arch/s390/mm/hugetlbpage.c +++ b/arch/s390/mm/hugetlbpage.c @@ -142,7 +142,7 @@ static void clear_huge_pte_skeys(struct mm_struct *mm, unsigned long rste) __storage_key_init_range(paddr, paddr + size - 1); } -void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, +void __set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) { unsigned long rste; @@ -163,6 +163,12 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, set_pte(ptep, __pte(rste)); } +void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte, unsigned long sz) +{ + __set_huge_pte_at(mm, addr, ptep, pte); +} + pte_t huge_ptep_get(pte_t *ptep) { return __rste_to_pte(pte_val(*ptep)); @@ -218,7 +224,7 @@ pte_t *huge_pte_offset(struct mm_struct *mm, if (p4d_present(*p4dp)) { pudp = pud_offset(p4dp, addr); if (pud_present(*pudp)) { - if (pud_large(*pudp)) + if (pud_leaf(*pudp)) return (pte_t *) pudp; pmdp = pmd_offset(pudp, addr); } @@ -229,22 +235,12 @@ pte_t *huge_pte_offset(struct mm_struct *mm, int pmd_huge(pmd_t pmd) { - return pmd_large(pmd); + return pmd_leaf(pmd); } int pud_huge(pud_t pud) { - return pud_large(pud); -} - -struct page * -follow_huge_pud(struct mm_struct *mm, unsigned long address, - pud_t *pud, int flags) -{ - if (flags & FOLL_GET) - return NULL; - - return pud_page(*pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT); + return pud_leaf(pud); } bool __init arch_hugetlb_valid_size(unsigned long size) @@ -283,7 +279,7 @@ static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file, info.flags = VM_UNMAPPED_AREA_TOPDOWN; info.length = len; - info.low_limit = max(PAGE_SIZE, mmap_min_addr); + info.low_limit = PAGE_SIZE; info.high_limit = current->mm->mmap_base; info.align_mask = PAGE_MASK & ~huge_page_mask(h); info.align_offset = 0; diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index 6a0ac00d5a42..f6391442c0c2 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -31,29 +31,29 @@ #include <linux/cma.h> #include <linux/gfp.h> #include <linux/dma-direct.h> -#include <linux/platform-feature.h> +#include <linux/percpu.h> #include <asm/processor.h> #include <linux/uaccess.h> #include <asm/pgalloc.h> +#include <asm/ctlreg.h> #include <asm/kfence.h> -#include <asm/ptdump.h> #include <asm/dma.h> -#include <asm/lowcore.h> +#include <asm/abs_lowcore.h> #include <asm/tlb.h> #include <asm/tlbflush.h> #include <asm/sections.h> -#include <asm/ctl_reg.h> #include <asm/sclp.h> #include <asm/set_memory.h> #include <asm/kasan.h> #include <asm/dma-mapping.h> #include <asm/uv.h> +#include <linux/virtio_anchor.h> #include <linux/virtio_config.h> pgd_t swapper_pg_dir[PTRS_PER_PGD] __section(".bss..swapper_pg_dir"); -static pgd_t invalid_pg_dir[PTRS_PER_PGD] __section(".bss..invalid_pg_dir"); +pgd_t invalid_pg_dir[PTRS_PER_PGD] __section(".bss..invalid_pg_dir"); -unsigned long s390_invalid_asce; +struct ctlreg __bootdata_preserved(s390_invalid_asce); unsigned long empty_zero_page, zero_page_mask; EXPORT_SYMBOL(empty_zero_page); @@ -92,41 +92,12 @@ static void __init setup_zero_pages(void) void __init paging_init(void) { unsigned long max_zone_pfns[MAX_NR_ZONES]; - unsigned long pgd_type, asce_bits; - psw_t psw; - - s390_invalid_asce = (unsigned long)invalid_pg_dir; - s390_invalid_asce |= _ASCE_TYPE_REGION3 | _ASCE_TABLE_LENGTH; - crst_table_init((unsigned long *)invalid_pg_dir, _REGION3_ENTRY_EMPTY); - init_mm.pgd = swapper_pg_dir; - if (VMALLOC_END > _REGION2_SIZE) { - asce_bits = _ASCE_TYPE_REGION2 | _ASCE_TABLE_LENGTH; - pgd_type = _REGION2_ENTRY_EMPTY; - } else { - asce_bits = _ASCE_TYPE_REGION3 | _ASCE_TABLE_LENGTH; - pgd_type = _REGION3_ENTRY_EMPTY; - } - init_mm.context.asce = (__pa(init_mm.pgd) & PAGE_MASK) | asce_bits; - S390_lowcore.kernel_asce = init_mm.context.asce; - S390_lowcore.user_asce = s390_invalid_asce; - crst_table_init((unsigned long *) init_mm.pgd, pgd_type); - vmem_map_init(); - kasan_copy_shadow_mapping(); - - /* enable virtual mapping in kernel mode */ - __ctl_load(S390_lowcore.kernel_asce, 1, 1); - __ctl_load(S390_lowcore.user_asce, 7, 7); - __ctl_load(S390_lowcore.kernel_asce, 13, 13); - psw.mask = __extract_psw(); - psw_bits(psw).dat = 1; - psw_bits(psw).as = PSW_BITS_AS_HOME; - __load_psw_mask(psw.mask); - kasan_free_early_identity(); + vmem_map_init(); sparse_init(); zone_dma_bits = 31; memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); - max_zone_pfns[ZONE_DMA] = PFN_DOWN(MAX_DMA_ADDRESS); + max_zone_pfns[ZONE_DMA] = virt_to_pfn(MAX_DMA_ADDRESS); max_zone_pfns[ZONE_NORMAL] = max_low_pfn; free_area_init(max_zone_pfns); } @@ -135,30 +106,29 @@ void mark_rodata_ro(void) { unsigned long size = __end_ro_after_init - __start_ro_after_init; - set_memory_ro((unsigned long)__start_ro_after_init, size >> PAGE_SHIFT); + __set_memory_ro(__start_ro_after_init, __end_ro_after_init); pr_info("Write protected read-only-after-init data: %luk\n", size >> 10); - debug_checkwx(); } -int set_memory_encrypted(unsigned long addr, int numpages) +int set_memory_encrypted(unsigned long vaddr, int numpages) { int i; /* make specified pages unshared, (swiotlb, dma_free) */ for (i = 0; i < numpages; ++i) { - uv_remove_shared(addr); - addr += PAGE_SIZE; + uv_remove_shared(virt_to_phys((void *)vaddr)); + vaddr += PAGE_SIZE; } return 0; } -int set_memory_decrypted(unsigned long addr, int numpages) +int set_memory_decrypted(unsigned long vaddr, int numpages) { int i; /* make specified pages shared (swiotlb, dma_alloca) */ for (i = 0; i < numpages; ++i) { - uv_set_shared(addr); - addr += PAGE_SIZE; + uv_set_shared(virt_to_phys((void *)vaddr)); + vaddr += PAGE_SIZE; } return 0; } @@ -175,7 +145,7 @@ static void pv_init(void) if (!is_prot_virt_guest()) return; - platform_set(PLATFORM_VIRTIO_RESTRICTED_MEM_ACCESS); + virtio_set_mem_acc_cb(virtio_require_restricted_mem_acc); /* make sure bounce buffers are shared */ swiotlb_init(true, SWIOTLB_FORCE | SWIOTLB_VERBOSE); @@ -192,24 +162,16 @@ void __init mem_init(void) pv_init(); kfence_split_mapping(); - /* Setup guest page hinting */ - cmma_init(); /* this will put all low memory onto the freelists */ memblock_free_all(); setup_zero_pages(); /* Setup zeroed pages. */ - - cmma_init_nodat(); } void free_initmem(void) { - __set_memory((unsigned long)_sinittext, - (unsigned long)(_einittext - _sinittext) >> PAGE_SHIFT, - SET_MEMORY_RW | SET_MEMORY_NX); - free_reserved_area(sclp_early_sccb, - sclp_early_sccb + EXT_SCCB_READ_SCP, - POISON_FREE_INITMEM, "unused early sccb"); + set_memory_rwnx((unsigned long)_sinittext, + (unsigned long)(_einittext - _sinittext) >> PAGE_SHIFT); free_initmem_default(POISON_FREE_INITMEM); } @@ -222,6 +184,41 @@ unsigned long memory_block_size_bytes(void) return max_t(unsigned long, MIN_MEMORY_BLOCK_SIZE, sclp.rzm); } +unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; +EXPORT_SYMBOL(__per_cpu_offset); + +static int __init pcpu_cpu_distance(unsigned int from, unsigned int to) +{ + return LOCAL_DISTANCE; +} + +static int __init pcpu_cpu_to_node(int cpu) +{ + return 0; +} + +void __init setup_per_cpu_areas(void) +{ + unsigned long delta; + unsigned int cpu; + int rc; + + /* + * Always reserve area for module percpu variables. That's + * what the legacy allocator did. + */ + rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE, + PERCPU_DYNAMIC_RESERVE, PAGE_SIZE, + pcpu_cpu_distance, + pcpu_cpu_to_node); + if (rc < 0) + panic("Failed to initialize percpu areas."); + + delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; + for_each_possible_cpu(cpu) + __per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu]; +} + #ifdef CONFIG_MEMORY_HOTPLUG #ifdef CONFIG_CMA @@ -282,9 +279,6 @@ int arch_add_memory(int nid, u64 start, u64 size, unsigned long size_pages = PFN_DOWN(size); int rc; - if (WARN_ON_ONCE(params->altmap)) - return -EINVAL; - if (WARN_ON_ONCE(params->pgprot.pgprot != PAGE_KERNEL.pgprot)) return -EINVAL; diff --git a/arch/s390/mm/kasan_init.c b/arch/s390/mm/kasan_init.c deleted file mode 100644 index 9f988d4582ed..000000000000 --- a/arch/s390/mm/kasan_init.c +++ /dev/null @@ -1,403 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include <linux/kasan.h> -#include <linux/sched/task.h> -#include <linux/memblock.h> -#include <linux/pgtable.h> -#include <asm/pgalloc.h> -#include <asm/kasan.h> -#include <asm/mem_detect.h> -#include <asm/processor.h> -#include <asm/sclp.h> -#include <asm/facility.h> -#include <asm/sections.h> -#include <asm/setup.h> -#include <asm/uv.h> - -static unsigned long segment_pos __initdata; -static unsigned long segment_low __initdata; -static unsigned long pgalloc_pos __initdata; -static unsigned long pgalloc_low __initdata; -static unsigned long pgalloc_freeable __initdata; -static bool has_edat __initdata; -static bool has_nx __initdata; - -#define __sha(x) ((unsigned long)kasan_mem_to_shadow((void *)x)) - -static pgd_t early_pg_dir[PTRS_PER_PGD] __initdata __aligned(PAGE_SIZE); - -static void __init kasan_early_panic(const char *reason) -{ - sclp_early_printk("The Linux kernel failed to boot with the KernelAddressSanitizer:\n"); - sclp_early_printk(reason); - disabled_wait(); -} - -static void * __init kasan_early_alloc_segment(void) -{ - segment_pos -= _SEGMENT_SIZE; - - if (segment_pos < segment_low) - kasan_early_panic("out of memory during initialisation\n"); - - return (void *)segment_pos; -} - -static void * __init kasan_early_alloc_pages(unsigned int order) -{ - pgalloc_pos -= (PAGE_SIZE << order); - - if (pgalloc_pos < pgalloc_low) - kasan_early_panic("out of memory during initialisation\n"); - - return (void *)pgalloc_pos; -} - -static void * __init kasan_early_crst_alloc(unsigned long val) -{ - unsigned long *table; - - table = kasan_early_alloc_pages(CRST_ALLOC_ORDER); - if (table) - crst_table_init(table, val); - return table; -} - -static pte_t * __init kasan_early_pte_alloc(void) -{ - static void *pte_leftover; - pte_t *pte; - - BUILD_BUG_ON(_PAGE_TABLE_SIZE * 2 != PAGE_SIZE); - - if (!pte_leftover) { - pte_leftover = kasan_early_alloc_pages(0); - pte = pte_leftover + _PAGE_TABLE_SIZE; - } else { - pte = pte_leftover; - pte_leftover = NULL; - } - memset64((u64 *)pte, _PAGE_INVALID, PTRS_PER_PTE); - return pte; -} - -enum populate_mode { - POPULATE_ONE2ONE, - POPULATE_MAP, - POPULATE_ZERO_SHADOW, - POPULATE_SHALLOW -}; -static void __init kasan_early_pgtable_populate(unsigned long address, - unsigned long end, - enum populate_mode mode) -{ - unsigned long pgt_prot_zero, pgt_prot, sgt_prot; - pgd_t *pg_dir; - p4d_t *p4_dir; - pud_t *pu_dir; - pmd_t *pm_dir; - pte_t *pt_dir; - - pgt_prot_zero = pgprot_val(PAGE_KERNEL_RO); - if (!has_nx) - pgt_prot_zero &= ~_PAGE_NOEXEC; - pgt_prot = pgprot_val(PAGE_KERNEL); - sgt_prot = pgprot_val(SEGMENT_KERNEL); - if (!has_nx || mode == POPULATE_ONE2ONE) { - pgt_prot &= ~_PAGE_NOEXEC; - sgt_prot &= ~_SEGMENT_ENTRY_NOEXEC; - } - - /* - * The first 1MB of 1:1 mapping is mapped with 4KB pages - */ - while (address < end) { - pg_dir = pgd_offset_k(address); - if (pgd_none(*pg_dir)) { - if (mode == POPULATE_ZERO_SHADOW && - IS_ALIGNED(address, PGDIR_SIZE) && - end - address >= PGDIR_SIZE) { - pgd_populate(&init_mm, pg_dir, - kasan_early_shadow_p4d); - address = (address + PGDIR_SIZE) & PGDIR_MASK; - continue; - } - p4_dir = kasan_early_crst_alloc(_REGION2_ENTRY_EMPTY); - pgd_populate(&init_mm, pg_dir, p4_dir); - } - - if (mode == POPULATE_SHALLOW) { - address = (address + P4D_SIZE) & P4D_MASK; - continue; - } - - p4_dir = p4d_offset(pg_dir, address); - if (p4d_none(*p4_dir)) { - if (mode == POPULATE_ZERO_SHADOW && - IS_ALIGNED(address, P4D_SIZE) && - end - address >= P4D_SIZE) { - p4d_populate(&init_mm, p4_dir, - kasan_early_shadow_pud); - address = (address + P4D_SIZE) & P4D_MASK; - continue; - } - pu_dir = kasan_early_crst_alloc(_REGION3_ENTRY_EMPTY); - p4d_populate(&init_mm, p4_dir, pu_dir); - } - - pu_dir = pud_offset(p4_dir, address); - if (pud_none(*pu_dir)) { - if (mode == POPULATE_ZERO_SHADOW && - IS_ALIGNED(address, PUD_SIZE) && - end - address >= PUD_SIZE) { - pud_populate(&init_mm, pu_dir, - kasan_early_shadow_pmd); - address = (address + PUD_SIZE) & PUD_MASK; - continue; - } - pm_dir = kasan_early_crst_alloc(_SEGMENT_ENTRY_EMPTY); - pud_populate(&init_mm, pu_dir, pm_dir); - } - - pm_dir = pmd_offset(pu_dir, address); - if (pmd_none(*pm_dir)) { - if (IS_ALIGNED(address, PMD_SIZE) && - end - address >= PMD_SIZE) { - if (mode == POPULATE_ZERO_SHADOW) { - pmd_populate(&init_mm, pm_dir, kasan_early_shadow_pte); - address = (address + PMD_SIZE) & PMD_MASK; - continue; - } else if (has_edat && address) { - void *page; - - if (mode == POPULATE_ONE2ONE) { - page = (void *)address; - } else { - page = kasan_early_alloc_segment(); - memset(page, 0, _SEGMENT_SIZE); - } - set_pmd(pm_dir, __pmd(__pa(page) | sgt_prot)); - address = (address + PMD_SIZE) & PMD_MASK; - continue; - } - } - pt_dir = kasan_early_pte_alloc(); - pmd_populate(&init_mm, pm_dir, pt_dir); - } else if (pmd_large(*pm_dir)) { - address = (address + PMD_SIZE) & PMD_MASK; - continue; - } - - pt_dir = pte_offset_kernel(pm_dir, address); - if (pte_none(*pt_dir)) { - void *page; - - switch (mode) { - case POPULATE_ONE2ONE: - page = (void *)address; - set_pte(pt_dir, __pte(__pa(page) | pgt_prot)); - break; - case POPULATE_MAP: - page = kasan_early_alloc_pages(0); - memset(page, 0, PAGE_SIZE); - set_pte(pt_dir, __pte(__pa(page) | pgt_prot)); - break; - case POPULATE_ZERO_SHADOW: - page = kasan_early_shadow_page; - set_pte(pt_dir, __pte(__pa(page) | pgt_prot_zero)); - break; - case POPULATE_SHALLOW: - /* should never happen */ - break; - } - } - address += PAGE_SIZE; - } -} - -static void __init kasan_set_pgd(pgd_t *pgd, unsigned long asce_type) -{ - unsigned long asce_bits; - - asce_bits = asce_type | _ASCE_TABLE_LENGTH; - S390_lowcore.kernel_asce = (__pa(pgd) & PAGE_MASK) | asce_bits; - S390_lowcore.user_asce = S390_lowcore.kernel_asce; - - __ctl_load(S390_lowcore.kernel_asce, 1, 1); - __ctl_load(S390_lowcore.kernel_asce, 7, 7); - __ctl_load(S390_lowcore.kernel_asce, 13, 13); -} - -static void __init kasan_enable_dat(void) -{ - psw_t psw; - - psw.mask = __extract_psw(); - psw_bits(psw).dat = 1; - psw_bits(psw).as = PSW_BITS_AS_HOME; - __load_psw_mask(psw.mask); -} - -static void __init kasan_early_detect_facilities(void) -{ - if (test_facility(8)) { - has_edat = true; - __ctl_set_bit(0, 23); - } - if (!noexec_disabled && test_facility(130)) { - has_nx = true; - __ctl_set_bit(0, 20); - } -} - -void __init kasan_early_init(void) -{ - unsigned long shadow_alloc_size; - unsigned long initrd_end; - unsigned long memsize; - unsigned long pgt_prot = pgprot_val(PAGE_KERNEL_RO); - pte_t pte_z; - pmd_t pmd_z = __pmd(__pa(kasan_early_shadow_pte) | _SEGMENT_ENTRY); - pud_t pud_z = __pud(__pa(kasan_early_shadow_pmd) | _REGION3_ENTRY); - p4d_t p4d_z = __p4d(__pa(kasan_early_shadow_pud) | _REGION2_ENTRY); - - kasan_early_detect_facilities(); - if (!has_nx) - pgt_prot &= ~_PAGE_NOEXEC; - pte_z = __pte(__pa(kasan_early_shadow_page) | pgt_prot); - - memsize = get_mem_detect_end(); - if (!memsize) - kasan_early_panic("cannot detect physical memory size\n"); - /* - * Kasan currently supports standby memory but only if it follows - * online memory (default allocation), i.e. no memory holes. - * - memsize represents end of online memory - * - ident_map_size represents online + standby and memory limits - * accounted. - * Kasan maps "memsize" right away. - * [0, memsize] - as identity mapping - * [__sha(0), __sha(memsize)] - shadow memory for identity mapping - * The rest [memsize, ident_map_size] if memsize < ident_map_size - * could be mapped/unmapped dynamically later during memory hotplug. - */ - memsize = min(memsize, ident_map_size); - - BUILD_BUG_ON(!IS_ALIGNED(KASAN_SHADOW_START, P4D_SIZE)); - BUILD_BUG_ON(!IS_ALIGNED(KASAN_SHADOW_END, P4D_SIZE)); - crst_table_init((unsigned long *)early_pg_dir, _REGION2_ENTRY_EMPTY); - - /* init kasan zero shadow */ - crst_table_init((unsigned long *)kasan_early_shadow_p4d, - p4d_val(p4d_z)); - crst_table_init((unsigned long *)kasan_early_shadow_pud, - pud_val(pud_z)); - crst_table_init((unsigned long *)kasan_early_shadow_pmd, - pmd_val(pmd_z)); - memset64((u64 *)kasan_early_shadow_pte, pte_val(pte_z), PTRS_PER_PTE); - - shadow_alloc_size = memsize >> KASAN_SHADOW_SCALE_SHIFT; - pgalloc_low = round_up((unsigned long)_end, _SEGMENT_SIZE); - if (IS_ENABLED(CONFIG_BLK_DEV_INITRD)) { - initrd_end = - round_up(initrd_data.start + initrd_data.size, _SEGMENT_SIZE); - pgalloc_low = max(pgalloc_low, initrd_end); - } - - if (pgalloc_low + shadow_alloc_size > memsize) - kasan_early_panic("out of memory during initialisation\n"); - - if (has_edat) { - segment_pos = round_down(memsize, _SEGMENT_SIZE); - segment_low = segment_pos - shadow_alloc_size; - pgalloc_pos = segment_low; - } else { - pgalloc_pos = memsize; - } - init_mm.pgd = early_pg_dir; - /* - * Current memory layout: - * +- 0 -------------+ +- shadow start -+ - * | 1:1 ram mapping | /| 1/8 ram | - * | | / | | - * +- end of ram ----+ / +----------------+ - * | ... gap ... | / | | - * | |/ | kasan | - * +- shadow start --+ | zero | - * | 1/8 addr space | | page | - * +- shadow end -+ | mapping | - * | ... gap ... |\ | (untracked) | - * +- vmalloc area -+ \ | | - * | vmalloc_size | \ | | - * +- modules vaddr -+ \ +----------------+ - * | 2Gb | \| unmapped | allocated per module - * +-----------------+ +- shadow end ---+ - * - * Current memory layout (KASAN_VMALLOC): - * +- 0 -------------+ +- shadow start -+ - * | 1:1 ram mapping | /| 1/8 ram | - * | | / | | - * +- end of ram ----+ / +----------------+ - * | ... gap ... | / | kasan | - * | |/ | zero | - * +- shadow start --+ | page | - * | 1/8 addr space | | mapping | - * +- shadow end -+ | (untracked) | - * | ... gap ... |\ | | - * +- vmalloc area -+ \ +- vmalloc area -+ - * | vmalloc_size | \ |shallow populate| - * +- modules vaddr -+ \ +- modules area -+ - * | 2Gb | \|shallow populate| - * +-----------------+ +- shadow end ---+ - */ - /* populate kasan shadow (for identity mapping and zero page mapping) */ - kasan_early_pgtable_populate(__sha(0), __sha(memsize), POPULATE_MAP); - if (IS_ENABLED(CONFIG_KASAN_VMALLOC)) { - /* shallowly populate kasan shadow for vmalloc and modules */ - kasan_early_pgtable_populate(__sha(VMALLOC_START), __sha(MODULES_END), - POPULATE_SHALLOW); - } - /* populate kasan shadow for untracked memory */ - kasan_early_pgtable_populate(__sha(ident_map_size), - IS_ENABLED(CONFIG_KASAN_VMALLOC) ? - __sha(VMALLOC_START) : - __sha(MODULES_VADDR), - POPULATE_ZERO_SHADOW); - kasan_early_pgtable_populate(__sha(MODULES_END), __sha(_REGION1_SIZE), - POPULATE_ZERO_SHADOW); - /* memory allocated for identity mapping structs will be freed later */ - pgalloc_freeable = pgalloc_pos; - /* populate identity mapping */ - kasan_early_pgtable_populate(0, memsize, POPULATE_ONE2ONE); - kasan_set_pgd(early_pg_dir, _ASCE_TYPE_REGION2); - kasan_enable_dat(); - /* enable kasan */ - init_task.kasan_depth = 0; - memblock_reserve(pgalloc_pos, memsize - pgalloc_pos); - sclp_early_printk("KernelAddressSanitizer initialized\n"); -} - -void __init kasan_copy_shadow_mapping(void) -{ - /* - * At this point we are still running on early pages setup early_pg_dir, - * while swapper_pg_dir has just been initialized with identity mapping. - * Carry over shadow memory region from early_pg_dir to swapper_pg_dir. - */ - - pgd_t *pg_dir_src; - pgd_t *pg_dir_dst; - p4d_t *p4_dir_src; - p4d_t *p4_dir_dst; - - pg_dir_src = pgd_offset_raw(early_pg_dir, KASAN_SHADOW_START); - pg_dir_dst = pgd_offset_raw(init_mm.pgd, KASAN_SHADOW_START); - p4_dir_src = p4d_offset(pg_dir_src, KASAN_SHADOW_START); - p4_dir_dst = p4d_offset(pg_dir_dst, KASAN_SHADOW_START); - memcpy(p4_dir_dst, p4_dir_src, - (KASAN_SHADOW_SIZE >> P4D_SHIFT) * sizeof(p4d_t)); -} - -void __init kasan_free_early_identity(void) -{ - memblock_phys_free(pgalloc_pos, pgalloc_freeable - pgalloc_pos); -} diff --git a/arch/s390/mm/maccess.c b/arch/s390/mm/maccess.c index 421efa46946b..632c3a55feed 100644 --- a/arch/s390/mm/maccess.c +++ b/arch/s390/mm/maccess.c @@ -12,10 +12,17 @@ #include <linux/errno.h> #include <linux/gfp.h> #include <linux/cpu.h> +#include <linux/uio.h> +#include <linux/io.h> #include <asm/asm-extable.h> -#include <asm/ctl_reg.h> -#include <asm/io.h> +#include <asm/abs_lowcore.h> #include <asm/stacktrace.h> +#include <asm/maccess.h> +#include <asm/ctlreg.h> + +unsigned long __bootdata_preserved(__memcpy_real_area); +pte_t *__bootdata_preserved(memcpy_real_ptep); +static DEFINE_MUTEX(memcpy_real_mutex); static notrace long s390_kernel_write_odd(void *dst, const void *src, size_t size) { @@ -61,159 +68,77 @@ notrace void *s390_kernel_write(void *dst, const void *src, size_t size) long copied; spin_lock_irqsave(&s390_kernel_write_lock, flags); - if (!(flags & PSW_MASK_DAT)) { - memcpy(dst, src, size); - } else { - while (size) { - copied = s390_kernel_write_odd(tmp, src, size); - tmp += copied; - src += copied; - size -= copied; - } + while (size) { + copied = s390_kernel_write_odd(tmp, src, size); + tmp += copied; + src += copied; + size -= copied; } spin_unlock_irqrestore(&s390_kernel_write_lock, flags); return dst; } -static int __no_sanitize_address __memcpy_real(void *dest, void *src, size_t count) -{ - union register_pair _dst, _src; - int rc = -EFAULT; - - _dst.even = (unsigned long) dest; - _dst.odd = (unsigned long) count; - _src.even = (unsigned long) src; - _src.odd = (unsigned long) count; - asm volatile ( - "0: mvcle %[dst],%[src],0\n" - "1: jo 0b\n" - " lhi %[rc],0\n" - "2:\n" - EX_TABLE(1b,2b) - : [rc] "+&d" (rc), [dst] "+&d" (_dst.pair), [src] "+&d" (_src.pair) - : : "cc", "memory"); - return rc; -} - -static unsigned long __no_sanitize_address _memcpy_real(unsigned long dest, - unsigned long src, - unsigned long count) -{ - int irqs_disabled, rc; - unsigned long flags; - - if (!count) - return 0; - flags = arch_local_irq_save(); - irqs_disabled = arch_irqs_disabled_flags(flags); - if (!irqs_disabled) - trace_hardirqs_off(); - __arch_local_irq_stnsm(0xf8); // disable DAT - rc = __memcpy_real((void *) dest, (void *) src, (size_t) count); - if (flags & PSW_MASK_DAT) - __arch_local_irq_stosm(0x04); // enable DAT - if (!irqs_disabled) - trace_hardirqs_on(); - __arch_local_irq_ssm(flags); - return rc; -} - -/* - * Copy memory in real mode (kernel to kernel) - */ -int memcpy_real(void *dest, unsigned long src, size_t count) +size_t memcpy_real_iter(struct iov_iter *iter, unsigned long src, size_t count) { - unsigned long _dest = (unsigned long)dest; - unsigned long _src = (unsigned long)src; - unsigned long _count = (unsigned long)count; - int rc; - - if (S390_lowcore.nodat_stack != 0) { - preempt_disable(); - rc = call_on_stack(3, S390_lowcore.nodat_stack, - unsigned long, _memcpy_real, - unsigned long, _dest, - unsigned long, _src, - unsigned long, _count); - preempt_enable(); - return rc; - } - /* - * This is a really early memcpy_real call, the stacks are - * not set up yet. Just call _memcpy_real on the early boot - * stack - */ - return _memcpy_real(_dest, _src, _count); -} - -/* - * Copy memory in absolute mode (kernel to kernel) - */ -void memcpy_absolute(void *dest, void *src, size_t count) -{ - unsigned long cr0, flags, prefix; - - flags = arch_local_irq_save(); - __ctl_store(cr0, 0, 0); - __ctl_clear_bit(0, 28); /* disable lowcore protection */ - prefix = store_prefix(); - if (prefix) { - local_mcck_disable(); - set_prefix(0); - memcpy(dest, src, count); - set_prefix(prefix); - local_mcck_enable(); - } else { - memcpy(dest, src, count); + size_t len, copied, res = 0; + unsigned long phys, offset; + void *chunk; + pte_t pte; + + BUILD_BUG_ON(MEMCPY_REAL_SIZE != PAGE_SIZE); + while (count) { + phys = src & MEMCPY_REAL_MASK; + offset = src & ~MEMCPY_REAL_MASK; + chunk = (void *)(__memcpy_real_area + offset); + len = min(count, MEMCPY_REAL_SIZE - offset); + pte = mk_pte_phys(phys, PAGE_KERNEL_RO); + + mutex_lock(&memcpy_real_mutex); + if (pte_val(pte) != pte_val(*memcpy_real_ptep)) { + __ptep_ipte(__memcpy_real_area, memcpy_real_ptep, 0, 0, IPTE_GLOBAL); + set_pte(memcpy_real_ptep, pte); + } + copied = copy_to_iter(chunk, len, iter); + mutex_unlock(&memcpy_real_mutex); + + count -= copied; + src += copied; + res += copied; + if (copied < len) + break; } - __ctl_load(cr0, 0, 0); - arch_local_irq_restore(flags); + return res; } -/* - * Copy memory from kernel (real) to user (virtual) - */ -int copy_to_user_real(void __user *dest, unsigned long src, unsigned long count) +int memcpy_real(void *dest, unsigned long src, size_t count) { - int offs = 0, size, rc; - char *buf; - - buf = (char *) __get_free_page(GFP_KERNEL); - if (!buf) - return -ENOMEM; - rc = -EFAULT; - while (offs < count) { - size = min(PAGE_SIZE, count - offs); - if (memcpy_real(buf, src + offs, size)) - goto out; - if (copy_to_user(dest + offs, buf, size)) - goto out; - offs += size; - } - rc = 0; -out: - free_page((unsigned long) buf); - return rc; + struct iov_iter iter; + struct kvec kvec; + + kvec.iov_base = dest; + kvec.iov_len = count; + iov_iter_kvec(&iter, ITER_DEST, &kvec, 1, count); + if (memcpy_real_iter(&iter, src, count) < count) + return -EFAULT; + return 0; } /* - * Check if physical address is within prefix or zero page + * Find CPU that owns swapped prefix page */ -static int is_swapped(phys_addr_t addr) +static int get_swapped_owner(phys_addr_t addr) { phys_addr_t lc; int cpu; - if (addr < sizeof(struct lowcore)) - return 1; for_each_online_cpu(cpu) { lc = virt_to_phys(lowcore_ptr[cpu]); if (addr > lc + sizeof(struct lowcore) - 1 || addr < lc) continue; - return 1; + return cpu; } - return 0; + return -1; } /* @@ -226,17 +151,34 @@ void *xlate_dev_mem_ptr(phys_addr_t addr) { void *ptr = phys_to_virt(addr); void *bounce = ptr; + struct lowcore *abs_lc; unsigned long size; + int this_cpu, cpu; cpus_read_lock(); - preempt_disable(); - if (is_swapped(addr)) { - size = PAGE_SIZE - (addr & ~PAGE_MASK); - bounce = (void *) __get_free_page(GFP_ATOMIC); - if (bounce) - memcpy_absolute(bounce, ptr, size); + this_cpu = get_cpu(); + if (addr >= sizeof(struct lowcore)) { + cpu = get_swapped_owner(addr); + if (cpu < 0) + goto out; + } + bounce = (void *)__get_free_page(GFP_ATOMIC); + if (!bounce) + goto out; + size = PAGE_SIZE - (addr & ~PAGE_MASK); + if (addr < sizeof(struct lowcore)) { + abs_lc = get_abs_lowcore(); + ptr = (void *)abs_lc + addr; + memcpy(bounce, ptr, size); + put_abs_lowcore(abs_lc); + } else if (cpu == this_cpu) { + ptr = (void *)(addr - virt_to_phys(lowcore_ptr[cpu])); + memcpy(bounce, ptr, size); + } else { + memcpy(bounce, ptr, size); } - preempt_enable(); +out: + put_cpu(); cpus_read_unlock(); return bounce; } diff --git a/arch/s390/mm/mmap.c b/arch/s390/mm/mmap.c index d545f5c39f7e..b14fc0887654 100644 --- a/arch/s390/mm/mmap.c +++ b/arch/s390/mm/mmap.c @@ -37,7 +37,7 @@ static inline int mmap_is_legacy(struct rlimit *rlim_stack) unsigned long arch_mmap_rnd(void) { - return (get_random_int() & MMAP_RND_MASK) << PAGE_SHIFT; + return (get_random_u32() & MMAP_RND_MASK) << PAGE_SHIFT; } static unsigned long mmap_base_legacy(unsigned long rnd) @@ -71,6 +71,15 @@ static inline unsigned long mmap_base(unsigned long rnd, return PAGE_ALIGN(STACK_TOP - gap - rnd); } +static int get_align_mask(struct file *filp, unsigned long flags) +{ + if (!(current->flags & PF_RANDOMIZE)) + return 0; + if (filp || (flags & MAP_SHARED)) + return MMAP_ALIGN_MASK << PAGE_SHIFT; + return 0; +} + unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) @@ -97,10 +106,7 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, info.length = len; info.low_limit = mm->mmap_base; info.high_limit = TASK_SIZE; - if (filp || (flags & MAP_SHARED)) - info.align_mask = MMAP_ALIGN_MASK << PAGE_SHIFT; - else - info.align_mask = 0; + info.align_mask = get_align_mask(filp, flags); info.align_offset = pgoff << PAGE_SHIFT; addr = vm_unmapped_area(&info); if (offset_in_page(addr)) @@ -136,12 +142,9 @@ unsigned long arch_get_unmapped_area_topdown(struct file *filp, unsigned long ad info.flags = VM_UNMAPPED_AREA_TOPDOWN; info.length = len; - info.low_limit = max(PAGE_SIZE, mmap_min_addr); + info.low_limit = PAGE_SIZE; info.high_limit = mm->mmap_base; - if (filp || (flags & MAP_SHARED)) - info.align_mask = MMAP_ALIGN_MASK << PAGE_SHIFT; - else - info.align_mask = 0; + info.align_mask = get_align_mask(filp, flags); info.align_offset = pgoff << PAGE_SHIFT; addr = vm_unmapped_area(&info); @@ -188,3 +191,23 @@ void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) mm->get_unmapped_area = arch_get_unmapped_area_topdown; } } + +static const pgprot_t protection_map[16] = { + [VM_NONE] = PAGE_NONE, + [VM_READ] = PAGE_RO, + [VM_WRITE] = PAGE_RO, + [VM_WRITE | VM_READ] = PAGE_RO, + [VM_EXEC] = PAGE_RX, + [VM_EXEC | VM_READ] = PAGE_RX, + [VM_EXEC | VM_WRITE] = PAGE_RX, + [VM_EXEC | VM_WRITE | VM_READ] = PAGE_RX, + [VM_SHARED] = PAGE_NONE, + [VM_SHARED | VM_READ] = PAGE_RO, + [VM_SHARED | VM_WRITE] = PAGE_RW, + [VM_SHARED | VM_WRITE | VM_READ] = PAGE_RW, + [VM_SHARED | VM_EXEC] = PAGE_RX, + [VM_SHARED | VM_EXEC | VM_READ] = PAGE_RX, + [VM_SHARED | VM_EXEC | VM_WRITE] = PAGE_RWX, + [VM_SHARED | VM_EXEC | VM_WRITE | VM_READ] = PAGE_RWX +}; +DECLARE_VM_GET_PAGE_PROT diff --git a/arch/s390/mm/page-states.c b/arch/s390/mm/page-states.c index d5ea09d78938..01f9b39e65f5 100644 --- a/arch/s390/mm/page-states.c +++ b/arch/s390/mm/page-states.c @@ -7,210 +7,18 @@ * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> */ -#include <linux/kernel.h> -#include <linux/errno.h> -#include <linux/types.h> #include <linux/mm.h> -#include <linux/memblock.h> -#include <linux/gfp.h> -#include <linux/init.h> -#include <asm/asm-extable.h> -#include <asm/facility.h> #include <asm/page-states.h> +#include <asm/sections.h> +#include <asm/page.h> -static int cmma_flag = 1; - -static int __init cmma(char *str) -{ - bool enabled; - - if (!kstrtobool(str, &enabled)) - cmma_flag = enabled; - return 1; -} -__setup("cmma=", cmma); - -static inline int cmma_test_essa(void) -{ - unsigned long tmp = 0; - int rc = -EOPNOTSUPP; - - /* test ESSA_GET_STATE */ - asm volatile( - " .insn rrf,0xb9ab0000,%[tmp],%[tmp],%[cmd],0\n" - "0: la %[rc],0\n" - "1:\n" - EX_TABLE(0b,1b) - : [rc] "+&d" (rc), [tmp] "+&d" (tmp) - : [cmd] "i" (ESSA_GET_STATE)); - return rc; -} - -void __init cmma_init(void) -{ - if (!cmma_flag) - return; - if (cmma_test_essa()) { - cmma_flag = 0; - return; - } - if (test_facility(147)) - cmma_flag = 2; -} - -static inline unsigned char get_page_state(struct page *page) -{ - unsigned char state; - - asm volatile(" .insn rrf,0xb9ab0000,%0,%1,%2,0" - : "=&d" (state) - : "a" (page_to_phys(page)), - "i" (ESSA_GET_STATE)); - return state & 0x3f; -} - -static inline void set_page_unused(struct page *page, int order) -{ - int i, rc; - - for (i = 0; i < (1 << order); i++) - asm volatile(".insn rrf,0xb9ab0000,%0,%1,%2,0" - : "=&d" (rc) - : "a" (page_to_phys(page + i)), - "i" (ESSA_SET_UNUSED)); -} - -static inline void set_page_stable_dat(struct page *page, int order) -{ - int i, rc; - - for (i = 0; i < (1 << order); i++) - asm volatile(".insn rrf,0xb9ab0000,%0,%1,%2,0" - : "=&d" (rc) - : "a" (page_to_phys(page + i)), - "i" (ESSA_SET_STABLE)); -} - -static inline void set_page_stable_nodat(struct page *page, int order) -{ - int i, rc; - - for (i = 0; i < (1 << order); i++) - asm volatile(".insn rrf,0xb9ab0000,%0,%1,%2,0" - : "=&d" (rc) - : "a" (page_to_phys(page + i)), - "i" (ESSA_SET_STABLE_NODAT)); -} - -static void mark_kernel_pmd(pud_t *pud, unsigned long addr, unsigned long end) -{ - unsigned long next; - struct page *page; - pmd_t *pmd; - - pmd = pmd_offset(pud, addr); - do { - next = pmd_addr_end(addr, end); - if (pmd_none(*pmd) || pmd_large(*pmd)) - continue; - page = phys_to_page(pmd_val(*pmd)); - set_bit(PG_arch_1, &page->flags); - } while (pmd++, addr = next, addr != end); -} - -static void mark_kernel_pud(p4d_t *p4d, unsigned long addr, unsigned long end) -{ - unsigned long next; - struct page *page; - pud_t *pud; - int i; - - pud = pud_offset(p4d, addr); - do { - next = pud_addr_end(addr, end); - if (pud_none(*pud) || pud_large(*pud)) - continue; - if (!pud_folded(*pud)) { - page = phys_to_page(pud_val(*pud)); - for (i = 0; i < 3; i++) - set_bit(PG_arch_1, &page[i].flags); - } - mark_kernel_pmd(pud, addr, next); - } while (pud++, addr = next, addr != end); -} - -static void mark_kernel_p4d(pgd_t *pgd, unsigned long addr, unsigned long end) -{ - unsigned long next; - struct page *page; - p4d_t *p4d; - int i; - - p4d = p4d_offset(pgd, addr); - do { - next = p4d_addr_end(addr, end); - if (p4d_none(*p4d)) - continue; - if (!p4d_folded(*p4d)) { - page = phys_to_page(p4d_val(*p4d)); - for (i = 0; i < 3; i++) - set_bit(PG_arch_1, &page[i].flags); - } - mark_kernel_pud(p4d, addr, next); - } while (p4d++, addr = next, addr != end); -} - -static void mark_kernel_pgd(void) -{ - unsigned long addr, next; - struct page *page; - pgd_t *pgd; - int i; - - addr = 0; - pgd = pgd_offset_k(addr); - do { - next = pgd_addr_end(addr, MODULES_END); - if (pgd_none(*pgd)) - continue; - if (!pgd_folded(*pgd)) { - page = phys_to_page(pgd_val(*pgd)); - for (i = 0; i < 3; i++) - set_bit(PG_arch_1, &page[i].flags); - } - mark_kernel_p4d(pgd, addr, next); - } while (pgd++, addr = next, addr != MODULES_END); -} - -void __init cmma_init_nodat(void) -{ - struct page *page; - unsigned long start, end, ix; - int i; - - if (cmma_flag < 2) - return; - /* Mark pages used in kernel page tables */ - mark_kernel_pgd(); - - /* Set all kernel pages not used for page tables to stable/no-dat */ - for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, NULL) { - page = pfn_to_page(start); - for (ix = start; ix < end; ix++, page++) { - if (__test_and_clear_bit(PG_arch_1, &page->flags)) - continue; /* skip page table pages */ - if (!list_empty(&page->lru)) - continue; /* skip free pages */ - set_page_stable_nodat(page, 0); - } - } -} +int __bootdata_preserved(cmma_flag); void arch_free_page(struct page *page, int order) { if (!cmma_flag) return; - set_page_unused(page, order); + __set_page_unused(page_to_virt(page), 1UL << order); } void arch_alloc_page(struct page *page, int order) @@ -218,14 +26,7 @@ void arch_alloc_page(struct page *page, int order) if (!cmma_flag) return; if (cmma_flag < 2) - set_page_stable_dat(page, order); + __set_page_stable_dat(page_to_virt(page), 1UL << order); else - set_page_stable_nodat(page, order); -} - -void arch_set_page_dat(struct page *page, int order) -{ - if (!cmma_flag) - return; - set_page_stable_dat(page, order); + __set_page_stable_nodat(page_to_virt(page), 1UL << order); } diff --git a/arch/s390/mm/pageattr.c b/arch/s390/mm/pageattr.c index 85195c18b2e8..01bc8fad64d6 100644 --- a/arch/s390/mm/pageattr.c +++ b/arch/s390/mm/pageattr.c @@ -4,6 +4,8 @@ * Author(s): Jan Glauber <jang@linux.vnet.ibm.com> */ #include <linux/hugetlb.h> +#include <linux/proc_fs.h> +#include <linux/vmalloc.h> #include <linux/mm.h> #include <asm/cacheflush.h> #include <asm/facility.h> @@ -41,7 +43,7 @@ void __storage_key_init_range(unsigned long start, unsigned long end) } #ifdef CONFIG_PROC_FS -atomic_long_t direct_pages_count[PG_DIRECT_MAP_MAX]; +atomic_long_t __bootdata_preserved(direct_pages_count[PG_DIRECT_MAP_MAX]); void arch_report_meminfo(struct seq_file *m) { @@ -73,7 +75,7 @@ static void pgt_set(unsigned long *old, unsigned long new, unsigned long addr, break; } table = (unsigned long *)((unsigned long)old & mask); - crdte(*old, new, table, dtt, addr, S390_lowcore.kernel_asce); + crdte(*old, new, table, dtt, addr, S390_lowcore.kernel_asce.val); } else if (MACHINE_HAS_IDTE) { cspg(old, *old, new); } else { @@ -96,11 +98,19 @@ static int walk_pte_level(pmd_t *pmdp, unsigned long addr, unsigned long end, if (flags & SET_MEMORY_RO) new = pte_wrprotect(new); else if (flags & SET_MEMORY_RW) - new = pte_mkwrite(pte_mkdirty(new)); + new = pte_mkwrite_novma(pte_mkdirty(new)); if (flags & SET_MEMORY_NX) new = set_pte_bit(new, __pgprot(_PAGE_NOEXEC)); else if (flags & SET_MEMORY_X) new = clear_pte_bit(new, __pgprot(_PAGE_NOEXEC)); + if (flags & SET_MEMORY_INV) { + new = set_pte_bit(new, __pgprot(_PAGE_INVALID)); + } else if (flags & SET_MEMORY_DEF) { + new = __pte(pte_val(new) & PAGE_MASK); + new = set_pte_bit(new, PAGE_KERNEL); + if (!MACHINE_HAS_NX) + new = clear_pte_bit(new, __pgprot(_PAGE_NOEXEC)); + } pgt_set((unsigned long *)ptep, pte_val(new), addr, CRDTE_DTT_PAGE); ptep++; addr += PAGE_SIZE; @@ -146,11 +156,19 @@ static void modify_pmd_page(pmd_t *pmdp, unsigned long addr, if (flags & SET_MEMORY_RO) new = pmd_wrprotect(new); else if (flags & SET_MEMORY_RW) - new = pmd_mkwrite(pmd_mkdirty(new)); + new = pmd_mkwrite_novma(pmd_mkdirty(new)); if (flags & SET_MEMORY_NX) new = set_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_NOEXEC)); else if (flags & SET_MEMORY_X) new = clear_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_NOEXEC)); + if (flags & SET_MEMORY_INV) { + new = set_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_INVALID)); + } else if (flags & SET_MEMORY_DEF) { + new = __pmd(pmd_val(new) & PMD_MASK); + new = set_pmd_bit(new, SEGMENT_KERNEL); + if (!MACHINE_HAS_NX) + new = clear_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_NOEXEC)); + } pgt_set((unsigned long *)pmdp, pmd_val(new), addr, CRDTE_DTT_SEGMENT); } @@ -167,7 +185,7 @@ static int walk_pmd_level(pud_t *pudp, unsigned long addr, unsigned long end, if (pmd_none(*pmdp)) return -EINVAL; next = pmd_addr_end(addr, end); - if (pmd_large(*pmdp)) { + if (pmd_leaf(*pmdp)) { need_split = !!(flags & SET_MEMORY_4K); need_split |= !!(addr & ~PMD_MASK); need_split |= !!(addr + PMD_SIZE > next); @@ -232,6 +250,14 @@ static void modify_pud_page(pud_t *pudp, unsigned long addr, new = set_pud_bit(new, __pgprot(_REGION_ENTRY_NOEXEC)); else if (flags & SET_MEMORY_X) new = clear_pud_bit(new, __pgprot(_REGION_ENTRY_NOEXEC)); + if (flags & SET_MEMORY_INV) { + new = set_pud_bit(new, __pgprot(_REGION_ENTRY_INVALID)); + } else if (flags & SET_MEMORY_DEF) { + new = __pud(pud_val(new) & PUD_MASK); + new = set_pud_bit(new, REGION3_KERNEL); + if (!MACHINE_HAS_NX) + new = clear_pud_bit(new, __pgprot(_REGION_ENTRY_NOEXEC)); + } pgt_set((unsigned long *)pudp, pud_val(new), addr, CRDTE_DTT_REGION3); } @@ -248,7 +274,7 @@ static int walk_pud_level(p4d_t *p4d, unsigned long addr, unsigned long end, if (pud_none(*pudp)) return -EINVAL; next = pud_addr_end(addr, end); - if (pud_large(*pudp)) { + if (pud_leaf(*pudp)) { need_split = !!(flags & SET_MEMORY_4K); need_split |= !!(addr & ~PUD_MASK); need_split |= !!(addr + PUD_SIZE > next); @@ -298,11 +324,6 @@ static int change_page_attr(unsigned long addr, unsigned long end, int rc = -EINVAL; pgd_t *pgdp; - if (addr == end) - return 0; - if (end >= MODULES_END) - return -EINVAL; - mutex_lock(&cpa_mutex); pgdp = pgd_offset_k(addr); do { if (pgd_none(*pgdp)) @@ -313,18 +334,76 @@ static int change_page_attr(unsigned long addr, unsigned long end, break; cond_resched(); } while (pgdp++, addr = next, addr < end && !rc); - mutex_unlock(&cpa_mutex); return rc; } -int __set_memory(unsigned long addr, int numpages, unsigned long flags) +static int change_page_attr_alias(unsigned long addr, unsigned long end, + unsigned long flags) { + unsigned long alias, offset, va_start, va_end; + struct vm_struct *area; + int rc = 0; + + /* + * Changes to read-only permissions on kernel VA mappings are also + * applied to the kernel direct mapping. Execute permissions are + * intentionally not transferred to keep all allocated pages within + * the direct mapping non-executable. + */ + flags &= SET_MEMORY_RO | SET_MEMORY_RW; + if (!flags) + return 0; + area = NULL; + while (addr < end) { + if (!area) + area = find_vm_area((void *)addr); + if (!area || !(area->flags & VM_ALLOC)) + return 0; + va_start = (unsigned long)area->addr; + va_end = va_start + area->nr_pages * PAGE_SIZE; + offset = (addr - va_start) >> PAGE_SHIFT; + alias = (unsigned long)page_address(area->pages[offset]); + rc = change_page_attr(alias, alias + PAGE_SIZE, flags); + if (rc) + break; + addr += PAGE_SIZE; + if (addr >= va_end) + area = NULL; + } + return rc; +} + +int __set_memory(unsigned long addr, unsigned long numpages, unsigned long flags) +{ + unsigned long end; + int rc; + if (!MACHINE_HAS_NX) flags &= ~(SET_MEMORY_NX | SET_MEMORY_X); if (!flags) return 0; + if (!numpages) + return 0; addr &= PAGE_MASK; - return change_page_attr(addr, addr + numpages * PAGE_SIZE, flags); + end = addr + numpages * PAGE_SIZE; + mutex_lock(&cpa_mutex); + rc = change_page_attr(addr, end, flags); + if (rc) + goto out; + rc = change_page_attr_alias(addr, end, flags); +out: + mutex_unlock(&cpa_mutex); + return rc; +} + +int set_direct_map_invalid_noflush(struct page *page) +{ + return __set_memory((unsigned long)page_to_virt(page), 1, SET_MEMORY_INV); +} + +int set_direct_map_default_noflush(struct page *page) +{ + return __set_memory((unsigned long)page_to_virt(page), 1, SET_MEMORY_DEF); } #if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KFENCE) diff --git a/arch/s390/mm/pfault.c b/arch/s390/mm/pfault.c new file mode 100644 index 000000000000..1aac13bb8f53 --- /dev/null +++ b/arch/s390/mm/pfault.c @@ -0,0 +1,248 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright IBM Corp. 1999, 2023 + */ + +#include <linux/cpuhotplug.h> +#include <linux/sched/task.h> +#include <linux/errno.h> +#include <linux/init.h> +#include <linux/irq.h> +#include <asm/asm-extable.h> +#include <asm/pfault.h> +#include <asm/diag.h> + +#define __SUBCODE_MASK 0x0600 +#define __PF_RES_FIELD 0x8000000000000000UL + +/* + * 'pfault' pseudo page faults routines. + */ +static int pfault_disable; + +static int __init nopfault(char *str) +{ + pfault_disable = 1; + return 1; +} +early_param("nopfault", nopfault); + +struct pfault_refbk { + u16 refdiagc; + u16 reffcode; + u16 refdwlen; + u16 refversn; + u64 refgaddr; + u64 refselmk; + u64 refcmpmk; + u64 reserved; +}; + +static struct pfault_refbk pfault_init_refbk = { + .refdiagc = 0x258, + .reffcode = 0, + .refdwlen = 5, + .refversn = 2, + .refgaddr = __LC_LPP, + .refselmk = 1UL << 48, + .refcmpmk = 1UL << 48, + .reserved = __PF_RES_FIELD +}; + +int __pfault_init(void) +{ + int rc = -EOPNOTSUPP; + + if (pfault_disable) + return rc; + diag_stat_inc(DIAG_STAT_X258); + asm volatile( + " diag %[refbk],%[rc],0x258\n" + "0: nopr %%r7\n" + EX_TABLE(0b, 0b) + : [rc] "+d" (rc) + : [refbk] "a" (&pfault_init_refbk), "m" (pfault_init_refbk) + : "cc"); + return rc; +} + +static struct pfault_refbk pfault_fini_refbk = { + .refdiagc = 0x258, + .reffcode = 1, + .refdwlen = 5, + .refversn = 2, +}; + +void __pfault_fini(void) +{ + if (pfault_disable) + return; + diag_stat_inc(DIAG_STAT_X258); + asm volatile( + " diag %[refbk],0,0x258\n" + "0: nopr %%r7\n" + EX_TABLE(0b, 0b) + : + : [refbk] "a" (&pfault_fini_refbk), "m" (pfault_fini_refbk) + : "cc"); +} + +static DEFINE_SPINLOCK(pfault_lock); +static LIST_HEAD(pfault_list); + +#define PF_COMPLETE 0x0080 + +/* + * The mechanism of our pfault code: if Linux is running as guest, runs a user + * space process and the user space process accesses a page that the host has + * paged out we get a pfault interrupt. + * + * This allows us, within the guest, to schedule a different process. Without + * this mechanism the host would have to suspend the whole virtual cpu until + * the page has been paged in. + * + * So when we get such an interrupt then we set the state of the current task + * to uninterruptible and also set the need_resched flag. Both happens within + * interrupt context(!). If we later on want to return to user space we + * recognize the need_resched flag and then call schedule(). It's not very + * obvious how this works... + * + * Of course we have a lot of additional fun with the completion interrupt (-> + * host signals that a page of a process has been paged in and the process can + * continue to run). This interrupt can arrive on any cpu and, since we have + * virtual cpus, actually appear before the interrupt that signals that a page + * is missing. + */ +static void pfault_interrupt(struct ext_code ext_code, + unsigned int param32, unsigned long param64) +{ + struct task_struct *tsk; + __u16 subcode; + pid_t pid; + + /* + * Get the external interruption subcode & pfault initial/completion + * signal bit. VM stores this in the 'cpu address' field associated + * with the external interrupt. + */ + subcode = ext_code.subcode; + if ((subcode & 0xff00) != __SUBCODE_MASK) + return; + inc_irq_stat(IRQEXT_PFL); + /* Get the token (= pid of the affected task). */ + pid = param64 & LPP_PID_MASK; + rcu_read_lock(); + tsk = find_task_by_pid_ns(pid, &init_pid_ns); + if (tsk) + get_task_struct(tsk); + rcu_read_unlock(); + if (!tsk) + return; + spin_lock(&pfault_lock); + if (subcode & PF_COMPLETE) { + /* signal bit is set -> a page has been swapped in by VM */ + if (tsk->thread.pfault_wait == 1) { + /* + * Initial interrupt was faster than the completion + * interrupt. pfault_wait is valid. Set pfault_wait + * back to zero and wake up the process. This can + * safely be done because the task is still sleeping + * and can't produce new pfaults. + */ + tsk->thread.pfault_wait = 0; + list_del(&tsk->thread.list); + wake_up_process(tsk); + put_task_struct(tsk); + } else { + /* + * Completion interrupt was faster than initial + * interrupt. Set pfault_wait to -1 so the initial + * interrupt doesn't put the task to sleep. + * If the task is not running, ignore the completion + * interrupt since it must be a leftover of a PFAULT + * CANCEL operation which didn't remove all pending + * completion interrupts. + */ + if (task_is_running(tsk)) + tsk->thread.pfault_wait = -1; + } + } else { + /* signal bit not set -> a real page is missing. */ + if (WARN_ON_ONCE(tsk != current)) + goto out; + if (tsk->thread.pfault_wait == 1) { + /* Already on the list with a reference: put to sleep */ + goto block; + } else if (tsk->thread.pfault_wait == -1) { + /* + * Completion interrupt was faster than the initial + * interrupt (pfault_wait == -1). Set pfault_wait + * back to zero and exit. + */ + tsk->thread.pfault_wait = 0; + } else { + /* + * Initial interrupt arrived before completion + * interrupt. Let the task sleep. + * An extra task reference is needed since a different + * cpu may set the task state to TASK_RUNNING again + * before the scheduler is reached. + */ + get_task_struct(tsk); + tsk->thread.pfault_wait = 1; + list_add(&tsk->thread.list, &pfault_list); +block: + /* + * Since this must be a userspace fault, there + * is no kernel task state to trample. Rely on the + * return to userspace schedule() to block. + */ + __set_current_state(TASK_UNINTERRUPTIBLE); + set_tsk_need_resched(tsk); + set_preempt_need_resched(); + } + } +out: + spin_unlock(&pfault_lock); + put_task_struct(tsk); +} + +static int pfault_cpu_dead(unsigned int cpu) +{ + struct thread_struct *thread, *next; + struct task_struct *tsk; + + spin_lock_irq(&pfault_lock); + list_for_each_entry_safe(thread, next, &pfault_list, list) { + thread->pfault_wait = 0; + list_del(&thread->list); + tsk = container_of(thread, struct task_struct, thread); + wake_up_process(tsk); + put_task_struct(tsk); + } + spin_unlock_irq(&pfault_lock); + return 0; +} + +static int __init pfault_irq_init(void) +{ + int rc; + + rc = register_external_irq(EXT_IRQ_CP_SERVICE, pfault_interrupt); + if (rc) + goto out_extint; + rc = pfault_init() == 0 ? 0 : -EOPNOTSUPP; + if (rc) + goto out_pfault; + irq_subclass_register(IRQ_SUBCLASS_SERVICE_SIGNAL); + cpuhp_setup_state_nocalls(CPUHP_S390_PFAULT_DEAD, "s390/pfault:dead", + NULL, pfault_cpu_dead); + return 0; + +out_pfault: + unregister_external_irq(EXT_IRQ_CP_SERVICE, pfault_interrupt); +out_extint: + pfault_disable = 1; + return rc; +} +early_initcall(pfault_irq_init); diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c index 2de48b2c1b04..abb629d7e131 100644 --- a/arch/s390/mm/pgalloc.c +++ b/arch/s390/mm/pgalloc.c @@ -10,6 +10,7 @@ #include <linux/slab.h> #include <linux/mm.h> #include <asm/mmu_context.h> +#include <asm/page-states.h> #include <asm/pgalloc.h> #include <asm/gmap.h> #include <asm/tlb.h> @@ -30,22 +31,11 @@ static struct ctl_table page_table_sysctl[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, - { } -}; - -static struct ctl_table page_table_sysctl_dir[] = { - { - .procname = "vm", - .maxlen = 0, - .mode = 0555, - .child = page_table_sysctl, - }, - { } }; static int __init page_table_register_sysctl(void) { - return register_sysctl_table(page_table_sysctl_dir) ? 0 : -ENOMEM; + return register_sysctl("vm", page_table_sysctl) ? 0 : -ENOMEM; } __initcall(page_table_register_sysctl); @@ -53,17 +43,19 @@ __initcall(page_table_register_sysctl); unsigned long *crst_table_alloc(struct mm_struct *mm) { - struct page *page = alloc_pages(GFP_KERNEL, CRST_ALLOC_ORDER); + struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL, CRST_ALLOC_ORDER); + unsigned long *table; - if (!page) + if (!ptdesc) return NULL; - arch_set_page_dat(page, CRST_ALLOC_ORDER); - return (unsigned long *) page_to_virt(page); + table = ptdesc_to_virt(ptdesc); + __arch_set_page_dat(table, 1UL << CRST_ALLOC_ORDER); + return table; } void crst_table_free(struct mm_struct *mm, unsigned long *table) { - free_pages((unsigned long)table, CRST_ALLOC_ORDER); + pagetable_free(virt_to_ptdesc(table)); } static void __crst_table_upgrade(void *arg) @@ -72,8 +64,8 @@ static void __crst_table_upgrade(void *arg) /* change all active ASCEs to avoid the creation of new TLBs */ if (current->active_mm == mm) { - S390_lowcore.user_asce = mm->context.asce; - __ctl_load(S390_lowcore.user_asce, 7, 7); + S390_lowcore.user_asce.val = mm->context.asce; + local_ctl_load(7, &S390_lowcore.user_asce); } __tlb_flush_local(); } @@ -141,287 +133,97 @@ err_p4d: return -ENOMEM; } -static inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits) -{ - unsigned int old, new; - - do { - old = atomic_read(v); - new = old ^ bits; - } while (atomic_cmpxchg(v, old, new) != old); - return new; -} - #ifdef CONFIG_PGSTE -struct page *page_table_alloc_pgste(struct mm_struct *mm) +struct ptdesc *page_table_alloc_pgste(struct mm_struct *mm) { - struct page *page; + struct ptdesc *ptdesc; u64 *table; - page = alloc_page(GFP_KERNEL); - if (page) { - table = (u64 *)page_to_virt(page); + ptdesc = pagetable_alloc(GFP_KERNEL, 0); + if (ptdesc) { + table = (u64 *)ptdesc_to_virt(ptdesc); + __arch_set_page_dat(table, 1); memset64(table, _PAGE_INVALID, PTRS_PER_PTE); memset64(table + PTRS_PER_PTE, 0, PTRS_PER_PTE); } - return page; + return ptdesc; } -void page_table_free_pgste(struct page *page) +void page_table_free_pgste(struct ptdesc *ptdesc) { - __free_page(page); + pagetable_free(ptdesc); } #endif /* CONFIG_PGSTE */ -/* - * A 2KB-pgtable is either upper or lower half of a normal page. - * The second half of the page may be unused or used as another - * 2KB-pgtable. - * - * Whenever possible the parent page for a new 2KB-pgtable is picked - * from the list of partially allocated pages mm_context_t::pgtable_list. - * In case the list is empty a new parent page is allocated and added to - * the list. - * - * When a parent page gets fully allocated it contains 2KB-pgtables in both - * upper and lower halves and is removed from mm_context_t::pgtable_list. - * - * When 2KB-pgtable is freed from to fully allocated parent page that - * page turns partially allocated and added to mm_context_t::pgtable_list. - * - * If 2KB-pgtable is freed from the partially allocated parent page that - * page turns unused and gets removed from mm_context_t::pgtable_list. - * Furthermore, the unused parent page is released. - * - * As follows from the above, no unallocated or fully allocated parent - * pages are contained in mm_context_t::pgtable_list. - * - * The upper byte (bits 24-31) of the parent page _refcount is used - * for tracking contained 2KB-pgtables and has the following format: - * - * PP AA - * 01234567 upper byte (bits 24-31) of struct page::_refcount - * || || - * || |+--- upper 2KB-pgtable is allocated - * || +---- lower 2KB-pgtable is allocated - * |+------- upper 2KB-pgtable is pending for removal - * +-------- lower 2KB-pgtable is pending for removal - * - * (See commit 620b4e903179 ("s390: use _refcount for pgtables") on why - * using _refcount is possible). - * - * When 2KB-pgtable is allocated the corresponding AA bit is set to 1. - * The parent page is either: - * - added to mm_context_t::pgtable_list in case the second half of the - * parent page is still unallocated; - * - removed from mm_context_t::pgtable_list in case both hales of the - * parent page are allocated; - * These operations are protected with mm_context_t::lock. - * - * When 2KB-pgtable is deallocated the corresponding AA bit is set to 0 - * and the corresponding PP bit is set to 1 in a single atomic operation. - * Thus, PP and AA bits corresponding to the same 2KB-pgtable are mutually - * exclusive and may never be both set to 1! - * The parent page is either: - * - added to mm_context_t::pgtable_list in case the second half of the - * parent page is still allocated; - * - removed from mm_context_t::pgtable_list in case the second half of - * the parent page is unallocated; - * These operations are protected with mm_context_t::lock. - * - * It is important to understand that mm_context_t::lock only protects - * mm_context_t::pgtable_list and AA bits, but not the parent page itself - * and PP bits. - * - * Releasing the parent page happens whenever the PP bit turns from 1 to 0, - * while both AA bits and the second PP bit are already unset. Then the - * parent page does not contain any 2KB-pgtable fragment anymore, and it has - * also been removed from mm_context_t::pgtable_list. It is safe to release - * the page therefore. - * - * PGSTE memory spaces use full 4KB-pgtables and do not need most of the - * logic described above. Both AA bits are set to 1 to denote a 4KB-pgtable - * while the PP bits are never used, nor such a page is added to or removed - * from mm_context_t::pgtable_list. - */ unsigned long *page_table_alloc(struct mm_struct *mm) { + struct ptdesc *ptdesc; unsigned long *table; - struct page *page; - unsigned int mask, bit; - - /* Try to get a fragment of a 4K page as a 2K page table */ - if (!mm_alloc_pgste(mm)) { - table = NULL; - spin_lock_bh(&mm->context.lock); - if (!list_empty(&mm->context.pgtable_list)) { - page = list_first_entry(&mm->context.pgtable_list, - struct page, lru); - mask = atomic_read(&page->_refcount) >> 24; - /* - * The pending removal bits must also be checked. - * Failure to do so might lead to an impossible - * value of (i.e 0x13 or 0x23) written to _refcount. - * Such values violate the assumption that pending and - * allocation bits are mutually exclusive, and the rest - * of the code unrails as result. That could lead to - * a whole bunch of races and corruptions. - */ - mask = (mask | (mask >> 4)) & 0x03U; - if (mask != 0x03U) { - table = (unsigned long *) page_to_virt(page); - bit = mask & 1; /* =1 -> second 2K */ - if (bit) - table += PTRS_PER_PTE; - atomic_xor_bits(&page->_refcount, - 0x01U << (bit + 24)); - list_del(&page->lru); - } - } - spin_unlock_bh(&mm->context.lock); - if (table) - return table; - } - /* Allocate a fresh page */ - page = alloc_page(GFP_KERNEL); - if (!page) + + ptdesc = pagetable_alloc(GFP_KERNEL, 0); + if (!ptdesc) return NULL; - if (!pgtable_pte_page_ctor(page)) { - __free_page(page); + if (!pagetable_pte_ctor(ptdesc)) { + pagetable_free(ptdesc); return NULL; } - arch_set_page_dat(page, 0); - /* Initialize page table */ - table = (unsigned long *) page_to_virt(page); - if (mm_alloc_pgste(mm)) { - /* Return 4K page table with PGSTEs */ - atomic_xor_bits(&page->_refcount, 0x03U << 24); - memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE); - memset64((u64 *)table + PTRS_PER_PTE, 0, PTRS_PER_PTE); - } else { - /* Return the first 2K fragment of the page */ - atomic_xor_bits(&page->_refcount, 0x01U << 24); - memset64((u64 *)table, _PAGE_INVALID, 2 * PTRS_PER_PTE); - spin_lock_bh(&mm->context.lock); - list_add(&page->lru, &mm->context.pgtable_list); - spin_unlock_bh(&mm->context.lock); - } + table = ptdesc_to_virt(ptdesc); + __arch_set_page_dat(table, 1); + /* pt_list is used by gmap only */ + INIT_LIST_HEAD(&ptdesc->pt_list); + memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE); + memset64((u64 *)table + PTRS_PER_PTE, 0, PTRS_PER_PTE); return table; } -static void page_table_release_check(struct page *page, void *table, - unsigned int half, unsigned int mask) +static void pagetable_pte_dtor_free(struct ptdesc *ptdesc) { - char msg[128]; - - if (!IS_ENABLED(CONFIG_DEBUG_VM) || !mask) - return; - snprintf(msg, sizeof(msg), - "Invalid pgtable %p release half 0x%02x mask 0x%02x", - table, half, mask); - dump_page(page, msg); + pagetable_pte_dtor(ptdesc); + pagetable_free(ptdesc); } void page_table_free(struct mm_struct *mm, unsigned long *table) { - unsigned int mask, bit, half; - struct page *page; - - page = virt_to_page(table); - if (!mm_alloc_pgste(mm)) { - /* Free 2K page table fragment of a 4K page */ - bit = ((unsigned long) table & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t)); - spin_lock_bh(&mm->context.lock); - /* - * Mark the page for delayed release. The actual release - * will happen outside of the critical section from this - * function or from __tlb_remove_table() - */ - mask = atomic_xor_bits(&page->_refcount, 0x11U << (bit + 24)); - mask >>= 24; - if (mask & 0x03U) - list_add(&page->lru, &mm->context.pgtable_list); - else - list_del(&page->lru); - spin_unlock_bh(&mm->context.lock); - mask = atomic_xor_bits(&page->_refcount, 0x10U << (bit + 24)); - mask >>= 24; - if (mask != 0x00U) - return; - half = 0x01U << bit; - } else { - half = 0x03U; - mask = atomic_xor_bits(&page->_refcount, 0x03U << 24); - mask >>= 24; - } + struct ptdesc *ptdesc = virt_to_ptdesc(table); - page_table_release_check(page, table, half, mask); - pgtable_pte_page_dtor(page); - __free_page(page); + pagetable_pte_dtor_free(ptdesc); } -void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table, - unsigned long vmaddr) +void __tlb_remove_table(void *table) { - struct mm_struct *mm; - struct page *page; - unsigned int bit, mask; - - mm = tlb->mm; - page = virt_to_page(table); - if (mm_alloc_pgste(mm)) { - gmap_unlink(mm, table, vmaddr); - table = (unsigned long *) ((unsigned long)table | 0x03U); - tlb_remove_table(tlb, table); + struct ptdesc *ptdesc = virt_to_ptdesc(table); + struct page *page = ptdesc_page(ptdesc); + + if (compound_order(page) == CRST_ALLOC_ORDER) { + /* pmd, pud, or p4d */ + pagetable_free(ptdesc); return; } - bit = ((unsigned long) table & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t)); - spin_lock_bh(&mm->context.lock); - /* - * Mark the page for delayed release. The actual release will happen - * outside of the critical section from __tlb_remove_table() or from - * page_table_free() - */ - mask = atomic_xor_bits(&page->_refcount, 0x11U << (bit + 24)); - mask >>= 24; - if (mask & 0x03U) - list_add_tail(&page->lru, &mm->context.pgtable_list); - else - list_del(&page->lru); - spin_unlock_bh(&mm->context.lock); - table = (unsigned long *) ((unsigned long) table | (0x01U << bit)); - tlb_remove_table(tlb, table); + pagetable_pte_dtor_free(ptdesc); } -void __tlb_remove_table(void *_table) +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static void pte_free_now(struct rcu_head *head) { - unsigned int mask = (unsigned long) _table & 0x03U, half = mask; - void *table = (void *)((unsigned long) _table ^ mask); - struct page *page = virt_to_page(table); + struct ptdesc *ptdesc = container_of(head, struct ptdesc, pt_rcu_head); - switch (half) { - case 0x00U: /* pmd, pud, or p4d */ - free_pages((unsigned long)table, CRST_ALLOC_ORDER); - return; - case 0x01U: /* lower 2K of a 4K page table */ - case 0x02U: /* higher 2K of a 4K page table */ - mask = atomic_xor_bits(&page->_refcount, mask << (4 + 24)); - mask >>= 24; - if (mask != 0x00U) - return; - break; - case 0x03U: /* 4K page table with pgstes */ - mask = atomic_xor_bits(&page->_refcount, 0x03U << 24); - mask >>= 24; - break; - } + pagetable_pte_dtor_free(ptdesc); +} - page_table_release_check(page, table, half, mask); - pgtable_pte_page_dtor(page); - __free_page(page); +void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable) +{ + struct ptdesc *ptdesc = virt_to_ptdesc(pgtable); + + call_rcu(&ptdesc->pt_rcu_head, pte_free_now); + /* + * THPs are not allowed for KVM guests. Warn if pgste ever reaches here. + * Turn to the generic pte_free_defer() version once gmap is removed. + */ + WARN_ON_ONCE(mm_has_pgste(mm)); } +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ /* * Base infrastructure required to generate basic asces, region, segment, @@ -448,16 +250,19 @@ static void base_pgt_free(unsigned long *table) static unsigned long *base_crst_alloc(unsigned long val) { unsigned long *table; + struct ptdesc *ptdesc; - table = (unsigned long *)__get_free_pages(GFP_KERNEL, CRST_ALLOC_ORDER); - if (table) - crst_table_init(table, val); + ptdesc = pagetable_alloc(GFP_KERNEL, CRST_ALLOC_ORDER); + if (!ptdesc) + return NULL; + table = ptdesc_address(ptdesc); + crst_table_init(table, val); return table; } static void base_crst_free(unsigned long *table) { - free_pages((unsigned long)table, CRST_ALLOC_ORDER); + pagetable_free(virt_to_ptdesc(table)); } #define BASE_ADDR_END_FUNC(NAME, SIZE) \ diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index 4909dcd762e8..2c944bafb030 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -125,32 +125,23 @@ static inline pte_t ptep_flush_lazy(struct mm_struct *mm, static inline pgste_t pgste_get_lock(pte_t *ptep) { - unsigned long new = 0; + unsigned long value = 0; #ifdef CONFIG_PGSTE - unsigned long old; - - asm( - " lg %0,%2\n" - "0: lgr %1,%0\n" - " nihh %0,0xff7f\n" /* clear PCL bit in old */ - " oihh %1,0x0080\n" /* set PCL bit in new */ - " csg %0,%1,%2\n" - " jl 0b\n" - : "=&d" (old), "=&d" (new), "=Q" (ptep[PTRS_PER_PTE]) - : "Q" (ptep[PTRS_PER_PTE]) : "cc", "memory"); + unsigned long *ptr = (unsigned long *)(ptep + PTRS_PER_PTE); + + do { + value = __atomic64_or_barrier(PGSTE_PCL_BIT, ptr); + } while (value & PGSTE_PCL_BIT); + value |= PGSTE_PCL_BIT; #endif - return __pgste(new); + return __pgste(value); } static inline void pgste_set_unlock(pte_t *ptep, pgste_t pgste) { #ifdef CONFIG_PGSTE - asm( - " nihh %1,0xff7f\n" /* clear PCL bit */ - " stg %1,%0\n" - : "=Q" (ptep[PTRS_PER_PTE]) - : "d" (pgste_val(pgste)), "Q" (ptep[PTRS_PER_PTE]) - : "cc", "memory"); + barrier(); + WRITE_ONCE(*(unsigned long *)(ptep + PTRS_PER_PTE), pgste_val(pgste) & ~PGSTE_PCL_BIT); #endif } @@ -302,6 +293,31 @@ pte_t ptep_xchg_direct(struct mm_struct *mm, unsigned long addr, } EXPORT_SYMBOL(ptep_xchg_direct); +/* + * Caller must check that new PTE only differs in _PAGE_PROTECT HW bit, so that + * RDP can be used instead of IPTE. See also comments at pte_allow_rdp(). + */ +void ptep_reset_dat_prot(struct mm_struct *mm, unsigned long addr, pte_t *ptep, + pte_t new) +{ + preempt_disable(); + atomic_inc(&mm->context.flush_count); + if (cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) + __ptep_rdp(addr, ptep, 0, 0, 1); + else + __ptep_rdp(addr, ptep, 0, 0, 0); + /* + * PTE is not invalidated by RDP, only _PAGE_PROTECT is cleared. That + * means it is still valid and active, and must not be changed according + * to the architecture. But writing a new value that only differs in SW + * bits is allowed. + */ + set_pte(ptep, new); + atomic_dec(&mm->context.flush_count); + preempt_enable(); +} +EXPORT_SYMBOL(ptep_reset_dat_prot); + pte_t ptep_xchg_lazy(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t new) { @@ -454,7 +470,7 @@ static int pmd_lookup(struct mm_struct *mm, unsigned long addr, pmd_t **pmdp) return -ENOENT; /* Large PUDs are not supported yet. */ - if (pud_large(*pud)) + if (pud_leaf(*pud)) return -EFAULT; *pmdp = pmd_offset(pud, addr); @@ -705,9 +721,9 @@ static void ptep_zap_swap_entry(struct mm_struct *mm, swp_entry_t entry) if (!non_swap_entry(entry)) dec_mm_counter(mm, MM_SWAPENTS); else if (is_migration_entry(entry)) { - struct page *page = pfn_swap_entry_to_page(entry); + struct folio *folio = pfn_swap_entry_folio(entry); - dec_mm_counter(mm, mm_counter(page)); + dec_mm_counter(mm, mm_counter(folio)); } free_swap_and_cache(entry); } @@ -731,7 +747,7 @@ void ptep_zap_unused(struct mm_struct *mm, unsigned long addr, pte_clear(mm, addr, ptep); } if (reset) - pgste_val(pgste) &= ~_PGSTE_GPS_USAGE_MASK; + pgste_val(pgste) &= ~(_PGSTE_GPS_USAGE_MASK | _PGSTE_GPS_NODAT); pgste_set_unlock(ptep, pgste); preempt_enable(); } @@ -804,14 +820,14 @@ int set_guest_storage_key(struct mm_struct *mm, unsigned long addr, default: return -EFAULT; } - +again: ptl = pmd_lock(mm, pmdp); if (!pmd_present(*pmdp)) { spin_unlock(ptl); return key ? -EFAULT : 0; } - if (pmd_large(*pmdp)) { + if (pmd_leaf(*pmdp)) { paddr = pmd_val(*pmdp) & HPAGE_MASK; paddr |= addr & ~HPAGE_MASK; /* @@ -825,6 +841,8 @@ int set_guest_storage_key(struct mm_struct *mm, unsigned long addr, spin_unlock(ptl); ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); + if (!ptep) + goto again; new = old = pgste_get_lock(ptep); pgste_val(new) &= ~(PGSTE_GR_BIT | PGSTE_GC_BIT | PGSTE_ACC_BITS | PGSTE_FP_BIT); @@ -913,14 +931,14 @@ int reset_guest_reference_bit(struct mm_struct *mm, unsigned long addr) default: return -EFAULT; } - +again: ptl = pmd_lock(mm, pmdp); if (!pmd_present(*pmdp)) { spin_unlock(ptl); return 0; } - if (pmd_large(*pmdp)) { + if (pmd_leaf(*pmdp)) { paddr = pmd_val(*pmdp) & HPAGE_MASK; paddr |= addr & ~HPAGE_MASK; cc = page_reset_referenced(paddr); @@ -930,6 +948,8 @@ int reset_guest_reference_bit(struct mm_struct *mm, unsigned long addr) spin_unlock(ptl); ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); + if (!ptep) + goto again; new = old = pgste_get_lock(ptep); /* Reset guest reference bit only */ pgste_val(new) &= ~PGSTE_GR_BIT; @@ -975,14 +995,14 @@ int get_guest_storage_key(struct mm_struct *mm, unsigned long addr, default: return -EFAULT; } - +again: ptl = pmd_lock(mm, pmdp); if (!pmd_present(*pmdp)) { spin_unlock(ptl); return 0; } - if (pmd_large(*pmdp)) { + if (pmd_leaf(*pmdp)) { paddr = pmd_val(*pmdp) & HPAGE_MASK; paddr |= addr & ~HPAGE_MASK; *key = page_get_storage_key(paddr); @@ -992,6 +1012,8 @@ int get_guest_storage_key(struct mm_struct *mm, unsigned long addr, spin_unlock(ptl); ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); + if (!ptep) + goto again; pgste = pgste_get_lock(ptep); *key = (pgste_val(pgste) & (PGSTE_ACC_BITS | PGSTE_FP_BIT)) >> 56; paddr = pte_val(*ptep) & PAGE_MASK; diff --git a/arch/s390/mm/physaddr.c b/arch/s390/mm/physaddr.c new file mode 100644 index 000000000000..59de866c72d9 --- /dev/null +++ b/arch/s390/mm/physaddr.c @@ -0,0 +1,15 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/mmdebug.h> +#include <linux/export.h> +#include <linux/mm.h> +#include <asm/page.h> + +unsigned long __phys_addr(unsigned long x, bool is_31bit) +{ + VIRTUAL_BUG_ON(is_vmalloc_or_module_addr((void *)(x))); + x = __pa_nodebug(x); + if (is_31bit) + VIRTUAL_BUG_ON(x >> 31); + return x; +} +EXPORT_SYMBOL(__phys_addr); diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c index c2583f921ca8..85cddf904cb2 100644 --- a/arch/s390/mm/vmem.c +++ b/arch/s390/mm/vmem.c @@ -11,8 +11,11 @@ #include <linux/list.h> #include <linux/hugetlb.h> #include <linux/slab.h> +#include <linux/sort.h> +#include <asm/page-states.h> #include <asm/cacheflush.h> #include <asm/nospec-branch.h> +#include <asm/ctlreg.h> #include <asm/pgalloc.h> #include <asm/setup.h> #include <asm/tlbflush.h> @@ -30,11 +33,15 @@ static void __ref *vmem_alloc_pages(unsigned int order) return memblock_alloc(size, size); } -static void vmem_free_pages(unsigned long addr, int order) +static void vmem_free_pages(unsigned long addr, int order, struct vmem_altmap *altmap) { + if (altmap) { + vmem_altmap_free(altmap, 1 << order); + return; + } /* We don't expect boot memory to be removed ever. */ if (!slab_is_available() || - WARN_ON_ONCE(PageReserved(virt_to_page(addr)))) + WARN_ON_ONCE(PageReserved(virt_to_page((void *)addr)))) return; free_pages(addr, order); } @@ -44,8 +51,10 @@ void *vmem_crst_alloc(unsigned long val) unsigned long *table; table = vmem_alloc_pages(CRST_ALLOC_ORDER); - if (table) - crst_table_init(table, val); + if (!table) + return NULL; + crst_table_init(table, val); + __arch_set_page_dat(table, 1UL << CRST_ALLOC_ORDER); return table; } @@ -61,6 +70,7 @@ pte_t __ref *vmem_pte_alloc(void) if (!pte) return NULL; memset64((u64 *)pte, _PAGE_INVALID, PTRS_PER_PTE); + __arch_set_page_dat(pte, 1); return pte; } @@ -150,7 +160,8 @@ static bool vmemmap_unuse_sub_pmd(unsigned long start, unsigned long end) /* __ref: we'll only call vmemmap_alloc_block() via vmemmap_populate() */ static int __ref modify_pte_table(pmd_t *pmd, unsigned long addr, - unsigned long end, bool add, bool direct) + unsigned long end, bool add, bool direct, + struct vmem_altmap *altmap) { unsigned long prot, pages = 0; int ret = -ENOMEM; @@ -166,11 +177,11 @@ static int __ref modify_pte_table(pmd_t *pmd, unsigned long addr, if (pte_none(*pte)) continue; if (!direct) - vmem_free_pages((unsigned long) pfn_to_virt(pte_pfn(*pte)), 0); + vmem_free_pages((unsigned long)pfn_to_virt(pte_pfn(*pte)), get_order(PAGE_SIZE), altmap); pte_clear(&init_mm, addr, pte); } else if (pte_none(*pte)) { if (!direct) { - void *new_page = vmemmap_alloc_block(PAGE_SIZE, NUMA_NO_NODE); + void *new_page = vmemmap_alloc_block_buf(PAGE_SIZE, NUMA_NO_NODE, altmap); if (!new_page) goto out; @@ -207,7 +218,8 @@ static void try_free_pte_table(pmd_t *pmd, unsigned long start) /* __ref: we'll only call vmemmap_alloc_block() via vmemmap_populate() */ static int __ref modify_pmd_table(pud_t *pud, unsigned long addr, - unsigned long end, bool add, bool direct) + unsigned long end, bool add, bool direct, + struct vmem_altmap *altmap) { unsigned long next, prot, pages = 0; int ret = -ENOMEM; @@ -224,15 +236,15 @@ static int __ref modify_pmd_table(pud_t *pud, unsigned long addr, if (!add) { if (pmd_none(*pmd)) continue; - if (pmd_large(*pmd)) { + if (pmd_leaf(*pmd)) { if (IS_ALIGNED(addr, PMD_SIZE) && IS_ALIGNED(next, PMD_SIZE)) { if (!direct) - vmem_free_pages(pmd_deref(*pmd), get_order(PMD_SIZE)); + vmem_free_pages(pmd_deref(*pmd), get_order(PMD_SIZE), altmap); pmd_clear(pmd); pages++; } else if (!direct && vmemmap_unuse_sub_pmd(addr, next)) { - vmem_free_pages(pmd_deref(*pmd), get_order(PMD_SIZE)); + vmem_free_pages(pmd_deref(*pmd), get_order(PMD_SIZE), altmap); pmd_clear(pmd); } continue; @@ -240,7 +252,7 @@ static int __ref modify_pmd_table(pud_t *pud, unsigned long addr, } else if (pmd_none(*pmd)) { if (IS_ALIGNED(addr, PMD_SIZE) && IS_ALIGNED(next, PMD_SIZE) && - MACHINE_HAS_EDAT1 && addr && direct && + MACHINE_HAS_EDAT1 && direct && !debug_pagealloc_enabled()) { set_pmd(pmd, __pmd(__pa(addr) | prot)); pages++; @@ -255,7 +267,7 @@ static int __ref modify_pmd_table(pud_t *pud, unsigned long addr, * page tables since vmemmap_populate gets * called for each section separately. */ - new_page = vmemmap_alloc_block(PMD_SIZE, NUMA_NO_NODE); + new_page = vmemmap_alloc_block_buf(PMD_SIZE, NUMA_NO_NODE, altmap); if (new_page) { set_pmd(pmd, __pmd(__pa(new_page) | prot)); if (!IS_ALIGNED(addr, PMD_SIZE) || @@ -269,12 +281,12 @@ static int __ref modify_pmd_table(pud_t *pud, unsigned long addr, if (!pte) goto out; pmd_populate(&init_mm, pmd, pte); - } else if (pmd_large(*pmd)) { + } else if (pmd_leaf(*pmd)) { if (!direct) vmemmap_use_sub_pmd(addr, next); continue; } - ret = modify_pte_table(pmd, addr, next, add, direct); + ret = modify_pte_table(pmd, addr, next, add, direct, altmap); if (ret) goto out; if (!add) @@ -289,27 +301,19 @@ out: static void try_free_pmd_table(pud_t *pud, unsigned long start) { - const unsigned long end = start + PUD_SIZE; pmd_t *pmd; int i; - /* Don't mess with any tables not fully in 1:1 mapping & vmemmap area */ - if (end > VMALLOC_START) - return; -#ifdef CONFIG_KASAN - if (start < KASAN_SHADOW_END && KASAN_SHADOW_START > end) - return; -#endif pmd = pmd_offset(pud, start); for (i = 0; i < PTRS_PER_PMD; i++, pmd++) if (!pmd_none(*pmd)) return; - vmem_free_pages(pud_deref(*pud), CRST_ALLOC_ORDER); + vmem_free_pages(pud_deref(*pud), CRST_ALLOC_ORDER, NULL); pud_clear(pud); } static int modify_pud_table(p4d_t *p4d, unsigned long addr, unsigned long end, - bool add, bool direct) + bool add, bool direct, struct vmem_altmap *altmap) { unsigned long next, prot, pages = 0; int ret = -ENOMEM; @@ -325,7 +329,7 @@ static int modify_pud_table(p4d_t *p4d, unsigned long addr, unsigned long end, if (!add) { if (pud_none(*pud)) continue; - if (pud_large(*pud)) { + if (pud_leaf(*pud)) { if (IS_ALIGNED(addr, PUD_SIZE) && IS_ALIGNED(next, PUD_SIZE)) { pud_clear(pud); @@ -336,7 +340,7 @@ static int modify_pud_table(p4d_t *p4d, unsigned long addr, unsigned long end, } else if (pud_none(*pud)) { if (IS_ALIGNED(addr, PUD_SIZE) && IS_ALIGNED(next, PUD_SIZE) && - MACHINE_HAS_EDAT2 && addr && direct && + MACHINE_HAS_EDAT2 && direct && !debug_pagealloc_enabled()) { set_pud(pud, __pud(__pa(addr) | prot)); pages++; @@ -346,10 +350,10 @@ static int modify_pud_table(p4d_t *p4d, unsigned long addr, unsigned long end, if (!pmd) goto out; pud_populate(&init_mm, pud, pmd); - } else if (pud_large(*pud)) { + } else if (pud_leaf(*pud)) { continue; } - ret = modify_pmd_table(pud, addr, next, add, direct); + ret = modify_pmd_table(pud, addr, next, add, direct, altmap); if (ret) goto out; if (!add) @@ -364,29 +368,20 @@ out: static void try_free_pud_table(p4d_t *p4d, unsigned long start) { - const unsigned long end = start + P4D_SIZE; pud_t *pud; int i; - /* Don't mess with any tables not fully in 1:1 mapping & vmemmap area */ - if (end > VMALLOC_START) - return; -#ifdef CONFIG_KASAN - if (start < KASAN_SHADOW_END && KASAN_SHADOW_START > end) - return; -#endif - pud = pud_offset(p4d, start); for (i = 0; i < PTRS_PER_PUD; i++, pud++) { if (!pud_none(*pud)) return; } - vmem_free_pages(p4d_deref(*p4d), CRST_ALLOC_ORDER); + vmem_free_pages(p4d_deref(*p4d), CRST_ALLOC_ORDER, NULL); p4d_clear(p4d); } static int modify_p4d_table(pgd_t *pgd, unsigned long addr, unsigned long end, - bool add, bool direct) + bool add, bool direct, struct vmem_altmap *altmap) { unsigned long next; int ret = -ENOMEM; @@ -405,7 +400,7 @@ static int modify_p4d_table(pgd_t *pgd, unsigned long addr, unsigned long end, goto out; p4d_populate(&init_mm, p4d, pud); } - ret = modify_pud_table(p4d, addr, next, add, direct); + ret = modify_pud_table(p4d, addr, next, add, direct, altmap); if (ret) goto out; if (!add) @@ -418,29 +413,20 @@ out: static void try_free_p4d_table(pgd_t *pgd, unsigned long start) { - const unsigned long end = start + PGDIR_SIZE; p4d_t *p4d; int i; - /* Don't mess with any tables not fully in 1:1 mapping & vmemmap area */ - if (end > VMALLOC_START) - return; -#ifdef CONFIG_KASAN - if (start < KASAN_SHADOW_END && KASAN_SHADOW_START > end) - return; -#endif - p4d = p4d_offset(pgd, start); for (i = 0; i < PTRS_PER_P4D; i++, p4d++) { if (!p4d_none(*p4d)) return; } - vmem_free_pages(pgd_deref(*pgd), CRST_ALLOC_ORDER); + vmem_free_pages(pgd_deref(*pgd), CRST_ALLOC_ORDER, NULL); pgd_clear(pgd); } static int modify_pagetable(unsigned long start, unsigned long end, bool add, - bool direct) + bool direct, struct vmem_altmap *altmap) { unsigned long addr, next; int ret = -ENOMEM; @@ -449,6 +435,9 @@ static int modify_pagetable(unsigned long start, unsigned long end, bool add, if (WARN_ON_ONCE(!PAGE_ALIGNED(start | end))) return -EINVAL; + /* Don't mess with any tables not fully in 1:1 mapping & vmemmap area */ + if (WARN_ON_ONCE(end > VMALLOC_START)) + return -EINVAL; for (addr = start; addr < end; addr = next) { next = pgd_addr_end(addr, end); pgd = pgd_offset_k(addr); @@ -462,7 +451,7 @@ static int modify_pagetable(unsigned long start, unsigned long end, bool add, goto out; pgd_populate(&init_mm, pgd, p4d); } - ret = modify_p4d_table(pgd, addr, next, add, direct); + ret = modify_p4d_table(pgd, addr, next, add, direct, altmap); if (ret) goto out; if (!add) @@ -475,14 +464,16 @@ out: return ret; } -static int add_pagetable(unsigned long start, unsigned long end, bool direct) +static int add_pagetable(unsigned long start, unsigned long end, bool direct, + struct vmem_altmap *altmap) { - return modify_pagetable(start, end, true, direct); + return modify_pagetable(start, end, true, direct, altmap); } -static int remove_pagetable(unsigned long start, unsigned long end, bool direct) +static int remove_pagetable(unsigned long start, unsigned long end, bool direct, + struct vmem_altmap *altmap) { - return modify_pagetable(start, end, false, direct); + return modify_pagetable(start, end, false, direct, altmap); } /* @@ -490,7 +481,8 @@ static int remove_pagetable(unsigned long start, unsigned long end, bool direct) */ static int vmem_add_range(unsigned long start, unsigned long size) { - return add_pagetable(start, start + size, true); + start = (unsigned long)__va(start); + return add_pagetable(start, start + size, true, NULL); } /* @@ -498,7 +490,8 @@ static int vmem_add_range(unsigned long start, unsigned long size) */ static void vmem_remove_range(unsigned long start, unsigned long size) { - remove_pagetable(start, start + size, true); + start = (unsigned long)__va(start); + remove_pagetable(start, start + size, true, NULL); } /* @@ -511,21 +504,25 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, mutex_lock(&vmem_mutex); /* We don't care about the node, just use NUMA_NO_NODE on allocations */ - ret = add_pagetable(start, end, false); + ret = add_pagetable(start, end, false, altmap); if (ret) - remove_pagetable(start, end, false); + remove_pagetable(start, end, false, altmap); mutex_unlock(&vmem_mutex); return ret; } +#ifdef CONFIG_MEMORY_HOTPLUG + void vmemmap_free(unsigned long start, unsigned long end, struct vmem_altmap *altmap) { mutex_lock(&vmem_mutex); - remove_pagetable(start, end, false); + remove_pagetable(start, end, false, altmap); mutex_unlock(&vmem_mutex); } +#endif + void vmem_remove_mapping(unsigned long start, unsigned long size) { mutex_lock(&vmem_mutex); @@ -538,7 +535,7 @@ struct range arch_get_mappable_range(void) struct range mhp_range; mhp_range.start = 0; - mhp_range.end = VMEM_MAX_PHYS - 1; + mhp_range.end = max_mappable - 1; return mhp_range; } @@ -561,33 +558,125 @@ int vmem_add_mapping(unsigned long start, unsigned long size) } /* - * map whole physical memory to virtual memory (identity mapping) - * we reserve enough space in the vmalloc area for vmemmap to hotplug - * additional memory segments. + * Allocate new or return existing page-table entry, but do not map it + * to any physical address. If missing, allocate segment- and region- + * table entries along. Meeting a large segment- or region-table entry + * while traversing is an error, since the function is expected to be + * called against virtual regions reserved for 4KB mappings only. */ +pte_t *vmem_get_alloc_pte(unsigned long addr, bool alloc) +{ + pte_t *ptep = NULL; + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + pgd = pgd_offset_k(addr); + if (pgd_none(*pgd)) { + if (!alloc) + goto out; + p4d = vmem_crst_alloc(_REGION2_ENTRY_EMPTY); + if (!p4d) + goto out; + pgd_populate(&init_mm, pgd, p4d); + } + p4d = p4d_offset(pgd, addr); + if (p4d_none(*p4d)) { + if (!alloc) + goto out; + pud = vmem_crst_alloc(_REGION3_ENTRY_EMPTY); + if (!pud) + goto out; + p4d_populate(&init_mm, p4d, pud); + } + pud = pud_offset(p4d, addr); + if (pud_none(*pud)) { + if (!alloc) + goto out; + pmd = vmem_crst_alloc(_SEGMENT_ENTRY_EMPTY); + if (!pmd) + goto out; + pud_populate(&init_mm, pud, pmd); + } else if (WARN_ON_ONCE(pud_leaf(*pud))) { + goto out; + } + pmd = pmd_offset(pud, addr); + if (pmd_none(*pmd)) { + if (!alloc) + goto out; + pte = vmem_pte_alloc(); + if (!pte) + goto out; + pmd_populate(&init_mm, pmd, pte); + } else if (WARN_ON_ONCE(pmd_leaf(*pmd))) { + goto out; + } + ptep = pte_offset_kernel(pmd, addr); +out: + return ptep; +} + +int __vmem_map_4k_page(unsigned long addr, unsigned long phys, pgprot_t prot, bool alloc) +{ + pte_t *ptep, pte; + + if (!IS_ALIGNED(addr, PAGE_SIZE)) + return -EINVAL; + ptep = vmem_get_alloc_pte(addr, alloc); + if (!ptep) + return -ENOMEM; + __ptep_ipte(addr, ptep, 0, 0, IPTE_GLOBAL); + pte = mk_pte_phys(phys, prot); + set_pte(ptep, pte); + return 0; +} + +int vmem_map_4k_page(unsigned long addr, unsigned long phys, pgprot_t prot) +{ + int rc; + + mutex_lock(&vmem_mutex); + rc = __vmem_map_4k_page(addr, phys, prot, true); + mutex_unlock(&vmem_mutex); + return rc; +} + +void vmem_unmap_4k_page(unsigned long addr) +{ + pte_t *ptep; + + mutex_lock(&vmem_mutex); + ptep = virt_to_kpte(addr); + __ptep_ipte(addr, ptep, 0, 0, IPTE_GLOBAL); + pte_clear(&init_mm, addr, ptep); + mutex_unlock(&vmem_mutex); +} + void __init vmem_map_init(void) { - phys_addr_t base, end; - u64 i; - - for_each_mem_range(i, &base, &end) - vmem_add_range(base, end - base); - __set_memory((unsigned long)_stext, - (unsigned long)(_etext - _stext) >> PAGE_SHIFT, - SET_MEMORY_RO | SET_MEMORY_X); - __set_memory((unsigned long)_etext, - (unsigned long)(__end_rodata - _etext) >> PAGE_SHIFT, - SET_MEMORY_RO); - __set_memory((unsigned long)_sinittext, - (unsigned long)(_einittext - _sinittext) >> PAGE_SHIFT, - SET_MEMORY_RO | SET_MEMORY_X); - __set_memory(__stext_amode31, (__etext_amode31 - __stext_amode31) >> PAGE_SHIFT, - SET_MEMORY_RO | SET_MEMORY_X); - - /* lowcore must be executable for LPSWE */ + __set_memory_rox(_stext, _etext); + __set_memory_ro(_etext, __end_rodata); + __set_memory_rox(_sinittext, _einittext); + __set_memory_rox(__stext_amode31, __etext_amode31); + /* + * If the BEAR-enhancement facility is not installed the first + * prefix page is used to return to the previous context with + * an LPSWE instruction and therefore must be executable. + */ if (!static_key_enabled(&cpu_has_bear)) set_memory_x(0, 1); - + if (debug_pagealloc_enabled()) { + /* + * Use RELOC_HIDE() as long as __va(0) translates to NULL, + * since performing pointer arithmetic on a NULL pointer + * has undefined behavior and generates compiler warnings. + */ + __set_memory_4k(__va(0), RELOC_HIDE(__va(0), ident_map_size)); + } + if (MACHINE_HAS_NX) + system_ctl_set_bit(0, CR0_INSTRUCTION_EXEC_PROTECTION_BIT); pr_info("Write protected kernel read-only data: %luk\n", (unsigned long)(__end_rodata - _stext) >> 10); } |