diff options
Diffstat (limited to 'arch/s390/mm')
-rw-r--r-- | arch/s390/mm/Makefile | 2 | ||||
-rw-r--r-- | arch/s390/mm/cmm.c | 2 | ||||
-rw-r--r-- | arch/s390/mm/dump_pagetables.c | 20 | ||||
-rw-r--r-- | arch/s390/mm/extable.c | 81 | ||||
-rw-r--r-- | arch/s390/mm/fault.c | 110 | ||||
-rw-r--r-- | arch/s390/mm/gmap.c | 226 | ||||
-rw-r--r-- | arch/s390/mm/hugetlbpage.c | 47 | ||||
-rw-r--r-- | arch/s390/mm/init.c | 21 | ||||
-rw-r--r-- | arch/s390/mm/kasan_init.c | 8 | ||||
-rw-r--r-- | arch/s390/mm/maccess.c | 217 | ||||
-rw-r--r-- | arch/s390/mm/mmap.c | 26 | ||||
-rw-r--r-- | arch/s390/mm/page-states.c | 1 | ||||
-rw-r--r-- | arch/s390/mm/pageattr.c | 33 | ||||
-rw-r--r-- | arch/s390/mm/pgalloc.c | 252 | ||||
-rw-r--r-- | arch/s390/mm/pgtable.c | 46 | ||||
-rw-r--r-- | arch/s390/mm/vmem.c | 123 |
16 files changed, 819 insertions, 396 deletions
diff --git a/arch/s390/mm/Makefile b/arch/s390/mm/Makefile index cd67e94c16aa..57e4f3a24829 100644 --- a/arch/s390/mm/Makefile +++ b/arch/s390/mm/Makefile @@ -4,7 +4,7 @@ # obj-y := init.o fault.o extmem.o mmap.o vmem.o maccess.o -obj-y += page-states.o pageattr.o pgtable.o pgalloc.o +obj-y += page-states.o pageattr.o pgtable.o pgalloc.o extable.o obj-$(CONFIG_CMM) += cmm.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o diff --git a/arch/s390/mm/cmm.c b/arch/s390/mm/cmm.c index 2203164b39da..9141ed4c52e9 100644 --- a/arch/s390/mm/cmm.c +++ b/arch/s390/mm/cmm.c @@ -90,7 +90,7 @@ static long cmm_alloc_pages(long nr, long *counter, } else free_page((unsigned long) npa); } - diag10_range(addr >> PAGE_SHIFT, 1); + diag10_range(virt_to_pfn(addr), 1); pa->pages[pa->index++] = addr; (*counter)++; spin_unlock(&cmm_lock); diff --git a/arch/s390/mm/dump_pagetables.c b/arch/s390/mm/dump_pagetables.c index 9f9af5298dd6..9953819d7959 100644 --- a/arch/s390/mm/dump_pagetables.c +++ b/arch/s390/mm/dump_pagetables.c @@ -8,8 +8,10 @@ #include <linux/kasan.h> #include <asm/ptdump.h> #include <asm/kasan.h> +#include <asm/abs_lowcore.h> #include <asm/nospec-branch.h> #include <asm/sections.h> +#include <asm/maccess.h> static unsigned long max_addr; @@ -21,6 +23,8 @@ struct addr_marker { enum address_markers_idx { IDENTITY_BEFORE_NR = 0, IDENTITY_BEFORE_END_NR, + AMODE31_START_NR, + AMODE31_END_NR, KERNEL_START_NR, KERNEL_END_NR, #ifdef CONFIG_KFENCE @@ -39,11 +43,17 @@ enum address_markers_idx { VMALLOC_END_NR, MODULES_NR, MODULES_END_NR, + ABS_LOWCORE_NR, + ABS_LOWCORE_END_NR, + MEMCPY_REAL_NR, + MEMCPY_REAL_END_NR, }; static struct addr_marker address_markers[] = { [IDENTITY_BEFORE_NR] = {0, "Identity Mapping Start"}, [IDENTITY_BEFORE_END_NR] = {(unsigned long)_stext, "Identity Mapping End"}, + [AMODE31_START_NR] = {0, "Amode31 Area Start"}, + [AMODE31_END_NR] = {0, "Amode31 Area End"}, [KERNEL_START_NR] = {(unsigned long)_stext, "Kernel Image Start"}, [KERNEL_END_NR] = {(unsigned long)_end, "Kernel Image End"}, #ifdef CONFIG_KFENCE @@ -62,6 +72,10 @@ static struct addr_marker address_markers[] = { [VMALLOC_END_NR] = {0, "vmalloc Area End"}, [MODULES_NR] = {0, "Modules Area Start"}, [MODULES_END_NR] = {0, "Modules Area End"}, + [ABS_LOWCORE_NR] = {0, "Lowcore Area Start"}, + [ABS_LOWCORE_END_NR] = {0, "Lowcore Area End"}, + [MEMCPY_REAL_NR] = {0, "Real Memory Copy Area Start"}, + [MEMCPY_REAL_END_NR] = {0, "Real Memory Copy Area End"}, { -1, NULL } }; @@ -276,8 +290,14 @@ static int pt_dump_init(void) max_addr = (S390_lowcore.kernel_asce & _REGION_ENTRY_TYPE_MASK) >> 2; max_addr = 1UL << (max_addr * 11 + 31); address_markers[IDENTITY_AFTER_END_NR].start_address = ident_map_size; + address_markers[AMODE31_START_NR].start_address = __samode31; + address_markers[AMODE31_END_NR].start_address = __eamode31; address_markers[MODULES_NR].start_address = MODULES_VADDR; address_markers[MODULES_END_NR].start_address = MODULES_END; + address_markers[ABS_LOWCORE_NR].start_address = __abs_lowcore; + address_markers[ABS_LOWCORE_END_NR].start_address = __abs_lowcore + ABS_LOWCORE_MAP_SIZE; + address_markers[MEMCPY_REAL_NR].start_address = __memcpy_real_area; + address_markers[MEMCPY_REAL_END_NR].start_address = __memcpy_real_area + PAGE_SIZE; address_markers[VMEMMAP_NR].start_address = (unsigned long) vmemmap; address_markers[VMEMMAP_END_NR].start_address = (unsigned long)vmemmap + vmemmap_size; address_markers[VMALLOC_NR].start_address = VMALLOC_START; diff --git a/arch/s390/mm/extable.c b/arch/s390/mm/extable.c new file mode 100644 index 000000000000..1e4d2187541a --- /dev/null +++ b/arch/s390/mm/extable.c @@ -0,0 +1,81 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/bitfield.h> +#include <linux/extable.h> +#include <linux/string.h> +#include <linux/errno.h> +#include <linux/panic.h> +#include <asm/asm-extable.h> +#include <asm/extable.h> + +const struct exception_table_entry *s390_search_extables(unsigned long addr) +{ + const struct exception_table_entry *fixup; + size_t num; + + fixup = search_exception_tables(addr); + if (fixup) + return fixup; + num = __stop_amode31_ex_table - __start_amode31_ex_table; + return search_extable(__start_amode31_ex_table, num, addr); +} + +static bool ex_handler_fixup(const struct exception_table_entry *ex, struct pt_regs *regs) +{ + regs->psw.addr = extable_fixup(ex); + return true; +} + +static bool ex_handler_ua_store(const struct exception_table_entry *ex, struct pt_regs *regs) +{ + unsigned int reg_err = FIELD_GET(EX_DATA_REG_ERR, ex->data); + + regs->gprs[reg_err] = -EFAULT; + regs->psw.addr = extable_fixup(ex); + return true; +} + +static bool ex_handler_ua_load_mem(const struct exception_table_entry *ex, struct pt_regs *regs) +{ + unsigned int reg_addr = FIELD_GET(EX_DATA_REG_ADDR, ex->data); + unsigned int reg_err = FIELD_GET(EX_DATA_REG_ERR, ex->data); + size_t len = FIELD_GET(EX_DATA_LEN, ex->data); + + regs->gprs[reg_err] = -EFAULT; + memset((void *)regs->gprs[reg_addr], 0, len); + regs->psw.addr = extable_fixup(ex); + return true; +} + +static bool ex_handler_ua_load_reg(const struct exception_table_entry *ex, struct pt_regs *regs) +{ + unsigned int reg_zero = FIELD_GET(EX_DATA_REG_ADDR, ex->data); + unsigned int reg_err = FIELD_GET(EX_DATA_REG_ERR, ex->data); + + regs->gprs[reg_err] = -EFAULT; + regs->gprs[reg_zero] = 0; + regs->psw.addr = extable_fixup(ex); + return true; +} + +bool fixup_exception(struct pt_regs *regs) +{ + const struct exception_table_entry *ex; + + ex = s390_search_extables(instruction_pointer(regs)); + if (!ex) + return false; + switch (ex->type) { + case EX_TYPE_FIXUP: + return ex_handler_fixup(ex, regs); + case EX_TYPE_BPF: + return ex_handler_bpf(ex, regs); + case EX_TYPE_UA_STORE: + return ex_handler_ua_store(ex, regs); + case EX_TYPE_UA_LOAD_MEM: + return ex_handler_ua_load_mem(ex, regs); + case EX_TYPE_UA_LOAD_REG: + return ex_handler_ua_load_reg(ex, regs); + } + panic("invalid exception table entry"); +} diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index d30f5986fa85..9649d9382e0a 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -32,6 +32,7 @@ #include <linux/uaccess.h> #include <linux/hugetlb.h> #include <linux/kfence.h> +#include <asm/asm-extable.h> #include <asm/asm-offsets.h> #include <asm/diag.h> #include <asm/gmap.h> @@ -115,7 +116,7 @@ static void dump_pagetable(unsigned long asce, unsigned long address) pr_cont("R1:%016lx ", *table); if (*table & _REGION_ENTRY_INVALID) goto out; - table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); + table = __va(*table & _REGION_ENTRY_ORIGIN); fallthrough; case _ASCE_TYPE_REGION2: table += (address & _REGION2_INDEX) >> _REGION2_SHIFT; @@ -124,7 +125,7 @@ static void dump_pagetable(unsigned long asce, unsigned long address) pr_cont("R2:%016lx ", *table); if (*table & _REGION_ENTRY_INVALID) goto out; - table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); + table = __va(*table & _REGION_ENTRY_ORIGIN); fallthrough; case _ASCE_TYPE_REGION3: table += (address & _REGION3_INDEX) >> _REGION3_SHIFT; @@ -133,7 +134,7 @@ static void dump_pagetable(unsigned long asce, unsigned long address) pr_cont("R3:%016lx ", *table); if (*table & (_REGION_ENTRY_INVALID | _REGION3_ENTRY_LARGE)) goto out; - table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); + table = __va(*table & _REGION_ENTRY_ORIGIN); fallthrough; case _ASCE_TYPE_SEGMENT: table += (address & _SEGMENT_INDEX) >> _SEGMENT_SHIFT; @@ -142,7 +143,7 @@ static void dump_pagetable(unsigned long asce, unsigned long address) pr_cont("S:%016lx ", *table); if (*table & (_SEGMENT_ENTRY_INVALID | _SEGMENT_ENTRY_LARGE)) goto out; - table = (unsigned long *)(*table & _SEGMENT_ENTRY_ORIGIN); + table = __va(*table & _SEGMENT_ENTRY_ORIGIN); } table += (address & _PAGE_INDEX) >> _PAGE_SHIFT; if (bad_address(table)) @@ -227,27 +228,10 @@ static noinline void do_sigsegv(struct pt_regs *regs, int si_code) (void __user *)(regs->int_parm_long & __FAIL_ADDR_MASK)); } -const struct exception_table_entry *s390_search_extables(unsigned long addr) -{ - const struct exception_table_entry *fixup; - - fixup = search_extable(__start_amode31_ex_table, - __stop_amode31_ex_table - __start_amode31_ex_table, - addr); - if (!fixup) - fixup = search_exception_tables(addr); - return fixup; -} - static noinline void do_no_context(struct pt_regs *regs) { - const struct exception_table_entry *fixup; - - /* Are we prepared to handle this kernel fault? */ - fixup = s390_search_extables(regs->psw.addr); - if (fixup && ex_handle(fixup, regs)) + if (fixup_exception(regs)) return; - /* * Oops. The kernel tried to access some bad page. We'll have to * terminate things with extreme prejudice. @@ -284,8 +268,7 @@ static noinline void do_sigbus(struct pt_regs *regs) (void __user *)(regs->int_parm_long & __FAIL_ADDR_MASK)); } -static noinline void do_fault_error(struct pt_regs *regs, int access, - vm_fault_t fault) +static noinline void do_fault_error(struct pt_regs *regs, vm_fault_t fault) { int si_code; @@ -395,7 +378,9 @@ static inline vm_fault_t do_exception(struct pt_regs *regs, int access) flags = FAULT_FLAG_DEFAULT; if (user_mode(regs)) flags |= FAULT_FLAG_USER; - if (access == VM_WRITE || is_write) + if (is_write) + access = VM_WRITE; + if (access == VM_WRITE) flags |= FAULT_FLAG_WRITE; mmap_read_lock(mm); @@ -435,8 +420,6 @@ retry: if (unlikely(!(vma->vm_flags & access))) goto out_up; - if (is_vm_hugetlb_page(vma)) - address &= HPAGE_MASK; /* * If for any reason at all we couldn't handle the fault, * make sure we exit gracefully rather than endlessly redo @@ -449,25 +432,37 @@ retry: goto out_up; goto out; } + + /* The fault is fully completed (including releasing mmap lock) */ + if (fault & VM_FAULT_COMPLETED) { + if (gmap) { + mmap_read_lock(mm); + goto out_gmap; + } + fault = 0; + goto out; + } + if (unlikely(fault & VM_FAULT_ERROR)) goto out_up; - if (flags & FAULT_FLAG_ALLOW_RETRY) { - if (fault & VM_FAULT_RETRY) { - if (IS_ENABLED(CONFIG_PGSTE) && gmap && - (flags & FAULT_FLAG_RETRY_NOWAIT)) { - /* FAULT_FLAG_RETRY_NOWAIT has been set, - * mmap_lock has not been released */ - current->thread.gmap_pfault = 1; - fault = VM_FAULT_PFAULT; - goto out_up; - } - flags &= ~FAULT_FLAG_RETRY_NOWAIT; - flags |= FAULT_FLAG_TRIED; - mmap_read_lock(mm); - goto retry; + if (fault & VM_FAULT_RETRY) { + if (IS_ENABLED(CONFIG_PGSTE) && gmap && + (flags & FAULT_FLAG_RETRY_NOWAIT)) { + /* + * FAULT_FLAG_RETRY_NOWAIT has been set, mmap_lock has + * not been released + */ + current->thread.gmap_pfault = 1; + fault = VM_FAULT_PFAULT; + goto out_up; } + flags &= ~FAULT_FLAG_RETRY_NOWAIT; + flags |= FAULT_FLAG_TRIED; + mmap_read_lock(mm); + goto retry; } +out_gmap: if (IS_ENABLED(CONFIG_PGSTE) && gmap) { address = __gmap_link(gmap, current->thread.gmap_addr, address); @@ -520,7 +515,7 @@ void do_protection_exception(struct pt_regs *regs) fault = do_exception(regs, access); } if (unlikely(fault)) - do_fault_error(regs, access, fault); + do_fault_error(regs, fault); } NOKPROBE_SYMBOL(do_protection_exception); @@ -532,7 +527,7 @@ void do_dat_exception(struct pt_regs *regs) access = VM_ACCESS_FLAGS; fault = do_exception(regs, access); if (unlikely(fault)) - do_fault_error(regs, access, fault); + do_fault_error(regs, fault); } NOKPROBE_SYMBOL(do_dat_exception); @@ -770,6 +765,7 @@ void do_secure_storage_access(struct pt_regs *regs) struct vm_area_struct *vma; struct mm_struct *mm; struct page *page; + struct gmap *gmap; int rc; /* @@ -799,13 +795,24 @@ void do_secure_storage_access(struct pt_regs *regs) } switch (get_fault_type(regs)) { + case GMAP_FAULT: + mm = current->mm; + gmap = (struct gmap *)S390_lowcore.gmap; + mmap_read_lock(mm); + addr = __gmap_translate(gmap, addr); + mmap_read_unlock(mm); + if (IS_ERR_VALUE(addr)) { + do_fault_error(regs, VM_FAULT_BADMAP); + break; + } + fallthrough; case USER_FAULT: mm = current->mm; mmap_read_lock(mm); vma = find_vma(mm, addr); if (!vma) { mmap_read_unlock(mm); - do_fault_error(regs, VM_READ | VM_WRITE, VM_FAULT_BADMAP); + do_fault_error(regs, VM_FAULT_BADMAP); break; } page = follow_page(vma, addr, FOLL_WRITE | FOLL_GET); @@ -827,9 +834,8 @@ void do_secure_storage_access(struct pt_regs *regs) if (rc) BUG(); break; - case GMAP_FAULT: default: - do_fault_error(regs, VM_READ | VM_WRITE, VM_FAULT_BADMAP); + do_fault_error(regs, VM_FAULT_BADMAP); WARN_ON_ONCE(1); } } @@ -841,7 +847,7 @@ void do_non_secure_storage_access(struct pt_regs *regs) struct gmap *gmap = (struct gmap *)S390_lowcore.gmap; if (get_fault_type(regs) != GMAP_FAULT) { - do_fault_error(regs, VM_READ | VM_WRITE, VM_FAULT_BADMAP); + do_fault_error(regs, VM_FAULT_BADMAP); WARN_ON_ONCE(1); return; } @@ -853,6 +859,16 @@ NOKPROBE_SYMBOL(do_non_secure_storage_access); void do_secure_storage_violation(struct pt_regs *regs) { + unsigned long gaddr = regs->int_parm_long & __FAIL_ADDR_MASK; + struct gmap *gmap = (struct gmap *)S390_lowcore.gmap; + + /* + * If the VM has been rebooted, its address space might still contain + * secure pages from the previous boot. + * Clear the page so it can be reused. + */ + if (!gmap_destroy_page(gmap, gaddr)) + return; /* * Either KVM messed up the secure guest mapping or the same * page is mapped into multiple secure guests. diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index dfee0ebb2fac..02d15c8dc92e 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -974,18 +974,18 @@ static int gmap_protect_pmd(struct gmap *gmap, unsigned long gaddr, return -EAGAIN; if (prot == PROT_NONE && !pmd_i) { - pmd_val(new) |= _SEGMENT_ENTRY_INVALID; + new = set_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_INVALID)); gmap_pmdp_xchg(gmap, pmdp, new, gaddr); } if (prot == PROT_READ && !pmd_p) { - pmd_val(new) &= ~_SEGMENT_ENTRY_INVALID; - pmd_val(new) |= _SEGMENT_ENTRY_PROTECT; + new = clear_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_INVALID)); + new = set_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_PROTECT)); gmap_pmdp_xchg(gmap, pmdp, new, gaddr); } if (bits & GMAP_NOTIFY_MPROT) - pmd_val(*pmdp) |= _SEGMENT_ENTRY_GMAP_IN; + set_pmd(pmdp, set_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_GMAP_IN))); /* Shadow GMAP protection needs split PMDs */ if (bits & GMAP_NOTIFY_SHADOW) @@ -1151,7 +1151,7 @@ int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val) address = pte_val(pte) & PAGE_MASK; address += gaddr & ~PAGE_MASK; *val = *(unsigned long *) address; - pte_val(*ptep) |= _PAGE_YOUNG; + set_pte(ptep, set_pte_bit(*ptep, __pgprot(_PAGE_YOUNG))); /* Do *NOT* clear the _PAGE_INVALID bit! */ rc = 0; } @@ -1183,6 +1183,7 @@ EXPORT_SYMBOL_GPL(gmap_read_table); static inline void gmap_insert_rmap(struct gmap *sg, unsigned long vmaddr, struct gmap_rmap *rmap) { + struct gmap_rmap *temp; void __rcu **slot; BUG_ON(!gmap_is_shadow(sg)); @@ -1190,6 +1191,12 @@ static inline void gmap_insert_rmap(struct gmap *sg, unsigned long vmaddr, if (slot) { rmap->next = radix_tree_deref_slot_protected(slot, &sg->guest_table_lock); + for (temp = rmap->next; temp; temp = temp->next) { + if (temp->raddr == rmap->raddr) { + kfree(rmap); + return; + } + } radix_tree_replace_slot(&sg->host_to_rmap, slot, rmap); } else { rmap->next = NULL; @@ -1278,7 +1285,7 @@ static int gmap_protect_rmap(struct gmap *sg, unsigned long raddr, static inline void gmap_idte_one(unsigned long asce, unsigned long vaddr) { asm volatile( - " .insn rrf,0xb98e0000,%0,%1,0,0" + " idte %0,0,%1" : : "a" (asce), "a" (vaddr) : "cc", "memory"); } @@ -2275,7 +2282,7 @@ EXPORT_SYMBOL_GPL(ptep_notify); static void pmdp_notify_gmap(struct gmap *gmap, pmd_t *pmdp, unsigned long gaddr) { - pmd_val(*pmdp) &= ~_SEGMENT_ENTRY_GMAP_IN; + set_pmd(pmdp, clear_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_GMAP_IN))); gmap_call_notifier(gmap, gaddr, gaddr + HPAGE_SIZE - 1); } @@ -2294,7 +2301,7 @@ static void gmap_pmdp_xchg(struct gmap *gmap, pmd_t *pmdp, pmd_t new, { gaddr &= HPAGE_MASK; pmdp_notify_gmap(gmap, pmdp, gaddr); - pmd_val(new) &= ~_SEGMENT_ENTRY_GMAP_IN; + new = clear_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_GMAP_IN)); if (MACHINE_HAS_TLB_GUEST) __pmdp_idte(gaddr, (pmd_t *)pmdp, IDTE_GUEST_ASCE, gmap->asce, IDTE_GLOBAL); @@ -2302,7 +2309,7 @@ static void gmap_pmdp_xchg(struct gmap *gmap, pmd_t *pmdp, pmd_t new, __pmdp_idte(gaddr, (pmd_t *)pmdp, 0, 0, IDTE_GLOBAL); else __pmdp_csp(pmdp); - *pmdp = new; + set_pmd(pmdp, new); } static void gmap_pmdp_clear(struct mm_struct *mm, unsigned long vmaddr, @@ -2324,7 +2331,7 @@ static void gmap_pmdp_clear(struct mm_struct *mm, unsigned long vmaddr, _SEGMENT_ENTRY_GMAP_UC)); if (purge) __pmdp_csp(pmdp); - pmd_val(*pmdp) = _SEGMENT_ENTRY_EMPTY; + set_pmd(pmdp, __pmd(_SEGMENT_ENTRY_EMPTY)); } spin_unlock(&gmap->guest_table_lock); } @@ -2447,7 +2454,7 @@ static bool gmap_test_and_clear_dirty_pmd(struct gmap *gmap, pmd_t *pmdp, return false; /* Clear UC indication and reset protection */ - pmd_val(*pmdp) &= ~_SEGMENT_ENTRY_GMAP_UC; + set_pmd(pmdp, clear_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_GMAP_UC))); gmap_protect_pmd(gmap, gaddr, pmdp, PROT_READ, 0); return true; } @@ -2508,8 +2515,9 @@ static const struct mm_walk_ops thp_split_walk_ops = { static inline void thp_split_mm(struct mm_struct *mm) { struct vm_area_struct *vma; + VMA_ITERATOR(vmi, mm, 0); - for (vma = mm->mmap; vma != NULL; vma = vma->vm_next) { + for_each_vma(vmi, vma) { vma->vm_flags &= ~VM_HUGEPAGE; vma->vm_flags |= VM_NOHUGEPAGE; walk_page_vma(vma, &thp_split_walk_ops, NULL); @@ -2577,8 +2585,9 @@ int gmap_mark_unmergeable(void) struct mm_struct *mm = current->mm; struct vm_area_struct *vma; int ret; + VMA_ITERATOR(vmi, mm, 0); - for (vma = mm->mmap; vma; vma = vma->vm_next) { + for_each_vma(vmi, vma) { ret = ksm_madvise(vma, vma->vm_start, vma->vm_end, MADV_UNMERGEABLE, &vma->vm_flags); if (ret) @@ -2601,6 +2610,18 @@ static int __s390_enable_skey_pte(pte_t *pte, unsigned long addr, return 0; } +/* + * Give a chance to schedule after setting a key to 256 pages. + * We only hold the mm lock, which is a rwsem and the kvm srcu. + * Both can sleep. + */ +static int __s390_enable_skey_pmd(pmd_t *pmd, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + cond_resched(); + return 0; +} + static int __s390_enable_skey_hugetlb(pte_t *pte, unsigned long addr, unsigned long hmask, unsigned long next, struct mm_walk *walk) @@ -2623,12 +2644,14 @@ static int __s390_enable_skey_hugetlb(pte_t *pte, unsigned long addr, end = start + HPAGE_SIZE - 1; __storage_key_init_range(start, end); set_bit(PG_arch_1, &page->flags); + cond_resched(); return 0; } static const struct mm_walk_ops enable_skey_walk_ops = { .hugetlb_entry = __s390_enable_skey_hugetlb, .pte_entry = __s390_enable_skey_pte, + .pmd_entry = __s390_enable_skey_pmd, }; int s390_enable_skey(void) @@ -2676,41 +2699,168 @@ void s390_reset_cmma(struct mm_struct *mm) } EXPORT_SYMBOL_GPL(s390_reset_cmma); +#define GATHER_GET_PAGES 32 + +struct reset_walk_state { + unsigned long next; + unsigned long count; + unsigned long pfns[GATHER_GET_PAGES]; +}; + +static int s390_gather_pages(pte_t *ptep, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + struct reset_walk_state *p = walk->private; + pte_t pte = READ_ONCE(*ptep); + + if (pte_present(pte)) { + /* we have a reference from the mapping, take an extra one */ + get_page(phys_to_page(pte_val(pte))); + p->pfns[p->count] = phys_to_pfn(pte_val(pte)); + p->next = next; + p->count++; + } + return p->count >= GATHER_GET_PAGES; +} + +static const struct mm_walk_ops gather_pages_ops = { + .pte_entry = s390_gather_pages, +}; + /* - * make inaccessible pages accessible again + * Call the Destroy secure page UVC on each page in the given array of PFNs. + * Each page needs to have an extra reference, which will be released here. */ -static int __s390_reset_acc(pte_t *ptep, unsigned long addr, - unsigned long next, struct mm_walk *walk) +void s390_uv_destroy_pfns(unsigned long count, unsigned long *pfns) { - pte_t pte = READ_ONCE(*ptep); + unsigned long i; - /* There is a reference through the mapping */ - if (pte_present(pte)) - WARN_ON_ONCE(uv_destroy_owned_page(pte_val(pte) & PAGE_MASK)); + for (i = 0; i < count; i++) { + /* we always have an extra reference */ + uv_destroy_owned_page(pfn_to_phys(pfns[i])); + /* get rid of the extra reference */ + put_page(pfn_to_page(pfns[i])); + cond_resched(); + } +} +EXPORT_SYMBOL_GPL(s390_uv_destroy_pfns); +/** + * __s390_uv_destroy_range - Call the destroy secure page UVC on each page + * in the given range of the given address space. + * @mm: the mm to operate on + * @start: the start of the range + * @end: the end of the range + * @interruptible: if not 0, stop when a fatal signal is received + * + * Walk the given range of the given address space and call the destroy + * secure page UVC on each page. Optionally exit early if a fatal signal is + * pending. + * + * Return: 0 on success, -EINTR if the function stopped before completing + */ +int __s390_uv_destroy_range(struct mm_struct *mm, unsigned long start, + unsigned long end, bool interruptible) +{ + struct reset_walk_state state = { .next = start }; + int r = 1; + + while (r > 0) { + state.count = 0; + mmap_read_lock(mm); + r = walk_page_range(mm, state.next, end, &gather_pages_ops, &state); + mmap_read_unlock(mm); + cond_resched(); + s390_uv_destroy_pfns(state.count, state.pfns); + if (interruptible && fatal_signal_pending(current)) + return -EINTR; + } return 0; } +EXPORT_SYMBOL_GPL(__s390_uv_destroy_range); -static const struct mm_walk_ops reset_acc_walk_ops = { - .pte_entry = __s390_reset_acc, -}; +/** + * s390_unlist_old_asce - Remove the topmost level of page tables from the + * list of page tables of the gmap. + * @gmap: the gmap whose table is to be removed + * + * On s390x, KVM keeps a list of all pages containing the page tables of the + * gmap (the CRST list). This list is used at tear down time to free all + * pages that are now not needed anymore. + * + * This function removes the topmost page of the tree (the one pointed to by + * the ASCE) from the CRST list. + * + * This means that it will not be freed when the VM is torn down, and needs + * to be handled separately by the caller, unless a leak is actually + * intended. Notice that this function will only remove the page from the + * list, the page will still be used as a top level page table (and ASCE). + */ +void s390_unlist_old_asce(struct gmap *gmap) +{ + struct page *old; -#include <linux/sched/mm.h> -void s390_reset_acc(struct mm_struct *mm) + old = virt_to_page(gmap->table); + spin_lock(&gmap->guest_table_lock); + list_del(&old->lru); + /* + * Sometimes the topmost page might need to be "removed" multiple + * times, for example if the VM is rebooted into secure mode several + * times concurrently, or if s390_replace_asce fails after calling + * s390_remove_old_asce and is attempted again later. In that case + * the old asce has been removed from the list, and therefore it + * will not be freed when the VM terminates, but the ASCE is still + * in use and still pointed to. + * A subsequent call to replace_asce will follow the pointer and try + * to remove the same page from the list again. + * Therefore it's necessary that the page of the ASCE has valid + * pointers, so list_del can work (and do nothing) without + * dereferencing stale or invalid pointers. + */ + INIT_LIST_HEAD(&old->lru); + spin_unlock(&gmap->guest_table_lock); +} +EXPORT_SYMBOL_GPL(s390_unlist_old_asce); + +/** + * s390_replace_asce - Try to replace the current ASCE of a gmap with a copy + * @gmap: the gmap whose ASCE needs to be replaced + * + * If the allocation of the new top level page table fails, the ASCE is not + * replaced. + * In any case, the old ASCE is always removed from the gmap CRST list. + * Therefore the caller has to make sure to save a pointer to it + * beforehand, unless a leak is actually intended. + */ +int s390_replace_asce(struct gmap *gmap) { - if (!mm_is_protected(mm)) - return; + unsigned long asce; + struct page *page; + void *table; + + s390_unlist_old_asce(gmap); + + page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER); + if (!page) + return -ENOMEM; + table = page_to_virt(page); + memcpy(table, gmap->table, 1UL << (CRST_ALLOC_ORDER + PAGE_SHIFT)); + /* - * we might be called during - * reset: we walk the pages and clear - * close of all kvm file descriptors: we walk the pages and clear - * exit of process on fd closure: vma already gone, do nothing + * The caller has to deal with the old ASCE, but here we make sure + * the new one is properly added to the CRST list, so that + * it will be freed when the VM is torn down. */ - if (!mmget_not_zero(mm)) - return; - mmap_read_lock(mm); - walk_page_range(mm, 0, TASK_SIZE, &reset_acc_walk_ops, NULL); - mmap_read_unlock(mm); - mmput(mm); + spin_lock(&gmap->guest_table_lock); + list_add(&page->lru, &gmap->crst_list); + spin_unlock(&gmap->guest_table_lock); + + /* Set new table origin while preserving existing ASCE control bits */ + asce = (gmap->asce & ~_ASCE_ORIGIN) | __pa(table); + WRITE_ONCE(gmap->asce, asce); + WRITE_ONCE(gmap->mm->context.gmap_asce, asce); + WRITE_ONCE(gmap->table, table); + + return 0; } -EXPORT_SYMBOL_GPL(s390_reset_acc); +EXPORT_SYMBOL_GPL(s390_replace_asce); diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c index da36d13ffc16..c299a18273ff 100644 --- a/arch/s390/mm/hugetlbpage.c +++ b/arch/s390/mm/hugetlbpage.c @@ -9,6 +9,7 @@ #define KMSG_COMPONENT "hugetlb" #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#include <asm/pgalloc.h> #include <linux/mm.h> #include <linux/hugetlb.h> #include <linux/mman.h> @@ -72,8 +73,8 @@ static inline unsigned long __pte_to_rste(pte_t pte) static inline pte_t __rste_to_pte(unsigned long rste) { + unsigned long pteval; int present; - pte_t pte; if ((rste & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3) present = pud_present(__pud(rste)); @@ -101,29 +102,21 @@ static inline pte_t __rste_to_pte(unsigned long rste) * u unused, l large */ if (present) { - pte_val(pte) = rste & _SEGMENT_ENTRY_ORIGIN_LARGE; - pte_val(pte) |= _PAGE_LARGE | _PAGE_PRESENT; - pte_val(pte) |= move_set_bit(rste, _SEGMENT_ENTRY_READ, - _PAGE_READ); - pte_val(pte) |= move_set_bit(rste, _SEGMENT_ENTRY_WRITE, - _PAGE_WRITE); - pte_val(pte) |= move_set_bit(rste, _SEGMENT_ENTRY_INVALID, - _PAGE_INVALID); - pte_val(pte) |= move_set_bit(rste, _SEGMENT_ENTRY_PROTECT, - _PAGE_PROTECT); - pte_val(pte) |= move_set_bit(rste, _SEGMENT_ENTRY_DIRTY, - _PAGE_DIRTY); - pte_val(pte) |= move_set_bit(rste, _SEGMENT_ENTRY_YOUNG, - _PAGE_YOUNG); + pteval = rste & _SEGMENT_ENTRY_ORIGIN_LARGE; + pteval |= _PAGE_LARGE | _PAGE_PRESENT; + pteval |= move_set_bit(rste, _SEGMENT_ENTRY_READ, _PAGE_READ); + pteval |= move_set_bit(rste, _SEGMENT_ENTRY_WRITE, _PAGE_WRITE); + pteval |= move_set_bit(rste, _SEGMENT_ENTRY_INVALID, _PAGE_INVALID); + pteval |= move_set_bit(rste, _SEGMENT_ENTRY_PROTECT, _PAGE_PROTECT); + pteval |= move_set_bit(rste, _SEGMENT_ENTRY_DIRTY, _PAGE_DIRTY); + pteval |= move_set_bit(rste, _SEGMENT_ENTRY_YOUNG, _PAGE_YOUNG); #ifdef CONFIG_MEM_SOFT_DIRTY - pte_val(pte) |= move_set_bit(rste, _SEGMENT_ENTRY_SOFT_DIRTY, - _PAGE_SOFT_DIRTY); + pteval |= move_set_bit(rste, _SEGMENT_ENTRY_SOFT_DIRTY, _PAGE_SOFT_DIRTY); #endif - pte_val(pte) |= move_set_bit(rste, _SEGMENT_ENTRY_NOEXEC, - _PAGE_NOEXEC); + pteval |= move_set_bit(rste, _SEGMENT_ENTRY_NOEXEC, _PAGE_NOEXEC); } else - pte_val(pte) = _PAGE_INVALID; - return pte; + pteval = _PAGE_INVALID; + return __pte(pteval); } static void clear_huge_pte_skeys(struct mm_struct *mm, unsigned long rste) @@ -167,7 +160,7 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, rste |= _SEGMENT_ENTRY_LARGE; clear_huge_pte_skeys(mm, rste); - pte_val(*ptep) = rste; + set_pte(ptep, __pte(rste)); } pte_t huge_ptep_get(pte_t *ptep) @@ -244,16 +237,6 @@ int pud_huge(pud_t pud) return pud_large(pud); } -struct page * -follow_huge_pud(struct mm_struct *mm, unsigned long address, - pud_t *pud, int flags) -{ - if (flags & FOLL_GET) - return NULL; - - return pud_page(*pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT); -} - bool __init arch_hugetlb_valid_size(unsigned long size) { if (MACHINE_HAS_EDAT1 && size == PMD_SIZE) diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index 8c6f258a6183..97d66a3e60fb 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -37,7 +37,7 @@ #include <asm/kfence.h> #include <asm/ptdump.h> #include <asm/dma.h> -#include <asm/lowcore.h> +#include <asm/abs_lowcore.h> #include <asm/tlb.h> #include <asm/tlbflush.h> #include <asm/sections.h> @@ -47,6 +47,7 @@ #include <asm/kasan.h> #include <asm/dma-mapping.h> #include <asm/uv.h> +#include <linux/virtio_anchor.h> #include <linux/virtio_config.h> pgd_t swapper_pg_dir[PTRS_PER_PGD] __section(".bss..swapper_pg_dir"); @@ -168,25 +169,16 @@ bool force_dma_unencrypted(struct device *dev) return is_prot_virt_guest(); } -#ifdef CONFIG_ARCH_HAS_RESTRICTED_VIRTIO_MEMORY_ACCESS - -int arch_has_restricted_virtio_memory_access(void) -{ - return is_prot_virt_guest(); -} -EXPORT_SYMBOL(arch_has_restricted_virtio_memory_access); - -#endif - /* protected virtualization */ static void pv_init(void) { if (!is_prot_virt_guest()) return; + virtio_set_mem_acc_cb(virtio_require_restricted_mem_acc); + /* make sure bounce buffers are shared */ - swiotlb_force = SWIOTLB_FORCE; - swiotlb_init(1); + swiotlb_init(true, SWIOTLB_FORCE | SWIOTLB_VERBOSE); swiotlb_update_mem_attributes(); } @@ -215,6 +207,9 @@ void free_initmem(void) __set_memory((unsigned long)_sinittext, (unsigned long)(_einittext - _sinittext) >> PAGE_SHIFT, SET_MEMORY_RW | SET_MEMORY_NX); + free_reserved_area(sclp_early_sccb, + sclp_early_sccb + EXT_SCCB_READ_SCP, + POISON_FREE_INITMEM, "unused early sccb"); free_initmem_default(POISON_FREE_INITMEM); } diff --git a/arch/s390/mm/kasan_init.c b/arch/s390/mm/kasan_init.c index 483b9dbe0970..9f988d4582ed 100644 --- a/arch/s390/mm/kasan_init.c +++ b/arch/s390/mm/kasan_init.c @@ -175,7 +175,7 @@ static void __init kasan_early_pgtable_populate(unsigned long address, page = kasan_early_alloc_segment(); memset(page, 0, _SEGMENT_SIZE); } - pmd_val(*pm_dir) = __pa(page) | sgt_prot; + set_pmd(pm_dir, __pmd(__pa(page) | sgt_prot)); address = (address + PMD_SIZE) & PMD_MASK; continue; } @@ -194,16 +194,16 @@ static void __init kasan_early_pgtable_populate(unsigned long address, switch (mode) { case POPULATE_ONE2ONE: page = (void *)address; - pte_val(*pt_dir) = __pa(page) | pgt_prot; + set_pte(pt_dir, __pte(__pa(page) | pgt_prot)); break; case POPULATE_MAP: page = kasan_early_alloc_pages(0); memset(page, 0, PAGE_SIZE); - pte_val(*pt_dir) = __pa(page) | pgt_prot; + set_pte(pt_dir, __pte(__pa(page) | pgt_prot)); break; case POPULATE_ZERO_SHADOW: page = kasan_early_shadow_page; - pte_val(*pt_dir) = __pa(page) | pgt_prot_zero; + set_pte(pt_dir, __pte(__pa(page) | pgt_prot_zero)); break; case POPULATE_SHALLOW: /* should never happen */ diff --git a/arch/s390/mm/maccess.c b/arch/s390/mm/maccess.c index 9663ce3625bc..1571cdcb0c50 100644 --- a/arch/s390/mm/maccess.c +++ b/arch/s390/mm/maccess.c @@ -4,8 +4,6 @@ * * Copyright IBM Corp. 2009, 2015 * - * Author(s): Heiko Carstens <heiko.carstens@de.ibm.com>, - * */ #include <linux/uaccess.h> @@ -14,9 +12,17 @@ #include <linux/errno.h> #include <linux/gfp.h> #include <linux/cpu.h> +#include <linux/uio.h> +#include <asm/asm-extable.h> #include <asm/ctl_reg.h> #include <asm/io.h> +#include <asm/abs_lowcore.h> #include <asm/stacktrace.h> +#include <asm/maccess.h> + +unsigned long __bootdata_preserved(__memcpy_real_area); +static __ro_after_init pte_t *memcpy_real_ptep; +static DEFINE_MUTEX(memcpy_real_mutex); static notrace long s390_kernel_write_odd(void *dst, const void *src, size_t size) { @@ -77,144 +83,72 @@ notrace void *s390_kernel_write(void *dst, const void *src, size_t size) return dst; } -static int __no_sanitize_address __memcpy_real(void *dest, void *src, size_t count) -{ - union register_pair _dst, _src; - int rc = -EFAULT; - - _dst.even = (unsigned long) dest; - _dst.odd = (unsigned long) count; - _src.even = (unsigned long) src; - _src.odd = (unsigned long) count; - asm volatile ( - "0: mvcle %[dst],%[src],0\n" - "1: jo 0b\n" - " lhi %[rc],0\n" - "2:\n" - EX_TABLE(1b,2b) - : [rc] "+&d" (rc), [dst] "+&d" (_dst.pair), [src] "+&d" (_src.pair) - : : "cc", "memory"); - return rc; -} - -static unsigned long __no_sanitize_address _memcpy_real(unsigned long dest, - unsigned long src, - unsigned long count) +void __init memcpy_real_init(void) { - int irqs_disabled, rc; - unsigned long flags; - - if (!count) - return 0; - flags = arch_local_irq_save(); - irqs_disabled = arch_irqs_disabled_flags(flags); - if (!irqs_disabled) - trace_hardirqs_off(); - __arch_local_irq_stnsm(0xf8); // disable DAT - rc = __memcpy_real((void *) dest, (void *) src, (size_t) count); - if (flags & PSW_MASK_DAT) - __arch_local_irq_stosm(0x04); // enable DAT - if (!irqs_disabled) - trace_hardirqs_on(); - __arch_local_irq_ssm(flags); - return rc; + memcpy_real_ptep = vmem_get_alloc_pte(__memcpy_real_area, true); + if (!memcpy_real_ptep) + panic("Couldn't setup memcpy real area"); } -/* - * Copy memory in real mode (kernel to kernel) - */ -int memcpy_real(void *dest, void *src, size_t count) -{ - unsigned long _dest = (unsigned long)dest; - unsigned long _src = (unsigned long)src; - unsigned long _count = (unsigned long)count; - int rc; - - if (S390_lowcore.nodat_stack != 0) { - preempt_disable(); - rc = call_on_stack(3, S390_lowcore.nodat_stack, - unsigned long, _memcpy_real, - unsigned long, _dest, - unsigned long, _src, - unsigned long, _count); - preempt_enable(); - return rc; - } - /* - * This is a really early memcpy_real call, the stacks are - * not set up yet. Just call _memcpy_real on the early boot - * stack - */ - return _memcpy_real(_dest, _src, _count); -} - -/* - * Copy memory in absolute mode (kernel to kernel) - */ -void memcpy_absolute(void *dest, void *src, size_t count) +size_t memcpy_real_iter(struct iov_iter *iter, unsigned long src, size_t count) { - unsigned long cr0, flags, prefix; - - flags = arch_local_irq_save(); - __ctl_store(cr0, 0, 0); - __ctl_clear_bit(0, 28); /* disable lowcore protection */ - prefix = store_prefix(); - if (prefix) { - local_mcck_disable(); - set_prefix(0); - memcpy(dest, src, count); - set_prefix(prefix); - local_mcck_enable(); - } else { - memcpy(dest, src, count); + size_t len, copied, res = 0; + unsigned long phys, offset; + void *chunk; + pte_t pte; + + while (count) { + phys = src & PAGE_MASK; + offset = src & ~PAGE_MASK; + chunk = (void *)(__memcpy_real_area + offset); + len = min(count, PAGE_SIZE - offset); + pte = mk_pte_phys(phys, PAGE_KERNEL_RO); + + mutex_lock(&memcpy_real_mutex); + if (pte_val(pte) != pte_val(*memcpy_real_ptep)) { + __ptep_ipte(__memcpy_real_area, memcpy_real_ptep, 0, 0, IPTE_GLOBAL); + set_pte(memcpy_real_ptep, pte); + } + copied = copy_to_iter(chunk, len, iter); + mutex_unlock(&memcpy_real_mutex); + + count -= copied; + src += copied; + res += copied; + if (copied < len) + break; } - __ctl_load(cr0, 0, 0); - arch_local_irq_restore(flags); + return res; } -/* - * Copy memory from kernel (real) to user (virtual) - */ -int copy_to_user_real(void __user *dest, void *src, unsigned long count) +int memcpy_real(void *dest, unsigned long src, size_t count) { - int offs = 0, size, rc; - char *buf; - - buf = (char *) __get_free_page(GFP_KERNEL); - if (!buf) - return -ENOMEM; - rc = -EFAULT; - while (offs < count) { - size = min(PAGE_SIZE, count - offs); - if (memcpy_real(buf, src + offs, size)) - goto out; - if (copy_to_user(dest + offs, buf, size)) - goto out; - offs += size; - } - rc = 0; -out: - free_page((unsigned long) buf); - return rc; + struct iov_iter iter; + struct kvec kvec; + + kvec.iov_base = dest; + kvec.iov_len = count; + iov_iter_kvec(&iter, WRITE, &kvec, 1, count); + if (memcpy_real_iter(&iter, src, count) < count) + return -EFAULT; + return 0; } /* - * Check if physical address is within prefix or zero page + * Find CPU that owns swapped prefix page */ -static int is_swapped(unsigned long addr) +static int get_swapped_owner(phys_addr_t addr) { - unsigned long lc; + phys_addr_t lc; int cpu; - if (addr < sizeof(struct lowcore)) - return 1; for_each_online_cpu(cpu) { - lc = (unsigned long) lowcore_ptr[cpu]; + lc = virt_to_phys(lowcore_ptr[cpu]); if (addr > lc + sizeof(struct lowcore) - 1 || addr < lc) continue; - return 1; + return cpu; } - return 0; + return -1; } /* @@ -225,18 +159,37 @@ static int is_swapped(unsigned long addr) */ void *xlate_dev_mem_ptr(phys_addr_t addr) { - void *bounce = (void *) addr; + void *ptr = phys_to_virt(addr); + void *bounce = ptr; + struct lowcore *abs_lc; + unsigned long flags; unsigned long size; + int this_cpu, cpu; cpus_read_lock(); - preempt_disable(); - if (is_swapped(addr)) { - size = PAGE_SIZE - (addr & ~PAGE_MASK); - bounce = (void *) __get_free_page(GFP_ATOMIC); - if (bounce) - memcpy_absolute(bounce, (void *) addr, size); + this_cpu = get_cpu(); + if (addr >= sizeof(struct lowcore)) { + cpu = get_swapped_owner(addr); + if (cpu < 0) + goto out; } - preempt_enable(); + bounce = (void *)__get_free_page(GFP_ATOMIC); + if (!bounce) + goto out; + size = PAGE_SIZE - (addr & ~PAGE_MASK); + if (addr < sizeof(struct lowcore)) { + abs_lc = get_abs_lowcore(&flags); + ptr = (void *)abs_lc + addr; + memcpy(bounce, ptr, size); + put_abs_lowcore(abs_lc, flags); + } else if (cpu == this_cpu) { + ptr = (void *)(addr - virt_to_phys(lowcore_ptr[cpu])); + memcpy(bounce, ptr, size); + } else { + memcpy(bounce, ptr, size); + } +out: + put_cpu(); cpus_read_unlock(); return bounce; } @@ -244,8 +197,8 @@ void *xlate_dev_mem_ptr(phys_addr_t addr) /* * Free converted buffer for /dev/mem access (if necessary) */ -void unxlate_dev_mem_ptr(phys_addr_t addr, void *buf) +void unxlate_dev_mem_ptr(phys_addr_t addr, void *ptr) { - if ((void *) addr != buf) - free_page((unsigned long) buf); + if (addr != virt_to_phys(ptr)) + free_page((unsigned long)ptr); } diff --git a/arch/s390/mm/mmap.c b/arch/s390/mm/mmap.c index e54f928503c5..3327c47bc181 100644 --- a/arch/s390/mm/mmap.c +++ b/arch/s390/mm/mmap.c @@ -37,7 +37,7 @@ static inline int mmap_is_legacy(struct rlimit *rlim_stack) unsigned long arch_mmap_rnd(void) { - return (get_random_int() & MMAP_RND_MASK) << PAGE_SHIFT; + return (get_random_u32() & MMAP_RND_MASK) << PAGE_SHIFT; } static unsigned long mmap_base_legacy(unsigned long rnd) @@ -58,9 +58,9 @@ static inline unsigned long mmap_base(unsigned long rnd, /* * Top of mmap area (just below the process stack). - * Leave at least a ~32 MB hole. + * Leave at least a ~128 MB hole. */ - gap_min = 32 * 1024 * 1024UL; + gap_min = SZ_128M; gap_max = (STACK_TOP / 6) * 5; if (gap < gap_min) @@ -188,3 +188,23 @@ void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) mm->get_unmapped_area = arch_get_unmapped_area_topdown; } } + +static const pgprot_t protection_map[16] = { + [VM_NONE] = PAGE_NONE, + [VM_READ] = PAGE_RO, + [VM_WRITE] = PAGE_RO, + [VM_WRITE | VM_READ] = PAGE_RO, + [VM_EXEC] = PAGE_RX, + [VM_EXEC | VM_READ] = PAGE_RX, + [VM_EXEC | VM_WRITE] = PAGE_RX, + [VM_EXEC | VM_WRITE | VM_READ] = PAGE_RX, + [VM_SHARED] = PAGE_NONE, + [VM_SHARED | VM_READ] = PAGE_RO, + [VM_SHARED | VM_WRITE] = PAGE_RW, + [VM_SHARED | VM_WRITE | VM_READ] = PAGE_RW, + [VM_SHARED | VM_EXEC] = PAGE_RX, + [VM_SHARED | VM_EXEC | VM_READ] = PAGE_RX, + [VM_SHARED | VM_EXEC | VM_WRITE] = PAGE_RWX, + [VM_SHARED | VM_EXEC | VM_WRITE | VM_READ] = PAGE_RWX +}; +DECLARE_VM_GET_PAGE_PROT diff --git a/arch/s390/mm/page-states.c b/arch/s390/mm/page-states.c index 18a6381097a9..d5ea09d78938 100644 --- a/arch/s390/mm/page-states.c +++ b/arch/s390/mm/page-states.c @@ -14,6 +14,7 @@ #include <linux/memblock.h> #include <linux/gfp.h> #include <linux/init.h> +#include <asm/asm-extable.h> #include <asm/facility.h> #include <asm/page-states.h> diff --git a/arch/s390/mm/pageattr.c b/arch/s390/mm/pageattr.c index 654019181a37..85195c18b2e8 100644 --- a/arch/s390/mm/pageattr.c +++ b/arch/s390/mm/pageattr.c @@ -98,9 +98,9 @@ static int walk_pte_level(pmd_t *pmdp, unsigned long addr, unsigned long end, else if (flags & SET_MEMORY_RW) new = pte_mkwrite(pte_mkdirty(new)); if (flags & SET_MEMORY_NX) - pte_val(new) |= _PAGE_NOEXEC; + new = set_pte_bit(new, __pgprot(_PAGE_NOEXEC)); else if (flags & SET_MEMORY_X) - pte_val(new) &= ~_PAGE_NOEXEC; + new = clear_pte_bit(new, __pgprot(_PAGE_NOEXEC)); pgt_set((unsigned long *)ptep, pte_val(new), addr, CRDTE_DTT_PAGE); ptep++; addr += PAGE_SIZE; @@ -127,11 +127,11 @@ static int split_pmd_page(pmd_t *pmdp, unsigned long addr) prot &= ~_PAGE_NOEXEC; ptep = pt_dir; for (i = 0; i < PTRS_PER_PTE; i++) { - pte_val(*ptep) = pte_addr | prot; + set_pte(ptep, __pte(pte_addr | prot)); pte_addr += PAGE_SIZE; ptep++; } - pmd_val(new) = __pa(pt_dir) | _SEGMENT_ENTRY; + new = __pmd(__pa(pt_dir) | _SEGMENT_ENTRY); pgt_set((unsigned long *)pmdp, pmd_val(new), addr, CRDTE_DTT_SEGMENT); update_page_count(PG_DIRECT_MAP_4K, PTRS_PER_PTE); update_page_count(PG_DIRECT_MAP_1M, -1); @@ -148,9 +148,9 @@ static void modify_pmd_page(pmd_t *pmdp, unsigned long addr, else if (flags & SET_MEMORY_RW) new = pmd_mkwrite(pmd_mkdirty(new)); if (flags & SET_MEMORY_NX) - pmd_val(new) |= _SEGMENT_ENTRY_NOEXEC; + new = set_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_NOEXEC)); else if (flags & SET_MEMORY_X) - pmd_val(new) &= ~_SEGMENT_ENTRY_NOEXEC; + new = clear_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_NOEXEC)); pgt_set((unsigned long *)pmdp, pmd_val(new), addr, CRDTE_DTT_SEGMENT); } @@ -208,11 +208,11 @@ static int split_pud_page(pud_t *pudp, unsigned long addr) prot &= ~_SEGMENT_ENTRY_NOEXEC; pmdp = pm_dir; for (i = 0; i < PTRS_PER_PMD; i++) { - pmd_val(*pmdp) = pmd_addr | prot; + set_pmd(pmdp, __pmd(pmd_addr | prot)); pmd_addr += PMD_SIZE; pmdp++; } - pud_val(new) = __pa(pm_dir) | _REGION3_ENTRY; + new = __pud(__pa(pm_dir) | _REGION3_ENTRY); pgt_set((unsigned long *)pudp, pud_val(new), addr, CRDTE_DTT_REGION3); update_page_count(PG_DIRECT_MAP_1M, PTRS_PER_PMD); update_page_count(PG_DIRECT_MAP_2G, -1); @@ -229,9 +229,9 @@ static void modify_pud_page(pud_t *pudp, unsigned long addr, else if (flags & SET_MEMORY_RW) new = pud_mkwrite(pud_mkdirty(new)); if (flags & SET_MEMORY_NX) - pud_val(new) |= _REGION_ENTRY_NOEXEC; + new = set_pud_bit(new, __pgprot(_REGION_ENTRY_NOEXEC)); else if (flags & SET_MEMORY_X) - pud_val(new) &= ~_REGION_ENTRY_NOEXEC; + new = clear_pud_bit(new, __pgprot(_REGION_ENTRY_NOEXEC)); pgt_set((unsigned long *)pudp, pud_val(new), addr, CRDTE_DTT_REGION3); } @@ -347,23 +347,24 @@ static void ipte_range(pte_t *pte, unsigned long address, int nr) void __kernel_map_pages(struct page *page, int numpages, int enable) { unsigned long address; + pte_t *ptep, pte; int nr, i, j; - pte_t *pte; for (i = 0; i < numpages;) { address = (unsigned long)page_to_virt(page + i); - pte = virt_to_kpte(address); - nr = (unsigned long)pte >> ilog2(sizeof(long)); + ptep = virt_to_kpte(address); + nr = (unsigned long)ptep >> ilog2(sizeof(long)); nr = PTRS_PER_PTE - (nr & (PTRS_PER_PTE - 1)); nr = min(numpages - i, nr); if (enable) { for (j = 0; j < nr; j++) { - pte_val(*pte) &= ~_PAGE_INVALID; + pte = clear_pte_bit(*ptep, __pgprot(_PAGE_INVALID)); + set_pte(ptep, pte); address += PAGE_SIZE; - pte++; + ptep++; } } else { - ipte_range(pte, address, nr); + ipte_range(ptep, address, nr); } i += nr; } diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c index 781965f7210e..2de48b2c1b04 100644 --- a/arch/s390/mm/pgalloc.c +++ b/arch/s390/mm/pgalloc.c @@ -53,17 +53,17 @@ __initcall(page_table_register_sysctl); unsigned long *crst_table_alloc(struct mm_struct *mm) { - struct page *page = alloc_pages(GFP_KERNEL, 2); + struct page *page = alloc_pages(GFP_KERNEL, CRST_ALLOC_ORDER); if (!page) return NULL; - arch_set_page_dat(page, 2); + arch_set_page_dat(page, CRST_ALLOC_ORDER); return (unsigned long *) page_to_virt(page); } void crst_table_free(struct mm_struct *mm, unsigned long *table) { - free_pages((unsigned long) table, 2); + free_pages((unsigned long)table, CRST_ALLOC_ORDER); } static void __crst_table_upgrade(void *arg) @@ -176,7 +176,75 @@ void page_table_free_pgste(struct page *page) #endif /* CONFIG_PGSTE */ /* - * page table entry allocation/free routines. + * A 2KB-pgtable is either upper or lower half of a normal page. + * The second half of the page may be unused or used as another + * 2KB-pgtable. + * + * Whenever possible the parent page for a new 2KB-pgtable is picked + * from the list of partially allocated pages mm_context_t::pgtable_list. + * In case the list is empty a new parent page is allocated and added to + * the list. + * + * When a parent page gets fully allocated it contains 2KB-pgtables in both + * upper and lower halves and is removed from mm_context_t::pgtable_list. + * + * When 2KB-pgtable is freed from to fully allocated parent page that + * page turns partially allocated and added to mm_context_t::pgtable_list. + * + * If 2KB-pgtable is freed from the partially allocated parent page that + * page turns unused and gets removed from mm_context_t::pgtable_list. + * Furthermore, the unused parent page is released. + * + * As follows from the above, no unallocated or fully allocated parent + * pages are contained in mm_context_t::pgtable_list. + * + * The upper byte (bits 24-31) of the parent page _refcount is used + * for tracking contained 2KB-pgtables and has the following format: + * + * PP AA + * 01234567 upper byte (bits 24-31) of struct page::_refcount + * || || + * || |+--- upper 2KB-pgtable is allocated + * || +---- lower 2KB-pgtable is allocated + * |+------- upper 2KB-pgtable is pending for removal + * +-------- lower 2KB-pgtable is pending for removal + * + * (See commit 620b4e903179 ("s390: use _refcount for pgtables") on why + * using _refcount is possible). + * + * When 2KB-pgtable is allocated the corresponding AA bit is set to 1. + * The parent page is either: + * - added to mm_context_t::pgtable_list in case the second half of the + * parent page is still unallocated; + * - removed from mm_context_t::pgtable_list in case both hales of the + * parent page are allocated; + * These operations are protected with mm_context_t::lock. + * + * When 2KB-pgtable is deallocated the corresponding AA bit is set to 0 + * and the corresponding PP bit is set to 1 in a single atomic operation. + * Thus, PP and AA bits corresponding to the same 2KB-pgtable are mutually + * exclusive and may never be both set to 1! + * The parent page is either: + * - added to mm_context_t::pgtable_list in case the second half of the + * parent page is still allocated; + * - removed from mm_context_t::pgtable_list in case the second half of + * the parent page is unallocated; + * These operations are protected with mm_context_t::lock. + * + * It is important to understand that mm_context_t::lock only protects + * mm_context_t::pgtable_list and AA bits, but not the parent page itself + * and PP bits. + * + * Releasing the parent page happens whenever the PP bit turns from 1 to 0, + * while both AA bits and the second PP bit are already unset. Then the + * parent page does not contain any 2KB-pgtable fragment anymore, and it has + * also been removed from mm_context_t::pgtable_list. It is safe to release + * the page therefore. + * + * PGSTE memory spaces use full 4KB-pgtables and do not need most of the + * logic described above. Both AA bits are set to 1 to denote a 4KB-pgtable + * while the PP bits are never used, nor such a page is added to or removed + * from mm_context_t::pgtable_list. */ unsigned long *page_table_alloc(struct mm_struct *mm) { @@ -192,14 +260,23 @@ unsigned long *page_table_alloc(struct mm_struct *mm) page = list_first_entry(&mm->context.pgtable_list, struct page, lru); mask = atomic_read(&page->_refcount) >> 24; - mask = (mask | (mask >> 4)) & 3; - if (mask != 3) { + /* + * The pending removal bits must also be checked. + * Failure to do so might lead to an impossible + * value of (i.e 0x13 or 0x23) written to _refcount. + * Such values violate the assumption that pending and + * allocation bits are mutually exclusive, and the rest + * of the code unrails as result. That could lead to + * a whole bunch of races and corruptions. + */ + mask = (mask | (mask >> 4)) & 0x03U; + if (mask != 0x03U) { table = (unsigned long *) page_to_virt(page); bit = mask & 1; /* =1 -> second 2K */ if (bit) table += PTRS_PER_PTE; atomic_xor_bits(&page->_refcount, - 1U << (bit + 24)); + 0x01U << (bit + 24)); list_del(&page->lru); } } @@ -220,12 +297,12 @@ unsigned long *page_table_alloc(struct mm_struct *mm) table = (unsigned long *) page_to_virt(page); if (mm_alloc_pgste(mm)) { /* Return 4K page table with PGSTEs */ - atomic_xor_bits(&page->_refcount, 3 << 24); + atomic_xor_bits(&page->_refcount, 0x03U << 24); memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE); memset64((u64 *)table + PTRS_PER_PTE, 0, PTRS_PER_PTE); } else { /* Return the first 2K fragment of the page */ - atomic_xor_bits(&page->_refcount, 1 << 24); + atomic_xor_bits(&page->_refcount, 0x01U << 24); memset64((u64 *)table, _PAGE_INVALID, 2 * PTRS_PER_PTE); spin_lock_bh(&mm->context.lock); list_add(&page->lru, &mm->context.pgtable_list); @@ -234,29 +311,53 @@ unsigned long *page_table_alloc(struct mm_struct *mm) return table; } +static void page_table_release_check(struct page *page, void *table, + unsigned int half, unsigned int mask) +{ + char msg[128]; + + if (!IS_ENABLED(CONFIG_DEBUG_VM) || !mask) + return; + snprintf(msg, sizeof(msg), + "Invalid pgtable %p release half 0x%02x mask 0x%02x", + table, half, mask); + dump_page(page, msg); +} + void page_table_free(struct mm_struct *mm, unsigned long *table) { + unsigned int mask, bit, half; struct page *page; - unsigned int bit, mask; page = virt_to_page(table); if (!mm_alloc_pgste(mm)) { /* Free 2K page table fragment of a 4K page */ bit = ((unsigned long) table & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t)); spin_lock_bh(&mm->context.lock); - mask = atomic_xor_bits(&page->_refcount, 1U << (bit + 24)); + /* + * Mark the page for delayed release. The actual release + * will happen outside of the critical section from this + * function or from __tlb_remove_table() + */ + mask = atomic_xor_bits(&page->_refcount, 0x11U << (bit + 24)); mask >>= 24; - if (mask & 3) + if (mask & 0x03U) list_add(&page->lru, &mm->context.pgtable_list); else list_del(&page->lru); spin_unlock_bh(&mm->context.lock); - if (mask != 0) + mask = atomic_xor_bits(&page->_refcount, 0x10U << (bit + 24)); + mask >>= 24; + if (mask != 0x00U) return; + half = 0x01U << bit; } else { - atomic_xor_bits(&page->_refcount, 3U << 24); + half = 0x03U; + mask = atomic_xor_bits(&page->_refcount, 0x03U << 24); + mask >>= 24; } + page_table_release_check(page, table, half, mask); pgtable_pte_page_dtor(page); __free_page(page); } @@ -272,47 +373,54 @@ void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table, page = virt_to_page(table); if (mm_alloc_pgste(mm)) { gmap_unlink(mm, table, vmaddr); - table = (unsigned long *) ((unsigned long)table | 3); + table = (unsigned long *) ((unsigned long)table | 0x03U); tlb_remove_table(tlb, table); return; } bit = ((unsigned long) table & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t)); spin_lock_bh(&mm->context.lock); + /* + * Mark the page for delayed release. The actual release will happen + * outside of the critical section from __tlb_remove_table() or from + * page_table_free() + */ mask = atomic_xor_bits(&page->_refcount, 0x11U << (bit + 24)); mask >>= 24; - if (mask & 3) + if (mask & 0x03U) list_add_tail(&page->lru, &mm->context.pgtable_list); else list_del(&page->lru); spin_unlock_bh(&mm->context.lock); - table = (unsigned long *) ((unsigned long) table | (1U << bit)); + table = (unsigned long *) ((unsigned long) table | (0x01U << bit)); tlb_remove_table(tlb, table); } void __tlb_remove_table(void *_table) { - unsigned int mask = (unsigned long) _table & 3; + unsigned int mask = (unsigned long) _table & 0x03U, half = mask; void *table = (void *)((unsigned long) _table ^ mask); struct page *page = virt_to_page(table); - switch (mask) { - case 0: /* pmd, pud, or p4d */ - free_pages((unsigned long) table, 2); - break; - case 1: /* lower 2K of a 4K page table */ - case 2: /* higher 2K of a 4K page table */ + switch (half) { + case 0x00U: /* pmd, pud, or p4d */ + free_pages((unsigned long)table, CRST_ALLOC_ORDER); + return; + case 0x01U: /* lower 2K of a 4K page table */ + case 0x02U: /* higher 2K of a 4K page table */ mask = atomic_xor_bits(&page->_refcount, mask << (4 + 24)); mask >>= 24; - if (mask != 0) - break; - fallthrough; - case 3: /* 4K page table with pgstes */ - if (mask & 3) - atomic_xor_bits(&page->_refcount, 3 << 24); - pgtable_pte_page_dtor(page); - __free_page(page); + if (mask != 0x00U) + return; + break; + case 0x03U: /* 4K page table with pgstes */ + mask = atomic_xor_bits(&page->_refcount, 0x03U << 24); + mask >>= 24; break; } + + page_table_release_check(page, table, half, mask); + pgtable_pte_page_dtor(page); + __free_page(page); } /* @@ -322,34 +430,34 @@ void __tlb_remove_table(void *_table) static struct kmem_cache *base_pgt_cache; -static unsigned long base_pgt_alloc(void) +static unsigned long *base_pgt_alloc(void) { - u64 *table; + unsigned long *table; table = kmem_cache_alloc(base_pgt_cache, GFP_KERNEL); if (table) - memset64(table, _PAGE_INVALID, PTRS_PER_PTE); - return (unsigned long) table; + memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE); + return table; } -static void base_pgt_free(unsigned long table) +static void base_pgt_free(unsigned long *table) { - kmem_cache_free(base_pgt_cache, (void *) table); + kmem_cache_free(base_pgt_cache, table); } -static unsigned long base_crst_alloc(unsigned long val) +static unsigned long *base_crst_alloc(unsigned long val) { - unsigned long table; + unsigned long *table; - table = __get_free_pages(GFP_KERNEL, CRST_ALLOC_ORDER); + table = (unsigned long *)__get_free_pages(GFP_KERNEL, CRST_ALLOC_ORDER); if (table) - crst_table_init((unsigned long *)table, val); + crst_table_init(table, val); return table; } -static void base_crst_free(unsigned long table) +static void base_crst_free(unsigned long *table) { - free_pages(table, CRST_ALLOC_ORDER); + free_pages((unsigned long)table, CRST_ALLOC_ORDER); } #define BASE_ADDR_END_FUNC(NAME, SIZE) \ @@ -377,14 +485,14 @@ static inline unsigned long base_lra(unsigned long address) return real; } -static int base_page_walk(unsigned long origin, unsigned long addr, +static int base_page_walk(unsigned long *origin, unsigned long addr, unsigned long end, int alloc) { unsigned long *pte, next; if (!alloc) return 0; - pte = (unsigned long *) origin; + pte = origin; pte += (addr & _PAGE_INDEX) >> _PAGE_SHIFT; do { next = base_page_addr_end(addr, end); @@ -393,13 +501,13 @@ static int base_page_walk(unsigned long origin, unsigned long addr, return 0; } -static int base_segment_walk(unsigned long origin, unsigned long addr, +static int base_segment_walk(unsigned long *origin, unsigned long addr, unsigned long end, int alloc) { - unsigned long *ste, next, table; + unsigned long *ste, next, *table; int rc; - ste = (unsigned long *) origin; + ste = origin; ste += (addr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT; do { next = base_segment_addr_end(addr, end); @@ -409,9 +517,9 @@ static int base_segment_walk(unsigned long origin, unsigned long addr, table = base_pgt_alloc(); if (!table) return -ENOMEM; - *ste = table | _SEGMENT_ENTRY; + *ste = __pa(table) | _SEGMENT_ENTRY; } - table = *ste & _SEGMENT_ENTRY_ORIGIN; + table = __va(*ste & _SEGMENT_ENTRY_ORIGIN); rc = base_page_walk(table, addr, next, alloc); if (rc) return rc; @@ -422,13 +530,13 @@ static int base_segment_walk(unsigned long origin, unsigned long addr, return 0; } -static int base_region3_walk(unsigned long origin, unsigned long addr, +static int base_region3_walk(unsigned long *origin, unsigned long addr, unsigned long end, int alloc) { - unsigned long *rtte, next, table; + unsigned long *rtte, next, *table; int rc; - rtte = (unsigned long *) origin; + rtte = origin; rtte += (addr & _REGION3_INDEX) >> _REGION3_SHIFT; do { next = base_region3_addr_end(addr, end); @@ -438,9 +546,9 @@ static int base_region3_walk(unsigned long origin, unsigned long addr, table = base_crst_alloc(_SEGMENT_ENTRY_EMPTY); if (!table) return -ENOMEM; - *rtte = table | _REGION3_ENTRY; + *rtte = __pa(table) | _REGION3_ENTRY; } - table = *rtte & _REGION_ENTRY_ORIGIN; + table = __va(*rtte & _REGION_ENTRY_ORIGIN); rc = base_segment_walk(table, addr, next, alloc); if (rc) return rc; @@ -450,13 +558,13 @@ static int base_region3_walk(unsigned long origin, unsigned long addr, return 0; } -static int base_region2_walk(unsigned long origin, unsigned long addr, +static int base_region2_walk(unsigned long *origin, unsigned long addr, unsigned long end, int alloc) { - unsigned long *rste, next, table; + unsigned long *rste, next, *table; int rc; - rste = (unsigned long *) origin; + rste = origin; rste += (addr & _REGION2_INDEX) >> _REGION2_SHIFT; do { next = base_region2_addr_end(addr, end); @@ -466,9 +574,9 @@ static int base_region2_walk(unsigned long origin, unsigned long addr, table = base_crst_alloc(_REGION3_ENTRY_EMPTY); if (!table) return -ENOMEM; - *rste = table | _REGION2_ENTRY; + *rste = __pa(table) | _REGION2_ENTRY; } - table = *rste & _REGION_ENTRY_ORIGIN; + table = __va(*rste & _REGION_ENTRY_ORIGIN); rc = base_region3_walk(table, addr, next, alloc); if (rc) return rc; @@ -478,13 +586,13 @@ static int base_region2_walk(unsigned long origin, unsigned long addr, return 0; } -static int base_region1_walk(unsigned long origin, unsigned long addr, +static int base_region1_walk(unsigned long *origin, unsigned long addr, unsigned long end, int alloc) { - unsigned long *rfte, next, table; + unsigned long *rfte, next, *table; int rc; - rfte = (unsigned long *) origin; + rfte = origin; rfte += (addr & _REGION1_INDEX) >> _REGION1_SHIFT; do { next = base_region1_addr_end(addr, end); @@ -494,9 +602,9 @@ static int base_region1_walk(unsigned long origin, unsigned long addr, table = base_crst_alloc(_REGION2_ENTRY_EMPTY); if (!table) return -ENOMEM; - *rfte = table | _REGION1_ENTRY; + *rfte = __pa(table) | _REGION1_ENTRY; } - table = *rfte & _REGION_ENTRY_ORIGIN; + table = __va(*rfte & _REGION_ENTRY_ORIGIN); rc = base_region2_walk(table, addr, next, alloc); if (rc) return rc; @@ -515,7 +623,7 @@ static int base_region1_walk(unsigned long origin, unsigned long addr, */ void base_asce_free(unsigned long asce) { - unsigned long table = asce & _ASCE_ORIGIN; + unsigned long *table = __va(asce & _ASCE_ORIGIN); if (!asce) return; @@ -567,7 +675,7 @@ static int base_pgt_cache_init(void) */ unsigned long base_asce_alloc(unsigned long addr, unsigned long num_pages) { - unsigned long asce, table, end; + unsigned long asce, *table, end; int rc; if (base_pgt_cache_init()) @@ -578,25 +686,25 @@ unsigned long base_asce_alloc(unsigned long addr, unsigned long num_pages) if (!table) return 0; rc = base_segment_walk(table, addr, end, 1); - asce = table | _ASCE_TYPE_SEGMENT | _ASCE_TABLE_LENGTH; + asce = __pa(table) | _ASCE_TYPE_SEGMENT | _ASCE_TABLE_LENGTH; } else if (end <= _REGION2_SIZE) { table = base_crst_alloc(_REGION3_ENTRY_EMPTY); if (!table) return 0; rc = base_region3_walk(table, addr, end, 1); - asce = table | _ASCE_TYPE_REGION3 | _ASCE_TABLE_LENGTH; + asce = __pa(table) | _ASCE_TYPE_REGION3 | _ASCE_TABLE_LENGTH; } else if (end <= _REGION1_SIZE) { table = base_crst_alloc(_REGION2_ENTRY_EMPTY); if (!table) return 0; rc = base_region2_walk(table, addr, end, 1); - asce = table | _ASCE_TYPE_REGION2 | _ASCE_TABLE_LENGTH; + asce = __pa(table) | _ASCE_TYPE_REGION2 | _ASCE_TABLE_LENGTH; } else { table = base_crst_alloc(_REGION1_ENTRY_EMPTY); if (!table) return 0; rc = base_region1_walk(table, addr, end, 1); - asce = table | _ASCE_TYPE_REGION1 | _ASCE_TABLE_LENGTH; + asce = __pa(table) | _ASCE_TYPE_REGION1 | _ASCE_TABLE_LENGTH; } if (rc) { base_asce_free(asce); diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index c16232cd0ec5..4909dcd762e8 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -115,7 +115,7 @@ static inline pte_t ptep_flush_lazy(struct mm_struct *mm, atomic_inc(&mm->context.flush_count); if (cpumask_equal(&mm->context.cpu_attach_mask, cpumask_of(smp_processor_id()))) { - pte_val(*ptep) |= _PAGE_INVALID; + set_pte(ptep, set_pte_bit(*ptep, __pgprot(_PAGE_INVALID))); mm->context.flush_mm = 1; } else ptep_ipte_global(mm, addr, ptep, nodat); @@ -224,15 +224,15 @@ static inline pgste_t pgste_set_pte(pte_t *ptep, pgste_t pgste, pte_t entry) * Without enhanced suppression-on-protection force * the dirty bit on for all writable ptes. */ - pte_val(entry) |= _PAGE_DIRTY; - pte_val(entry) &= ~_PAGE_PROTECT; + entry = set_pte_bit(entry, __pgprot(_PAGE_DIRTY)); + entry = clear_pte_bit(entry, __pgprot(_PAGE_PROTECT)); } if (!(pte_val(entry) & _PAGE_PROTECT)) /* This pte allows write access, set user-dirty */ pgste_val(pgste) |= PGSTE_UC_BIT; } #endif - *ptep = entry; + set_pte(ptep, entry); return pgste; } @@ -275,12 +275,12 @@ static inline pte_t ptep_xchg_commit(struct mm_struct *mm, pgste = pgste_update_all(old, pgste, mm); if ((pgste_val(pgste) & _PGSTE_GPS_USAGE_MASK) == _PGSTE_GPS_USAGE_UNUSED) - pte_val(old) |= _PAGE_UNUSED; + old = set_pte_bit(old, __pgprot(_PAGE_UNUSED)); } pgste = pgste_set_pte(ptep, pgste, new); pgste_set_unlock(ptep, pgste); } else { - *ptep = new; + set_pte(ptep, new); } return old; } @@ -345,14 +345,14 @@ void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, struct mm_struct *mm = vma->vm_mm; if (!MACHINE_HAS_NX) - pte_val(pte) &= ~_PAGE_NOEXEC; + pte = clear_pte_bit(pte, __pgprot(_PAGE_NOEXEC)); if (mm_has_pgste(mm)) { pgste = pgste_get(ptep); pgste_set_key(ptep, pgste, pte, mm); pgste = pgste_set_pte(ptep, pgste, pte); pgste_set_unlock(ptep, pgste); } else { - *ptep = pte; + set_pte(ptep, pte); } preempt_enable(); } @@ -417,7 +417,7 @@ static inline pmd_t pmdp_flush_lazy(struct mm_struct *mm, atomic_inc(&mm->context.flush_count); if (cpumask_equal(&mm->context.cpu_attach_mask, cpumask_of(smp_processor_id()))) { - pmd_val(*pmdp) |= _SEGMENT_ENTRY_INVALID; + set_pmd(pmdp, set_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_INVALID))); mm->context.flush_mm = 1; if (mm_has_pgste(mm)) gmap_pmdp_invalidate(mm, addr); @@ -469,7 +469,7 @@ pmd_t pmdp_xchg_direct(struct mm_struct *mm, unsigned long addr, preempt_disable(); old = pmdp_flush_direct(mm, addr, pmdp); - *pmdp = new; + set_pmd(pmdp, new); preempt_enable(); return old; } @@ -482,7 +482,7 @@ pmd_t pmdp_xchg_lazy(struct mm_struct *mm, unsigned long addr, preempt_disable(); old = pmdp_flush_lazy(mm, addr, pmdp); - *pmdp = new; + set_pmd(pmdp, new); preempt_enable(); return old; } @@ -539,7 +539,7 @@ pud_t pudp_xchg_direct(struct mm_struct *mm, unsigned long addr, preempt_disable(); old = pudp_flush_direct(mm, addr, pudp); - *pudp = new; + set_pud(pudp, new); preempt_enable(); return old; } @@ -579,9 +579,9 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) list_del(lh); } ptep = (pte_t *) pgtable; - pte_val(*ptep) = _PAGE_INVALID; + set_pte(ptep, __pte(_PAGE_INVALID)); ptep++; - pte_val(*ptep) = _PAGE_INVALID; + set_pte(ptep, __pte(_PAGE_INVALID)); return pgtable; } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ @@ -646,12 +646,12 @@ int ptep_force_prot(struct mm_struct *mm, unsigned long addr, if (prot == PROT_NONE && !pte_i) { ptep_flush_direct(mm, addr, ptep, nodat); pgste = pgste_update_all(entry, pgste, mm); - pte_val(entry) |= _PAGE_INVALID; + entry = set_pte_bit(entry, __pgprot(_PAGE_INVALID)); } if (prot == PROT_READ && !pte_p) { ptep_flush_direct(mm, addr, ptep, nodat); - pte_val(entry) &= ~_PAGE_INVALID; - pte_val(entry) |= _PAGE_PROTECT; + entry = clear_pte_bit(entry, __pgprot(_PAGE_INVALID)); + entry = set_pte_bit(entry, __pgprot(_PAGE_PROTECT)); } pgste_val(pgste) |= bit; pgste = pgste_set_pte(ptep, pgste, entry); @@ -675,8 +675,8 @@ int ptep_shadow_pte(struct mm_struct *mm, unsigned long saddr, !(pte_val(pte) & _PAGE_PROTECT))) { pgste_val(spgste) |= PGSTE_VSIE_BIT; tpgste = pgste_get_lock(tptep); - pte_val(tpte) = (pte_val(spte) & PAGE_MASK) | - (pte_val(pte) & _PAGE_PROTECT); + tpte = __pte((pte_val(spte) & PAGE_MASK) | + (pte_val(pte) & _PAGE_PROTECT)); /* don't touch the storage key - it belongs to parent pgste */ tpgste = pgste_set_pte(tptep, tpgste, tpte); pgste_set_unlock(tptep, tpgste); @@ -748,7 +748,7 @@ void ptep_zap_key(struct mm_struct *mm, unsigned long addr, pte_t *ptep) pgste_val(pgste) |= PGSTE_GR_BIT | PGSTE_GC_BIT; ptev = pte_val(*ptep); if (!(ptev & _PAGE_INVALID) && (ptev & _PAGE_WRITE)) - page_set_storage_key(ptev & PAGE_MASK, PAGE_DEFAULT_KEY, 1); + page_set_storage_key(ptev & PAGE_MASK, PAGE_DEFAULT_KEY, 0); pgste_set_unlock(ptep, pgste); preempt_enable(); } @@ -773,10 +773,10 @@ bool ptep_test_and_clear_uc(struct mm_struct *mm, unsigned long addr, nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT); ptep_ipte_global(mm, addr, ptep, nodat); if (MACHINE_HAS_ESOP || !(pte_val(pte) & _PAGE_WRITE)) - pte_val(pte) |= _PAGE_PROTECT; + pte = set_pte_bit(pte, __pgprot(_PAGE_PROTECT)); else - pte_val(pte) |= _PAGE_INVALID; - *ptep = pte; + pte = set_pte_bit(pte, __pgprot(_PAGE_INVALID)); + set_pte(ptep, pte); } pgste_set_unlock(ptep, pgste); return dirty; diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c index 7d9705eeb02f..ee1a97078527 100644 --- a/arch/s390/mm/vmem.c +++ b/arch/s390/mm/vmem.c @@ -1,7 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* * Copyright IBM Corp. 2006 - * Author(s): Heiko Carstens <heiko.carstens@de.ibm.com> */ #include <linux/memory_hotplug.h> @@ -175,9 +174,9 @@ static int __ref modify_pte_table(pmd_t *pmd, unsigned long addr, if (!new_page) goto out; - pte_val(*pte) = __pa(new_page) | prot; + set_pte(pte, __pte(__pa(new_page) | prot)); } else { - pte_val(*pte) = __pa(addr) | prot; + set_pte(pte, __pte(__pa(addr) | prot)); } } else { continue; @@ -241,9 +240,9 @@ static int __ref modify_pmd_table(pud_t *pud, unsigned long addr, } else if (pmd_none(*pmd)) { if (IS_ALIGNED(addr, PMD_SIZE) && IS_ALIGNED(next, PMD_SIZE) && - MACHINE_HAS_EDAT1 && addr && direct && + MACHINE_HAS_EDAT1 && direct && !debug_pagealloc_enabled()) { - pmd_val(*pmd) = __pa(addr) | prot; + set_pmd(pmd, __pmd(__pa(addr) | prot)); pages++; continue; } else if (!direct && MACHINE_HAS_EDAT1) { @@ -258,7 +257,7 @@ static int __ref modify_pmd_table(pud_t *pud, unsigned long addr, */ new_page = vmemmap_alloc_block(PMD_SIZE, NUMA_NO_NODE); if (new_page) { - pmd_val(*pmd) = __pa(new_page) | prot; + set_pmd(pmd, __pmd(__pa(new_page) | prot)); if (!IS_ALIGNED(addr, PMD_SIZE) || !IS_ALIGNED(next, PMD_SIZE)) { vmemmap_use_new_sub_pmd(addr, next); @@ -337,9 +336,9 @@ static int modify_pud_table(p4d_t *p4d, unsigned long addr, unsigned long end, } else if (pud_none(*pud)) { if (IS_ALIGNED(addr, PUD_SIZE) && IS_ALIGNED(next, PUD_SIZE) && - MACHINE_HAS_EDAT2 && addr && direct && + MACHINE_HAS_EDAT2 && direct && !debug_pagealloc_enabled()) { - pud_val(*pud) = __pa(addr) | prot; + set_pud(pud, __pud(__pa(addr) | prot)); pages++; continue; } @@ -562,6 +561,103 @@ int vmem_add_mapping(unsigned long start, unsigned long size) } /* + * Allocate new or return existing page-table entry, but do not map it + * to any physical address. If missing, allocate segment- and region- + * table entries along. Meeting a large segment- or region-table entry + * while traversing is an error, since the function is expected to be + * called against virtual regions reserverd for 4KB mappings only. + */ +pte_t *vmem_get_alloc_pte(unsigned long addr, bool alloc) +{ + pte_t *ptep = NULL; + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + pgd = pgd_offset_k(addr); + if (pgd_none(*pgd)) { + if (!alloc) + goto out; + p4d = vmem_crst_alloc(_REGION2_ENTRY_EMPTY); + if (!p4d) + goto out; + pgd_populate(&init_mm, pgd, p4d); + } + p4d = p4d_offset(pgd, addr); + if (p4d_none(*p4d)) { + if (!alloc) + goto out; + pud = vmem_crst_alloc(_REGION3_ENTRY_EMPTY); + if (!pud) + goto out; + p4d_populate(&init_mm, p4d, pud); + } + pud = pud_offset(p4d, addr); + if (pud_none(*pud)) { + if (!alloc) + goto out; + pmd = vmem_crst_alloc(_SEGMENT_ENTRY_EMPTY); + if (!pmd) + goto out; + pud_populate(&init_mm, pud, pmd); + } else if (WARN_ON_ONCE(pud_large(*pud))) { + goto out; + } + pmd = pmd_offset(pud, addr); + if (pmd_none(*pmd)) { + if (!alloc) + goto out; + pte = vmem_pte_alloc(); + if (!pte) + goto out; + pmd_populate(&init_mm, pmd, pte); + } else if (WARN_ON_ONCE(pmd_large(*pmd))) { + goto out; + } + ptep = pte_offset_kernel(pmd, addr); +out: + return ptep; +} + +int __vmem_map_4k_page(unsigned long addr, unsigned long phys, pgprot_t prot, bool alloc) +{ + pte_t *ptep, pte; + + if (!IS_ALIGNED(addr, PAGE_SIZE)) + return -EINVAL; + ptep = vmem_get_alloc_pte(addr, alloc); + if (!ptep) + return -ENOMEM; + __ptep_ipte(addr, ptep, 0, 0, IPTE_GLOBAL); + pte = mk_pte_phys(phys, prot); + set_pte(ptep, pte); + return 0; +} + +int vmem_map_4k_page(unsigned long addr, unsigned long phys, pgprot_t prot) +{ + int rc; + + mutex_lock(&vmem_mutex); + rc = __vmem_map_4k_page(addr, phys, prot, true); + mutex_unlock(&vmem_mutex); + return rc; +} + +void vmem_unmap_4k_page(unsigned long addr) +{ + pte_t *ptep; + + mutex_lock(&vmem_mutex); + ptep = virt_to_kpte(addr); + __ptep_ipte(addr, ptep, 0, 0, IPTE_GLOBAL); + pte_clear(&init_mm, addr, ptep); + mutex_unlock(&vmem_mutex); +} + +/* * map whole physical memory to virtual memory (identity mapping) * we reserve enough space in the vmalloc area for vmemmap to hotplug * additional memory segments. @@ -585,13 +681,12 @@ void __init vmem_map_init(void) __set_memory(__stext_amode31, (__etext_amode31 - __stext_amode31) >> PAGE_SHIFT, SET_MEMORY_RO | SET_MEMORY_X); - if (nospec_uses_trampoline() || !static_key_enabled(&cpu_has_bear)) { - /* - * Lowcore must be executable for LPSWE - * and expoline trampoline branch instructions. - */ + /* lowcore requires 4k mapping for real addresses / prefixing */ + set_memory_4k(0, LC_PAGES); + + /* lowcore must be executable for LPSWE */ + if (!static_key_enabled(&cpu_has_bear)) set_memory_x(0, 1); - } pr_info("Write protected kernel read-only data: %luk\n", (unsigned long)(__end_rodata - _stext) >> 10); |