diff options
Diffstat (limited to 'arch/s390/mm/fault.c')
-rw-r--r-- | arch/s390/mm/fault.c | 329 |
1 files changed, 198 insertions, 131 deletions
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index 7b0bb475c166..9649d9382e0a 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -31,29 +31,30 @@ #include <linux/kprobes.h> #include <linux/uaccess.h> #include <linux/hugetlb.h> +#include <linux/kfence.h> +#include <asm/asm-extable.h> #include <asm/asm-offsets.h> #include <asm/diag.h> -#include <asm/pgtable.h> #include <asm/gmap.h> #include <asm/irq.h> #include <asm/mmu_context.h> #include <asm/facility.h> +#include <asm/uv.h> #include "../kernel/entry.h" #define __FAIL_ADDR_MASK -4096L #define __SUBCODE_MASK 0x0600 #define __PF_RES_FIELD 0x8000000000000000ULL -#define VM_FAULT_BADCONTEXT 0x010000 -#define VM_FAULT_BADMAP 0x020000 -#define VM_FAULT_BADACCESS 0x040000 -#define VM_FAULT_SIGNAL 0x080000 -#define VM_FAULT_PFAULT 0x100000 +#define VM_FAULT_BADCONTEXT ((__force vm_fault_t) 0x010000) +#define VM_FAULT_BADMAP ((__force vm_fault_t) 0x020000) +#define VM_FAULT_BADACCESS ((__force vm_fault_t) 0x040000) +#define VM_FAULT_SIGNAL ((__force vm_fault_t) 0x080000) +#define VM_FAULT_PFAULT ((__force vm_fault_t) 0x100000) enum fault_type { KERNEL_FAULT, USER_FAULT, - VDSO_FAULT, GMAP_FAULT, }; @@ -77,22 +78,16 @@ static enum fault_type get_fault_type(struct pt_regs *regs) trans_exc_code = regs->int_parm_long & 3; if (likely(trans_exc_code == 0)) { /* primary space exception */ - if (IS_ENABLED(CONFIG_PGSTE) && - test_pt_regs_flag(regs, PIF_GUEST_FAULT)) - return GMAP_FAULT; - if (current->thread.mm_segment == USER_DS) + if (user_mode(regs)) return USER_FAULT; - return KERNEL_FAULT; - } - if (trans_exc_code == 2) { - /* secondary space exception */ - if (current->thread.mm_segment & 1) { - if (current->thread.mm_segment == USER_DS_SACF) - return USER_FAULT; + if (!IS_ENABLED(CONFIG_PGSTE)) return KERNEL_FAULT; - } - return VDSO_FAULT; + if (test_pt_regs_flag(regs, PIF_GUEST_FAULT)) + return GMAP_FAULT; + return KERNEL_FAULT; } + if (trans_exc_code == 2) + return USER_FAULT; if (trans_exc_code == 1) { /* access register mode, not used in the kernel */ return USER_FAULT; @@ -105,7 +100,7 @@ static int bad_address(void *p) { unsigned long dummy; - return probe_kernel_address((unsigned long *)p, dummy); + return get_kernel_nofault(dummy, (unsigned long *)p); } static void dump_pagetable(unsigned long asce, unsigned long address) @@ -121,8 +116,8 @@ static void dump_pagetable(unsigned long asce, unsigned long address) pr_cont("R1:%016lx ", *table); if (*table & _REGION_ENTRY_INVALID) goto out; - table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); - /* fallthrough */ + table = __va(*table & _REGION_ENTRY_ORIGIN); + fallthrough; case _ASCE_TYPE_REGION2: table += (address & _REGION2_INDEX) >> _REGION2_SHIFT; if (bad_address(table)) @@ -130,8 +125,8 @@ static void dump_pagetable(unsigned long asce, unsigned long address) pr_cont("R2:%016lx ", *table); if (*table & _REGION_ENTRY_INVALID) goto out; - table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); - /* fallthrough */ + table = __va(*table & _REGION_ENTRY_ORIGIN); + fallthrough; case _ASCE_TYPE_REGION3: table += (address & _REGION3_INDEX) >> _REGION3_SHIFT; if (bad_address(table)) @@ -139,8 +134,8 @@ static void dump_pagetable(unsigned long asce, unsigned long address) pr_cont("R3:%016lx ", *table); if (*table & (_REGION_ENTRY_INVALID | _REGION3_ENTRY_LARGE)) goto out; - table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); - /* fallthrough */ + table = __va(*table & _REGION_ENTRY_ORIGIN); + fallthrough; case _ASCE_TYPE_SEGMENT: table += (address & _SEGMENT_INDEX) >> _SEGMENT_SHIFT; if (bad_address(table)) @@ -148,7 +143,7 @@ static void dump_pagetable(unsigned long asce, unsigned long address) pr_cont("S:%016lx ", *table); if (*table & (_SEGMENT_ENTRY_INVALID | _SEGMENT_ENTRY_LARGE)) goto out; - table = (unsigned long *)(*table & _SEGMENT_ENTRY_ORIGIN); + table = __va(*table & _SEGMENT_ENTRY_ORIGIN); } table += (address & _PAGE_INDEX) >> _PAGE_SHIFT; if (bad_address(table)) @@ -188,10 +183,6 @@ static void dump_fault_info(struct pt_regs *regs) asce = S390_lowcore.user_asce; pr_cont("user "); break; - case VDSO_FAULT: - asce = S390_lowcore.vdso_asce; - pr_cont("vdso "); - break; case GMAP_FAULT: asce = ((struct gmap *) S390_lowcore.gmap)->asce; pr_cont("gmap "); @@ -237,29 +228,10 @@ static noinline void do_sigsegv(struct pt_regs *regs, int si_code) (void __user *)(regs->int_parm_long & __FAIL_ADDR_MASK)); } -const struct exception_table_entry *s390_search_extables(unsigned long addr) -{ - const struct exception_table_entry *fixup; - - fixup = search_extable(__start_dma_ex_table, - __stop_dma_ex_table - __start_dma_ex_table, - addr); - if (!fixup) - fixup = search_exception_tables(addr); - return fixup; -} - static noinline void do_no_context(struct pt_regs *regs) { - const struct exception_table_entry *fixup; - - /* Are we prepared to handle this kernel fault? */ - fixup = s390_search_extables(regs->psw.addr); - if (fixup) { - regs->psw.addr = extable_fixup(fixup); + if (fixup_exception(regs)) return; - } - /* * Oops. The kernel tried to access some bad page. We'll have to * terminate things with extreme prejudice. @@ -272,7 +244,6 @@ static noinline void do_no_context(struct pt_regs *regs) " in virtual user address space\n"); dump_fault_info(regs); die(regs, "Oops"); - do_exit(SIGKILL); } static noinline void do_low_address(struct pt_regs *regs) @@ -282,7 +253,6 @@ static noinline void do_low_address(struct pt_regs *regs) if (regs->psw.mask & PSW_MASK_PSTATE) { /* Low-address protection hit in user mode 'cannot happen'. */ die (regs, "Low-address protection"); - do_exit(SIGKILL); } do_no_context(regs); @@ -298,36 +268,12 @@ static noinline void do_sigbus(struct pt_regs *regs) (void __user *)(regs->int_parm_long & __FAIL_ADDR_MASK)); } -static noinline int signal_return(struct pt_regs *regs) -{ - u16 instruction; - int rc; - - rc = __get_user(instruction, (u16 __user *) regs->psw.addr); - if (rc) - return rc; - if (instruction == 0x0a77) { - set_pt_regs_flag(regs, PIF_SYSCALL); - regs->int_code = 0x00040077; - return 0; - } else if (instruction == 0x0aad) { - set_pt_regs_flag(regs, PIF_SYSCALL); - regs->int_code = 0x000400ad; - return 0; - } - return -EACCES; -} - -static noinline void do_fault_error(struct pt_regs *regs, int access, - vm_fault_t fault) +static noinline void do_fault_error(struct pt_regs *regs, vm_fault_t fault) { int si_code; switch (fault) { case VM_FAULT_BADACCESS: - if (access == VM_EXEC && signal_return(regs) == 0) - break; - /* fallthrough */ case VM_FAULT_BADMAP: /* Bad memory access. Check if it is kernel or user space. */ if (user_mode(regs)) { @@ -337,9 +283,8 @@ static noinline void do_fault_error(struct pt_regs *regs, int access, do_sigsegv(regs, si_code); break; } - /* fallthrough */ + fallthrough; case VM_FAULT_BADCONTEXT: - /* fallthrough */ case VM_FAULT_PFAULT: do_no_context(regs); break; @@ -377,7 +322,7 @@ static noinline void do_fault_error(struct pt_regs *regs, int access, * routines. * * interruption code (int_code): - * 04 Protection -> Write-Protection (suprression) + * 04 Protection -> Write-Protection (suppression) * 10 Segment translation -> Not present (nullification) * 11 Page translation -> Not present (nullification) * 3b Region third trans. -> Not present (nullification) @@ -393,19 +338,22 @@ static inline vm_fault_t do_exception(struct pt_regs *regs, int access) unsigned long address; unsigned int flags; vm_fault_t fault; + bool is_write; tsk = current; /* * The instruction that caused the program check has * been nullified. Don't signal single step via SIGTRAP. */ - clear_pt_regs_flag(regs, PIF_PER_TRAP); + clear_thread_flag(TIF_PER_TRAP); if (kprobe_page_fault(regs, 14)) return 0; mm = tsk->mm; trans_exc_code = regs->int_parm_long; + address = trans_exc_code & __FAIL_ADDR_MASK; + is_write = (trans_exc_code & store_indication) == 0x400; /* * Verify that the fault happened in user space, that @@ -416,9 +364,8 @@ static inline vm_fault_t do_exception(struct pt_regs *regs, int access) type = get_fault_type(regs); switch (type) { case KERNEL_FAULT: - goto out; - case VDSO_FAULT: - fault = VM_FAULT_BADMAP; + if (kfence_handle_page_fault(address, is_write, regs)) + return 0; goto out; case USER_FAULT: case GMAP_FAULT: @@ -427,14 +374,15 @@ static inline vm_fault_t do_exception(struct pt_regs *regs, int access) break; } - address = trans_exc_code & __FAIL_ADDR_MASK; perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); - flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; + flags = FAULT_FLAG_DEFAULT; if (user_mode(regs)) flags |= FAULT_FLAG_USER; - if (access == VM_WRITE || (trans_exc_code & store_indication) == 0x400) + if (is_write) + access = VM_WRITE; + if (access == VM_WRITE) flags |= FAULT_FLAG_WRITE; - down_read(&mm->mmap_sem); + mmap_read_lock(mm); gmap = NULL; if (IS_ENABLED(CONFIG_PGSTE) && type == GMAP_FAULT) { @@ -472,57 +420,49 @@ retry: if (unlikely(!(vma->vm_flags & access))) goto out_up; - if (is_vm_hugetlb_page(vma)) - address &= HPAGE_MASK; /* * If for any reason at all we couldn't handle the fault, * make sure we exit gracefully rather than endlessly redo * the fault. */ - fault = handle_mm_fault(vma, address, flags); - /* No reason to continue if interrupted by SIGKILL. */ - if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) { + fault = handle_mm_fault(vma, address, flags, regs); + if (fault_signal_pending(fault, regs)) { fault = VM_FAULT_SIGNAL; if (flags & FAULT_FLAG_RETRY_NOWAIT) goto out_up; goto out; } + + /* The fault is fully completed (including releasing mmap lock) */ + if (fault & VM_FAULT_COMPLETED) { + if (gmap) { + mmap_read_lock(mm); + goto out_gmap; + } + fault = 0; + goto out; + } + if (unlikely(fault & VM_FAULT_ERROR)) goto out_up; - /* - * Major/minor page fault accounting is only done on the - * initial attempt. If we go through a retry, it is extremely - * likely that the page will be found in page cache at that point. - */ - if (flags & FAULT_FLAG_ALLOW_RETRY) { - if (fault & VM_FAULT_MAJOR) { - tsk->maj_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, - regs, address); - } else { - tsk->min_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, - regs, address); - } - if (fault & VM_FAULT_RETRY) { - if (IS_ENABLED(CONFIG_PGSTE) && gmap && - (flags & FAULT_FLAG_RETRY_NOWAIT)) { - /* FAULT_FLAG_RETRY_NOWAIT has been set, - * mmap_sem has not been released */ - current->thread.gmap_pfault = 1; - fault = VM_FAULT_PFAULT; - goto out_up; - } - /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk - * of starvation. */ - flags &= ~(FAULT_FLAG_ALLOW_RETRY | - FAULT_FLAG_RETRY_NOWAIT); - flags |= FAULT_FLAG_TRIED; - down_read(&mm->mmap_sem); - goto retry; + if (fault & VM_FAULT_RETRY) { + if (IS_ENABLED(CONFIG_PGSTE) && gmap && + (flags & FAULT_FLAG_RETRY_NOWAIT)) { + /* + * FAULT_FLAG_RETRY_NOWAIT has been set, mmap_lock has + * not been released + */ + current->thread.gmap_pfault = 1; + fault = VM_FAULT_PFAULT; + goto out_up; } + flags &= ~FAULT_FLAG_RETRY_NOWAIT; + flags |= FAULT_FLAG_TRIED; + mmap_read_lock(mm); + goto retry; } +out_gmap: if (IS_ENABLED(CONFIG_PGSTE) && gmap) { address = __gmap_link(gmap, current->thread.gmap_addr, address); @@ -537,7 +477,7 @@ retry: } fault = 0; out_up: - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); out: return fault; } @@ -575,7 +515,7 @@ void do_protection_exception(struct pt_regs *regs) fault = do_exception(regs, access); } if (unlikely(fault)) - do_fault_error(regs, access, fault); + do_fault_error(regs, fault); } NOKPROBE_SYMBOL(do_protection_exception); @@ -584,10 +524,10 @@ void do_dat_exception(struct pt_regs *regs) int access; vm_fault_t fault; - access = VM_READ | VM_EXEC | VM_WRITE; + access = VM_ACCESS_FLAGS; fault = do_exception(regs, access); if (unlikely(fault)) - do_fault_error(regs, access, fault); + do_fault_error(regs, fault); } NOKPROBE_SYMBOL(do_dat_exception); @@ -737,7 +677,7 @@ static void pfault_interrupt(struct ext_code ext_code, * interrupt since it must be a leftover of a PFAULT * CANCEL operation which didn't remove all pending * completion interrupts. */ - if (tsk->state == TASK_RUNNING) + if (task_is_running(tsk)) tsk->thread.pfault_wait = -1; } } else { @@ -816,3 +756,130 @@ out_extint: early_initcall(pfault_irq_init); #endif /* CONFIG_PFAULT */ + +#if IS_ENABLED(CONFIG_PGSTE) + +void do_secure_storage_access(struct pt_regs *regs) +{ + unsigned long addr = regs->int_parm_long & __FAIL_ADDR_MASK; + struct vm_area_struct *vma; + struct mm_struct *mm; + struct page *page; + struct gmap *gmap; + int rc; + + /* + * bit 61 tells us if the address is valid, if it's not we + * have a major problem and should stop the kernel or send a + * SIGSEGV to the process. Unfortunately bit 61 is not + * reliable without the misc UV feature so we need to check + * for that as well. + */ + if (test_bit_inv(BIT_UV_FEAT_MISC, &uv_info.uv_feature_indications) && + !test_bit_inv(61, ®s->int_parm_long)) { + /* + * When this happens, userspace did something that it + * was not supposed to do, e.g. branching into secure + * memory. Trigger a segmentation fault. + */ + if (user_mode(regs)) { + send_sig(SIGSEGV, current, 0); + return; + } + + /* + * The kernel should never run into this case and we + * have no way out of this situation. + */ + panic("Unexpected PGM 0x3d with TEID bit 61=0"); + } + + switch (get_fault_type(regs)) { + case GMAP_FAULT: + mm = current->mm; + gmap = (struct gmap *)S390_lowcore.gmap; + mmap_read_lock(mm); + addr = __gmap_translate(gmap, addr); + mmap_read_unlock(mm); + if (IS_ERR_VALUE(addr)) { + do_fault_error(regs, VM_FAULT_BADMAP); + break; + } + fallthrough; + case USER_FAULT: + mm = current->mm; + mmap_read_lock(mm); + vma = find_vma(mm, addr); + if (!vma) { + mmap_read_unlock(mm); + do_fault_error(regs, VM_FAULT_BADMAP); + break; + } + page = follow_page(vma, addr, FOLL_WRITE | FOLL_GET); + if (IS_ERR_OR_NULL(page)) { + mmap_read_unlock(mm); + break; + } + if (arch_make_page_accessible(page)) + send_sig(SIGSEGV, current, 0); + put_page(page); + mmap_read_unlock(mm); + break; + case KERNEL_FAULT: + page = phys_to_page(addr); + if (unlikely(!try_get_page(page))) + break; + rc = arch_make_page_accessible(page); + put_page(page); + if (rc) + BUG(); + break; + default: + do_fault_error(regs, VM_FAULT_BADMAP); + WARN_ON_ONCE(1); + } +} +NOKPROBE_SYMBOL(do_secure_storage_access); + +void do_non_secure_storage_access(struct pt_regs *regs) +{ + unsigned long gaddr = regs->int_parm_long & __FAIL_ADDR_MASK; + struct gmap *gmap = (struct gmap *)S390_lowcore.gmap; + + if (get_fault_type(regs) != GMAP_FAULT) { + do_fault_error(regs, VM_FAULT_BADMAP); + WARN_ON_ONCE(1); + return; + } + + if (gmap_convert_to_secure(gmap, gaddr) == -EINVAL) + send_sig(SIGSEGV, current, 0); +} +NOKPROBE_SYMBOL(do_non_secure_storage_access); + +void do_secure_storage_violation(struct pt_regs *regs) +{ + unsigned long gaddr = regs->int_parm_long & __FAIL_ADDR_MASK; + struct gmap *gmap = (struct gmap *)S390_lowcore.gmap; + + /* + * If the VM has been rebooted, its address space might still contain + * secure pages from the previous boot. + * Clear the page so it can be reused. + */ + if (!gmap_destroy_page(gmap, gaddr)) + return; + /* + * Either KVM messed up the secure guest mapping or the same + * page is mapped into multiple secure guests. + * + * This exception is only triggered when a guest 2 is running + * and can therefore never occur in kernel context. + */ + printk_ratelimited(KERN_WARNING + "Secure storage violation in task: %s, pid %d\n", + current->comm, current->pid); + send_sig(SIGSEGV, current, 0); +} + +#endif /* CONFIG_PGSTE */ |