aboutsummaryrefslogtreecommitdiffstats
path: root/arch/s390/mm/fault.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/s390/mm/fault.c')
-rw-r--r--arch/s390/mm/fault.c329
1 files changed, 198 insertions, 131 deletions
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index 7b0bb475c166..9649d9382e0a 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -31,29 +31,30 @@
#include <linux/kprobes.h>
#include <linux/uaccess.h>
#include <linux/hugetlb.h>
+#include <linux/kfence.h>
+#include <asm/asm-extable.h>
#include <asm/asm-offsets.h>
#include <asm/diag.h>
-#include <asm/pgtable.h>
#include <asm/gmap.h>
#include <asm/irq.h>
#include <asm/mmu_context.h>
#include <asm/facility.h>
+#include <asm/uv.h>
#include "../kernel/entry.h"
#define __FAIL_ADDR_MASK -4096L
#define __SUBCODE_MASK 0x0600
#define __PF_RES_FIELD 0x8000000000000000ULL
-#define VM_FAULT_BADCONTEXT 0x010000
-#define VM_FAULT_BADMAP 0x020000
-#define VM_FAULT_BADACCESS 0x040000
-#define VM_FAULT_SIGNAL 0x080000
-#define VM_FAULT_PFAULT 0x100000
+#define VM_FAULT_BADCONTEXT ((__force vm_fault_t) 0x010000)
+#define VM_FAULT_BADMAP ((__force vm_fault_t) 0x020000)
+#define VM_FAULT_BADACCESS ((__force vm_fault_t) 0x040000)
+#define VM_FAULT_SIGNAL ((__force vm_fault_t) 0x080000)
+#define VM_FAULT_PFAULT ((__force vm_fault_t) 0x100000)
enum fault_type {
KERNEL_FAULT,
USER_FAULT,
- VDSO_FAULT,
GMAP_FAULT,
};
@@ -77,22 +78,16 @@ static enum fault_type get_fault_type(struct pt_regs *regs)
trans_exc_code = regs->int_parm_long & 3;
if (likely(trans_exc_code == 0)) {
/* primary space exception */
- if (IS_ENABLED(CONFIG_PGSTE) &&
- test_pt_regs_flag(regs, PIF_GUEST_FAULT))
- return GMAP_FAULT;
- if (current->thread.mm_segment == USER_DS)
+ if (user_mode(regs))
return USER_FAULT;
- return KERNEL_FAULT;
- }
- if (trans_exc_code == 2) {
- /* secondary space exception */
- if (current->thread.mm_segment & 1) {
- if (current->thread.mm_segment == USER_DS_SACF)
- return USER_FAULT;
+ if (!IS_ENABLED(CONFIG_PGSTE))
return KERNEL_FAULT;
- }
- return VDSO_FAULT;
+ if (test_pt_regs_flag(regs, PIF_GUEST_FAULT))
+ return GMAP_FAULT;
+ return KERNEL_FAULT;
}
+ if (trans_exc_code == 2)
+ return USER_FAULT;
if (trans_exc_code == 1) {
/* access register mode, not used in the kernel */
return USER_FAULT;
@@ -105,7 +100,7 @@ static int bad_address(void *p)
{
unsigned long dummy;
- return probe_kernel_address((unsigned long *)p, dummy);
+ return get_kernel_nofault(dummy, (unsigned long *)p);
}
static void dump_pagetable(unsigned long asce, unsigned long address)
@@ -121,8 +116,8 @@ static void dump_pagetable(unsigned long asce, unsigned long address)
pr_cont("R1:%016lx ", *table);
if (*table & _REGION_ENTRY_INVALID)
goto out;
- table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
- /* fallthrough */
+ table = __va(*table & _REGION_ENTRY_ORIGIN);
+ fallthrough;
case _ASCE_TYPE_REGION2:
table += (address & _REGION2_INDEX) >> _REGION2_SHIFT;
if (bad_address(table))
@@ -130,8 +125,8 @@ static void dump_pagetable(unsigned long asce, unsigned long address)
pr_cont("R2:%016lx ", *table);
if (*table & _REGION_ENTRY_INVALID)
goto out;
- table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
- /* fallthrough */
+ table = __va(*table & _REGION_ENTRY_ORIGIN);
+ fallthrough;
case _ASCE_TYPE_REGION3:
table += (address & _REGION3_INDEX) >> _REGION3_SHIFT;
if (bad_address(table))
@@ -139,8 +134,8 @@ static void dump_pagetable(unsigned long asce, unsigned long address)
pr_cont("R3:%016lx ", *table);
if (*table & (_REGION_ENTRY_INVALID | _REGION3_ENTRY_LARGE))
goto out;
- table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
- /* fallthrough */
+ table = __va(*table & _REGION_ENTRY_ORIGIN);
+ fallthrough;
case _ASCE_TYPE_SEGMENT:
table += (address & _SEGMENT_INDEX) >> _SEGMENT_SHIFT;
if (bad_address(table))
@@ -148,7 +143,7 @@ static void dump_pagetable(unsigned long asce, unsigned long address)
pr_cont("S:%016lx ", *table);
if (*table & (_SEGMENT_ENTRY_INVALID | _SEGMENT_ENTRY_LARGE))
goto out;
- table = (unsigned long *)(*table & _SEGMENT_ENTRY_ORIGIN);
+ table = __va(*table & _SEGMENT_ENTRY_ORIGIN);
}
table += (address & _PAGE_INDEX) >> _PAGE_SHIFT;
if (bad_address(table))
@@ -188,10 +183,6 @@ static void dump_fault_info(struct pt_regs *regs)
asce = S390_lowcore.user_asce;
pr_cont("user ");
break;
- case VDSO_FAULT:
- asce = S390_lowcore.vdso_asce;
- pr_cont("vdso ");
- break;
case GMAP_FAULT:
asce = ((struct gmap *) S390_lowcore.gmap)->asce;
pr_cont("gmap ");
@@ -237,29 +228,10 @@ static noinline void do_sigsegv(struct pt_regs *regs, int si_code)
(void __user *)(regs->int_parm_long & __FAIL_ADDR_MASK));
}
-const struct exception_table_entry *s390_search_extables(unsigned long addr)
-{
- const struct exception_table_entry *fixup;
-
- fixup = search_extable(__start_dma_ex_table,
- __stop_dma_ex_table - __start_dma_ex_table,
- addr);
- if (!fixup)
- fixup = search_exception_tables(addr);
- return fixup;
-}
-
static noinline void do_no_context(struct pt_regs *regs)
{
- const struct exception_table_entry *fixup;
-
- /* Are we prepared to handle this kernel fault? */
- fixup = s390_search_extables(regs->psw.addr);
- if (fixup) {
- regs->psw.addr = extable_fixup(fixup);
+ if (fixup_exception(regs))
return;
- }
-
/*
* Oops. The kernel tried to access some bad page. We'll have to
* terminate things with extreme prejudice.
@@ -272,7 +244,6 @@ static noinline void do_no_context(struct pt_regs *regs)
" in virtual user address space\n");
dump_fault_info(regs);
die(regs, "Oops");
- do_exit(SIGKILL);
}
static noinline void do_low_address(struct pt_regs *regs)
@@ -282,7 +253,6 @@ static noinline void do_low_address(struct pt_regs *regs)
if (regs->psw.mask & PSW_MASK_PSTATE) {
/* Low-address protection hit in user mode 'cannot happen'. */
die (regs, "Low-address protection");
- do_exit(SIGKILL);
}
do_no_context(regs);
@@ -298,36 +268,12 @@ static noinline void do_sigbus(struct pt_regs *regs)
(void __user *)(regs->int_parm_long & __FAIL_ADDR_MASK));
}
-static noinline int signal_return(struct pt_regs *regs)
-{
- u16 instruction;
- int rc;
-
- rc = __get_user(instruction, (u16 __user *) regs->psw.addr);
- if (rc)
- return rc;
- if (instruction == 0x0a77) {
- set_pt_regs_flag(regs, PIF_SYSCALL);
- regs->int_code = 0x00040077;
- return 0;
- } else if (instruction == 0x0aad) {
- set_pt_regs_flag(regs, PIF_SYSCALL);
- regs->int_code = 0x000400ad;
- return 0;
- }
- return -EACCES;
-}
-
-static noinline void do_fault_error(struct pt_regs *regs, int access,
- vm_fault_t fault)
+static noinline void do_fault_error(struct pt_regs *regs, vm_fault_t fault)
{
int si_code;
switch (fault) {
case VM_FAULT_BADACCESS:
- if (access == VM_EXEC && signal_return(regs) == 0)
- break;
- /* fallthrough */
case VM_FAULT_BADMAP:
/* Bad memory access. Check if it is kernel or user space. */
if (user_mode(regs)) {
@@ -337,9 +283,8 @@ static noinline void do_fault_error(struct pt_regs *regs, int access,
do_sigsegv(regs, si_code);
break;
}
- /* fallthrough */
+ fallthrough;
case VM_FAULT_BADCONTEXT:
- /* fallthrough */
case VM_FAULT_PFAULT:
do_no_context(regs);
break;
@@ -377,7 +322,7 @@ static noinline void do_fault_error(struct pt_regs *regs, int access,
* routines.
*
* interruption code (int_code):
- * 04 Protection -> Write-Protection (suprression)
+ * 04 Protection -> Write-Protection (suppression)
* 10 Segment translation -> Not present (nullification)
* 11 Page translation -> Not present (nullification)
* 3b Region third trans. -> Not present (nullification)
@@ -393,19 +338,22 @@ static inline vm_fault_t do_exception(struct pt_regs *regs, int access)
unsigned long address;
unsigned int flags;
vm_fault_t fault;
+ bool is_write;
tsk = current;
/*
* The instruction that caused the program check has
* been nullified. Don't signal single step via SIGTRAP.
*/
- clear_pt_regs_flag(regs, PIF_PER_TRAP);
+ clear_thread_flag(TIF_PER_TRAP);
if (kprobe_page_fault(regs, 14))
return 0;
mm = tsk->mm;
trans_exc_code = regs->int_parm_long;
+ address = trans_exc_code & __FAIL_ADDR_MASK;
+ is_write = (trans_exc_code & store_indication) == 0x400;
/*
* Verify that the fault happened in user space, that
@@ -416,9 +364,8 @@ static inline vm_fault_t do_exception(struct pt_regs *regs, int access)
type = get_fault_type(regs);
switch (type) {
case KERNEL_FAULT:
- goto out;
- case VDSO_FAULT:
- fault = VM_FAULT_BADMAP;
+ if (kfence_handle_page_fault(address, is_write, regs))
+ return 0;
goto out;
case USER_FAULT:
case GMAP_FAULT:
@@ -427,14 +374,15 @@ static inline vm_fault_t do_exception(struct pt_regs *regs, int access)
break;
}
- address = trans_exc_code & __FAIL_ADDR_MASK;
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
- flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
+ flags = FAULT_FLAG_DEFAULT;
if (user_mode(regs))
flags |= FAULT_FLAG_USER;
- if (access == VM_WRITE || (trans_exc_code & store_indication) == 0x400)
+ if (is_write)
+ access = VM_WRITE;
+ if (access == VM_WRITE)
flags |= FAULT_FLAG_WRITE;
- down_read(&mm->mmap_sem);
+ mmap_read_lock(mm);
gmap = NULL;
if (IS_ENABLED(CONFIG_PGSTE) && type == GMAP_FAULT) {
@@ -472,57 +420,49 @@ retry:
if (unlikely(!(vma->vm_flags & access)))
goto out_up;
- if (is_vm_hugetlb_page(vma))
- address &= HPAGE_MASK;
/*
* If for any reason at all we couldn't handle the fault,
* make sure we exit gracefully rather than endlessly redo
* the fault.
*/
- fault = handle_mm_fault(vma, address, flags);
- /* No reason to continue if interrupted by SIGKILL. */
- if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) {
+ fault = handle_mm_fault(vma, address, flags, regs);
+ if (fault_signal_pending(fault, regs)) {
fault = VM_FAULT_SIGNAL;
if (flags & FAULT_FLAG_RETRY_NOWAIT)
goto out_up;
goto out;
}
+
+ /* The fault is fully completed (including releasing mmap lock) */
+ if (fault & VM_FAULT_COMPLETED) {
+ if (gmap) {
+ mmap_read_lock(mm);
+ goto out_gmap;
+ }
+ fault = 0;
+ goto out;
+ }
+
if (unlikely(fault & VM_FAULT_ERROR))
goto out_up;
- /*
- * Major/minor page fault accounting is only done on the
- * initial attempt. If we go through a retry, it is extremely
- * likely that the page will be found in page cache at that point.
- */
- if (flags & FAULT_FLAG_ALLOW_RETRY) {
- if (fault & VM_FAULT_MAJOR) {
- tsk->maj_flt++;
- perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1,
- regs, address);
- } else {
- tsk->min_flt++;
- perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1,
- regs, address);
- }
- if (fault & VM_FAULT_RETRY) {
- if (IS_ENABLED(CONFIG_PGSTE) && gmap &&
- (flags & FAULT_FLAG_RETRY_NOWAIT)) {
- /* FAULT_FLAG_RETRY_NOWAIT has been set,
- * mmap_sem has not been released */
- current->thread.gmap_pfault = 1;
- fault = VM_FAULT_PFAULT;
- goto out_up;
- }
- /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk
- * of starvation. */
- flags &= ~(FAULT_FLAG_ALLOW_RETRY |
- FAULT_FLAG_RETRY_NOWAIT);
- flags |= FAULT_FLAG_TRIED;
- down_read(&mm->mmap_sem);
- goto retry;
+ if (fault & VM_FAULT_RETRY) {
+ if (IS_ENABLED(CONFIG_PGSTE) && gmap &&
+ (flags & FAULT_FLAG_RETRY_NOWAIT)) {
+ /*
+ * FAULT_FLAG_RETRY_NOWAIT has been set, mmap_lock has
+ * not been released
+ */
+ current->thread.gmap_pfault = 1;
+ fault = VM_FAULT_PFAULT;
+ goto out_up;
}
+ flags &= ~FAULT_FLAG_RETRY_NOWAIT;
+ flags |= FAULT_FLAG_TRIED;
+ mmap_read_lock(mm);
+ goto retry;
}
+out_gmap:
if (IS_ENABLED(CONFIG_PGSTE) && gmap) {
address = __gmap_link(gmap, current->thread.gmap_addr,
address);
@@ -537,7 +477,7 @@ retry:
}
fault = 0;
out_up:
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
out:
return fault;
}
@@ -575,7 +515,7 @@ void do_protection_exception(struct pt_regs *regs)
fault = do_exception(regs, access);
}
if (unlikely(fault))
- do_fault_error(regs, access, fault);
+ do_fault_error(regs, fault);
}
NOKPROBE_SYMBOL(do_protection_exception);
@@ -584,10 +524,10 @@ void do_dat_exception(struct pt_regs *regs)
int access;
vm_fault_t fault;
- access = VM_READ | VM_EXEC | VM_WRITE;
+ access = VM_ACCESS_FLAGS;
fault = do_exception(regs, access);
if (unlikely(fault))
- do_fault_error(regs, access, fault);
+ do_fault_error(regs, fault);
}
NOKPROBE_SYMBOL(do_dat_exception);
@@ -737,7 +677,7 @@ static void pfault_interrupt(struct ext_code ext_code,
* interrupt since it must be a leftover of a PFAULT
* CANCEL operation which didn't remove all pending
* completion interrupts. */
- if (tsk->state == TASK_RUNNING)
+ if (task_is_running(tsk))
tsk->thread.pfault_wait = -1;
}
} else {
@@ -816,3 +756,130 @@ out_extint:
early_initcall(pfault_irq_init);
#endif /* CONFIG_PFAULT */
+
+#if IS_ENABLED(CONFIG_PGSTE)
+
+void do_secure_storage_access(struct pt_regs *regs)
+{
+ unsigned long addr = regs->int_parm_long & __FAIL_ADDR_MASK;
+ struct vm_area_struct *vma;
+ struct mm_struct *mm;
+ struct page *page;
+ struct gmap *gmap;
+ int rc;
+
+ /*
+ * bit 61 tells us if the address is valid, if it's not we
+ * have a major problem and should stop the kernel or send a
+ * SIGSEGV to the process. Unfortunately bit 61 is not
+ * reliable without the misc UV feature so we need to check
+ * for that as well.
+ */
+ if (test_bit_inv(BIT_UV_FEAT_MISC, &uv_info.uv_feature_indications) &&
+ !test_bit_inv(61, &regs->int_parm_long)) {
+ /*
+ * When this happens, userspace did something that it
+ * was not supposed to do, e.g. branching into secure
+ * memory. Trigger a segmentation fault.
+ */
+ if (user_mode(regs)) {
+ send_sig(SIGSEGV, current, 0);
+ return;
+ }
+
+ /*
+ * The kernel should never run into this case and we
+ * have no way out of this situation.
+ */
+ panic("Unexpected PGM 0x3d with TEID bit 61=0");
+ }
+
+ switch (get_fault_type(regs)) {
+ case GMAP_FAULT:
+ mm = current->mm;
+ gmap = (struct gmap *)S390_lowcore.gmap;
+ mmap_read_lock(mm);
+ addr = __gmap_translate(gmap, addr);
+ mmap_read_unlock(mm);
+ if (IS_ERR_VALUE(addr)) {
+ do_fault_error(regs, VM_FAULT_BADMAP);
+ break;
+ }
+ fallthrough;
+ case USER_FAULT:
+ mm = current->mm;
+ mmap_read_lock(mm);
+ vma = find_vma(mm, addr);
+ if (!vma) {
+ mmap_read_unlock(mm);
+ do_fault_error(regs, VM_FAULT_BADMAP);
+ break;
+ }
+ page = follow_page(vma, addr, FOLL_WRITE | FOLL_GET);
+ if (IS_ERR_OR_NULL(page)) {
+ mmap_read_unlock(mm);
+ break;
+ }
+ if (arch_make_page_accessible(page))
+ send_sig(SIGSEGV, current, 0);
+ put_page(page);
+ mmap_read_unlock(mm);
+ break;
+ case KERNEL_FAULT:
+ page = phys_to_page(addr);
+ if (unlikely(!try_get_page(page)))
+ break;
+ rc = arch_make_page_accessible(page);
+ put_page(page);
+ if (rc)
+ BUG();
+ break;
+ default:
+ do_fault_error(regs, VM_FAULT_BADMAP);
+ WARN_ON_ONCE(1);
+ }
+}
+NOKPROBE_SYMBOL(do_secure_storage_access);
+
+void do_non_secure_storage_access(struct pt_regs *regs)
+{
+ unsigned long gaddr = regs->int_parm_long & __FAIL_ADDR_MASK;
+ struct gmap *gmap = (struct gmap *)S390_lowcore.gmap;
+
+ if (get_fault_type(regs) != GMAP_FAULT) {
+ do_fault_error(regs, VM_FAULT_BADMAP);
+ WARN_ON_ONCE(1);
+ return;
+ }
+
+ if (gmap_convert_to_secure(gmap, gaddr) == -EINVAL)
+ send_sig(SIGSEGV, current, 0);
+}
+NOKPROBE_SYMBOL(do_non_secure_storage_access);
+
+void do_secure_storage_violation(struct pt_regs *regs)
+{
+ unsigned long gaddr = regs->int_parm_long & __FAIL_ADDR_MASK;
+ struct gmap *gmap = (struct gmap *)S390_lowcore.gmap;
+
+ /*
+ * If the VM has been rebooted, its address space might still contain
+ * secure pages from the previous boot.
+ * Clear the page so it can be reused.
+ */
+ if (!gmap_destroy_page(gmap, gaddr))
+ return;
+ /*
+ * Either KVM messed up the secure guest mapping or the same
+ * page is mapped into multiple secure guests.
+ *
+ * This exception is only triggered when a guest 2 is running
+ * and can therefore never occur in kernel context.
+ */
+ printk_ratelimited(KERN_WARNING
+ "Secure storage violation in task: %s, pid %d\n",
+ current->comm, current->pid);
+ send_sig(SIGSEGV, current, 0);
+}
+
+#endif /* CONFIG_PGSTE */