diff options
Diffstat (limited to 'arch/x86/mm/fault.c')
-rw-r--r-- | arch/x86/mm/fault.c | 179 |
1 files changed, 101 insertions, 78 deletions
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 971977c438fc..622d12ec7f08 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -19,6 +19,7 @@ #include <linux/uaccess.h> /* faulthandler_disabled() */ #include <linux/efi.h> /* efi_crash_gracefully_on_page_fault()*/ #include <linux/mm_types.h> +#include <linux/mm.h> /* find_and_lock_vma() */ #include <asm/cpufeature.h> /* boot_cpu_has, ... */ #include <asm/traps.h> /* dotraplinkage, ... */ @@ -33,6 +34,8 @@ #include <asm/kvm_para.h> /* kvm_handle_async_pf */ #include <asm/vdso.h> /* fixup_vdso_exception() */ #include <asm/irq_stack.h> +#include <asm/fred.h> +#include <asm/sev.h> /* snp_dump_hva_rmpentry() */ #define CREATE_TRACE_POINTS #include <asm/trace/exceptions.h> @@ -249,7 +252,7 @@ static noinline int vmalloc_fault(unsigned long address) if (!pmd_k) return -1; - if (pmd_large(*pmd_k)) + if (pmd_leaf(*pmd_k)) return 0; pte_k = pte_offset_kernel(pmd_k, address); @@ -318,7 +321,7 @@ static void dump_pagetable(unsigned long address) * And let's rather not kmap-atomic the pte, just in case * it's allocated already: */ - if (!low_pfn(pmd_pfn(*pmd)) || !pmd_present(*pmd) || pmd_large(*pmd)) + if (!low_pfn(pmd_pfn(*pmd)) || !pmd_present(*pmd) || pmd_leaf(*pmd)) goto out; pte = pte_offset_kernel(pmd, address); @@ -367,7 +370,7 @@ static void dump_pagetable(unsigned long address) goto bad; pr_cont("P4D %lx ", p4d_val(*p4d)); - if (!p4d_present(*p4d) || p4d_large(*p4d)) + if (!p4d_present(*p4d) || p4d_leaf(*p4d)) goto out; pud = pud_offset(p4d, address); @@ -375,7 +378,7 @@ static void dump_pagetable(unsigned long address) goto bad; pr_cont("PUD %lx ", pud_val(*pud)); - if (!pud_present(*pud) || pud_large(*pud)) + if (!pud_present(*pud) || pud_leaf(*pud)) goto out; pmd = pmd_offset(pud, address); @@ -383,7 +386,7 @@ static void dump_pagetable(unsigned long address) goto bad; pr_cont("PMD %lx ", pmd_val(*pmd)); - if (!pmd_present(*pmd) || pmd_large(*pmd)) + if (!pmd_present(*pmd) || pmd_leaf(*pmd)) goto out; pte = pte_offset_kernel(pmd, address); @@ -546,6 +549,7 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long ad !(error_code & X86_PF_PROT) ? "not-present page" : (error_code & X86_PF_RSVD) ? "reserved bit violation" : (error_code & X86_PF_PK) ? "protection keys violation" : + (error_code & X86_PF_RMP) ? "RMP violation" : "permissions violation"); if (!(error_code & X86_PF_USER) && user_mode(regs)) { @@ -578,6 +582,9 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long ad } dump_pagetable(address); + + if (error_code & X86_PF_RMP) + snp_dump_hva_rmpentry(address); } static noinline void @@ -769,6 +776,8 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code, unsigned long address, struct task_struct *tsk) { const char *loglvl = task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG; + /* This is a racy snapshot, but it's better than nothing. */ + int cpu = raw_smp_processor_id(); if (!unhandled_signal(tsk, SIGSEGV)) return; @@ -782,20 +791,19 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code, print_vma_addr(KERN_CONT " in ", regs->ip); + /* + * Dump the likely CPU where the fatal segfault happened. + * This can help identify faulty hardware. + */ + printk(KERN_CONT " likely on CPU %d (core %d, socket %d)", cpu, + topology_core_id(cpu), topology_physical_package_id(cpu)); + + printk(KERN_CONT "\n"); show_opcodes(regs, loglvl); } -/* - * The (legacy) vsyscall page is the long page in the kernel portion - * of the address space that has user-accessible permissions. - */ -static bool is_vsyscall_vaddr(unsigned long vaddr) -{ - return unlikely((vaddr & PAGE_MASK) == VSYSCALL_ADDR); -} - static void __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, unsigned long address, u32 pkey, int si_code) @@ -869,12 +877,6 @@ __bad_area(struct pt_regs *regs, unsigned long error_code, __bad_area_nosemaphore(regs, error_code, address, pkey, si_code); } -static noinline void -bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address) -{ - __bad_area(regs, error_code, address, 0, SEGV_MAPERR); -} - static inline bool bad_area_access_from_pkeys(unsigned long error_code, struct vm_area_struct *vma) { @@ -1034,21 +1036,21 @@ spurious_kernel_fault(unsigned long error_code, unsigned long address) if (!p4d_present(*p4d)) return 0; - if (p4d_large(*p4d)) + if (p4d_leaf(*p4d)) return spurious_kernel_fault_check(error_code, (pte_t *) p4d); pud = pud_offset(p4d, address); if (!pud_present(*pud)) return 0; - if (pud_large(*pud)) + if (pud_leaf(*pud)) return spurious_kernel_fault_check(error_code, (pte_t *) pud); pmd = pmd_offset(pud, address); if (!pmd_present(*pmd)) return 0; - if (pmd_large(*pmd)) + if (pmd_leaf(*pmd)) return spurious_kernel_fault_check(error_code, (pte_t *) pmd); pte = pte_offset_kernel(pmd, address); @@ -1107,8 +1109,22 @@ access_error(unsigned long error_code, struct vm_area_struct *vma) (error_code & X86_PF_INSTR), foreign)) return 1; + /* + * Shadow stack accesses (PF_SHSTK=1) are only permitted to + * shadow stack VMAs. All other accesses result in an error. + */ + if (error_code & X86_PF_SHSTK) { + if (unlikely(!(vma->vm_flags & VM_SHADOW_STACK))) + return 1; + if (unlikely(!(vma->vm_flags & VM_WRITE))) + return 1; + return 0; + } + if (error_code & X86_PF_WRITE) { /* write, present and write, not present: */ + if (unlikely(vma->vm_flags & VM_SHADOW_STACK)) + return 1; if (unlikely(!(vma->vm_flags & VM_WRITE))) return 1; return 0; @@ -1283,28 +1299,37 @@ void do_user_addr_fault(struct pt_regs *regs, return; } - /* - * It's safe to allow irq's after cr2 has been saved and the - * vmalloc fault has been handled. - * - * User-mode registers count as a user access even for any - * potential system fault or CPU buglet: - */ - if (user_mode(regs)) { - local_irq_enable(); - flags |= FAULT_FLAG_USER; - } else { - if (regs->flags & X86_EFLAGS_IF) - local_irq_enable(); + /* Legacy check - remove this after verifying that it doesn't trigger */ + if (WARN_ON_ONCE(!(regs->flags & X86_EFLAGS_IF))) { + bad_area_nosemaphore(regs, error_code, address); + return; } + local_irq_enable(); + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); + /* + * Read-only permissions can not be expressed in shadow stack PTEs. + * Treat all shadow stack accesses as WRITE faults. This ensures + * that the MM will prepare everything (e.g., break COW) such that + * maybe_mkwrite() can create a proper shadow stack PTE. + */ + if (error_code & X86_PF_SHSTK) + flags |= FAULT_FLAG_WRITE; if (error_code & X86_PF_WRITE) flags |= FAULT_FLAG_WRITE; if (error_code & X86_PF_INSTR) flags |= FAULT_FLAG_INSTRUCTION; + /* + * We set FAULT_FLAG_USER based on the register state, not + * based on X86_PF_USER. User space accesses that cause + * system page faults are still user accesses. + */ + if (user_mode(regs)) + flags |= FAULT_FLAG_USER; + #ifdef CONFIG_X86_64 /* * Faults in the vsyscall page might need emulation. The @@ -1323,51 +1348,43 @@ void do_user_addr_fault(struct pt_regs *regs, } #endif - /* - * Kernel-mode access to the user address space should only occur - * on well-defined single instructions listed in the exception - * tables. But, an erroneous kernel fault occurring outside one of - * those areas which also holds mmap_lock might deadlock attempting - * to validate the fault against the address space. - * - * Only do the expensive exception table search when we might be at - * risk of a deadlock. This happens if we - * 1. Failed to acquire mmap_lock, and - * 2. The access did not originate in userspace. - */ - if (unlikely(!mmap_read_trylock(mm))) { - if (!user_mode(regs) && !search_exception_tables(regs->ip)) { - /* - * Fault from code in kernel from - * which we do not expect faults. - */ - bad_area_nosemaphore(regs, error_code, address); - return; - } -retry: - mmap_read_lock(mm); - } else { - /* - * The above down_read_trylock() might have succeeded in - * which case we'll have missed the might_sleep() from - * down_read(): - */ - might_sleep(); + if (!(flags & FAULT_FLAG_USER)) + goto lock_mmap; + + vma = lock_vma_under_rcu(mm, address); + if (!vma) + goto lock_mmap; + + if (unlikely(access_error(error_code, vma))) { + vma_end_read(vma); + goto lock_mmap; } + fault = handle_mm_fault(vma, address, flags | FAULT_FLAG_VMA_LOCK, regs); + if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED))) + vma_end_read(vma); - vma = find_vma(mm, address); - if (unlikely(!vma)) { - bad_area(regs, error_code, address); - return; + if (!(fault & VM_FAULT_RETRY)) { + count_vm_vma_lock_event(VMA_LOCK_SUCCESS); + goto done; } - if (likely(vma->vm_start <= address)) - goto good_area; - if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) { - bad_area(regs, error_code, address); + count_vm_vma_lock_event(VMA_LOCK_RETRY); + if (fault & VM_FAULT_MAJOR) + flags |= FAULT_FLAG_TRIED; + + /* Quick path to respond to signals */ + if (fault_signal_pending(fault, regs)) { + if (!user_mode(regs)) + kernelmode_fixup_or_oops(regs, error_code, address, + SIGBUS, BUS_ADRERR, + ARCH_DEFAULT_PKEY); return; } - if (unlikely(expand_stack(vma, address))) { - bad_area(regs, error_code, address); +lock_mmap: + +retry: + vma = lock_mm_and_find_vma(mm, address, regs); + if (unlikely(!vma)) { + bad_area_nosemaphore(regs, error_code, address); return; } @@ -1375,7 +1392,6 @@ retry: * Ok, we have a good vm_area for this memory access, so * we can handle it.. */ -good_area: if (unlikely(access_error(error_code, vma))) { bad_area_access_error(regs, error_code, address, vma); return; @@ -1408,6 +1424,10 @@ good_area: return; } + /* The fault is fully completed (including releasing mmap lock) */ + if (fault & VM_FAULT_COMPLETED) + return; + /* * If we need to retry the mmap_lock has already been released, * and if there is a fatal signal pending there is no guarantee @@ -1419,6 +1439,7 @@ good_area: } mmap_read_unlock(mm); +done: if (likely(!(fault & VM_FAULT_ERROR))) return; @@ -1495,8 +1516,10 @@ handle_page_fault(struct pt_regs *regs, unsigned long error_code, DEFINE_IDTENTRY_RAW_ERRORCODE(exc_page_fault) { - unsigned long address = read_cr2(); irqentry_state_t state; + unsigned long address; + + address = cpu_feature_enabled(X86_FEATURE_FRED) ? fred_event_data(regs) : read_cr2(); prefetchw(¤t->mm->mmap_lock); |