aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/mm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/mm')
-rw-r--r--arch/x86/mm/debug_pagetables.c58
-rw-r--r--arch/x86/mm/dump_pagetables.c26
-rw-r--r--arch/x86/mm/fault.c244
-rw-r--r--arch/x86/mm/init.c4
-rw-r--r--arch/x86/mm/init_32.c2
-rw-r--r--arch/x86/mm/init_64.c33
-rw-r--r--arch/x86/mm/kasan_init_64.c55
-rw-r--r--arch/x86/mm/mem_encrypt.c7
-rw-r--r--arch/x86/mm/mm_internal.h2
-rw-r--r--arch/x86/mm/mpx.c2
-rw-r--r--arch/x86/mm/pageattr-test.c31
-rw-r--r--arch/x86/mm/pageattr.c309
-rw-r--r--arch/x86/mm/pat.c13
-rw-r--r--arch/x86/mm/pgtable.c18
-rw-r--r--arch/x86/mm/pkeys.c1
-rw-r--r--arch/x86/mm/tlb.c4
16 files changed, 418 insertions, 391 deletions
diff --git a/arch/x86/mm/debug_pagetables.c b/arch/x86/mm/debug_pagetables.c
index 225fe2f0bfec..cd84f067e41d 100644
--- a/arch/x86/mm/debug_pagetables.c
+++ b/arch/x86/mm/debug_pagetables.c
@@ -10,20 +10,9 @@ static int ptdump_show(struct seq_file *m, void *v)
return 0;
}
-static int ptdump_open(struct inode *inode, struct file *filp)
-{
- return single_open(filp, ptdump_show, NULL);
-}
-
-static const struct file_operations ptdump_fops = {
- .owner = THIS_MODULE,
- .open = ptdump_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
+DEFINE_SHOW_ATTRIBUTE(ptdump);
-static int ptdump_show_curknl(struct seq_file *m, void *v)
+static int ptdump_curknl_show(struct seq_file *m, void *v)
{
if (current->mm->pgd) {
down_read(&current->mm->mmap_sem);
@@ -33,23 +22,12 @@ static int ptdump_show_curknl(struct seq_file *m, void *v)
return 0;
}
-static int ptdump_open_curknl(struct inode *inode, struct file *filp)
-{
- return single_open(filp, ptdump_show_curknl, NULL);
-}
-
-static const struct file_operations ptdump_curknl_fops = {
- .owner = THIS_MODULE,
- .open = ptdump_open_curknl,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
+DEFINE_SHOW_ATTRIBUTE(ptdump_curknl);
#ifdef CONFIG_PAGE_TABLE_ISOLATION
static struct dentry *pe_curusr;
-static int ptdump_show_curusr(struct seq_file *m, void *v)
+static int ptdump_curusr_show(struct seq_file *m, void *v)
{
if (current->mm->pgd) {
down_read(&current->mm->mmap_sem);
@@ -59,42 +37,20 @@ static int ptdump_show_curusr(struct seq_file *m, void *v)
return 0;
}
-static int ptdump_open_curusr(struct inode *inode, struct file *filp)
-{
- return single_open(filp, ptdump_show_curusr, NULL);
-}
-
-static const struct file_operations ptdump_curusr_fops = {
- .owner = THIS_MODULE,
- .open = ptdump_open_curusr,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
+DEFINE_SHOW_ATTRIBUTE(ptdump_curusr);
#endif
#if defined(CONFIG_EFI) && defined(CONFIG_X86_64)
static struct dentry *pe_efi;
-static int ptdump_show_efi(struct seq_file *m, void *v)
+static int ptdump_efi_show(struct seq_file *m, void *v)
{
if (efi_mm.pgd)
ptdump_walk_pgd_level_debugfs(m, efi_mm.pgd, false);
return 0;
}
-static int ptdump_open_efi(struct inode *inode, struct file *filp)
-{
- return single_open(filp, ptdump_show_efi, NULL);
-}
-
-static const struct file_operations ptdump_efi_fops = {
- .owner = THIS_MODULE,
- .open = ptdump_open_efi,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
+DEFINE_SHOW_ATTRIBUTE(ptdump_efi);
#endif
static struct dentry *dir, *pe_knl, *pe_curknl;
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index fc37bbd23eb8..e3cdc85ce5b6 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -55,10 +55,10 @@ struct addr_marker {
enum address_markers_idx {
USER_SPACE_NR = 0,
KERNEL_SPACE_NR,
- LOW_KERNEL_NR,
-#if defined(CONFIG_MODIFY_LDT_SYSCALL) && defined(CONFIG_X86_5LEVEL)
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
LDT_NR,
#endif
+ LOW_KERNEL_NR,
VMALLOC_START_NR,
VMEMMAP_START_NR,
#ifdef CONFIG_KASAN
@@ -66,9 +66,6 @@ enum address_markers_idx {
KASAN_SHADOW_END_NR,
#endif
CPU_ENTRY_AREA_NR,
-#if defined(CONFIG_MODIFY_LDT_SYSCALL) && !defined(CONFIG_X86_5LEVEL)
- LDT_NR,
-#endif
#ifdef CONFIG_X86_ESPFIX64
ESPFIX_START_NR,
#endif
@@ -380,7 +377,7 @@ static void walk_pte_level(struct seq_file *m, struct pg_state *st, pmd_t addr,
/*
* This is an optimization for KASAN=y case. Since all kasan page tables
- * eventually point to the kasan_zero_page we could call note_page()
+ * eventually point to the kasan_early_shadow_page we could call note_page()
* right away without walking through lower level page tables. This saves
* us dozens of seconds (minutes for 5-level config) while checking for
* W+X mapping or reading kernel_page_tables debugfs file.
@@ -388,10 +385,11 @@ static void walk_pte_level(struct seq_file *m, struct pg_state *st, pmd_t addr,
static inline bool kasan_page_table(struct seq_file *m, struct pg_state *st,
void *pt)
{
- if (__pa(pt) == __pa(kasan_zero_pmd) ||
- (pgtable_l5_enabled() && __pa(pt) == __pa(kasan_zero_p4d)) ||
- __pa(pt) == __pa(kasan_zero_pud)) {
- pgprotval_t prot = pte_flags(kasan_zero_pte[0]);
+ if (__pa(pt) == __pa(kasan_early_shadow_pmd) ||
+ (pgtable_l5_enabled() &&
+ __pa(pt) == __pa(kasan_early_shadow_p4d)) ||
+ __pa(pt) == __pa(kasan_early_shadow_pud)) {
+ pgprotval_t prot = pte_flags(kasan_early_shadow_pte[0]);
note_page(m, st, __pgprot(prot), 0, 5);
return true;
}
@@ -512,11 +510,11 @@ static inline bool is_hypervisor_range(int idx)
{
#ifdef CONFIG_X86_64
/*
- * ffff800000000000 - ffff87ffffffffff is reserved for
- * the hypervisor.
+ * A hole in the beginning of kernel address space reserved
+ * for a hypervisor.
*/
- return (idx >= pgd_index(__PAGE_OFFSET) - 16) &&
- (idx < pgd_index(__PAGE_OFFSET));
+ return (idx >= pgd_index(GUARD_HOLE_BASE_ADDR)) &&
+ (idx < pgd_index(GUARD_HOLE_END_ADDR));
#else
return false;
#endif
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 71d4b9d4d43f..2ff25ad33233 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -27,6 +27,7 @@
#include <asm/vm86.h> /* struct vm86 */
#include <asm/mmu_context.h> /* vma_pkey() */
#include <asm/efi.h> /* efi_recover_from_page_fault()*/
+#include <asm/desc.h> /* store_idt(), ... */
#define CREATE_TRACE_POINTS
#include <asm/trace/exceptions.h>
@@ -571,10 +572,55 @@ static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
return 0;
}
+static void show_ldttss(const struct desc_ptr *gdt, const char *name, u16 index)
+{
+ u32 offset = (index >> 3) * sizeof(struct desc_struct);
+ unsigned long addr;
+ struct ldttss_desc desc;
+
+ if (index == 0) {
+ pr_alert("%s: NULL\n", name);
+ return;
+ }
+
+ if (offset + sizeof(struct ldttss_desc) >= gdt->size) {
+ pr_alert("%s: 0x%hx -- out of bounds\n", name, index);
+ return;
+ }
+
+ if (probe_kernel_read(&desc, (void *)(gdt->address + offset),
+ sizeof(struct ldttss_desc))) {
+ pr_alert("%s: 0x%hx -- GDT entry is not readable\n",
+ name, index);
+ return;
+ }
+
+ addr = desc.base0 | (desc.base1 << 16) | (desc.base2 << 24);
+#ifdef CONFIG_X86_64
+ addr |= ((u64)desc.base3 << 32);
+#endif
+ pr_alert("%s: 0x%hx -- base=0x%lx limit=0x%x\n",
+ name, index, addr, (desc.limit0 | (desc.limit1 << 16)));
+}
+
+/*
+ * This helper function transforms the #PF error_code bits into
+ * "[PROT] [USER]" type of descriptive, almost human-readable error strings:
+ */
+static void err_str_append(unsigned long error_code, char *buf, unsigned long mask, const char *txt)
+{
+ if (error_code & mask) {
+ if (buf[0])
+ strcat(buf, " ");
+ strcat(buf, txt);
+ }
+}
+
static void
-show_fault_oops(struct pt_regs *regs, unsigned long error_code,
- unsigned long address)
+show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long address)
{
+ char err_txt[64];
+
if (!oops_may_print())
return;
@@ -602,6 +648,52 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code,
address < PAGE_SIZE ? "NULL pointer dereference" : "paging request",
(void *)address);
+ err_txt[0] = 0;
+
+ /*
+ * Note: length of these appended strings including the separation space and the
+ * zero delimiter must fit into err_txt[].
+ */
+ err_str_append(error_code, err_txt, X86_PF_PROT, "[PROT]" );
+ err_str_append(error_code, err_txt, X86_PF_WRITE, "[WRITE]");
+ err_str_append(error_code, err_txt, X86_PF_USER, "[USER]" );
+ err_str_append(error_code, err_txt, X86_PF_RSVD, "[RSVD]" );
+ err_str_append(error_code, err_txt, X86_PF_INSTR, "[INSTR]");
+ err_str_append(error_code, err_txt, X86_PF_PK, "[PK]" );
+
+ pr_alert("#PF error: %s\n", error_code ? err_txt : "[normal kernel read fault]");
+
+ if (!(error_code & X86_PF_USER) && user_mode(regs)) {
+ struct desc_ptr idt, gdt;
+ u16 ldtr, tr;
+
+ pr_alert("This was a system access from user code\n");
+
+ /*
+ * This can happen for quite a few reasons. The more obvious
+ * ones are faults accessing the GDT, or LDT. Perhaps
+ * surprisingly, if the CPU tries to deliver a benign or
+ * contributory exception from user code and gets a page fault
+ * during delivery, the page fault can be delivered as though
+ * it originated directly from user code. This could happen
+ * due to wrong permissions on the IDT, GDT, LDT, TSS, or
+ * kernel or IST stack.
+ */
+ store_idt(&idt);
+
+ /* Usable even on Xen PV -- it's just slow. */
+ native_store_gdt(&gdt);
+
+ pr_alert("IDT: 0x%lx (limit=0x%hx) GDT: 0x%lx (limit=0x%hx)\n",
+ idt.address, idt.size, gdt.address, gdt.size);
+
+ store_ldt(ldtr);
+ show_ldttss(&gdt, "LDTR", ldtr);
+
+ store_tr(tr);
+ show_ldttss(&gdt, "TR", tr);
+ }
+
dump_pagetable(address);
}
@@ -621,16 +713,30 @@ pgtable_bad(struct pt_regs *regs, unsigned long error_code,
tsk->comm, address);
dump_pagetable(address);
- tsk->thread.cr2 = address;
- tsk->thread.trap_nr = X86_TRAP_PF;
- tsk->thread.error_code = error_code;
-
if (__die("Bad pagetable", regs, error_code))
sig = 0;
oops_end(flags, regs, sig);
}
+static void set_signal_archinfo(unsigned long address,
+ unsigned long error_code)
+{
+ struct task_struct *tsk = current;
+
+ /*
+ * To avoid leaking information about the kernel page
+ * table layout, pretend that user-mode accesses to
+ * kernel addresses are always protection faults.
+ */
+ if (address >= TASK_SIZE_MAX)
+ error_code |= X86_PF_PROT;
+
+ tsk->thread.trap_nr = X86_TRAP_PF;
+ tsk->thread.error_code = error_code | X86_PF_USER;
+ tsk->thread.cr2 = address;
+}
+
static noinline void
no_context(struct pt_regs *regs, unsigned long error_code,
unsigned long address, int signal, int si_code)
@@ -639,6 +745,15 @@ no_context(struct pt_regs *regs, unsigned long error_code,
unsigned long flags;
int sig;
+ if (user_mode(regs)) {
+ /*
+ * This is an implicit supervisor-mode access from user
+ * mode. Bypass all the kernel-mode recovery code and just
+ * OOPS.
+ */
+ goto oops;
+ }
+
/* Are we prepared to handle this kernel fault? */
if (fixup_exception(regs, X86_TRAP_PF, error_code, address)) {
/*
@@ -656,9 +771,7 @@ no_context(struct pt_regs *regs, unsigned long error_code,
* faulting through the emulate_vsyscall() logic.
*/
if (current->thread.sig_on_uaccess_err && signal) {
- tsk->thread.trap_nr = X86_TRAP_PF;
- tsk->thread.error_code = error_code | X86_PF_USER;
- tsk->thread.cr2 = address;
+ set_signal_archinfo(address, error_code);
/* XXX: hwpoison faults will set the wrong code. */
force_sig_fault(signal, si_code, (void __user *)address,
@@ -726,6 +839,7 @@ no_context(struct pt_regs *regs, unsigned long error_code,
if (IS_ENABLED(CONFIG_EFI))
efi_recover_from_page_fault(address);
+oops:
/*
* Oops. The kernel tried to access some bad page. We'll have to
* terminate things with extreme prejudice:
@@ -737,10 +851,6 @@ no_context(struct pt_regs *regs, unsigned long error_code,
if (task_stack_end_corrupted(tsk))
printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
- tsk->thread.cr2 = address;
- tsk->thread.trap_nr = X86_TRAP_PF;
- tsk->thread.error_code = error_code;
-
sig = SIGKILL;
if (__die("Oops", regs, error_code))
sig = 0;
@@ -794,7 +904,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
struct task_struct *tsk = current;
/* User mode accesses just cause a SIGSEGV */
- if (error_code & X86_PF_USER) {
+ if (user_mode(regs) && (error_code & X86_PF_USER)) {
/*
* It's possible to have interrupts off here:
*/
@@ -821,9 +931,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
if (likely(show_unhandled_signals))
show_signal_msg(regs, error_code, address, tsk);
- tsk->thread.cr2 = address;
- tsk->thread.error_code = error_code;
- tsk->thread.trap_nr = X86_TRAP_PF;
+ set_signal_archinfo(address, error_code);
if (si_code == SEGV_PKUERR)
force_sig_pkuerr((void __user *)address, pkey);
@@ -937,9 +1045,7 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
if (is_prefetch(regs, error_code, address))
return;
- tsk->thread.cr2 = address;
- tsk->thread.error_code = error_code;
- tsk->thread.trap_nr = X86_TRAP_PF;
+ set_signal_archinfo(address, error_code);
#ifdef CONFIG_MEMORY_FAILURE
if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) {
@@ -1148,23 +1254,6 @@ static int fault_in_kernel_space(unsigned long address)
return address >= TASK_SIZE_MAX;
}
-static inline bool smap_violation(int error_code, struct pt_regs *regs)
-{
- if (!IS_ENABLED(CONFIG_X86_SMAP))
- return false;
-
- if (!static_cpu_has(X86_FEATURE_SMAP))
- return false;
-
- if (error_code & X86_PF_USER)
- return false;
-
- if (!user_mode(regs) && (regs->flags & X86_EFLAGS_AC))
- return false;
-
- return true;
-}
-
/*
* Called for all faults where 'address' is part of the kernel address
* space. Might get called for faults that originate from *code* that
@@ -1230,7 +1319,6 @@ void do_user_addr_fault(struct pt_regs *regs,
unsigned long hw_error_code,
unsigned long address)
{
- unsigned long sw_error_code;
struct vm_area_struct *vma;
struct task_struct *tsk;
struct mm_struct *mm;
@@ -1252,10 +1340,16 @@ void do_user_addr_fault(struct pt_regs *regs,
pgtable_bad(regs, hw_error_code, address);
/*
- * Check for invalid kernel (supervisor) access to user
- * pages in the user address space.
+ * If SMAP is on, check for invalid kernel (supervisor) access to user
+ * pages in the user address space. The odd case here is WRUSS,
+ * which, according to the preliminary documentation, does not respect
+ * SMAP and will have the USER bit set so, in all cases, SMAP
+ * enforcement appears to be consistent with the USER bit.
*/
- if (unlikely(smap_violation(hw_error_code, regs))) {
+ if (unlikely(cpu_feature_enabled(X86_FEATURE_SMAP) &&
+ !(hw_error_code & X86_PF_USER) &&
+ !(regs->flags & X86_EFLAGS_AC)))
+ {
bad_area_nosemaphore(regs, hw_error_code, address);
return;
}
@@ -1270,13 +1364,6 @@ void do_user_addr_fault(struct pt_regs *regs,
}
/*
- * hw_error_code is literally the "page fault error code" passed to
- * the kernel directly from the hardware. But, we will shortly be
- * modifying it in software, so give it a new name.
- */
- sw_error_code = hw_error_code;
-
- /*
* It's safe to allow irq's after cr2 has been saved and the
* vmalloc fault has been handled.
*
@@ -1285,26 +1372,6 @@ void do_user_addr_fault(struct pt_regs *regs,
*/
if (user_mode(regs)) {
local_irq_enable();
- /*
- * Up to this point, X86_PF_USER set in hw_error_code
- * indicated a user-mode access. But, after this,
- * X86_PF_USER in sw_error_code will indicate either
- * that, *or* an implicit kernel(supervisor)-mode access
- * which originated from user mode.
- */
- if (!(hw_error_code & X86_PF_USER)) {
- /*
- * The CPU was in user mode, but the CPU says
- * the fault was not a user-mode access.
- * Must be an implicit kernel-mode access,
- * which we do not expect to happen in the
- * user address space.
- */
- pr_warn_once("kernel-mode error from user-mode: %lx\n",
- hw_error_code);
-
- sw_error_code |= X86_PF_USER;
- }
flags |= FAULT_FLAG_USER;
} else {
if (regs->flags & X86_EFLAGS_IF)
@@ -1313,9 +1380,9 @@ void do_user_addr_fault(struct pt_regs *regs,
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
- if (sw_error_code & X86_PF_WRITE)
+ if (hw_error_code & X86_PF_WRITE)
flags |= FAULT_FLAG_WRITE;
- if (sw_error_code & X86_PF_INSTR)
+ if (hw_error_code & X86_PF_INSTR)
flags |= FAULT_FLAG_INSTRUCTION;
#ifdef CONFIG_X86_64
@@ -1328,7 +1395,7 @@ void do_user_addr_fault(struct pt_regs *regs,
* The vsyscall page does not have a "real" VMA, so do this
* emulation before we go searching for VMAs.
*/
- if ((sw_error_code & X86_PF_INSTR) && is_vsyscall_vaddr(address)) {
+ if ((hw_error_code & X86_PF_INSTR) && is_vsyscall_vaddr(address)) {
if (emulate_vsyscall(regs, address))
return;
}
@@ -1344,18 +1411,15 @@ void do_user_addr_fault(struct pt_regs *regs,
* Only do the expensive exception table search when we might be at
* risk of a deadlock. This happens if we
* 1. Failed to acquire mmap_sem, and
- * 2. The access did not originate in userspace. Note: either the
- * hardware or earlier page fault code may set X86_PF_USER
- * in sw_error_code.
+ * 2. The access did not originate in userspace.
*/
if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
- if (!(sw_error_code & X86_PF_USER) &&
- !search_exception_tables(regs->ip)) {
+ if (!user_mode(regs) && !search_exception_tables(regs->ip)) {
/*
* Fault from code in kernel from
* which we do not expect faults.
*/
- bad_area_nosemaphore(regs, sw_error_code, address);
+ bad_area_nosemaphore(regs, hw_error_code, address);
return;
}
retry:
@@ -1371,29 +1435,17 @@ retry:
vma = find_vma(mm, address);
if (unlikely(!vma)) {
- bad_area(regs, sw_error_code, address);
+ bad_area(regs, hw_error_code, address);
return;
}
if (likely(vma->vm_start <= address))
goto good_area;
if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
- bad_area(regs, sw_error_code, address);
+ bad_area(regs, hw_error_code, address);
return;
}
- if (sw_error_code & X86_PF_USER) {
- /*
- * Accessing the stack below %sp is always a bug.
- * The large cushion allows instructions like enter
- * and pusha to work. ("enter $65535, $31" pushes
- * 32 pointers and then decrements %sp by 65535.)
- */
- if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) {
- bad_area(regs, sw_error_code, address);
- return;
- }
- }
if (unlikely(expand_stack(vma, address))) {
- bad_area(regs, sw_error_code, address);
+ bad_area(regs, hw_error_code, address);
return;
}
@@ -1402,8 +1454,8 @@ retry:
* we can handle it..
*/
good_area:
- if (unlikely(access_error(sw_error_code, vma))) {
- bad_area_access_error(regs, sw_error_code, address, vma);
+ if (unlikely(access_error(hw_error_code, vma))) {
+ bad_area_access_error(regs, hw_error_code, address, vma);
return;
}
@@ -1442,13 +1494,13 @@ good_area:
return;
/* Not returning to user mode? Handle exceptions or die: */
- no_context(regs, sw_error_code, address, SIGBUS, BUS_ADRERR);
+ no_context(regs, hw_error_code, address, SIGBUS, BUS_ADRERR);
return;
}
up_read(&mm->mmap_sem);
if (unlikely(fault & VM_FAULT_ERROR)) {
- mm_fault_error(regs, sw_error_code, address, fault);
+ mm_fault_error(regs, hw_error_code, address, fault);
return;
}
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index ef99f3892e1f..f905a2371080 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -742,7 +742,7 @@ int devmem_is_allowed(unsigned long pagenr)
return 1;
}
-void free_init_pages(char *what, unsigned long begin, unsigned long end)
+void free_init_pages(const char *what, unsigned long begin, unsigned long end)
{
unsigned long begin_aligned, end_aligned;
@@ -931,7 +931,7 @@ unsigned long max_swapfile_size(void)
pages = generic_max_swapfile_size();
- if (boot_cpu_has_bug(X86_BUG_L1TF)) {
+ if (boot_cpu_has_bug(X86_BUG_L1TF) && l1tf_mitigation != L1TF_MITIGATION_OFF) {
/* Limit the swap file size to MAX_PA/2 for L1TF workaround */
unsigned long long l1tf_limit = l1tf_pfn_limit();
/*
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 49ecf5ecf6d3..85c94f9a87f8 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -860,7 +860,7 @@ int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
}
#ifdef CONFIG_MEMORY_HOTREMOVE
-int arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
+int arch_remove_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap)
{
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 5fab264948c2..bccff68e3267 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -432,7 +432,7 @@ phys_pte_init(pte_t *pte_page, unsigned long paddr, unsigned long paddr_end,
E820_TYPE_RAM) &&
!e820__mapped_any(paddr & PAGE_MASK, paddr_next,
E820_TYPE_RESERVED_KERN))
- set_pte(pte, __pte(0));
+ set_pte_safe(pte, __pte(0));
continue;
}
@@ -452,7 +452,7 @@ phys_pte_init(pte_t *pte_page, unsigned long paddr, unsigned long paddr_end,
pr_info(" pte=%p addr=%lx pte=%016lx\n", pte, paddr,
pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL).pte);
pages++;
- set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, prot));
+ set_pte_safe(pte, pfn_pte(paddr >> PAGE_SHIFT, prot));
paddr_last = (paddr & PAGE_MASK) + PAGE_SIZE;
}
@@ -487,7 +487,7 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long paddr, unsigned long paddr_end,
E820_TYPE_RAM) &&
!e820__mapped_any(paddr & PMD_MASK, paddr_next,
E820_TYPE_RESERVED_KERN))
- set_pmd(pmd, __pmd(0));
+ set_pmd_safe(pmd, __pmd(0));
continue;
}
@@ -524,7 +524,7 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long paddr, unsigned long paddr_end,
if (page_size_mask & (1<<PG_LEVEL_2M)) {
pages++;
spin_lock(&init_mm.page_table_lock);
- set_pte((pte_t *)pmd,
+ set_pte_safe((pte_t *)pmd,
pfn_pte((paddr & PMD_MASK) >> PAGE_SHIFT,
__pgprot(pgprot_val(prot) | _PAGE_PSE)));
spin_unlock(&init_mm.page_table_lock);
@@ -536,7 +536,7 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long paddr, unsigned long paddr_end,
paddr_last = phys_pte_init(pte, paddr, paddr_end, new_prot);
spin_lock(&init_mm.page_table_lock);
- pmd_populate_kernel(&init_mm, pmd, pte);
+ pmd_populate_kernel_safe(&init_mm, pmd, pte);
spin_unlock(&init_mm.page_table_lock);
}
update_page_count(PG_LEVEL_2M, pages);
@@ -573,7 +573,7 @@ phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end,
E820_TYPE_RAM) &&
!e820__mapped_any(paddr & PUD_MASK, paddr_next,
E820_TYPE_RESERVED_KERN))
- set_pud(pud, __pud(0));
+ set_pud_safe(pud, __pud(0));
continue;
}
@@ -584,7 +584,6 @@ phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end,
paddr_end,
page_size_mask,
prot);
- __flush_tlb_all();
continue;
}
/*
@@ -611,7 +610,7 @@ phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end,
if (page_size_mask & (1<<PG_LEVEL_1G)) {
pages++;
spin_lock(&init_mm.page_table_lock);
- set_pte((pte_t *)pud,
+ set_pte_safe((pte_t *)pud,
pfn_pte((paddr & PUD_MASK) >> PAGE_SHIFT,
PAGE_KERNEL_LARGE));
spin_unlock(&init_mm.page_table_lock);
@@ -624,10 +623,9 @@ phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end,
page_size_mask, prot);
spin_lock(&init_mm.page_table_lock);
- pud_populate(&init_mm, pud, pmd);
+ pud_populate_safe(&init_mm, pud, pmd);
spin_unlock(&init_mm.page_table_lock);
}
- __flush_tlb_all();
update_page_count(PG_LEVEL_1G, pages);
@@ -659,7 +657,7 @@ phys_p4d_init(p4d_t *p4d_page, unsigned long paddr, unsigned long paddr_end,
E820_TYPE_RAM) &&
!e820__mapped_any(paddr & P4D_MASK, paddr_next,
E820_TYPE_RESERVED_KERN))
- set_p4d(p4d, __p4d(0));
+ set_p4d_safe(p4d, __p4d(0));
continue;
}
@@ -668,7 +666,6 @@ phys_p4d_init(p4d_t *p4d_page, unsigned long paddr, unsigned long paddr_end,
paddr_last = phys_pud_init(pud, paddr,
paddr_end,
page_size_mask);
- __flush_tlb_all();
continue;
}
@@ -677,10 +674,9 @@ phys_p4d_init(p4d_t *p4d_page, unsigned long paddr, unsigned long paddr_end,
page_size_mask);
spin_lock(&init_mm.page_table_lock);
- p4d_populate(&init_mm, p4d, pud);
+ p4d_populate_safe(&init_mm, p4d, pud);
spin_unlock(&init_mm.page_table_lock);
}
- __flush_tlb_all();
return paddr_last;
}
@@ -723,9 +719,9 @@ kernel_physical_mapping_init(unsigned long paddr_start,
spin_lock(&init_mm.page_table_lock);
if (pgtable_l5_enabled())
- pgd_populate(&init_mm, pgd, p4d);
+ pgd_populate_safe(&init_mm, pgd, p4d);
else
- p4d_populate(&init_mm, p4d_offset(pgd, vaddr), (pud_t *) p4d);
+ p4d_populate_safe(&init_mm, p4d_offset(pgd, vaddr), (pud_t *) p4d);
spin_unlock(&init_mm.page_table_lock);
pgd_changed = true;
}
@@ -733,8 +729,6 @@ kernel_physical_mapping_init(unsigned long paddr_start,
if (pgd_changed)
sync_global_pgds(vaddr_start, vaddr_end - 1);
- __flush_tlb_all();
-
return paddr_last;
}
@@ -1147,7 +1141,8 @@ kernel_physical_mapping_remove(unsigned long start, unsigned long end)
remove_pagetable(start, end, true, NULL);
}
-int __ref arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
+int __ref arch_remove_memory(int nid, u64 start, u64 size,
+ struct vmem_altmap *altmap)
{
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
index 04a9cf6b034f..462fde83b515 100644
--- a/arch/x86/mm/kasan_init_64.c
+++ b/arch/x86/mm/kasan_init_64.c
@@ -211,7 +211,8 @@ static void __init kasan_early_p4d_populate(pgd_t *pgd,
unsigned long next;
if (pgd_none(*pgd)) {
- pgd_entry = __pgd(_KERNPG_TABLE | __pa_nodebug(kasan_zero_p4d));
+ pgd_entry = __pgd(_KERNPG_TABLE |
+ __pa_nodebug(kasan_early_shadow_p4d));
set_pgd(pgd, pgd_entry);
}
@@ -222,7 +223,8 @@ static void __init kasan_early_p4d_populate(pgd_t *pgd,
if (!p4d_none(*p4d))
continue;
- p4d_entry = __p4d(_KERNPG_TABLE | __pa_nodebug(kasan_zero_pud));
+ p4d_entry = __p4d(_KERNPG_TABLE |
+ __pa_nodebug(kasan_early_shadow_pud));
set_p4d(p4d, p4d_entry);
} while (p4d++, addr = next, addr != end && p4d_none(*p4d));
}
@@ -261,10 +263,11 @@ static struct notifier_block kasan_die_notifier = {
void __init kasan_early_init(void)
{
int i;
- pteval_t pte_val = __pa_nodebug(kasan_zero_page) | __PAGE_KERNEL | _PAGE_ENC;
- pmdval_t pmd_val = __pa_nodebug(kasan_zero_pte) | _KERNPG_TABLE;
- pudval_t pud_val = __pa_nodebug(kasan_zero_pmd) | _KERNPG_TABLE;
- p4dval_t p4d_val = __pa_nodebug(kasan_zero_pud) | _KERNPG_TABLE;
+ pteval_t pte_val = __pa_nodebug(kasan_early_shadow_page) |
+ __PAGE_KERNEL | _PAGE_ENC;
+ pmdval_t pmd_val = __pa_nodebug(kasan_early_shadow_pte) | _KERNPG_TABLE;
+ pudval_t pud_val = __pa_nodebug(kasan_early_shadow_pmd) | _KERNPG_TABLE;
+ p4dval_t p4d_val = __pa_nodebug(kasan_early_shadow_pud) | _KERNPG_TABLE;
/* Mask out unsupported __PAGE_KERNEL bits: */
pte_val &= __default_kernel_pte_mask;
@@ -273,16 +276,16 @@ void __init kasan_early_init(void)
p4d_val &= __default_kernel_pte_mask;
for (i = 0; i < PTRS_PER_PTE; i++)
- kasan_zero_pte[i] = __pte(pte_val);
+ kasan_early_shadow_pte[i] = __pte(pte_val);
for (i = 0; i < PTRS_PER_PMD; i++)
- kasan_zero_pmd[i] = __pmd(pmd_val);
+ kasan_early_shadow_pmd[i] = __pmd(pmd_val);
for (i = 0; i < PTRS_PER_PUD; i++)
- kasan_zero_pud[i] = __pud(pud_val);
+ kasan_early_shadow_pud[i] = __pud(pud_val);
for (i = 0; pgtable_l5_enabled() && i < PTRS_PER_P4D; i++)
- kasan_zero_p4d[i] = __p4d(p4d_val);
+ kasan_early_shadow_p4d[i] = __p4d(p4d_val);
kasan_map_early_shadow(early_top_pgt);
kasan_map_early_shadow(init_top_pgt);
@@ -326,7 +329,7 @@ void __init kasan_init(void)
clear_pgds(KASAN_SHADOW_START & PGDIR_MASK, KASAN_SHADOW_END);
- kasan_populate_zero_shadow((void *)(KASAN_SHADOW_START & PGDIR_MASK),
+ kasan_populate_early_shadow((void *)(KASAN_SHADOW_START & PGDIR_MASK),
kasan_mem_to_shadow((void *)PAGE_OFFSET));
for (i = 0; i < E820_MAX_ENTRIES; i++) {
@@ -338,41 +341,41 @@ void __init kasan_init(void)
shadow_cpu_entry_begin = (void *)CPU_ENTRY_AREA_BASE;
shadow_cpu_entry_begin = kasan_mem_to_shadow(shadow_cpu_entry_begin);
- shadow_cpu_entry_begin = (void *)round_down((unsigned long)shadow_cpu_entry_begin,
- PAGE_SIZE);
+ shadow_cpu_entry_begin = (void *)round_down(
+ (unsigned long)shadow_cpu_entry_begin, PAGE_SIZE);
shadow_cpu_entry_end = (void *)(CPU_ENTRY_AREA_BASE +
CPU_ENTRY_AREA_MAP_SIZE);
shadow_cpu_entry_end = kasan_mem_to_shadow(shadow_cpu_entry_end);
- shadow_cpu_entry_end = (void *)round_up((unsigned long)shadow_cpu_entry_end,
- PAGE_SIZE);
+ shadow_cpu_entry_end = (void *)round_up(
+ (unsigned long)shadow_cpu_entry_end, PAGE_SIZE);
- kasan_populate_zero_shadow(
+ kasan_populate_early_shadow(
kasan_mem_to_shadow((void *)PAGE_OFFSET + MAXMEM),
shadow_cpu_entry_begin);
kasan_populate_shadow((unsigned long)shadow_cpu_entry_begin,
(unsigned long)shadow_cpu_entry_end, 0);
- kasan_populate_zero_shadow(shadow_cpu_entry_end,
- kasan_mem_to_shadow((void *)__START_KERNEL_map));
+ kasan_populate_early_shadow(shadow_cpu_entry_end,
+ kasan_mem_to_shadow((void *)__START_KERNEL_map));
kasan_populate_shadow((unsigned long)kasan_mem_to_shadow(_stext),
(unsigned long)kasan_mem_to_shadow(_end),
early_pfn_to_nid(__pa(_stext)));
- kasan_populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END),
- (void *)KASAN_SHADOW_END);
+ kasan_populate_early_shadow(kasan_mem_to_shadow((void *)MODULES_END),
+ (void *)KASAN_SHADOW_END);
load_cr3(init_top_pgt);
__flush_tlb_all();
/*
- * kasan_zero_page has been used as early shadow memory, thus it may
- * contain some garbage. Now we can clear and write protect it, since
- * after the TLB flush no one should write to it.
+ * kasan_early_shadow_page has been used as early shadow memory, thus
+ * it may contain some garbage. Now we can clear and write protect it,
+ * since after the TLB flush no one should write to it.
*/
- memset(kasan_zero_page, 0, PAGE_SIZE);
+ memset(kasan_early_shadow_page, 0, PAGE_SIZE);
for (i = 0; i < PTRS_PER_PTE; i++) {
pte_t pte;
pgprot_t prot;
@@ -380,8 +383,8 @@ void __init kasan_init(void)
prot = __pgprot(__PAGE_KERNEL_RO | _PAGE_ENC);
pgprot_val(prot) &= __default_kernel_pte_mask;
- pte = __pte(__pa(kasan_zero_page) | pgprot_val(prot));
- set_pte(&kasan_zero_pte[i], pte);
+ pte = __pte(__pa(kasan_early_shadow_page) | pgprot_val(prot));
+ set_pte(&kasan_early_shadow_pte[i], pte);
}
/* Flush TLBs again to be sure that write protection applied. */
__flush_tlb_all();
diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
index 006f373f54ab..385afa2b9e17 100644
--- a/arch/x86/mm/mem_encrypt.c
+++ b/arch/x86/mm/mem_encrypt.c
@@ -381,13 +381,6 @@ void __init mem_encrypt_init(void)
swiotlb_update_mem_attributes();
/*
- * With SEV, DMA operations cannot use encryption, we need to use
- * SWIOTLB to bounce buffer DMA operation.
- */
- if (sev_active())
- dma_ops = &swiotlb_dma_ops;
-
- /*
* With SEV, we need to unroll the rep string I/O instructions.
*/
if (sev_active())
diff --git a/arch/x86/mm/mm_internal.h b/arch/x86/mm/mm_internal.h
index 4e1f6e1b8159..319bde386d5f 100644
--- a/arch/x86/mm/mm_internal.h
+++ b/arch/x86/mm/mm_internal.h
@@ -19,4 +19,6 @@ extern int after_bootmem;
void update_cache_mode_entry(unsigned entry, enum page_cache_mode cache);
+extern unsigned long tlb_single_page_flush_ceiling;
+
#endif /* __X86_MM_INTERNAL_H */
diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c
index 2385538e8065..de1851d15699 100644
--- a/arch/x86/mm/mpx.c
+++ b/arch/x86/mm/mpx.c
@@ -495,7 +495,7 @@ static int get_bt_addr(struct mm_struct *mm,
unsigned long bd_entry;
unsigned long bt_addr;
- if (!access_ok(VERIFY_READ, (bd_entry_ptr), sizeof(*bd_entry_ptr)))
+ if (!access_ok((bd_entry_ptr), sizeof(*bd_entry_ptr)))
return -EFAULT;
while (1) {
diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c
index 08f8f76a4852..facce271e8b9 100644
--- a/arch/x86/mm/pageattr-test.c
+++ b/arch/x86/mm/pageattr-test.c
@@ -23,7 +23,8 @@
static __read_mostly int print = 1;
enum {
- NTEST = 400,
+ NTEST = 3 * 100,
+ NPAGES = 100,
#ifdef CONFIG_X86_64
LPS = (1 << PMD_SHIFT),
#elif defined(CONFIG_X86_PAE)
@@ -110,6 +111,9 @@ static int print_split(struct split_state *s)
static unsigned long addr[NTEST];
static unsigned int len[NTEST];
+static struct page *pages[NPAGES];
+static unsigned long addrs[NPAGES];
+
/* Change the global bit on random pages in the direct mapping */
static int pageattr_test(void)
{
@@ -120,7 +124,6 @@ static int pageattr_test(void)
unsigned int level;
int i, k;
int err;
- unsigned long test_addr;
if (print)
printk(KERN_INFO "CPA self-test:\n");
@@ -137,7 +140,7 @@ static int pageattr_test(void)
unsigned long pfn = prandom_u32() % max_pfn_mapped;
addr[i] = (unsigned long)__va(pfn << PAGE_SHIFT);
- len[i] = prandom_u32() % 100;
+ len[i] = prandom_u32() % NPAGES;
len[i] = min_t(unsigned long, len[i], max_pfn_mapped - pfn - 1);
if (len[i] == 0)
@@ -167,14 +170,29 @@ static int pageattr_test(void)
break;
}
__set_bit(pfn + k, bm);
+ addrs[k] = addr[i] + k*PAGE_SIZE;
+ pages[k] = pfn_to_page(pfn + k);
}
if (!addr[i] || !pte || !k) {
addr[i] = 0;
continue;
}
- test_addr = addr[i];
- err = change_page_attr_set(&test_addr, len[i], PAGE_CPA_TEST, 0);
+ switch (i % 3) {
+ case 0:
+ err = change_page_attr_set(&addr[i], len[i], PAGE_CPA_TEST, 0);
+ break;
+
+ case 1:
+ err = change_page_attr_set(addrs, len[1], PAGE_CPA_TEST, 1);
+ break;
+
+ case 2:
+ err = cpa_set_pages_array(pages, len[i], PAGE_CPA_TEST);
+ break;
+ }
+
+
if (err < 0) {
printk(KERN_ERR "CPA %d failed %d\n", i, err);
failed++;
@@ -206,8 +224,7 @@ static int pageattr_test(void)
failed++;
continue;
}
- test_addr = addr[i];
- err = change_page_attr_clear(&test_addr, len[i], PAGE_CPA_TEST, 0);
+ err = change_page_attr_clear(&addr[i], len[i], PAGE_CPA_TEST, 0);
if (err < 0) {
printk(KERN_ERR "CPA reverting failed: %d\n", err);
failed++;
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index db7a10082238..4f8972311a77 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -26,6 +26,8 @@
#include <asm/pat.h>
#include <asm/set_memory.h>
+#include "mm_internal.h"
+
/*
* The current flushing context - we pass it instead of 5 arguments:
*/
@@ -35,11 +37,11 @@ struct cpa_data {
pgprot_t mask_set;
pgprot_t mask_clr;
unsigned long numpages;
- int flags;
+ unsigned long curpage;
unsigned long pfn;
- unsigned force_split : 1,
+ unsigned int flags;
+ unsigned int force_split : 1,
force_static_prot : 1;
- int curpage;
struct page **pages;
};
@@ -228,19 +230,28 @@ static bool __cpa_pfn_in_highmap(unsigned long pfn)
#endif
+static unsigned long __cpa_addr(struct cpa_data *cpa, unsigned long idx)
+{
+ if (cpa->flags & CPA_PAGES_ARRAY) {
+ struct page *page = cpa->pages[idx];
+
+ if (unlikely(PageHighMem(page)))
+ return 0;
+
+ return (unsigned long)page_address(page);
+ }
+
+ if (cpa->flags & CPA_ARRAY)
+ return cpa->vaddr[idx];
+
+ return *cpa->vaddr + idx * PAGE_SIZE;
+}
+
/*
* Flushing functions
*/
-/**
- * clflush_cache_range - flush a cache range with clflush
- * @vaddr: virtual start address
- * @size: number of bytes to flush
- *
- * clflushopt is an unordered instruction which needs fencing with mfence or
- * sfence to avoid ordering issues.
- */
-void clflush_cache_range(void *vaddr, unsigned int size)
+static void clflush_cache_range_opt(void *vaddr, unsigned int size)
{
const unsigned long clflush_size = boot_cpu_data.x86_clflush_size;
void *p = (void *)((unsigned long)vaddr & ~(clflush_size - 1));
@@ -249,11 +260,22 @@ void clflush_cache_range(void *vaddr, unsigned int size)
if (p >= vend)
return;
- mb();
-
for (; p < vend; p += clflush_size)
clflushopt(p);
+}
+/**
+ * clflush_cache_range - flush a cache range with clflush
+ * @vaddr: virtual start address
+ * @size: number of bytes to flush
+ *
+ * CLFLUSHOPT is an unordered instruction which needs fencing with MFENCE or
+ * SFENCE to avoid ordering issues.
+ */
+void clflush_cache_range(void *vaddr, unsigned int size)
+{
+ mb();
+ clflush_cache_range_opt(vaddr, size);
mb();
}
EXPORT_SYMBOL_GPL(clflush_cache_range);
@@ -285,79 +307,49 @@ static void cpa_flush_all(unsigned long cache)
on_each_cpu(__cpa_flush_all, (void *) cache, 1);
}
-static bool __cpa_flush_range(unsigned long start, int numpages, int cache)
+void __cpa_flush_tlb(void *data)
{
- BUG_ON(irqs_disabled() && !early_boot_irqs_disabled);
-
- WARN_ON(PAGE_ALIGN(start) != start);
-
- if (cache && !static_cpu_has(X86_FEATURE_CLFLUSH)) {
- cpa_flush_all(cache);
- return true;
- }
+ struct cpa_data *cpa = data;
+ unsigned int i;
- flush_tlb_kernel_range(start, start + PAGE_SIZE * numpages);
-
- return !cache;
+ for (i = 0; i < cpa->numpages; i++)
+ __flush_tlb_one_kernel(__cpa_addr(cpa, i));
}
-static void cpa_flush_range(unsigned long start, int numpages, int cache)
+static void cpa_flush(struct cpa_data *data, int cache)
{
- unsigned int i, level;
- unsigned long addr;
+ struct cpa_data *cpa = data;
+ unsigned int i;
- if (__cpa_flush_range(start, numpages, cache))
- return;
-
- /*
- * We only need to flush on one CPU,
- * clflush is a MESI-coherent instruction that
- * will cause all other CPUs to flush the same
- * cachelines:
- */
- for (i = 0, addr = start; i < numpages; i++, addr += PAGE_SIZE) {
- pte_t *pte = lookup_address(addr, &level);
+ BUG_ON(irqs_disabled() && !early_boot_irqs_disabled);
- /*
- * Only flush present addresses:
- */
- if (pte && (pte_val(*pte) & _PAGE_PRESENT))
- clflush_cache_range((void *) addr, PAGE_SIZE);
+ if (cache && !static_cpu_has(X86_FEATURE_CLFLUSH)) {
+ cpa_flush_all(cache);
+ return;
}
-}
-static void cpa_flush_array(unsigned long baddr, unsigned long *start,
- int numpages, int cache,
- int in_flags, struct page **pages)
-{
- unsigned int i, level;
+ if (cpa->numpages <= tlb_single_page_flush_ceiling)
+ on_each_cpu(__cpa_flush_tlb, cpa, 1);
+ else
+ flush_tlb_all();
- if (__cpa_flush_range(baddr, numpages, cache))
+ if (!cache)
return;
- /*
- * We only need to flush on one CPU,
- * clflush is a MESI-coherent instruction that
- * will cause all other CPUs to flush the same
- * cachelines:
- */
- for (i = 0; i < numpages; i++) {
- unsigned long addr;
- pte_t *pte;
-
- if (in_flags & CPA_PAGES_ARRAY)
- addr = (unsigned long)page_address(pages[i]);
- else
- addr = start[i];
+ mb();
+ for (i = 0; i < cpa->numpages; i++) {
+ unsigned long addr = __cpa_addr(cpa, i);
+ unsigned int level;
- pte = lookup_address(addr, &level);
+ pte_t *pte = lookup_address(addr, &level);
/*
* Only flush present addresses:
*/
if (pte && (pte_val(*pte) & _PAGE_PRESENT))
- clflush_cache_range((void *)addr, PAGE_SIZE);
+ clflush_cache_range_opt((void *)addr, PAGE_SIZE);
}
+ mb();
}
static bool overlaps(unsigned long r1_start, unsigned long r1_end,
@@ -1468,15 +1460,7 @@ static int __change_page_attr(struct cpa_data *cpa, int primary)
unsigned int level;
pte_t *kpte, old_pte;
- if (cpa->flags & CPA_PAGES_ARRAY) {
- struct page *page = cpa->pages[cpa->curpage];
- if (unlikely(PageHighMem(page)))
- return 0;
- address = (unsigned long)page_address(page);
- } else if (cpa->flags & CPA_ARRAY)
- address = cpa->vaddr[cpa->curpage];
- else
- address = *cpa->vaddr;
+ address = __cpa_addr(cpa, cpa->curpage);
repeat:
kpte = _lookup_address_cpa(cpa, address, &level);
if (!kpte)
@@ -1557,22 +1541,14 @@ static int cpa_process_alias(struct cpa_data *cpa)
* No need to redo, when the primary call touched the direct
* mapping already:
*/
- if (cpa->flags & CPA_PAGES_ARRAY) {
- struct page *page = cpa->pages[cpa->curpage];
- if (unlikely(PageHighMem(page)))
- return 0;
- vaddr = (unsigned long)page_address(page);
- } else if (cpa->flags & CPA_ARRAY)
- vaddr = cpa->vaddr[cpa->curpage];
- else
- vaddr = *cpa->vaddr;
-
+ vaddr = __cpa_addr(cpa, cpa->curpage);
if (!(within(vaddr, PAGE_OFFSET,
PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT)))) {
alias_cpa = *cpa;
alias_cpa.vaddr = &laddr;
alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
+ alias_cpa.curpage = 0;
ret = __change_page_attr_set_clr(&alias_cpa, 0);
if (ret)
@@ -1592,6 +1568,7 @@ static int cpa_process_alias(struct cpa_data *cpa)
alias_cpa = *cpa;
alias_cpa.vaddr = &temp_cpa_vaddr;
alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
+ alias_cpa.curpage = 0;
/*
* The high mapping range is imprecise, so ignore the
@@ -1607,14 +1584,15 @@ static int cpa_process_alias(struct cpa_data *cpa)
static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
{
unsigned long numpages = cpa->numpages;
- int ret;
+ unsigned long rempages = numpages;
+ int ret = 0;
- while (numpages) {
+ while (rempages) {
/*
* Store the remaining nr of pages for the large page
* preservation check.
*/
- cpa->numpages = numpages;
+ cpa->numpages = rempages;
/* for array changes, we can't use large page */
if (cpa->flags & (CPA_ARRAY | CPA_PAGES_ARRAY))
cpa->numpages = 1;
@@ -1625,12 +1603,12 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
if (!debug_pagealloc_enabled())
spin_unlock(&cpa_lock);
if (ret)
- return ret;
+ goto out;
if (checkalias) {
ret = cpa_process_alias(cpa);
if (ret)
- return ret;
+ goto out;
}
/*
@@ -1638,15 +1616,15 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
* CPA operation. Either a large page has been
* preserved or a single page update happened.
*/
- BUG_ON(cpa->numpages > numpages || !cpa->numpages);
- numpages -= cpa->numpages;
- if (cpa->flags & (CPA_PAGES_ARRAY | CPA_ARRAY))
- cpa->curpage++;
- else
- *cpa->vaddr += cpa->numpages * PAGE_SIZE;
-
+ BUG_ON(cpa->numpages > rempages || !cpa->numpages);
+ rempages -= cpa->numpages;
+ cpa->curpage += cpa->numpages;
}
- return 0;
+
+out:
+ /* Restore the original numpages */
+ cpa->numpages = numpages;
+ return ret;
}
/*
@@ -1679,7 +1657,6 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
{
struct cpa_data cpa;
int ret, cache, checkalias;
- unsigned long baddr = 0;
memset(&cpa, 0, sizeof(cpa));
@@ -1704,7 +1681,7 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
} else if (!(in_flag & CPA_PAGES_ARRAY)) {
/*
* in_flag of CPA_PAGES_ARRAY implies it is aligned.
- * No need to cehck in that case
+ * No need to check in that case
*/
if (*addr & ~PAGE_MASK) {
*addr &= PAGE_MASK;
@@ -1713,11 +1690,6 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
*/
WARN_ON_ONCE(1);
}
- /*
- * Save address for cache flush. *addr is modified in the call
- * to __change_page_attr_set_clr() below.
- */
- baddr = make_addr_canonical_again(*addr);
}
/* Must avoid aliasing mappings in the highmem code */
@@ -1765,13 +1737,7 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
goto out;
}
- if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) {
- cpa_flush_array(baddr, addr, numpages, cache,
- cpa.flags, pages);
- } else {
- cpa_flush_range(baddr, numpages, cache);
- }
-
+ cpa_flush(&cpa, cache);
out:
return ret;
}
@@ -1842,14 +1808,14 @@ out_err:
}
EXPORT_SYMBOL(set_memory_uc);
-static int _set_memory_array(unsigned long *addr, int addrinarray,
+static int _set_memory_array(unsigned long *addr, int numpages,
enum page_cache_mode new_type)
{
enum page_cache_mode set_type;
int i, j;
int ret;
- for (i = 0; i < addrinarray; i++) {
+ for (i = 0; i < numpages; i++) {
ret = reserve_memtype(__pa(addr[i]), __pa(addr[i]) + PAGE_SIZE,
new_type, NULL);
if (ret)
@@ -1860,11 +1826,11 @@ static int _set_memory_array(unsigned long *addr, int addrinarray,
set_type = (new_type == _PAGE_CACHE_MODE_WC) ?
_PAGE_CACHE_MODE_UC_MINUS : new_type;
- ret = change_page_attr_set(addr, addrinarray,
+ ret = change_page_attr_set(addr, numpages,
cachemode2pgprot(set_type), 1);
if (!ret && new_type == _PAGE_CACHE_MODE_WC)
- ret = change_page_attr_set_clr(addr, addrinarray,
+ ret = change_page_attr_set_clr(addr, numpages,
cachemode2pgprot(
_PAGE_CACHE_MODE_WC),
__pgprot(_PAGE_CACHE_MASK),
@@ -1881,36 +1847,34 @@ out_free:
return ret;
}
-int set_memory_array_uc(unsigned long *addr, int addrinarray)
+int set_memory_array_uc(unsigned long *addr, int numpages)
{
- return _set_memory_array(addr, addrinarray, _PAGE_CACHE_MODE_UC_MINUS);
+ return _set_memory_array(addr, numpages, _PAGE_CACHE_MODE_UC_MINUS);
}
EXPORT_SYMBOL(set_memory_array_uc);
-int set_memory_array_wc(unsigned long *addr, int addrinarray)
+int set_memory_array_wc(unsigned long *addr, int numpages)
{
- return _set_memory_array(addr, addrinarray, _PAGE_CACHE_MODE_WC);
+ return _set_memory_array(addr, numpages, _PAGE_CACHE_MODE_WC);
}
EXPORT_SYMBOL(set_memory_array_wc);
-int set_memory_array_wt(unsigned long *addr, int addrinarray)
+int set_memory_array_wt(unsigned long *addr, int numpages)
{
- return _set_memory_array(addr, addrinarray, _PAGE_CACHE_MODE_WT);
+ return _set_memory_array(addr, numpages, _PAGE_CACHE_MODE_WT);
}
EXPORT_SYMBOL_GPL(set_memory_array_wt);
int _set_memory_wc(unsigned long addr, int numpages)
{
int ret;
- unsigned long addr_copy = addr;
ret = change_page_attr_set(&addr, numpages,
cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS),
0);
if (!ret) {
- ret = change_page_attr_set_clr(&addr_copy, numpages,
- cachemode2pgprot(
- _PAGE_CACHE_MODE_WC),
+ ret = change_page_attr_set_clr(&addr, numpages,
+ cachemode2pgprot(_PAGE_CACHE_MODE_WC),
__pgprot(_PAGE_CACHE_MASK),
0, 0, NULL);
}
@@ -1977,18 +1941,18 @@ int set_memory_wb(unsigned long addr, int numpages)
}
EXPORT_SYMBOL(set_memory_wb);
-int set_memory_array_wb(unsigned long *addr, int addrinarray)
+int set_memory_array_wb(unsigned long *addr, int numpages)
{
int i;
int ret;
/* WB cache mode is hard wired to all cache attribute bits being 0 */
- ret = change_page_attr_clear(addr, addrinarray,
+ ret = change_page_attr_clear(addr, numpages,
__pgprot(_PAGE_CACHE_MASK), 1);
if (ret)
return ret;
- for (i = 0; i < addrinarray; i++)
+ for (i = 0; i < numpages; i++)
free_memtype(__pa(addr[i]), __pa(addr[i]) + PAGE_SIZE);
return 0;
@@ -2058,7 +2022,6 @@ int set_memory_global(unsigned long addr, int numpages)
static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc)
{
struct cpa_data cpa;
- unsigned long start;
int ret;
/* Nothing to do if memory encryption is not active */
@@ -2069,8 +2032,6 @@ static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc)
if (WARN_ONCE(addr & ~PAGE_MASK, "misaligned address: %#lx\n", addr))
addr &= PAGE_MASK;
- start = addr;
-
memset(&cpa, 0, sizeof(cpa));
cpa.vaddr = &addr;
cpa.numpages = numpages;
@@ -2085,18 +2046,18 @@ static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc)
/*
* Before changing the encryption attribute, we need to flush caches.
*/
- cpa_flush_range(start, numpages, 1);
+ cpa_flush(&cpa, 1);
ret = __change_page_attr_set_clr(&cpa, 1);
/*
- * After changing the encryption attribute, we need to flush TLBs
- * again in case any speculative TLB caching occurred (but no need
- * to flush caches again). We could just use cpa_flush_all(), but
- * in case TLB flushing gets optimized in the cpa_flush_range()
- * path use the same logic as above.
+ * After changing the encryption attribute, we need to flush TLBs again
+ * in case any speculative TLB caching occurred (but no need to flush
+ * caches again). We could just use cpa_flush_all(), but in case TLB
+ * flushing gets optimized in the cpa_flush() path use the same logic
+ * as above.
*/
- cpa_flush_range(start, numpages, 0);
+ cpa_flush(&cpa, 0);
return ret;
}
@@ -2121,7 +2082,7 @@ int set_pages_uc(struct page *page, int numpages)
}
EXPORT_SYMBOL(set_pages_uc);
-static int _set_pages_array(struct page **pages, int addrinarray,
+static int _set_pages_array(struct page **pages, int numpages,
enum page_cache_mode new_type)
{
unsigned long start;
@@ -2131,7 +2092,7 @@ static int _set_pages_array(struct page **pages, int addrinarray,
int free_idx;
int ret;
- for (i = 0; i < addrinarray; i++) {
+ for (i = 0; i < numpages; i++) {
if (PageHighMem(pages[i]))
continue;
start = page_to_pfn(pages[i]) << PAGE_SHIFT;
@@ -2144,10 +2105,10 @@ static int _set_pages_array(struct page **pages, int addrinarray,
set_type = (new_type == _PAGE_CACHE_MODE_WC) ?
_PAGE_CACHE_MODE_UC_MINUS : new_type;
- ret = cpa_set_pages_array(pages, addrinarray,
+ ret = cpa_set_pages_array(pages, numpages,
cachemode2pgprot(set_type));
if (!ret && new_type == _PAGE_CACHE_MODE_WC)
- ret = change_page_attr_set_clr(NULL, addrinarray,
+ ret = change_page_attr_set_clr(NULL, numpages,
cachemode2pgprot(
_PAGE_CACHE_MODE_WC),
__pgprot(_PAGE_CACHE_MASK),
@@ -2167,21 +2128,21 @@ err_out:
return -EINVAL;
}
-int set_pages_array_uc(struct page **pages, int addrinarray)
+int set_pages_array_uc(struct page **pages, int numpages)
{
- return _set_pages_array(pages, addrinarray, _PAGE_CACHE_MODE_UC_MINUS);
+ return _set_pages_array(pages, numpages, _PAGE_CACHE_MODE_UC_MINUS);
}
EXPORT_SYMBOL(set_pages_array_uc);
-int set_pages_array_wc(struct page **pages, int addrinarray)
+int set_pages_array_wc(struct page **pages, int numpages)
{
- return _set_pages_array(pages, addrinarray, _PAGE_CACHE_MODE_WC);
+ return _set_pages_array(pages, numpages, _PAGE_CACHE_MODE_WC);
}
EXPORT_SYMBOL(set_pages_array_wc);
-int set_pages_array_wt(struct page **pages, int addrinarray)
+int set_pages_array_wt(struct page **pages, int numpages)
{
- return _set_pages_array(pages, addrinarray, _PAGE_CACHE_MODE_WT);
+ return _set_pages_array(pages, numpages, _PAGE_CACHE_MODE_WT);
}
EXPORT_SYMBOL_GPL(set_pages_array_wt);
@@ -2193,7 +2154,7 @@ int set_pages_wb(struct page *page, int numpages)
}
EXPORT_SYMBOL(set_pages_wb);
-int set_pages_array_wb(struct page **pages, int addrinarray)
+int set_pages_array_wb(struct page **pages, int numpages)
{
int retval;
unsigned long start;
@@ -2201,12 +2162,12 @@ int set_pages_array_wb(struct page **pages, int addrinarray)
int i;
/* WB cache mode is hard wired to all cache attribute bits being 0 */
- retval = cpa_clear_pages_array(pages, addrinarray,
+ retval = cpa_clear_pages_array(pages, numpages,
__pgprot(_PAGE_CACHE_MASK));
if (retval)
return retval;
- for (i = 0; i < addrinarray; i++) {
+ for (i = 0; i < numpages; i++) {
if (PageHighMem(pages[i]))
continue;
start = page_to_pfn(pages[i]) << PAGE_SHIFT;
@@ -2338,8 +2299,8 @@ bool kernel_page_present(struct page *page)
#endif /* CONFIG_DEBUG_PAGEALLOC */
-int kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address,
- unsigned numpages, unsigned long page_flags)
+int __init kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address,
+ unsigned numpages, unsigned long page_flags)
{
int retval = -EINVAL;
@@ -2353,6 +2314,8 @@ int kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address,
.flags = 0,
};
+ WARN_ONCE(num_online_cpus() > 1, "Don't call after initializing SMP");
+
if (!(__supported_pte_mask & _PAGE_NX))
goto out;
@@ -2375,6 +2338,40 @@ out:
}
/*
+ * __flush_tlb_all() flushes mappings only on current CPU and hence this
+ * function shouldn't be used in an SMP environment. Presently, it's used only
+ * during boot (way before smp_init()) by EFI subsystem and hence is ok.
+ */
+int __init kernel_unmap_pages_in_pgd(pgd_t *pgd, unsigned long address,
+ unsigned long numpages)
+{
+ int retval;
+
+ /*
+ * The typical sequence for unmapping is to find a pte through
+ * lookup_address_in_pgd() (ideally, it should never return NULL because
+ * the address is already mapped) and change it's protections. As pfn is
+ * the *target* of a mapping, it's not useful while unmapping.
+ */
+ struct cpa_data cpa = {
+ .vaddr = &address,
+ .pfn = 0,
+ .pgd = pgd,
+ .numpages = numpages,
+ .mask_set = __pgprot(0),
+ .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW),
+ .flags = 0,
+ };
+
+ WARN_ONCE(num_online_cpus() > 1, "Don't call after initializing SMP");
+
+ retval = __change_page_attr_set_clr(&cpa, 0);
+ __flush_tlb_all();
+
+ return retval;
+}
+
+/*
* The testcases use internal knowledge of the implementation that shouldn't
* be exposed to the rest of the kernel. Include these directly here.
*/
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index 08013524fba1..4fe956a63b25 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -519,8 +519,13 @@ static u64 sanitize_phys(u64 address)
* for a "decoy" virtual address (bit 63 clear) passed to
* set_memory_X(). __pa() on a "decoy" address results in a
* physical address with bit 63 set.
+ *
+ * Decoy addresses are not present for 32-bit builds, see
+ * set_mce_nospec().
*/
- return address & __PHYSICAL_MASK;
+ if (IS_ENABLED(CONFIG_X86_64))
+ return address & __PHYSICAL_MASK;
+ return address;
}
/*
@@ -546,7 +551,11 @@ int reserve_memtype(u64 start, u64 end, enum page_cache_mode req_type,
start = sanitize_phys(start);
end = sanitize_phys(end);
- BUG_ON(start >= end); /* end is exclusive */
+ if (start >= end) {
+ WARN(1, "%s failed: [mem %#010Lx-%#010Lx], req %s\n", __func__,
+ start, end - 1, cattr_name(req_type));
+ return -EINVAL;
+ }
if (!pat_enabled()) {
/* This is identical to page table setting without PAT */
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 59274e2c1ac4..7bd01709a091 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -23,12 +23,12 @@ EXPORT_SYMBOL(physical_mask);
gfp_t __userpte_alloc_gfp = PGALLOC_GFP | PGALLOC_USER_GFP;
-pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
+pte_t *pte_alloc_one_kernel(struct mm_struct *mm)
{
return (pte_t *)__get_free_page(PGALLOC_GFP & ~__GFP_ACCOUNT);
}
-pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
+pgtable_t pte_alloc_one(struct mm_struct *mm)
{
struct page *pte;
@@ -794,6 +794,14 @@ int pmd_clear_huge(pmd_t *pmd)
return 0;
}
+/*
+ * Until we support 512GB pages, skip them in the vmap area.
+ */
+int p4d_free_pud_page(p4d_t *p4d, unsigned long addr)
+{
+ return 0;
+}
+
#ifdef CONFIG_X86_64
/**
* pud_free_pmd_page - Clear pud entry and free pmd page.
@@ -811,9 +819,6 @@ int pud_free_pmd_page(pud_t *pud, unsigned long addr)
pte_t *pte;
int i;
- if (pud_none(*pud))
- return 1;
-
pmd = (pmd_t *)pud_page_vaddr(*pud);
pmd_sv = (pmd_t *)__get_free_page(GFP_KERNEL);
if (!pmd_sv)
@@ -855,9 +860,6 @@ int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
{
pte_t *pte;
- if (pmd_none(*pmd))
- return 1;
-
pte = (pte_t *)pmd_page_vaddr(*pmd);
pmd_clear(pmd);
diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c
index 6e98e0a7c923..047a77f6a10c 100644
--- a/arch/x86/mm/pkeys.c
+++ b/arch/x86/mm/pkeys.c
@@ -131,6 +131,7 @@ int __arch_override_mprotect_pkey(struct vm_area_struct *vma, int prot, int pkey
* in the process's lifetime will not accidentally get access
* to data which is pkey-protected later on.
*/
+static
u32 init_pkru_value = PKRU_AD_KEY( 1) | PKRU_AD_KEY( 2) | PKRU_AD_KEY( 3) |
PKRU_AD_KEY( 4) | PKRU_AD_KEY( 5) | PKRU_AD_KEY( 6) |
PKRU_AD_KEY( 7) | PKRU_AD_KEY( 8) | PKRU_AD_KEY( 9) |
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 03b6b4c2238d..999d6d8f0bef 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -15,6 +15,8 @@
#include <asm/apic.h>
#include <asm/uv/uv.h>
+#include "mm_internal.h"
+
/*
* TLB flushing, formerly SMP-only
* c/o Linus Torvalds.
@@ -721,7 +723,7 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
*
* This is in units of pages.
*/
-static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
+unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
unsigned long end, unsigned int stride_shift,