aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/arch/x86/mm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/mm')
-rw-r--r--arch/x86/mm/cpu_entry_area.c4
-rw-r--r--arch/x86/mm/dump_pagetables.c35
-rw-r--r--arch/x86/mm/fault.c195
-rw-r--r--arch/x86/mm/highmem_32.c50
-rw-r--r--arch/x86/mm/hugetlbpage.c23
-rw-r--r--arch/x86/mm/init.c48
-rw-r--r--arch/x86/mm/init_64.c24
-rw-r--r--arch/x86/mm/ioremap.c10
-rw-r--r--arch/x86/mm/kmmio.c4
-rw-r--r--arch/x86/mm/mem_encrypt.c2
-rw-r--r--arch/x86/mm/mmap.c2
-rw-r--r--arch/x86/mm/mmio-mod.c4
-rw-r--r--arch/x86/mm/numa.c11
-rw-r--r--arch/x86/mm/numa_32.c34
-rw-r--r--arch/x86/mm/pat/set_memory.c7
-rw-r--r--arch/x86/mm/pgtable.c16
-rw-r--r--arch/x86/mm/pgtable_32.c2
-rw-r--r--arch/x86/mm/pti.c8
-rw-r--r--arch/x86/mm/tlb.c421
19 files changed, 532 insertions, 368 deletions
diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c
index 56f9189bbadb..5199d8a1daf1 100644
--- a/arch/x86/mm/cpu_entry_area.c
+++ b/arch/x86/mm/cpu_entry_area.c
@@ -17,7 +17,7 @@ static DEFINE_PER_CPU_PAGE_ALIGNED(struct exception_stacks, exception_stacks);
DEFINE_PER_CPU(struct cea_exception_stacks*, cea_exception_stacks);
#endif
-#if defined(CONFIG_X86_32) && defined(CONFIG_DOUBLEFAULT)
+#ifdef CONFIG_X86_32
DECLARE_PER_CPU_PAGE_ALIGNED(struct doublefault_stack, doublefault_stack);
#endif
@@ -114,12 +114,10 @@ static void __init percpu_setup_exception_stacks(unsigned int cpu)
#else
static inline void percpu_setup_exception_stacks(unsigned int cpu)
{
-#ifdef CONFIG_DOUBLEFAULT
struct cpu_entry_area *cea = get_cpu_entry_area(cpu);
cea_map_percpu_pages(&cea->doublefault_stack,
&per_cpu(doublefault_stack, cpu), 1, PAGE_KERNEL);
-#endif
}
#endif
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index 69309cd56fdf..ea9010113f69 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -249,10 +249,22 @@ static void note_wx(struct pg_state *st, unsigned long addr)
(void *)st->start_address);
}
-static inline pgprotval_t effective_prot(pgprotval_t prot1, pgprotval_t prot2)
+static void effective_prot(struct ptdump_state *pt_st, int level, u64 val)
{
- return (prot1 & prot2 & (_PAGE_USER | _PAGE_RW)) |
- ((prot1 | prot2) & _PAGE_NX);
+ struct pg_state *st = container_of(pt_st, struct pg_state, ptdump);
+ pgprotval_t prot = val & PTE_FLAGS_MASK;
+ pgprotval_t effective;
+
+ if (level > 0) {
+ pgprotval_t higher_prot = st->prot_levels[level - 1];
+
+ effective = (higher_prot & prot & (_PAGE_USER | _PAGE_RW)) |
+ ((higher_prot | prot) & _PAGE_NX);
+ } else {
+ effective = prot;
+ }
+
+ st->prot_levels[level] = effective;
}
/*
@@ -261,7 +273,7 @@ static inline pgprotval_t effective_prot(pgprotval_t prot1, pgprotval_t prot2)
* print what we collected so far.
*/
static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level,
- unsigned long val)
+ u64 val)
{
struct pg_state *st = container_of(pt_st, struct pg_state, ptdump);
pgprotval_t new_prot, new_eff;
@@ -270,16 +282,10 @@ static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level,
struct seq_file *m = st->seq;
new_prot = val & PTE_FLAGS_MASK;
-
- if (level > 0) {
- new_eff = effective_prot(st->prot_levels[level - 1],
- new_prot);
- } else {
- new_eff = new_prot;
- }
-
- if (level >= 0)
- st->prot_levels[level] = new_eff;
+ if (!val)
+ new_eff = 0;
+ else
+ new_eff = st->prot_levels[level];
/*
* If we have a "break" in the series, we need to flush the state that
@@ -374,6 +380,7 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m,
struct pg_state st = {
.ptdump = {
.note_page = note_page,
+ .effective_prot = effective_prot,
.range = ptdump_ranges
},
.level = -1,
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index a51df516b87b..c5437f2964ee 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -30,6 +30,7 @@
#include <asm/desc.h> /* store_idt(), ... */
#include <asm/cpu_entry_area.h> /* exception stack */
#include <asm/pgtable_areas.h> /* VMALLOC_START, ... */
+#include <asm/kvm_para.h> /* kvm_handle_async_pf */
#define CREATE_TRACE_POINTS
#include <asm/trace/exceptions.h>
@@ -190,16 +191,13 @@ static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
return pmd_k;
}
-static void vmalloc_sync(void)
+void arch_sync_kernel_mappings(unsigned long start, unsigned long end)
{
- unsigned long address;
-
- if (SHARED_KERNEL_PMD)
- return;
+ unsigned long addr;
- for (address = VMALLOC_START & PMD_MASK;
- address >= TASK_SIZE_MAX && address < VMALLOC_END;
- address += PMD_SIZE) {
+ for (addr = start & PMD_MASK;
+ addr >= TASK_SIZE_MAX && addr < VMALLOC_END;
+ addr += PMD_SIZE) {
struct page *page;
spin_lock(&pgd_lock);
@@ -210,61 +208,13 @@ static void vmalloc_sync(void)
pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
spin_lock(pgt_lock);
- vmalloc_sync_one(page_address(page), address);
+ vmalloc_sync_one(page_address(page), addr);
spin_unlock(pgt_lock);
}
spin_unlock(&pgd_lock);
}
}
-void vmalloc_sync_mappings(void)
-{
- vmalloc_sync();
-}
-
-void vmalloc_sync_unmappings(void)
-{
- vmalloc_sync();
-}
-
-/*
- * 32-bit:
- *
- * Handle a fault on the vmalloc or module mapping area
- */
-static noinline int vmalloc_fault(unsigned long address)
-{
- unsigned long pgd_paddr;
- pmd_t *pmd_k;
- pte_t *pte_k;
-
- /* Make sure we are in vmalloc area: */
- if (!(address >= VMALLOC_START && address < VMALLOC_END))
- return -1;
-
- /*
- * Synchronize this task's top level page-table
- * with the 'reference' page table.
- *
- * Do _not_ use "current" here. We might be inside
- * an interrupt in the middle of a task switch..
- */
- pgd_paddr = read_cr3_pa();
- pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
- if (!pmd_k)
- return -1;
-
- if (pmd_large(*pmd_k))
- return 0;
-
- pte_k = pte_offset_kernel(pmd_k, address);
- if (!pte_present(*pte_k))
- return -1;
-
- return 0;
-}
-NOKPROBE_SYMBOL(vmalloc_fault);
-
/*
* Did it hit the DOS screen memory VA from vm86 mode?
*/
@@ -329,96 +279,6 @@ out:
#else /* CONFIG_X86_64: */
-void vmalloc_sync_mappings(void)
-{
- /*
- * 64-bit mappings might allocate new p4d/pud pages
- * that need to be propagated to all tasks' PGDs.
- */
- sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END);
-}
-
-void vmalloc_sync_unmappings(void)
-{
- /*
- * Unmappings never allocate or free p4d/pud pages.
- * No work is required here.
- */
-}
-
-/*
- * 64-bit:
- *
- * Handle a fault on the vmalloc area
- */
-static noinline int vmalloc_fault(unsigned long address)
-{
- pgd_t *pgd, *pgd_k;
- p4d_t *p4d, *p4d_k;
- pud_t *pud;
- pmd_t *pmd;
- pte_t *pte;
-
- /* Make sure we are in vmalloc area: */
- if (!(address >= VMALLOC_START && address < VMALLOC_END))
- return -1;
-
- /*
- * Copy kernel mappings over when needed. This can also
- * happen within a race in page table update. In the later
- * case just flush:
- */
- pgd = (pgd_t *)__va(read_cr3_pa()) + pgd_index(address);
- pgd_k = pgd_offset_k(address);
- if (pgd_none(*pgd_k))
- return -1;
-
- if (pgtable_l5_enabled()) {
- if (pgd_none(*pgd)) {
- set_pgd(pgd, *pgd_k);
- arch_flush_lazy_mmu_mode();
- } else {
- BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_k));
- }
- }
-
- /* With 4-level paging, copying happens on the p4d level. */
- p4d = p4d_offset(pgd, address);
- p4d_k = p4d_offset(pgd_k, address);
- if (p4d_none(*p4d_k))
- return -1;
-
- if (p4d_none(*p4d) && !pgtable_l5_enabled()) {
- set_p4d(p4d, *p4d_k);
- arch_flush_lazy_mmu_mode();
- } else {
- BUG_ON(p4d_pfn(*p4d) != p4d_pfn(*p4d_k));
- }
-
- BUILD_BUG_ON(CONFIG_PGTABLE_LEVELS < 4);
-
- pud = pud_offset(p4d, address);
- if (pud_none(*pud))
- return -1;
-
- if (pud_large(*pud))
- return 0;
-
- pmd = pmd_offset(pud, address);
- if (pmd_none(*pmd))
- return -1;
-
- if (pmd_large(*pmd))
- return 0;
-
- pte = pte_offset_kernel(pmd, address);
- if (!pte_present(*pte))
- return -1;
-
- return 0;
-}
-NOKPROBE_SYMBOL(vmalloc_fault);
-
#ifdef CONFIG_CPU_SUP_AMD
static const char errata93_warning[] =
KERN_ERR
@@ -1257,29 +1117,6 @@ do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code,
*/
WARN_ON_ONCE(hw_error_code & X86_PF_PK);
- /*
- * We can fault-in kernel-space virtual memory on-demand. The
- * 'reference' page table is init_mm.pgd.
- *
- * NOTE! We MUST NOT take any locks for this case. We may
- * be in an interrupt or a critical region, and should
- * only copy the information from the master page table,
- * nothing more.
- *
- * Before doing this on-demand faulting, ensure that the
- * fault is not any of the following:
- * 1. A fault on a PTE with a reserved bit set.
- * 2. A fault caused by a user-mode access. (Do not demand-
- * fault kernel memory due to user-mode accesses).
- * 3. A fault caused by a page-level protection violation.
- * (A demand fault would be on a non-present page which
- * would have X86_PF_PROT==0).
- */
- if (!(hw_error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) {
- if (vmalloc_fault(address) >= 0)
- return;
- }
-
/* Was the fault spurious, caused by lazy TLB invalidation? */
if (spurious_kernel_fault(hw_error_code, address))
return;
@@ -1523,6 +1360,24 @@ do_page_fault(struct pt_regs *regs, unsigned long hw_error_code,
unsigned long address)
{
prefetchw(&current->mm->mmap_sem);
+ /*
+ * KVM has two types of events that are, logically, interrupts, but
+ * are unfortunately delivered using the #PF vector. These events are
+ * "you just accessed valid memory, but the host doesn't have it right
+ * now, so I'll put you to sleep if you continue" and "that memory
+ * you tried to access earlier is available now."
+ *
+ * We are relying on the interrupted context being sane (valid RSP,
+ * relevant locks not held, etc.), which is fine as long as the
+ * interrupted context had IF=1. We are also relying on the KVM
+ * async pf type field and CR2 being read consistently instead of
+ * getting values from real and async page faults mixed up.
+ *
+ * Fingers crossed.
+ */
+ if (kvm_handle_async_pf(regs, (u32)address))
+ return;
+
trace_page_fault_entries(regs, hw_error_code, address);
if (unlikely(kmmio_fault(regs, address)))
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
index 0a1898b8552e..075fe51317b0 100644
--- a/arch/x86/mm/highmem_32.c
+++ b/arch/x86/mm/highmem_32.c
@@ -4,44 +4,11 @@
#include <linux/swap.h> /* for totalram_pages */
#include <linux/memblock.h>
-void *kmap(struct page *page)
-{
- might_sleep();
- if (!PageHighMem(page))
- return page_address(page);
- return kmap_high(page);
-}
-EXPORT_SYMBOL(kmap);
-
-void kunmap(struct page *page)
-{
- if (in_interrupt())
- BUG();
- if (!PageHighMem(page))
- return;
- kunmap_high(page);
-}
-EXPORT_SYMBOL(kunmap);
-
-/*
- * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because
- * no global lock is needed and because the kmap code must perform a global TLB
- * invalidation when the kmap pool wraps.
- *
- * However when holding an atomic kmap it is not legal to sleep, so atomic
- * kmaps are appropriate for short, tight code paths only.
- */
-void *kmap_atomic_prot(struct page *page, pgprot_t prot)
+void *kmap_atomic_high_prot(struct page *page, pgprot_t prot)
{
unsigned long vaddr;
int idx, type;
- preempt_disable();
- pagefault_disable();
-
- if (!PageHighMem(page))
- return page_address(page);
-
type = kmap_atomic_idx_push();
idx = type + KM_TYPE_NR*smp_processor_id();
vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
@@ -51,13 +18,7 @@ void *kmap_atomic_prot(struct page *page, pgprot_t prot)
return (void *)vaddr;
}
-EXPORT_SYMBOL(kmap_atomic_prot);
-
-void *kmap_atomic(struct page *page)
-{
- return kmap_atomic_prot(page, kmap_prot);
-}
-EXPORT_SYMBOL(kmap_atomic);
+EXPORT_SYMBOL(kmap_atomic_high_prot);
/*
* This is the same as kmap_atomic() but can map memory that doesn't
@@ -69,7 +30,7 @@ void *kmap_atomic_pfn(unsigned long pfn)
}
EXPORT_SYMBOL_GPL(kmap_atomic_pfn);
-void __kunmap_atomic(void *kvaddr)
+void kunmap_atomic_high(void *kvaddr)
{
unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
@@ -99,11 +60,8 @@ void __kunmap_atomic(void *kvaddr)
BUG_ON(vaddr >= (unsigned long)high_memory);
}
#endif
-
- pagefault_enable();
- preempt_enable();
}
-EXPORT_SYMBOL(__kunmap_atomic);
+EXPORT_SYMBOL(kunmap_atomic_high);
void __init set_highmem_pages_init(void)
{
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index 5bfd5aef5378..cf5781142716 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -181,28 +181,21 @@ get_unmapped_area:
#endif /* CONFIG_HUGETLB_PAGE */
#ifdef CONFIG_X86_64
-static __init int setup_hugepagesz(char *opt)
+bool __init arch_hugetlb_valid_size(unsigned long size)
{
- unsigned long ps = memparse(opt, &opt);
- if (ps == PMD_SIZE) {
- hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT);
- } else if (ps == PUD_SIZE && boot_cpu_has(X86_FEATURE_GBPAGES)) {
- hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT);
- } else {
- hugetlb_bad_size();
- printk(KERN_ERR "hugepagesz: Unsupported page size %lu M\n",
- ps >> 20);
- return 0;
- }
- return 1;
+ if (size == PMD_SIZE)
+ return true;
+ else if (size == PUD_SIZE && boot_cpu_has(X86_FEATURE_GBPAGES))
+ return true;
+ else
+ return false;
}
-__setup("hugepagesz=", setup_hugepagesz);
#ifdef CONFIG_CONTIG_ALLOC
static __init int gigantic_pages_init(void)
{
/* With compaction or CMA we can allocate gigantic pages at runtime */
- if (boot_cpu_has(X86_FEATURE_GBPAGES) && !size_to_hstate(1UL << PUD_SHIFT))
+ if (boot_cpu_has(X86_FEATURE_GBPAGES))
hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT);
return 0;
}
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 1bba16c5742b..112d3b98a3b6 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -49,7 +49,7 @@
* Index into __pte2cachemode_tbl[] are the caching attribute bits of the pte
* (_PAGE_PWT, _PAGE_PCD, _PAGE_PAT) at index bit positions 0, 1, 2.
*/
-uint16_t __cachemode2pte_tbl[_PAGE_CACHE_MODE_NUM] = {
+static uint16_t __cachemode2pte_tbl[_PAGE_CACHE_MODE_NUM] = {
[_PAGE_CACHE_MODE_WB ] = 0 | 0 ,
[_PAGE_CACHE_MODE_WC ] = 0 | _PAGE_PCD,
[_PAGE_CACHE_MODE_UC_MINUS] = 0 | _PAGE_PCD,
@@ -57,9 +57,16 @@ uint16_t __cachemode2pte_tbl[_PAGE_CACHE_MODE_NUM] = {
[_PAGE_CACHE_MODE_WT ] = 0 | _PAGE_PCD,
[_PAGE_CACHE_MODE_WP ] = 0 | _PAGE_PCD,
};
-EXPORT_SYMBOL(__cachemode2pte_tbl);
-uint8_t __pte2cachemode_tbl[8] = {
+unsigned long cachemode2protval(enum page_cache_mode pcm)
+{
+ if (likely(pcm == 0))
+ return 0;
+ return __cachemode2pte_tbl[pcm];
+}
+EXPORT_SYMBOL(cachemode2protval);
+
+static uint8_t __pte2cachemode_tbl[8] = {
[__pte2cm_idx( 0 | 0 | 0 )] = _PAGE_CACHE_MODE_WB,
[__pte2cm_idx(_PAGE_PWT | 0 | 0 )] = _PAGE_CACHE_MODE_UC_MINUS,
[__pte2cm_idx( 0 | _PAGE_PCD | 0 )] = _PAGE_CACHE_MODE_UC_MINUS,
@@ -69,7 +76,22 @@ uint8_t __pte2cachemode_tbl[8] = {
[__pte2cm_idx(0 | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC_MINUS,
[__pte2cm_idx(_PAGE_PWT | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC,
};
-EXPORT_SYMBOL(__pte2cachemode_tbl);
+
+/* Check that the write-protect PAT entry is set for write-protect */
+bool x86_has_pat_wp(void)
+{
+ return __pte2cachemode_tbl[_PAGE_CACHE_MODE_WP] == _PAGE_CACHE_MODE_WP;
+}
+
+enum page_cache_mode pgprot2cachemode(pgprot_t pgprot)
+{
+ unsigned long masked;
+
+ masked = pgprot_val(pgprot) & _PAGE_CACHE_MASK;
+ if (likely(masked == 0))
+ return 0;
+ return __pte2cachemode_tbl[__pte2cm_idx(masked)];
+}
static unsigned long __initdata pgt_buf_start;
static unsigned long __initdata pgt_buf_end;
@@ -121,8 +143,6 @@ __ref void *alloc_low_pages(unsigned int num)
} else {
pfn = pgt_buf_end;
pgt_buf_end += num;
- printk(KERN_DEBUG "BRK [%#010lx, %#010lx] PGTABLE\n",
- pfn << PAGE_SHIFT, (pgt_buf_end << PAGE_SHIFT) - 1);
}
for (i = 0; i < num; i++) {
@@ -172,6 +192,19 @@ struct map_range {
static int page_size_mask;
+/*
+ * Save some of cr4 feature set we're using (e.g. Pentium 4MB
+ * enable and PPro Global page enable), so that any CPU's that boot
+ * up after us can get the correct flags. Invoked on the boot CPU.
+ */
+static inline void cr4_set_bits_and_update_boot(unsigned long mask)
+{
+ mmu_cr4_features |= mask;
+ if (trampoline_cr4_features)
+ *trampoline_cr4_features = mmu_cr4_features;
+ cr4_set_bits(mask);
+}
+
static void __init probe_page_size_mask(void)
{
/*
@@ -949,7 +982,7 @@ void __init zone_sizes_init(void)
max_zone_pfns[ZONE_HIGHMEM] = max_pfn;
#endif
- free_area_init_nodes(max_zone_pfns);
+ free_area_init(max_zone_pfns);
}
__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = {
@@ -957,7 +990,6 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = {
.next_asid = 1,
.cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */
};
-EXPORT_PER_CPU_SYMBOL(cpu_tlbstate);
void update_cache_mode_entry(unsigned entry, enum page_cache_mode cache)
{
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 3b289c2f75cd..add03c35aa34 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -54,6 +54,7 @@
#include <asm/init.h>
#include <asm/uv/uv.h>
#include <asm/setup.h>
+#include <asm/ftrace.h>
#include "mm_internal.h"
@@ -217,6 +218,11 @@ void sync_global_pgds(unsigned long start, unsigned long end)
sync_global_pgds_l4(start, end);
}
+void arch_sync_kernel_mappings(unsigned long start, unsigned long end)
+{
+ sync_global_pgds(start, end);
+}
+
/*
* NOTE: This function is marked __ref because it calls __init function
* (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0.
@@ -298,7 +304,7 @@ static void __set_pte_vaddr(pud_t *pud, unsigned long vaddr, pte_t new_pte)
* It's enough to flush this one mapping.
* (PGE mappings get flushed as well)
*/
- __flush_tlb_one_kernel(vaddr);
+ flush_tlb_one_kernel(vaddr);
}
void set_pte_vaddr_p4d(p4d_t *p4d_page, unsigned long vaddr, pte_t new_pte)
@@ -367,7 +373,7 @@ static void __init __init_extra_mapping(unsigned long phys, unsigned long size,
pgprot_t prot;
pgprot_val(prot) = pgprot_val(PAGE_KERNEL_LARGE) |
- pgprot_val(pgprot_4k_2_large(cachemode2pgprot(cache)));
+ protval_4k_2_large(cachemode2protval(cache));
BUG_ON((phys & ~PMD_MASK) || (size & ~PMD_MASK));
for (; size; phys += PMD_SIZE, size -= PMD_SIZE) {
pgd = pgd_offset_k((unsigned long)__va(phys));
@@ -1259,6 +1265,18 @@ void __init mem_init(void)
mem_init_print_info(NULL);
}
+#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
+int __init deferred_page_init_max_threads(const struct cpumask *node_cpumask)
+{
+ /*
+ * More CPUs always led to greater speedups on tested systems, up to
+ * all the nodes' CPUs. Use all since the system is otherwise idle
+ * now.
+ */
+ return max_t(int, cpumask_weight(node_cpumask), 1);
+}
+#endif
+
int kernel_set_to_readonly;
void mark_rodata_ro(void)
@@ -1291,6 +1309,8 @@ void mark_rodata_ro(void)
all_end = roundup((unsigned long)_brk_end, PMD_SIZE);
set_memory_nx(text_end, (all_end - text_end) >> PAGE_SHIFT);
+ set_ftrace_ops_ro();
+
#ifdef CONFIG_CPA_DEBUG
printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, end);
set_memory_rw(start, (end-start) >> PAGE_SHIFT);
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 18c637c0dc6f..986d57534fd6 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -778,10 +778,8 @@ void __init *early_memremap_encrypted(resource_size_t phys_addr,
void __init *early_memremap_encrypted_wp(resource_size_t phys_addr,
unsigned long size)
{
- /* Be sure the write-protect PAT entry is set for write-protect */
- if (__pte2cachemode_tbl[_PAGE_CACHE_MODE_WP] != _PAGE_CACHE_MODE_WP)
+ if (!x86_has_pat_wp())
return NULL;
-
return early_memremap_prot(phys_addr, size, __PAGE_KERNEL_ENC_WP);
}
@@ -799,10 +797,8 @@ void __init *early_memremap_decrypted(resource_size_t phys_addr,
void __init *early_memremap_decrypted_wp(resource_size_t phys_addr,
unsigned long size)
{
- /* Be sure the write-protect PAT entry is set for write-protect */
- if (__pte2cachemode_tbl[_PAGE_CACHE_MODE_WP] != _PAGE_CACHE_MODE_WP)
+ if (!x86_has_pat_wp())
return NULL;
-
return early_memremap_prot(phys_addr, size, __PAGE_KERNEL_NOENC_WP);
}
#endif /* CONFIG_AMD_MEM_ENCRYPT */
@@ -889,5 +885,5 @@ void __init __early_set_fixmap(enum fixed_addresses idx,
set_pte(pte, pfn_pte(phys >> PAGE_SHIFT, flags));
else
pte_clear(&init_mm, addr, pte);
- __flush_tlb_one_kernel(addr);
+ flush_tlb_one_kernel(addr);
}
diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c
index 9994353fb75d..be020a7bc414 100644
--- a/arch/x86/mm/kmmio.c
+++ b/arch/x86/mm/kmmio.c
@@ -130,7 +130,7 @@ static void clear_pmd_presence(pmd_t *pmd, bool clear, pmdval_t *old)
pmdval_t v = pmd_val(*pmd);
if (clear) {
*old = v;
- new_pmd = pmd_mknotpresent(*pmd);
+ new_pmd = pmd_mkinvalid(*pmd);
} else {
/* Presume this has been called with clear==true previously */
new_pmd = __pmd(*old);
@@ -173,7 +173,7 @@ static int clear_page_presence(struct kmmio_fault_page *f, bool clear)
return -1;
}
- __flush_tlb_one_kernel(f->addr);
+ flush_tlb_one_kernel(f->addr);
return 0;
}
diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
index a03614bd3e1a..4a781cf99e92 100644
--- a/arch/x86/mm/mem_encrypt.c
+++ b/arch/x86/mm/mem_encrypt.c
@@ -134,7 +134,7 @@ static void __init __sme_early_map_unmap_mem(void *vaddr, unsigned long size,
size = (size <= PMD_SIZE) ? 0 : size - PMD_SIZE;
} while (size);
- __native_flush_tlb();
+ flush_tlb_local();
}
void __init sme_unmap_bootdata(char *real_mode_data)
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index cb91eccc4960..c90c20904a60 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -18,7 +18,9 @@
#include <linux/sched/signal.h>
#include <linux/sched/mm.h>
#include <linux/compat.h>
+#include <linux/elf-randomize.h>
#include <asm/elf.h>
+#include <asm/io.h>
#include "physaddr.h"
diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c
index 109325d77b3e..43fd19b3f118 100644
--- a/arch/x86/mm/mmio-mod.c
+++ b/arch/x86/mm/mmio-mod.c
@@ -372,7 +372,7 @@ static void enter_uniprocessor(void)
int cpu;
int err;
- if (downed_cpus == NULL &&
+ if (!cpumask_available(downed_cpus) &&
!alloc_cpumask_var(&downed_cpus, GFP_KERNEL)) {
pr_notice("Failed to allocate mask\n");
goto out;
@@ -402,7 +402,7 @@ static void leave_uniprocessor(void)
int cpu;
int err;
- if (downed_cpus == NULL || cpumask_weight(downed_cpus) == 0)
+ if (!cpumask_available(downed_cpus) || cpumask_weight(downed_cpus) == 0)
return;
pr_notice("Re-enabling CPUs...\n");
for_each_cpu(cpu, downed_cpus) {
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 59ba008504dc..8ee952038c80 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -517,8 +517,10 @@ static void __init numa_clear_kernel_node_hotplug(void)
* reserve specific pages for Sandy Bridge graphics. ]
*/
for_each_memblock(reserved, mb_region) {
- if (mb_region->nid != MAX_NUMNODES)
- node_set(mb_region->nid, reserved_nodemask);
+ int nid = memblock_get_region_node(mb_region);
+
+ if (nid != MAX_NUMNODES)
+ node_set(nid, reserved_nodemask);
}
/*
@@ -735,12 +737,9 @@ void __init x86_numa_init(void)
static void __init init_memory_less_node(int nid)
{
- unsigned long zones_size[MAX_NR_ZONES] = {0};
- unsigned long zholes_size[MAX_NR_ZONES] = {0};
-
/* Allocate and initialize node data. Memory-less node is now online.*/
alloc_node_data(nid);
- free_area_init_node(nid, zones_size, 0, zholes_size);
+ free_area_init_memoryless_node(nid);
/*
* All zonelists will be built later in start_kernel() after per cpu
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index f2bd3d61e16b..104544359d69 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -27,40 +27,6 @@
#include "numa_internal.h"
-#ifdef CONFIG_DISCONTIGMEM
-/*
- * 4) physnode_map - the mapping between a pfn and owning node
- * physnode_map keeps track of the physical memory layout of a generic
- * numa node on a 64Mb break (each element of the array will
- * represent 64Mb of memory and will be marked by the node id. so,
- * if the first gig is on node 0, and the second gig is on node 1
- * physnode_map will contain:
- *
- * physnode_map[0-15] = 0;
- * physnode_map[16-31] = 1;
- * physnode_map[32- ] = -1;
- */
-s8 physnode_map[MAX_SECTIONS] __read_mostly = { [0 ... (MAX_SECTIONS - 1)] = -1};
-EXPORT_SYMBOL(physnode_map);
-
-void memory_present(int nid, unsigned long start, unsigned long end)
-{
- unsigned long pfn;
-
- printk(KERN_INFO "Node: %d, start_pfn: %lx, end_pfn: %lx\n",
- nid, start, end);
- printk(KERN_DEBUG " Setting physnode_map array to node %d for pfns:\n", nid);
- printk(KERN_DEBUG " ");
- start = round_down(start, PAGES_PER_SECTION);
- end = round_up(end, PAGES_PER_SECTION);
- for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) {
- physnode_map[pfn / PAGES_PER_SECTION] = nid;
- printk(KERN_CONT "%lx ", pfn);
- }
- printk(KERN_CONT "\n");
-}
-#endif
-
extern unsigned long highend_pfn, highstart_pfn;
void __init initmem_init(void)
diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
index b8c55a2e402d..77e04304a2a7 100644
--- a/arch/x86/mm/pat/set_memory.c
+++ b/arch/x86/mm/pat/set_memory.c
@@ -69,6 +69,11 @@ static DEFINE_SPINLOCK(cpa_lock);
#define CPA_PAGES_ARRAY 4
#define CPA_NO_CHECK_ALIAS 8 /* Do not search for aliases */
+static inline pgprot_t cachemode2pgprot(enum page_cache_mode pcm)
+{
+ return __pgprot(cachemode2protval(pcm));
+}
+
#ifdef CONFIG_PROC_FS
static unsigned long direct_pages_count[PG_LEVEL_NUM];
@@ -341,7 +346,7 @@ static void __cpa_flush_tlb(void *data)
unsigned int i;
for (i = 0; i < cpa->numpages; i++)
- __flush_tlb_one_kernel(fix_addr(__cpa_addr(cpa, i)));
+ flush_tlb_one_kernel(fix_addr(__cpa_addr(cpa, i)));
}
static void cpa_flush(struct cpa_data *data, int cache)
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 7bd2c3a52297..d88e9064c28e 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -19,6 +19,14 @@ EXPORT_SYMBOL(physical_mask);
#define PGTABLE_HIGHMEM 0
#endif
+#ifndef CONFIG_PARAVIRT
+static inline
+void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table)
+{
+ tlb_remove_page(tlb, table);
+}
+#endif
+
gfp_t __userpte_alloc_gfp = GFP_PGTABLE_USER | PGTABLE_HIGHMEM;
pgtable_t pte_alloc_one(struct mm_struct *mm)
@@ -706,11 +714,9 @@ int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot)
if (pud_present(*pud) && !pud_huge(*pud))
return 0;
- prot = pgprot_4k_2_large(prot);
-
set_pte((pte_t *)pud, pfn_pte(
(u64)addr >> PAGE_SHIFT,
- __pgprot(pgprot_val(prot) | _PAGE_PSE)));
+ __pgprot(protval_4k_2_large(pgprot_val(prot)) | _PAGE_PSE)));
return 1;
}
@@ -738,11 +744,9 @@ int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot)
if (pmd_present(*pmd) && !pmd_huge(*pmd))
return 0;
- prot = pgprot_4k_2_large(prot);
-
set_pte((pte_t *)pmd, pfn_pte(
(u64)addr >> PAGE_SHIFT,
- __pgprot(pgprot_val(prot) | _PAGE_PSE)));
+ __pgprot(protval_4k_2_large(pgprot_val(prot)) | _PAGE_PSE)));
return 1;
}
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
index 0e6700eaa4f9..e1ce59dc558f 100644
--- a/arch/x86/mm/pgtable_32.c
+++ b/arch/x86/mm/pgtable_32.c
@@ -64,7 +64,7 @@ void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
* It's enough to flush this one mapping.
* (PGE mappings get flushed as well)
*/
- __flush_tlb_one_kernel(vaddr);
+ flush_tlb_one_kernel(vaddr);
}
unsigned long __FIXADDR_TOP = 0xfffff000;
diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
index 843aa10a4cb6..da0fb17a1a36 100644
--- a/arch/x86/mm/pti.c
+++ b/arch/x86/mm/pti.c
@@ -448,13 +448,7 @@ static void __init pti_clone_user_shared(void)
* the sp1 and sp2 slots.
*
* This is done for all possible CPUs during boot to ensure
- * that it's propagated to all mms. If we were to add one of
- * these mappings during CPU hotplug, we would need to take
- * some measure to make sure that every mm that subsequently
- * ran on that CPU would have the relevant PGD entry in its
- * pagetables. The usual vmalloc_fault() mechanism would not
- * work for page faults taken in entry_SYSCALL_64 before RSP
- * is set up.
+ * that it's propagated to all mms.
*/
unsigned long va = (unsigned long)&per_cpu(cpu_tss_rw, cpu);
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 66f96f21a7b6..1a3569b43aa5 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -18,6 +18,16 @@
#include "mm_internal.h"
+#ifdef CONFIG_PARAVIRT
+# define STATIC_NOPV
+#else
+# define STATIC_NOPV static
+# define __flush_tlb_local native_flush_tlb_local
+# define __flush_tlb_global native_flush_tlb_global
+# define __flush_tlb_one_user(addr) native_flush_tlb_one_user(addr)
+# define __flush_tlb_others(msk, info) native_flush_tlb_others(msk, info)
+#endif
+
/*
* TLB flushing, formerly SMP-only
* c/o Linus Torvalds.
@@ -39,6 +49,126 @@
#define LAST_USER_MM_IBPB 0x1UL
/*
+ * The x86 feature is called PCID (Process Context IDentifier). It is similar
+ * to what is traditionally called ASID on the RISC processors.
+ *
+ * We don't use the traditional ASID implementation, where each process/mm gets
+ * its own ASID and flush/restart when we run out of ASID space.
+ *
+ * Instead we have a small per-cpu array of ASIDs and cache the last few mm's
+ * that came by on this CPU, allowing cheaper switch_mm between processes on
+ * this CPU.
+ *
+ * We end up with different spaces for different things. To avoid confusion we
+ * use different names for each of them:
+ *
+ * ASID - [0, TLB_NR_DYN_ASIDS-1]
+ * the canonical identifier for an mm
+ *
+ * kPCID - [1, TLB_NR_DYN_ASIDS]
+ * the value we write into the PCID part of CR3; corresponds to the
+ * ASID+1, because PCID 0 is special.
+ *
+ * uPCID - [2048 + 1, 2048 + TLB_NR_DYN_ASIDS]
+ * for KPTI each mm has two address spaces and thus needs two
+ * PCID values, but we can still do with a single ASID denomination
+ * for each mm. Corresponds to kPCID + 2048.
+ *
+ */
+
+/* There are 12 bits of space for ASIDS in CR3 */
+#define CR3_HW_ASID_BITS 12
+
+/*
+ * When enabled, PAGE_TABLE_ISOLATION consumes a single bit for
+ * user/kernel switches
+ */
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+# define PTI_CONSUMED_PCID_BITS 1
+#else
+# define PTI_CONSUMED_PCID_BITS 0
+#endif
+
+#define CR3_AVAIL_PCID_BITS (X86_CR3_PCID_BITS - PTI_CONSUMED_PCID_BITS)
+
+/*
+ * ASIDs are zero-based: 0->MAX_AVAIL_ASID are valid. -1 below to account
+ * for them being zero-based. Another -1 is because PCID 0 is reserved for
+ * use by non-PCID-aware users.
+ */
+#define MAX_ASID_AVAILABLE ((1 << CR3_AVAIL_PCID_BITS) - 2)
+
+/*
+ * Given @asid, compute kPCID
+ */
+static inline u16 kern_pcid(u16 asid)
+{
+ VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE);
+
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+ /*
+ * Make sure that the dynamic ASID space does not confict with the
+ * bit we are using to switch between user and kernel ASIDs.
+ */
+ BUILD_BUG_ON(TLB_NR_DYN_ASIDS >= (1 << X86_CR3_PTI_PCID_USER_BIT));
+
+ /*
+ * The ASID being passed in here should have respected the
+ * MAX_ASID_AVAILABLE and thus never have the switch bit set.
+ */
+ VM_WARN_ON_ONCE(asid & (1 << X86_CR3_PTI_PCID_USER_BIT));
+#endif
+ /*
+ * The dynamically-assigned ASIDs that get passed in are small
+ * (<TLB_NR_DYN_ASIDS). They never have the high switch bit set,
+ * so do not bother to clear it.
+ *
+ * If PCID is on, ASID-aware code paths put the ASID+1 into the
+ * PCID bits. This serves two purposes. It prevents a nasty
+ * situation in which PCID-unaware code saves CR3, loads some other
+ * value (with PCID == 0), and then restores CR3, thus corrupting
+ * the TLB for ASID 0 if the saved ASID was nonzero. It also means
+ * that any bugs involving loading a PCID-enabled CR3 with
+ * CR4.PCIDE off will trigger deterministically.
+ */
+ return asid + 1;
+}
+
+/*
+ * Given @asid, compute uPCID
+ */
+static inline u16 user_pcid(u16 asid)
+{
+ u16 ret = kern_pcid(asid);
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+ ret |= 1 << X86_CR3_PTI_PCID_USER_BIT;
+#endif
+ return ret;
+}
+
+static inline unsigned long build_cr3(pgd_t *pgd, u16 asid)
+{
+ if (static_cpu_has(X86_FEATURE_PCID)) {
+ return __sme_pa(pgd) | kern_pcid(asid);
+ } else {
+ VM_WARN_ON_ONCE(asid != 0);
+ return __sme_pa(pgd);
+ }
+}
+
+static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid)
+{
+ VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE);
+ /*
+ * Use boot_cpu_has() instead of this_cpu_has() as this function
+ * might be called during early boot. This should work even after
+ * boot because all CPU's the have same capabilities:
+ */
+ VM_WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_PCID));
+ return __sme_pa(pgd) | kern_pcid(asid) | CR3_NOFLUSH;
+}
+
+/*
* We get here when we do something requiring a TLB invalidation
* but could not go invalidate all of the contexts. We do the
* necessary invalidation by clearing out the 'ctx_id' which
@@ -110,6 +240,32 @@ static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen,
*need_flush = true;
}
+/*
+ * Given an ASID, flush the corresponding user ASID. We can delay this
+ * until the next time we switch to it.
+ *
+ * See SWITCH_TO_USER_CR3.
+ */
+static inline void invalidate_user_asid(u16 asid)
+{
+ /* There is no user ASID if address space separation is off */
+ if (!IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION))
+ return;
+
+ /*
+ * We only have a single ASID if PCID is off and the CR3
+ * write will have flushed it.
+ */
+ if (!cpu_feature_enabled(X86_FEATURE_PCID))
+ return;
+
+ if (!static_cpu_has(X86_FEATURE_PTI))
+ return;
+
+ __set_bit(kern_pcid(asid),
+ (unsigned long *)this_cpu_ptr(&cpu_tlbstate.user_pcid_flush_mask));
+}
+
static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, bool need_flush)
{
unsigned long new_mm_cr3;
@@ -161,34 +317,6 @@ void switch_mm(struct mm_struct *prev, struct mm_struct *next,
local_irq_restore(flags);
}
-static void sync_current_stack_to_mm(struct mm_struct *mm)
-{
- unsigned long sp = current_stack_pointer;
- pgd_t *pgd = pgd_offset(mm, sp);
-
- if (pgtable_l5_enabled()) {
- if (unlikely(pgd_none(*pgd))) {
- pgd_t *pgd_ref = pgd_offset_k(sp);
-
- set_pgd(pgd, *pgd_ref);
- }
- } else {
- /*
- * "pgd" is faked. The top level entries are "p4d"s, so sync
- * the p4d. This compiles to approximately the same code as
- * the 5-level case.
- */
- p4d_t *p4d = p4d_offset(pgd, sp);
-
- if (unlikely(p4d_none(*p4d))) {
- pgd_t *pgd_ref = pgd_offset_k(sp);
- p4d_t *p4d_ref = p4d_offset(pgd_ref, sp);
-
- set_p4d(p4d, *p4d_ref);
- }
- }
-}
-
static inline unsigned long mm_mangle_tif_spec_ib(struct task_struct *next)
{
unsigned long next_tif = task_thread_info(next)->flags;
@@ -272,6 +400,26 @@ static void cond_ibpb(struct task_struct *next)
}
}
+#ifdef CONFIG_PERF_EVENTS
+static inline void cr4_update_pce_mm(struct mm_struct *mm)
+{
+ if (static_branch_unlikely(&rdpmc_always_available_key) ||
+ (!static_branch_unlikely(&rdpmc_never_available_key) &&
+ atomic_read(&mm->context.perf_rdpmc_allowed)))
+ cr4_set_bits_irqsoff(X86_CR4_PCE);
+ else
+ cr4_clear_bits_irqsoff(X86_CR4_PCE);
+}
+
+void cr4_update_pce(void *ignored)
+{
+ cr4_update_pce_mm(this_cpu_read(cpu_tlbstate.loaded_mm));
+}
+
+#else
+static inline void cr4_update_pce_mm(struct mm_struct *mm) { }
+#endif
+
void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
struct task_struct *tsk)
{
@@ -377,15 +525,6 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
*/
cond_ibpb(tsk);
- if (IS_ENABLED(CONFIG_VMAP_STACK)) {
- /*
- * If our current stack is in vmalloc space and isn't
- * mapped in the new pgd, we'll double-fault. Forcibly
- * map it.
- */
- sync_current_stack_to_mm(next);
- }
-
/*
* Stop remote flushes for the previous mm.
* Skip kernel threads; we never send init_mm TLB flushing IPIs,
@@ -440,7 +579,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
if (next != real_prev) {
- load_mm_cr4_irqsoff(next);
+ cr4_update_pce_mm(next);
switch_ldt(real_prev, next);
}
}
@@ -617,7 +756,7 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
unsigned long addr = f->start;
while (addr < f->end) {
- __flush_tlb_one_user(addr);
+ flush_tlb_one_user(addr);
addr += 1UL << f->stride_shift;
}
if (local)
@@ -625,7 +764,7 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
trace_tlb_flush(reason, nr_invalidate);
} else {
/* Full flush. */
- local_flush_tlb();
+ flush_tlb_local();
if (local)
count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
trace_tlb_flush(reason, TLB_FLUSH_ALL);
@@ -660,8 +799,8 @@ static bool tlb_is_not_lazy(int cpu, void *data)
return !per_cpu(cpu_tlbstate.is_lazy, cpu);
}
-void native_flush_tlb_others(const struct cpumask *cpumask,
- const struct flush_tlb_info *info)
+STATIC_NOPV void native_flush_tlb_others(const struct cpumask *cpumask,
+ const struct flush_tlb_info *info)
{
count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
if (info->end == TLB_FLUSH_ALL)
@@ -711,6 +850,12 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
(void *)info, 1, cpumask);
}
+void flush_tlb_others(const struct cpumask *cpumask,
+ const struct flush_tlb_info *info)
+{
+ __flush_tlb_others(cpumask, info);
+}
+
/*
* See Documentation/x86/tlb.rst for details. We choose 33
* because it is large enough to cover the vast majority (at
@@ -821,7 +966,7 @@ static void do_kernel_range_flush(void *info)
/* flush range by one by one 'invlpg' */
for (addr = f->start; addr < f->end; addr += PAGE_SIZE)
- __flush_tlb_one_kernel(addr);
+ flush_tlb_one_kernel(addr);
}
void flush_tlb_kernel_range(unsigned long start, unsigned long end)
@@ -844,6 +989,164 @@ void flush_tlb_kernel_range(unsigned long start, unsigned long end)
}
/*
+ * This can be used from process context to figure out what the value of
+ * CR3 is without needing to do a (slow) __read_cr3().
+ *
+ * It's intended to be used for code like KVM that sneakily changes CR3
+ * and needs to restore it. It needs to be used very carefully.
+ */
+unsigned long __get_current_cr3_fast(void)
+{
+ unsigned long cr3 = build_cr3(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd,
+ this_cpu_read(cpu_tlbstate.loaded_mm_asid));
+
+ /* For now, be very restrictive about when this can be called. */
+ VM_WARN_ON(in_nmi() || preemptible());
+
+ VM_BUG_ON(cr3 != __read_cr3());
+ return cr3;
+}
+EXPORT_SYMBOL_GPL(__get_current_cr3_fast);
+
+/*
+ * Flush one page in the kernel mapping
+ */
+void flush_tlb_one_kernel(unsigned long addr)
+{
+ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE);
+
+ /*
+ * If PTI is off, then __flush_tlb_one_user() is just INVLPG or its
+ * paravirt equivalent. Even with PCID, this is sufficient: we only
+ * use PCID if we also use global PTEs for the kernel mapping, and
+ * INVLPG flushes global translations across all address spaces.
+ *
+ * If PTI is on, then the kernel is mapped with non-global PTEs, and
+ * __flush_tlb_one_user() will flush the given address for the current
+ * kernel address space and for its usermode counterpart, but it does
+ * not flush it for other address spaces.
+ */
+ flush_tlb_one_user(addr);
+
+ if (!static_cpu_has(X86_FEATURE_PTI))
+ return;
+
+ /*
+ * See above. We need to propagate the flush to all other address
+ * spaces. In principle, we only need to propagate it to kernelmode
+ * address spaces, but the extra bookkeeping we would need is not
+ * worth it.
+ */
+ this_cpu_write(cpu_tlbstate.invalidate_other, true);
+}
+
+/*
+ * Flush one page in the user mapping
+ */
+STATIC_NOPV void native_flush_tlb_one_user(unsigned long addr)
+{
+ u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
+
+ asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
+
+ if (!static_cpu_has(X86_FEATURE_PTI))
+ return;
+
+ /*
+ * Some platforms #GP if we call invpcid(type=1/2) before CR4.PCIDE=1.
+ * Just use invalidate_user_asid() in case we are called early.
+ */
+ if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE))
+ invalidate_user_asid(loaded_mm_asid);
+ else
+ invpcid_flush_one(user_pcid(loaded_mm_asid), addr);
+}
+
+void flush_tlb_one_user(unsigned long addr)
+{
+ __flush_tlb_one_user(addr);
+}
+
+/*
+ * Flush everything
+ */
+STATIC_NOPV void native_flush_tlb_global(void)
+{
+ unsigned long cr4, flags;
+
+ if (static_cpu_has(X86_FEATURE_INVPCID)) {
+ /*
+ * Using INVPCID is considerably faster than a pair of writes
+ * to CR4 sandwiched inside an IRQ flag save/restore.
+ *
+ * Note, this works with CR4.PCIDE=0 or 1.
+ */
+ invpcid_flush_all();
+ return;
+ }
+
+ /*
+ * Read-modify-write to CR4 - protect it from preemption and
+ * from interrupts. (Use the raw variant because this code can
+ * be called from deep inside debugging code.)
+ */
+ raw_local_irq_save(flags);
+
+ cr4 = this_cpu_read(cpu_tlbstate.cr4);
+ /* toggle PGE */
+ native_write_cr4(cr4 ^ X86_CR4_PGE);
+ /* write old PGE again and flush TLBs */
+ native_write_cr4(cr4);
+
+ raw_local_irq_restore(flags);
+}
+
+/*
+ * Flush the entire current user mapping
+ */
+STATIC_NOPV void native_flush_tlb_local(void)
+{
+ /*
+ * Preemption or interrupts must be disabled to protect the access
+ * to the per CPU variable and to prevent being preempted between
+ * read_cr3() and write_cr3().
+ */
+ WARN_ON_ONCE(preemptible());
+
+ invalidate_user_asid(this_cpu_read(cpu_tlbstate.loaded_mm_asid));
+
+ /* If current->mm == NULL then the read_cr3() "borrows" an mm */
+ native_write_cr3(__native_read_cr3());
+}
+
+void flush_tlb_local(void)
+{
+ __flush_tlb_local();
+}
+
+/*
+ * Flush everything
+ */
+void __flush_tlb_all(void)
+{
+ /*
+ * This is to catch users with enabled preemption and the PGE feature
+ * and don't trigger the warning in __native_flush_tlb().
+ */
+ VM_WARN_ON_ONCE(preemptible());
+
+ if (boot_cpu_has(X86_FEATURE_PGE)) {
+ __flush_tlb_global();
+ } else {
+ /*
+ * !PGE -> !PCID (setup_pcid()), thus every flush is total.
+ */
+ flush_tlb_local();
+ }
+}
+EXPORT_SYMBOL_GPL(__flush_tlb_all);
+
+/*
* arch_tlbbatch_flush() performs a full TLB flush regardless of the active mm.
* This means that the 'struct flush_tlb_info' that describes which mappings to
* flush is actually fixed. We therefore set a single fixed struct and use it in
@@ -874,6 +1177,38 @@ void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
put_cpu();
}
+/*
+ * Blindly accessing user memory from NMI context can be dangerous
+ * if we're in the middle of switching the current user task or
+ * switching the loaded mm. It can also be dangerous if we
+ * interrupted some kernel code that was temporarily using a
+ * different mm.
+ */
+bool nmi_uaccess_okay(void)
+{
+ struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
+ struct mm_struct *current_mm = current->mm;
+
+ VM_WARN_ON_ONCE(!loaded_mm);
+
+ /*
+ * The condition we want to check is
+ * current_mm->pgd == __va(read_cr3_pa()). This may be slow, though,
+ * if we're running in a VM with shadow paging, and nmi_uaccess_okay()
+ * is supposed to be reasonably fast.
+ *
+ * Instead, we check the almost equivalent but somewhat conservative
+ * condition below, and we rely on the fact that switch_mm_irqs_off()
+ * sets loaded_mm to LOADED_MM_SWITCHING before writing to CR3.
+ */
+ if (loaded_mm != current_mm)
+ return false;
+
+ VM_WARN_ON_ONCE(current_mm->pgd != __va(read_cr3_pa()));
+
+ return true;
+}
+
static ssize_t tlbflush_read_file(struct file *file, char __user *user_buf,
size_t count, loff_t *ppos)
{