aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/arch/x86/mm/tlb.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/mm/tlb.c')
-rw-r--r--arch/x86/mm/tlb.c127
1 files changed, 77 insertions, 50 deletions
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index c1e31e9a85d7..44ac64f3a047 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -10,6 +10,7 @@
#include <linux/debugfs.h>
#include <linux/sched/smt.h>
#include <linux/task_work.h>
+#include <linux/mmu_notifier.h>
#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
@@ -88,10 +89,10 @@
#define CR3_HW_ASID_BITS 12
/*
- * When enabled, PAGE_TABLE_ISOLATION consumes a single bit for
+ * When enabled, MITIGATION_PAGE_TABLE_ISOLATION consumes a single bit for
* user/kernel switches
*/
-#ifdef CONFIG_PAGE_TABLE_ISOLATION
+#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
# define PTI_CONSUMED_PCID_BITS 1
#else
# define PTI_CONSUMED_PCID_BITS 0
@@ -113,7 +114,7 @@ static inline u16 kern_pcid(u16 asid)
{
VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE);
-#ifdef CONFIG_PAGE_TABLE_ISOLATION
+#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
/*
* Make sure that the dynamic ASID space does not conflict with the
* bit we are using to switch between user and kernel ASIDs.
@@ -148,32 +149,36 @@ static inline u16 kern_pcid(u16 asid)
static inline u16 user_pcid(u16 asid)
{
u16 ret = kern_pcid(asid);
-#ifdef CONFIG_PAGE_TABLE_ISOLATION
+#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
ret |= 1 << X86_CR3_PTI_PCID_USER_BIT;
#endif
return ret;
}
-static inline unsigned long build_cr3(pgd_t *pgd, u16 asid)
+static inline unsigned long build_cr3(pgd_t *pgd, u16 asid, unsigned long lam)
{
+ unsigned long cr3 = __sme_pa(pgd) | lam;
+
if (static_cpu_has(X86_FEATURE_PCID)) {
- return __sme_pa(pgd) | kern_pcid(asid);
+ VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE);
+ cr3 |= kern_pcid(asid);
} else {
VM_WARN_ON_ONCE(asid != 0);
- return __sme_pa(pgd);
}
+
+ return cr3;
}
-static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid)
+static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid,
+ unsigned long lam)
{
- VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE);
/*
* Use boot_cpu_has() instead of this_cpu_has() as this function
* might be called during early boot. This should work even after
* boot because all CPU's the have same capabilities:
*/
VM_WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_PCID));
- return __sme_pa(pgd) | kern_pcid(asid) | CR3_NOFLUSH;
+ return build_cr3(pgd, asid, lam) | CR3_NOFLUSH;
}
/*
@@ -257,7 +262,7 @@ static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen,
static inline void invalidate_user_asid(u16 asid)
{
/* There is no user ASID if address space separation is off */
- if (!IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION))
+ if (!IS_ENABLED(CONFIG_MITIGATION_PAGE_TABLE_ISOLATION))
return;
/*
@@ -274,15 +279,16 @@ static inline void invalidate_user_asid(u16 asid)
(unsigned long *)this_cpu_ptr(&cpu_tlbstate.user_pcid_flush_mask));
}
-static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, bool need_flush)
+static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, unsigned long lam,
+ bool need_flush)
{
unsigned long new_mm_cr3;
if (need_flush) {
invalidate_user_asid(new_asid);
- new_mm_cr3 = build_cr3(pgdir, new_asid);
+ new_mm_cr3 = build_cr3(pgdir, new_asid, lam);
} else {
- new_mm_cr3 = build_cr3_noflush(pgdir, new_asid);
+ new_mm_cr3 = build_cr3_noflush(pgdir, new_asid, lam);
}
/*
@@ -293,7 +299,7 @@ static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, bool need_flush)
write_cr3(new_mm_cr3);
}
-void leave_mm(int cpu)
+void leave_mm(void)
{
struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
@@ -321,7 +327,7 @@ void switch_mm(struct mm_struct *prev, struct mm_struct *next,
unsigned long flags;
local_irq_save(flags);
- switch_mm_irqs_off(prev, next, tsk);
+ switch_mm_irqs_off(NULL, next, tsk);
local_irq_restore(flags);
}
@@ -349,7 +355,7 @@ static void l1d_flush_evaluate(unsigned long prev_mm, unsigned long next_mm,
/*
* Validate that it is not running on an SMT sibling as this would
- * make the excercise pointless because the siblings share L1D. If
+ * make the exercise pointless because the siblings share L1D. If
* it runs on a SMT sibling, notify it with SIGBUS on return to
* user/guest
*/
@@ -486,26 +492,24 @@ void cr4_update_pce(void *ignored)
static inline void cr4_update_pce_mm(struct mm_struct *mm) { }
#endif
-void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
+/*
+ * This optimizes when not actually switching mm's. Some architectures use the
+ * 'unused' argument for this optimization, but x86 must use
+ * 'cpu_tlbstate.loaded_mm' instead because it does not always keep
+ * 'current->active_mm' up to date.
+ */
+void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next,
struct task_struct *tsk)
{
- struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm);
+ struct mm_struct *prev = this_cpu_read(cpu_tlbstate.loaded_mm);
u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
+ unsigned long new_lam = mm_lam_cr3_mask(next);
bool was_lazy = this_cpu_read(cpu_tlbstate_shared.is_lazy);
unsigned cpu = smp_processor_id();
u64 next_tlb_gen;
bool need_flush;
u16 new_asid;
- /*
- * NB: The scheduler will call us with prev == next when switching
- * from lazy TLB mode to normal mode if active_mm isn't changing.
- * When this happens, we don't assume that CR3 (and hence
- * cpu_tlbstate.loaded_mm) matches next.
- *
- * NB: leave_mm() calls us with prev == NULL and tsk == NULL.
- */
-
/* We don't want flush_tlb_func() to run concurrently with us. */
if (IS_ENABLED(CONFIG_PROVE_LOCKING))
WARN_ON_ONCE(!irqs_disabled());
@@ -520,7 +524,8 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
* isn't free.
*/
#ifdef CONFIG_DEBUG_VM
- if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev->pgd, prev_asid))) {
+ if (WARN_ON_ONCE(__read_cr3() != build_cr3(prev->pgd, prev_asid,
+ tlbstate_lam_cr3_mask()))) {
/*
* If we were to BUG here, we'd be very likely to kill
* the system so hard that we don't see the call trace.
@@ -551,16 +556,22 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
* provides that full memory barrier and core serializing
* instruction.
*/
- if (real_prev == next) {
+ if (prev == next) {
+ /* Not actually switching mm's */
VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
next->context.ctx_id);
/*
+ * If this races with another thread that enables lam, 'new_lam'
+ * might not match tlbstate_lam_cr3_mask().
+ */
+
+ /*
* Even in lazy TLB mode, the CPU should stay set in the
* mm_cpumask. The TLB shootdown code can figure out from
* cpu_tlbstate_shared.is_lazy whether or not to send an IPI.
*/
- if (WARN_ON_ONCE(real_prev != &init_mm &&
+ if (WARN_ON_ONCE(prev != &init_mm &&
!cpumask_test_cpu(cpu, mm_cpumask(next))))
cpumask_set_cpu(cpu, mm_cpumask(next));
@@ -602,10 +613,10 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
* Skip kernel threads; we never send init_mm TLB flushing IPIs,
* but the bitmap manipulation can cause cache line contention.
*/
- if (real_prev != &init_mm) {
+ if (prev != &init_mm) {
VM_WARN_ON_ONCE(!cpumask_test_cpu(cpu,
- mm_cpumask(real_prev)));
- cpumask_clear_cpu(cpu, mm_cpumask(real_prev));
+ mm_cpumask(prev)));
+ cpumask_clear_cpu(cpu, mm_cpumask(prev));
}
/*
@@ -622,15 +633,16 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
barrier();
}
+ set_tlbstate_lam_mode(next);
if (need_flush) {
this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
- load_new_mm_cr3(next->pgd, new_asid, true);
+ load_new_mm_cr3(next->pgd, new_asid, new_lam, true);
trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
} else {
/* The new ASID is already up to date. */
- load_new_mm_cr3(next->pgd, new_asid, false);
+ load_new_mm_cr3(next->pgd, new_asid, new_lam, false);
trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, 0);
}
@@ -641,9 +653,9 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
this_cpu_write(cpu_tlbstate.loaded_mm, next);
this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
- if (next != real_prev) {
+ if (next != prev) {
cr4_update_pce_mm(next);
- switch_ldt(real_prev, next);
+ switch_ldt(prev, next);
}
}
@@ -691,6 +703,10 @@ void initialize_tlbstate_and_flush(void)
/* Assert that CR3 already references the right mm. */
WARN_ON((cr3 & CR3_ADDR_MASK) != __pa(mm->pgd));
+ /* LAM expected to be disabled */
+ WARN_ON(cr3 & (X86_CR3_LAM_U48 | X86_CR3_LAM_U57));
+ WARN_ON(mm_lam_cr3_mask(mm));
+
/*
* Assert that CR4.PCIDE is set if needed. (CR4.PCIDE initialization
* doesn't work like other CR4 bits because it can only be set from
@@ -699,8 +715,8 @@ void initialize_tlbstate_and_flush(void)
WARN_ON(boot_cpu_has(X86_FEATURE_PCID) &&
!(cr4_read_shadow() & X86_CR4_PCIDE));
- /* Force ASID 0 and force a TLB flush. */
- write_cr3(build_cr3(mm->pgd, 0));
+ /* Disable LAM, force ASID 0 and force a TLB flush. */
+ write_cr3(build_cr3(mm->pgd, 0, 0));
/* Reinitialize tlbstate. */
this_cpu_write(cpu_tlbstate.last_user_mm_spec, LAST_USER_MM_INIT);
@@ -708,6 +724,7 @@ void initialize_tlbstate_and_flush(void)
this_cpu_write(cpu_tlbstate.next_asid, 1);
this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, mm->context.ctx_id);
this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, tlb_gen);
+ set_tlbstate_lam_mode(mm);
for (i = 1; i < TLB_NR_DYN_ASIDS; i++)
this_cpu_write(cpu_tlbstate.ctxs[i].ctx_id, 0);
@@ -925,7 +942,7 @@ void flush_tlb_multi(const struct cpumask *cpumask,
}
/*
- * See Documentation/x86/tlb.rst for details. We choose 33
+ * See Documentation/arch/x86/tlb.rst for details. We choose 33
* because it is large enough to cover the vast majority (at
* least 95%) of allocations, and is small enough that we are
* confident it will not cause too much overhead. Each single
@@ -1017,6 +1034,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
put_flush_tlb_info();
put_cpu();
+ mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end);
}
@@ -1071,8 +1089,10 @@ void flush_tlb_kernel_range(unsigned long start, unsigned long end)
*/
unsigned long __get_current_cr3_fast(void)
{
- unsigned long cr3 = build_cr3(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd,
- this_cpu_read(cpu_tlbstate.loaded_mm_asid));
+ unsigned long cr3 =
+ build_cr3(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd,
+ this_cpu_read(cpu_tlbstate.loaded_mm_asid),
+ tlbstate_lam_cr3_mask());
/* For now, be very restrictive about when this can be called. */
VM_WARN_ON(in_nmi() || preemptible());
@@ -1119,21 +1139,28 @@ void flush_tlb_one_kernel(unsigned long addr)
*/
STATIC_NOPV void native_flush_tlb_one_user(unsigned long addr)
{
- u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
+ u32 loaded_mm_asid;
+ bool cpu_pcide;
+ /* Flush 'addr' from the kernel PCID: */
asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
+ /* If PTI is off there is no user PCID and nothing to flush. */
if (!static_cpu_has(X86_FEATURE_PTI))
return;
+ loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
+ cpu_pcide = this_cpu_read(cpu_tlbstate.cr4) & X86_CR4_PCIDE;
+
/*
- * Some platforms #GP if we call invpcid(type=1/2) before CR4.PCIDE=1.
- * Just use invalidate_user_asid() in case we are called early.
+ * invpcid_flush_one(pcid>0) will #GP if CR4.PCIDE==0. Check
+ * 'cpu_pcide' to ensure that *this* CPU will not trigger those
+ * #GP's even if called before CR4.PCIDE has been initialized.
*/
- if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE))
- invalidate_user_asid(loaded_mm_asid);
- else
+ if (boot_cpu_has(X86_FEATURE_INVPCID) && cpu_pcide)
invpcid_flush_one(user_pcid(loaded_mm_asid), addr);
+ else
+ invalidate_user_asid(loaded_mm_asid);
}
void flush_tlb_one_user(unsigned long addr)
@@ -1205,7 +1232,7 @@ void __flush_tlb_all(void)
*/
VM_WARN_ON_ONCE(preemptible());
- if (boot_cpu_has(X86_FEATURE_PGE)) {
+ if (cpu_feature_enabled(X86_FEATURE_PGE)) {
__flush_tlb_global();
} else {
/*