diff options
Diffstat (limited to 'arch/x86/mm/tlb.c')
-rw-r--r-- | arch/x86/mm/tlb.c | 726 |
1 files changed, 564 insertions, 162 deletions
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 66f96f21a7b6..c1e31e9a85d7 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -8,16 +8,29 @@ #include <linux/export.h> #include <linux/cpu.h> #include <linux/debugfs.h> +#include <linux/sched/smt.h> +#include <linux/task_work.h> #include <asm/tlbflush.h> #include <asm/mmu_context.h> #include <asm/nospec-branch.h> #include <asm/cache.h> +#include <asm/cacheflush.h> #include <asm/apic.h> -#include <asm/uv/uv.h> +#include <asm/perf_event.h> #include "mm_internal.h" +#ifdef CONFIG_PARAVIRT +# define STATIC_NOPV +#else +# define STATIC_NOPV static +# define __flush_tlb_local native_flush_tlb_local +# define __flush_tlb_global native_flush_tlb_global +# define __flush_tlb_one_user(addr) native_flush_tlb_one_user(addr) +# define __flush_tlb_multi(msk, info) native_flush_tlb_multi(msk, info) +#endif + /* * TLB flushing, formerly SMP-only * c/o Linus Torvalds. @@ -33,10 +46,135 @@ */ /* - * Use bit 0 to mangle the TIF_SPEC_IB state into the mm pointer which is - * stored in cpu_tlb_state.last_user_mm_ibpb. + * Bits to mangle the TIF_SPEC_* state into the mm pointer which is + * stored in cpu_tlb_state.last_user_mm_spec. */ #define LAST_USER_MM_IBPB 0x1UL +#define LAST_USER_MM_L1D_FLUSH 0x2UL +#define LAST_USER_MM_SPEC_MASK (LAST_USER_MM_IBPB | LAST_USER_MM_L1D_FLUSH) + +/* Bits to set when tlbstate and flush is (re)initialized */ +#define LAST_USER_MM_INIT LAST_USER_MM_IBPB + +/* + * The x86 feature is called PCID (Process Context IDentifier). It is similar + * to what is traditionally called ASID on the RISC processors. + * + * We don't use the traditional ASID implementation, where each process/mm gets + * its own ASID and flush/restart when we run out of ASID space. + * + * Instead we have a small per-cpu array of ASIDs and cache the last few mm's + * that came by on this CPU, allowing cheaper switch_mm between processes on + * this CPU. + * + * We end up with different spaces for different things. To avoid confusion we + * use different names for each of them: + * + * ASID - [0, TLB_NR_DYN_ASIDS-1] + * the canonical identifier for an mm + * + * kPCID - [1, TLB_NR_DYN_ASIDS] + * the value we write into the PCID part of CR3; corresponds to the + * ASID+1, because PCID 0 is special. + * + * uPCID - [2048 + 1, 2048 + TLB_NR_DYN_ASIDS] + * for KPTI each mm has two address spaces and thus needs two + * PCID values, but we can still do with a single ASID denomination + * for each mm. Corresponds to kPCID + 2048. + * + */ + +/* There are 12 bits of space for ASIDS in CR3 */ +#define CR3_HW_ASID_BITS 12 + +/* + * When enabled, PAGE_TABLE_ISOLATION consumes a single bit for + * user/kernel switches + */ +#ifdef CONFIG_PAGE_TABLE_ISOLATION +# define PTI_CONSUMED_PCID_BITS 1 +#else +# define PTI_CONSUMED_PCID_BITS 0 +#endif + +#define CR3_AVAIL_PCID_BITS (X86_CR3_PCID_BITS - PTI_CONSUMED_PCID_BITS) + +/* + * ASIDs are zero-based: 0->MAX_AVAIL_ASID are valid. -1 below to account + * for them being zero-based. Another -1 is because PCID 0 is reserved for + * use by non-PCID-aware users. + */ +#define MAX_ASID_AVAILABLE ((1 << CR3_AVAIL_PCID_BITS) - 2) + +/* + * Given @asid, compute kPCID + */ +static inline u16 kern_pcid(u16 asid) +{ + VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE); + +#ifdef CONFIG_PAGE_TABLE_ISOLATION + /* + * Make sure that the dynamic ASID space does not conflict with the + * bit we are using to switch between user and kernel ASIDs. + */ + BUILD_BUG_ON(TLB_NR_DYN_ASIDS >= (1 << X86_CR3_PTI_PCID_USER_BIT)); + + /* + * The ASID being passed in here should have respected the + * MAX_ASID_AVAILABLE and thus never have the switch bit set. + */ + VM_WARN_ON_ONCE(asid & (1 << X86_CR3_PTI_PCID_USER_BIT)); +#endif + /* + * The dynamically-assigned ASIDs that get passed in are small + * (<TLB_NR_DYN_ASIDS). They never have the high switch bit set, + * so do not bother to clear it. + * + * If PCID is on, ASID-aware code paths put the ASID+1 into the + * PCID bits. This serves two purposes. It prevents a nasty + * situation in which PCID-unaware code saves CR3, loads some other + * value (with PCID == 0), and then restores CR3, thus corrupting + * the TLB for ASID 0 if the saved ASID was nonzero. It also means + * that any bugs involving loading a PCID-enabled CR3 with + * CR4.PCIDE off will trigger deterministically. + */ + return asid + 1; +} + +/* + * Given @asid, compute uPCID + */ +static inline u16 user_pcid(u16 asid) +{ + u16 ret = kern_pcid(asid); +#ifdef CONFIG_PAGE_TABLE_ISOLATION + ret |= 1 << X86_CR3_PTI_PCID_USER_BIT; +#endif + return ret; +} + +static inline unsigned long build_cr3(pgd_t *pgd, u16 asid) +{ + if (static_cpu_has(X86_FEATURE_PCID)) { + return __sme_pa(pgd) | kern_pcid(asid); + } else { + VM_WARN_ON_ONCE(asid != 0); + return __sme_pa(pgd); + } +} + +static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid) +{ + VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE); + /* + * Use boot_cpu_has() instead of this_cpu_has() as this function + * might be called during early boot. This should work even after + * boot because all CPU's the have same capabilities: + */ + VM_WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_PCID)); + return __sme_pa(pgd) | kern_pcid(asid) | CR3_NOFLUSH; +} /* * We get here when we do something requiring a TLB invalidation @@ -110,6 +248,32 @@ static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen, *need_flush = true; } +/* + * Given an ASID, flush the corresponding user ASID. We can delay this + * until the next time we switch to it. + * + * See SWITCH_TO_USER_CR3. + */ +static inline void invalidate_user_asid(u16 asid) +{ + /* There is no user ASID if address space separation is off */ + if (!IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION)) + return; + + /* + * We only have a single ASID if PCID is off and the CR3 + * write will have flushed it. + */ + if (!cpu_feature_enabled(X86_FEATURE_PCID)) + return; + + if (!static_cpu_has(X86_FEATURE_PTI)) + return; + + __set_bit(kern_pcid(asid), + (unsigned long *)this_cpu_ptr(&cpu_tlbstate.user_pcid_flush_mask)); +} + static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, bool need_flush) { unsigned long new_mm_cr3; @@ -145,7 +309,7 @@ void leave_mm(int cpu) return; /* Warn if we're not lazy. */ - WARN_ON(!this_cpu_read(cpu_tlbstate.is_lazy)); + WARN_ON(!this_cpu_read(cpu_tlbstate_shared.is_lazy)); switch_mm(NULL, &init_mm, NULL); } @@ -161,48 +325,70 @@ void switch_mm(struct mm_struct *prev, struct mm_struct *next, local_irq_restore(flags); } -static void sync_current_stack_to_mm(struct mm_struct *mm) +/* + * Invoked from return to user/guest by a task that opted-in to L1D + * flushing but ended up running on an SMT enabled core due to wrong + * affinity settings or CPU hotplug. This is part of the paranoid L1D flush + * contract which this task requested. + */ +static void l1d_flush_force_sigbus(struct callback_head *ch) { - unsigned long sp = current_stack_pointer; - pgd_t *pgd = pgd_offset(mm, sp); - - if (pgtable_l5_enabled()) { - if (unlikely(pgd_none(*pgd))) { - pgd_t *pgd_ref = pgd_offset_k(sp); + force_sig(SIGBUS); +} - set_pgd(pgd, *pgd_ref); - } - } else { - /* - * "pgd" is faked. The top level entries are "p4d"s, so sync - * the p4d. This compiles to approximately the same code as - * the 5-level case. - */ - p4d_t *p4d = p4d_offset(pgd, sp); +static void l1d_flush_evaluate(unsigned long prev_mm, unsigned long next_mm, + struct task_struct *next) +{ + /* Flush L1D if the outgoing task requests it */ + if (prev_mm & LAST_USER_MM_L1D_FLUSH) + wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH); - if (unlikely(p4d_none(*p4d))) { - pgd_t *pgd_ref = pgd_offset_k(sp); - p4d_t *p4d_ref = p4d_offset(pgd_ref, sp); + /* Check whether the incoming task opted in for L1D flush */ + if (likely(!(next_mm & LAST_USER_MM_L1D_FLUSH))) + return; - set_p4d(p4d, *p4d_ref); - } + /* + * Validate that it is not running on an SMT sibling as this would + * make the excercise pointless because the siblings share L1D. If + * it runs on a SMT sibling, notify it with SIGBUS on return to + * user/guest + */ + if (this_cpu_read(cpu_info.smt_active)) { + clear_ti_thread_flag(&next->thread_info, TIF_SPEC_L1D_FLUSH); + next->l1d_flush_kill.func = l1d_flush_force_sigbus; + task_work_add(next, &next->l1d_flush_kill, TWA_RESUME); } } -static inline unsigned long mm_mangle_tif_spec_ib(struct task_struct *next) +static unsigned long mm_mangle_tif_spec_bits(struct task_struct *next) { - unsigned long next_tif = task_thread_info(next)->flags; - unsigned long ibpb = (next_tif >> TIF_SPEC_IB) & LAST_USER_MM_IBPB; + unsigned long next_tif = read_task_thread_flags(next); + unsigned long spec_bits = (next_tif >> TIF_SPEC_IB) & LAST_USER_MM_SPEC_MASK; - return (unsigned long)next->mm | ibpb; + /* + * Ensure that the bit shift above works as expected and the two flags + * end up in bit 0 and 1. + */ + BUILD_BUG_ON(TIF_SPEC_L1D_FLUSH != TIF_SPEC_IB + 1); + + return (unsigned long)next->mm | spec_bits; } -static void cond_ibpb(struct task_struct *next) +static void cond_mitigation(struct task_struct *next) { + unsigned long prev_mm, next_mm; + if (!next || !next->mm) return; + next_mm = mm_mangle_tif_spec_bits(next); + prev_mm = this_cpu_read(cpu_tlbstate.last_user_mm_spec); + /* + * Avoid user/user BTB poisoning by flushing the branch predictor + * when switching between processes. This stops one process from + * doing Spectre-v2 attacks on another. + * * Both, the conditional and the always IBPB mode use the mm * pointer to avoid the IBPB when switching between tasks of the * same process. Using the mm pointer instead of mm->context.ctx_id @@ -212,8 +398,6 @@ static void cond_ibpb(struct task_struct *next) * exposed data is not really interesting. */ if (static_branch_likely(&switch_mm_cond_ibpb)) { - unsigned long prev_mm, next_mm; - /* * This is a bit more complex than the always mode because * it has to handle two cases: @@ -243,20 +427,14 @@ static void cond_ibpb(struct task_struct *next) * Optimize this with reasonably small overhead for the * above cases. Mangle the TIF_SPEC_IB bit into the mm * pointer of the incoming task which is stored in - * cpu_tlbstate.last_user_mm_ibpb for comparison. - */ - next_mm = mm_mangle_tif_spec_ib(next); - prev_mm = this_cpu_read(cpu_tlbstate.last_user_mm_ibpb); - - /* + * cpu_tlbstate.last_user_mm_spec for comparison. + * * Issue IBPB only if the mm's are different and one or * both have the IBPB bit set. */ if (next_mm != prev_mm && (next_mm | prev_mm) & LAST_USER_MM_IBPB) indirect_branch_prediction_barrier(); - - this_cpu_write(cpu_tlbstate.last_user_mm_ibpb, next_mm); } if (static_branch_unlikely(&switch_mm_always_ibpb)) { @@ -265,19 +443,55 @@ static void cond_ibpb(struct task_struct *next) * different context than the user space task which ran * last on this CPU. */ - if (this_cpu_read(cpu_tlbstate.last_user_mm) != next->mm) { + if ((prev_mm & ~LAST_USER_MM_SPEC_MASK) != + (unsigned long)next->mm) indirect_branch_prediction_barrier(); - this_cpu_write(cpu_tlbstate.last_user_mm, next->mm); - } } + + if (static_branch_unlikely(&switch_mm_cond_l1d_flush)) { + /* + * Flush L1D when the outgoing task requested it and/or + * check whether the incoming task requested L1D flushing + * and ended up on an SMT sibling. + */ + if (unlikely((prev_mm | next_mm) & LAST_USER_MM_L1D_FLUSH)) + l1d_flush_evaluate(prev_mm, next_mm, next); + } + + this_cpu_write(cpu_tlbstate.last_user_mm_spec, next_mm); +} + +#ifdef CONFIG_PERF_EVENTS +static inline void cr4_update_pce_mm(struct mm_struct *mm) +{ + if (static_branch_unlikely(&rdpmc_always_available_key) || + (!static_branch_unlikely(&rdpmc_never_available_key) && + atomic_read(&mm->context.perf_rdpmc_allowed))) { + /* + * Clear the existing dirty counters to + * prevent the leak for an RDPMC task. + */ + perf_clear_dirty_counters(); + cr4_set_bits_irqsoff(X86_CR4_PCE); + } else + cr4_clear_bits_irqsoff(X86_CR4_PCE); +} + +void cr4_update_pce(void *ignored) +{ + cr4_update_pce_mm(this_cpu_read(cpu_tlbstate.loaded_mm)); } +#else +static inline void cr4_update_pce_mm(struct mm_struct *mm) { } +#endif + void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk) { struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm); u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); - bool was_lazy = this_cpu_read(cpu_tlbstate.is_lazy); + bool was_lazy = this_cpu_read(cpu_tlbstate_shared.is_lazy); unsigned cpu = smp_processor_id(); u64 next_tlb_gen; bool need_flush; @@ -292,7 +506,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, * NB: leave_mm() calls us with prev == NULL and tsk == NULL. */ - /* We don't want flush_tlb_func_* to run concurrently with us. */ + /* We don't want flush_tlb_func() to run concurrently with us. */ if (IS_ENABLED(CONFIG_PROVE_LOCKING)) WARN_ON_ONCE(!irqs_disabled()); @@ -322,13 +536,20 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, __flush_tlb_all(); } #endif - this_cpu_write(cpu_tlbstate.is_lazy, false); + if (was_lazy) + this_cpu_write(cpu_tlbstate_shared.is_lazy, false); /* * The membarrier system call requires a full memory barrier and * core serialization before returning to user-space, after - * storing to rq->curr. Writing to CR3 provides that full - * memory barrier and core serializing instruction. + * storing to rq->curr, when changing mm. This is because + * membarrier() sends IPIs to all CPUs that are in the target mm + * to make them issue memory barriers. However, if another CPU + * switches to/from the target mm concurrently with + * membarrier(), it can cause that CPU not to receive an IPI + * when it really should issue a memory barrier. Writing to CR3 + * provides that full memory barrier and core serializing + * instruction. */ if (real_prev == next) { VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) != @@ -337,7 +558,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, /* * Even in lazy TLB mode, the CPU should stay set in the * mm_cpumask. The TLB shootdown code can figure out from - * from cpu_tlbstate.is_lazy whether or not to send an IPI. + * cpu_tlbstate_shared.is_lazy whether or not to send an IPI. */ if (WARN_ON_ONCE(real_prev != &init_mm && !cpumask_test_cpu(cpu, mm_cpumask(next)))) @@ -371,20 +592,10 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, need_flush = true; } else { /* - * Avoid user/user BTB poisoning by flushing the branch - * predictor when switching between processes. This stops - * one process from doing Spectre-v2 attacks on another. + * Apply process to process speculation vulnerability + * mitigations if applicable. */ - cond_ibpb(tsk); - - if (IS_ENABLED(CONFIG_VMAP_STACK)) { - /* - * If our current stack is in vmalloc space and isn't - * mapped in the new pgd, we'll double-fault. Forcibly - * map it. - */ - sync_current_stack_to_mm(next); - } + cond_mitigation(tsk); /* * Stop remote flushes for the previous mm. @@ -416,21 +627,12 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); load_new_mm_cr3(next->pgd, new_asid, true); - /* - * NB: This gets called via leave_mm() in the idle path - * where RCU functions differently. Tracing normally - * uses RCU, so we need to use the _rcuidle variant. - * - * (There is no good reason for this. The idle code should - * be rearranged to call this before rcu_idle_enter().) - */ - trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); + trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); } else { /* The new ASID is already up to date. */ load_new_mm_cr3(next->pgd, new_asid, false); - /* See above wrt _rcuidle. */ - trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0); + trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, 0); } /* Make sure we write CR3 before loaded_mm. */ @@ -440,7 +642,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid); if (next != real_prev) { - load_mm_cr4_irqsoff(next); + cr4_update_pce_mm(next); switch_ldt(real_prev, next); } } @@ -463,7 +665,7 @@ void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm) return; - this_cpu_write(cpu_tlbstate.is_lazy, true); + this_cpu_write(cpu_tlbstate_shared.is_lazy, true); } /* @@ -501,7 +703,7 @@ void initialize_tlbstate_and_flush(void) write_cr3(build_cr3(mm->pgd, 0)); /* Reinitialize tlbstate. */ - this_cpu_write(cpu_tlbstate.last_user_mm_ibpb, LAST_USER_MM_IBPB); + this_cpu_write(cpu_tlbstate.last_user_mm_spec, LAST_USER_MM_INIT); this_cpu_write(cpu_tlbstate.loaded_mm_asid, 0); this_cpu_write(cpu_tlbstate.next_asid, 1); this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, mm->context.ctx_id); @@ -512,14 +714,13 @@ void initialize_tlbstate_and_flush(void) } /* - * flush_tlb_func_common()'s memory ordering requirement is that any + * flush_tlb_func()'s memory ordering requirement is that any * TLB fills that happen after we flush the TLB are ordered after we * read active_mm's tlb_gen. We don't need any explicit barriers * because all x86 flush operations are serializing and the * atomic64_read operation won't be reordered by the compiler. */ -static void flush_tlb_func_common(const struct flush_tlb_info *f, - bool local, enum tlb_flush_reason reason) +static void flush_tlb_func(void *info) { /* * We have three different tlb_gen values in here. They are: @@ -530,34 +731,63 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f, * - f->new_tlb_gen: the generation that the requester of the flush * wants us to catch up to. */ + const struct flush_tlb_info *f = info; struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm); u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); - u64 mm_tlb_gen = atomic64_read(&loaded_mm->context.tlb_gen); u64 local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen); + bool local = smp_processor_id() == f->initiating_cpu; + unsigned long nr_invalidate = 0; + u64 mm_tlb_gen; /* This code cannot presently handle being reentered. */ VM_WARN_ON(!irqs_disabled()); + if (!local) { + inc_irq_stat(irq_tlb_count); + count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); + + /* Can only happen on remote CPUs */ + if (f->mm && f->mm != loaded_mm) + return; + } + if (unlikely(loaded_mm == &init_mm)) return; VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].ctx_id) != loaded_mm->context.ctx_id); - if (this_cpu_read(cpu_tlbstate.is_lazy)) { + if (this_cpu_read(cpu_tlbstate_shared.is_lazy)) { /* * We're in lazy mode. We need to at least flush our * paging-structure cache to avoid speculatively reading * garbage into our TLB. Since switching to init_mm is barely * slower than a minimal flush, just switch to init_mm. * - * This should be rare, with native_flush_tlb_others skipping + * This should be rare, with native_flush_tlb_multi() skipping * IPIs to lazy TLB mode CPUs. */ switch_mm_irqs_off(NULL, &init_mm, NULL); return; } + if (unlikely(f->new_tlb_gen != TLB_GENERATION_INVALID && + f->new_tlb_gen <= local_tlb_gen)) { + /* + * The TLB is already up to date in respect to f->new_tlb_gen. + * While the core might be still behind mm_tlb_gen, checking + * mm_tlb_gen unnecessarily would have negative caching effects + * so avoid it. + */ + return; + } + + /* + * Defer mm_tlb_gen reading as long as possible to avoid cache + * contention. + */ + mm_tlb_gen = atomic64_read(&loaded_mm->context.tlb_gen); + if (unlikely(local_tlb_gen == mm_tlb_gen)) { /* * There's nothing to do: we're already up to date. This can @@ -565,8 +795,7 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f, * be handled can catch us all the way up, leaving no work for * the second flush. */ - trace_tlb_flush(reason, 0); - return; + goto done; } WARN_ON_ONCE(local_tlb_gen > mm_tlb_gen); @@ -601,7 +830,7 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f, * 3, we'd be break the invariant: we'd update local_tlb_gen above * 1 without the full flush that's needed for tlb_gen 2. * - * 2. f->new_tlb_gen == mm_tlb_gen. This is purely an optimiation. + * 2. f->new_tlb_gen == mm_tlb_gen. This is purely an optimization. * Partial TLB flushes are not all that much cheaper than full TLB * flushes, so it seems unlikely that it would be a performance win * to do a partial flush if that won't bring our TLB fully up to @@ -613,56 +842,58 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f, f->new_tlb_gen == local_tlb_gen + 1 && f->new_tlb_gen == mm_tlb_gen) { /* Partial flush */ - unsigned long nr_invalidate = (f->end - f->start) >> f->stride_shift; unsigned long addr = f->start; + /* Partial flush cannot have invalid generations */ + VM_WARN_ON(f->new_tlb_gen == TLB_GENERATION_INVALID); + + /* Partial flush must have valid mm */ + VM_WARN_ON(f->mm == NULL); + + nr_invalidate = (f->end - f->start) >> f->stride_shift; + while (addr < f->end) { - __flush_tlb_one_user(addr); + flush_tlb_one_user(addr); addr += 1UL << f->stride_shift; } if (local) count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_invalidate); - trace_tlb_flush(reason, nr_invalidate); } else { /* Full flush. */ - local_flush_tlb(); + nr_invalidate = TLB_FLUSH_ALL; + + flush_tlb_local(); if (local) count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); - trace_tlb_flush(reason, TLB_FLUSH_ALL); } /* Both paths above update our state to mm_tlb_gen. */ this_cpu_write(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen, mm_tlb_gen); -} - -static void flush_tlb_func_local(const void *info, enum tlb_flush_reason reason) -{ - const struct flush_tlb_info *f = info; - - flush_tlb_func_common(f, true, reason); -} - -static void flush_tlb_func_remote(void *info) -{ - const struct flush_tlb_info *f = info; - inc_irq_stat(irq_tlb_count); - - if (f->mm && f->mm != this_cpu_read(cpu_tlbstate.loaded_mm)) - return; - - count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); - flush_tlb_func_common(f, false, TLB_REMOTE_SHOOTDOWN); + /* Tracing is done in a unified manner to reduce the code size */ +done: + trace_tlb_flush(!local ? TLB_REMOTE_SHOOTDOWN : + (f->mm == NULL) ? TLB_LOCAL_SHOOTDOWN : + TLB_LOCAL_MM_SHOOTDOWN, + nr_invalidate); } static bool tlb_is_not_lazy(int cpu, void *data) { - return !per_cpu(cpu_tlbstate.is_lazy, cpu); + return !per_cpu(cpu_tlbstate_shared.is_lazy, cpu); } -void native_flush_tlb_others(const struct cpumask *cpumask, - const struct flush_tlb_info *info) +DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state_shared, cpu_tlbstate_shared); +EXPORT_PER_CPU_SYMBOL(cpu_tlbstate_shared); + +STATIC_NOPV void native_flush_tlb_multi(const struct cpumask *cpumask, + const struct flush_tlb_info *info) { + /* + * Do accounting and tracing. Note that there are (and have always been) + * cases in which a remote TLB flush will be traced, but eventually + * would not happen. + */ count_vm_tlb_event(NR_TLB_REMOTE_FLUSH); if (info->end == TLB_FLUSH_ALL) trace_tlb_flush(TLB_REMOTE_SEND_IPI, TLB_FLUSH_ALL); @@ -670,29 +901,6 @@ void native_flush_tlb_others(const struct cpumask *cpumask, trace_tlb_flush(TLB_REMOTE_SEND_IPI, (info->end - info->start) >> PAGE_SHIFT); - if (is_uv_system()) { - /* - * This whole special case is confused. UV has a "Broadcast - * Assist Unit", which seems to be a fancy way to send IPIs. - * Back when x86 used an explicit TLB flush IPI, UV was - * optimized to use its own mechanism. These days, x86 uses - * smp_call_function_many(), but UV still uses a manual IPI, - * and that IPI's action is out of date -- it does a manual - * flush instead of calling flush_tlb_func_remote(). This - * means that the percpu tlb_gen variables won't be updated - * and we'll do pointless flushes on future context switches. - * - * Rather than hooking native_flush_tlb_others() here, I think - * that UV should be updated so that smp_call_function_many(), - * etc, are optimal on UV. - */ - cpumask = uv_flush_tlb_others(cpumask, info); - if (cpumask) - smp_call_function_many(cpumask, flush_tlb_func_remote, - (void *)info, 1); - return; - } - /* * If no page tables were freed, we can skip sending IPIs to * CPUs in lazy TLB mode. They will flush the CPU themselves @@ -704,13 +912,18 @@ void native_flush_tlb_others(const struct cpumask *cpumask, * doing a speculative memory access. */ if (info->freed_tables) - smp_call_function_many(cpumask, flush_tlb_func_remote, - (void *)info, 1); + on_each_cpu_mask(cpumask, flush_tlb_func, (void *)info, true); else - on_each_cpu_cond_mask(tlb_is_not_lazy, flush_tlb_func_remote, + on_each_cpu_cond_mask(tlb_is_not_lazy, flush_tlb_func, (void *)info, 1, cpumask); } +void flush_tlb_multi(const struct cpumask *cpumask, + const struct flush_tlb_info *info) +{ + __flush_tlb_multi(cpumask, info); +} + /* * See Documentation/x86/tlb.rst for details. We choose 33 * because it is large enough to cover the vast majority (at @@ -729,7 +942,7 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct flush_tlb_info, flush_tlb_info); static DEFINE_PER_CPU(unsigned int, flush_tlb_info_idx); #endif -static inline struct flush_tlb_info *get_flush_tlb_info(struct mm_struct *mm, +static struct flush_tlb_info *get_flush_tlb_info(struct mm_struct *mm, unsigned long start, unsigned long end, unsigned int stride_shift, bool freed_tables, u64 new_tlb_gen) @@ -751,14 +964,15 @@ static inline struct flush_tlb_info *get_flush_tlb_info(struct mm_struct *mm, info->stride_shift = stride_shift; info->freed_tables = freed_tables; info->new_tlb_gen = new_tlb_gen; + info->initiating_cpu = smp_processor_id(); return info; } -static inline void put_flush_tlb_info(void) +static void put_flush_tlb_info(void) { #ifdef CONFIG_DEBUG_VM - /* Complete reentrency prevention checks */ + /* Complete reentrancy prevention checks */ barrier(); this_cpu_dec(flush_tlb_info_idx); #endif @@ -787,16 +1001,20 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, info = get_flush_tlb_info(mm, start, end, stride_shift, freed_tables, new_tlb_gen); - if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) { + /* + * flush_tlb_multi() is not optimized for the common case in which only + * a local TLB flush is needed. Optimize this use-case by calling + * flush_tlb_func_local() directly in this case. + */ + if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) { + flush_tlb_multi(mm_cpumask(mm), info); + } else if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) { lockdep_assert_irqs_enabled(); local_irq_disable(); - flush_tlb_func_local(info, TLB_LOCAL_MM_SHOOTDOWN); + flush_tlb_func(info); local_irq_enable(); } - if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) - flush_tlb_others(mm_cpumask(mm), info); - put_flush_tlb_info(); put_cpu(); } @@ -821,7 +1039,7 @@ static void do_kernel_range_flush(void *info) /* flush range by one by one 'invlpg' */ for (addr = f->start; addr < f->end; addr += PAGE_SIZE) - __flush_tlb_one_kernel(addr); + flush_tlb_one_kernel(addr); } void flush_tlb_kernel_range(unsigned long start, unsigned long end) @@ -834,7 +1052,8 @@ void flush_tlb_kernel_range(unsigned long start, unsigned long end) struct flush_tlb_info *info; preempt_disable(); - info = get_flush_tlb_info(NULL, start, end, 0, false, 0); + info = get_flush_tlb_info(NULL, start, end, 0, false, + TLB_GENERATION_INVALID); on_each_cpu(do_kernel_range_flush, info, 1); @@ -844,36 +1063,219 @@ void flush_tlb_kernel_range(unsigned long start, unsigned long end) } /* - * arch_tlbbatch_flush() performs a full TLB flush regardless of the active mm. - * This means that the 'struct flush_tlb_info' that describes which mappings to - * flush is actually fixed. We therefore set a single fixed struct and use it in - * arch_tlbbatch_flush(). + * This can be used from process context to figure out what the value of + * CR3 is without needing to do a (slow) __read_cr3(). + * + * It's intended to be used for code like KVM that sneakily changes CR3 + * and needs to restore it. It needs to be used very carefully. + */ +unsigned long __get_current_cr3_fast(void) +{ + unsigned long cr3 = build_cr3(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd, + this_cpu_read(cpu_tlbstate.loaded_mm_asid)); + + /* For now, be very restrictive about when this can be called. */ + VM_WARN_ON(in_nmi() || preemptible()); + + VM_BUG_ON(cr3 != __read_cr3()); + return cr3; +} +EXPORT_SYMBOL_GPL(__get_current_cr3_fast); + +/* + * Flush one page in the kernel mapping */ -static const struct flush_tlb_info full_flush_tlb_info = { - .mm = NULL, - .start = 0, - .end = TLB_FLUSH_ALL, -}; +void flush_tlb_one_kernel(unsigned long addr) +{ + count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE); + + /* + * If PTI is off, then __flush_tlb_one_user() is just INVLPG or its + * paravirt equivalent. Even with PCID, this is sufficient: we only + * use PCID if we also use global PTEs for the kernel mapping, and + * INVLPG flushes global translations across all address spaces. + * + * If PTI is on, then the kernel is mapped with non-global PTEs, and + * __flush_tlb_one_user() will flush the given address for the current + * kernel address space and for its usermode counterpart, but it does + * not flush it for other address spaces. + */ + flush_tlb_one_user(addr); + + if (!static_cpu_has(X86_FEATURE_PTI)) + return; + + /* + * See above. We need to propagate the flush to all other address + * spaces. In principle, we only need to propagate it to kernelmode + * address spaces, but the extra bookkeeping we would need is not + * worth it. + */ + this_cpu_write(cpu_tlbstate.invalidate_other, true); +} + +/* + * Flush one page in the user mapping + */ +STATIC_NOPV void native_flush_tlb_one_user(unsigned long addr) +{ + u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); + + asm volatile("invlpg (%0)" ::"r" (addr) : "memory"); + + if (!static_cpu_has(X86_FEATURE_PTI)) + return; + + /* + * Some platforms #GP if we call invpcid(type=1/2) before CR4.PCIDE=1. + * Just use invalidate_user_asid() in case we are called early. + */ + if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE)) + invalidate_user_asid(loaded_mm_asid); + else + invpcid_flush_one(user_pcid(loaded_mm_asid), addr); +} + +void flush_tlb_one_user(unsigned long addr) +{ + __flush_tlb_one_user(addr); +} + +/* + * Flush everything + */ +STATIC_NOPV void native_flush_tlb_global(void) +{ + unsigned long flags; + + if (static_cpu_has(X86_FEATURE_INVPCID)) { + /* + * Using INVPCID is considerably faster than a pair of writes + * to CR4 sandwiched inside an IRQ flag save/restore. + * + * Note, this works with CR4.PCIDE=0 or 1. + */ + invpcid_flush_all(); + return; + } + + /* + * Read-modify-write to CR4 - protect it from preemption and + * from interrupts. (Use the raw variant because this code can + * be called from deep inside debugging code.) + */ + raw_local_irq_save(flags); + + __native_tlb_flush_global(this_cpu_read(cpu_tlbstate.cr4)); + + raw_local_irq_restore(flags); +} + +/* + * Flush the entire current user mapping + */ +STATIC_NOPV void native_flush_tlb_local(void) +{ + /* + * Preemption or interrupts must be disabled to protect the access + * to the per CPU variable and to prevent being preempted between + * read_cr3() and write_cr3(). + */ + WARN_ON_ONCE(preemptible()); + + invalidate_user_asid(this_cpu_read(cpu_tlbstate.loaded_mm_asid)); + + /* If current->mm == NULL then the read_cr3() "borrows" an mm */ + native_write_cr3(__native_read_cr3()); +} + +void flush_tlb_local(void) +{ + __flush_tlb_local(); +} + +/* + * Flush everything + */ +void __flush_tlb_all(void) +{ + /* + * This is to catch users with enabled preemption and the PGE feature + * and don't trigger the warning in __native_flush_tlb(). + */ + VM_WARN_ON_ONCE(preemptible()); + + if (boot_cpu_has(X86_FEATURE_PGE)) { + __flush_tlb_global(); + } else { + /* + * !PGE -> !PCID (setup_pcid()), thus every flush is total. + */ + flush_tlb_local(); + } +} +EXPORT_SYMBOL_GPL(__flush_tlb_all); void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch) { + struct flush_tlb_info *info; + int cpu = get_cpu(); - if (cpumask_test_cpu(cpu, &batch->cpumask)) { + info = get_flush_tlb_info(NULL, 0, TLB_FLUSH_ALL, 0, false, + TLB_GENERATION_INVALID); + /* + * flush_tlb_multi() is not optimized for the common case in which only + * a local TLB flush is needed. Optimize this use-case by calling + * flush_tlb_func_local() directly in this case. + */ + if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) { + flush_tlb_multi(&batch->cpumask, info); + } else if (cpumask_test_cpu(cpu, &batch->cpumask)) { lockdep_assert_irqs_enabled(); local_irq_disable(); - flush_tlb_func_local(&full_flush_tlb_info, TLB_LOCAL_SHOOTDOWN); + flush_tlb_func(info); local_irq_enable(); } - if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) - flush_tlb_others(&batch->cpumask, &full_flush_tlb_info); - cpumask_clear(&batch->cpumask); + put_flush_tlb_info(); put_cpu(); } +/* + * Blindly accessing user memory from NMI context can be dangerous + * if we're in the middle of switching the current user task or + * switching the loaded mm. It can also be dangerous if we + * interrupted some kernel code that was temporarily using a + * different mm. + */ +bool nmi_uaccess_okay(void) +{ + struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm); + struct mm_struct *current_mm = current->mm; + + VM_WARN_ON_ONCE(!loaded_mm); + + /* + * The condition we want to check is + * current_mm->pgd == __va(read_cr3_pa()). This may be slow, though, + * if we're running in a VM with shadow paging, and nmi_uaccess_okay() + * is supposed to be reasonably fast. + * + * Instead, we check the almost equivalent but somewhat conservative + * condition below, and we rely on the fact that switch_mm_irqs_off() + * sets loaded_mm to LOADED_MM_SWITCHING before writing to CR3. + */ + if (loaded_mm != current_mm) + return false; + + VM_WARN_ON_ONCE(current_mm->pgd != __va(read_cr3_pa())); + + return true; +} + static ssize_t tlbflush_read_file(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { |