diff options
Diffstat (limited to 'arch/x86')
36 files changed, 584 insertions, 198 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 30c40f08a3d4..f67e839f06c8 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -82,7 +82,6 @@ config X86 select HAVE_USER_RETURN_NOTIFIER select ARCH_BINFMT_ELF_RANDOMIZE_PIE select HAVE_ARCH_JUMP_LABEL - select HAVE_GENERIC_HARDIRQS select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE select SPARSE_IRQ select GENERIC_FIND_FIRST_BIT @@ -482,11 +481,12 @@ config X86_INTEL_LPSS bool "Intel Low Power Subsystem Support" depends on ACPI select COMMON_CLK + select PINCTRL ---help--- Select to build support for Intel Low Power Subsystem such as found on Intel Lynxpoint PCH. Selecting this option enables - things like clock tree (common clock framework) which are needed - by the LPSS peripheral drivers. + things like clock tree (common clock framework) and pincontrol + which are needed by the LPSS peripheral drivers. config X86_RDC321X bool "RDC R-321x SoC" @@ -860,7 +860,7 @@ source "kernel/Kconfig.preempt" config X86_UP_APIC bool "Local APIC support on uniprocessors" - depends on X86_32 && !SMP && !X86_32_NON_STANDARD + depends on X86_32 && !SMP && !X86_32_NON_STANDARD && !PCI_MSI ---help--- A local APIC (Advanced Programmable Interrupt Controller) is an integrated interrupt controller in the CPU. If you have a single-CPU @@ -885,11 +885,11 @@ config X86_UP_IOAPIC config X86_LOCAL_APIC def_bool y - depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC + depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC || PCI_MSI config X86_IO_APIC def_bool y - depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_IOAPIC + depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_IOAPIC || PCI_MSI config X86_VISWS_APIC def_bool y @@ -1033,6 +1033,7 @@ config X86_REBOOTFIXUPS config MICROCODE tristate "CPU microcode loading support" + depends on CPU_SUP_AMD || CPU_SUP_INTEL select FW_LOADER ---help--- diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index d3f5c63078d8..89270b4318db 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -374,7 +374,7 @@ static __always_inline __pure bool __static_cpu_has(u16 bit) * Catch too early usage of this before alternatives * have run. */ - asm goto("1: jmp %l[t_warn]\n" + asm_volatile_goto("1: jmp %l[t_warn]\n" "2:\n" ".section .altinstructions,\"a\"\n" " .long 1b - .\n" @@ -388,7 +388,7 @@ static __always_inline __pure bool __static_cpu_has(u16 bit) #endif - asm goto("1: jmp %l[t_no]\n" + asm_volatile_goto("1: jmp %l[t_no]\n" "2:\n" ".section .altinstructions,\"a\"\n" " .long 1b - .\n" @@ -453,7 +453,7 @@ static __always_inline __pure bool _static_cpu_has_safe(u16 bit) * have. Thus, we force the jump to the widest, 4-byte, signed relative * offset even though the last would often fit in less bytes. */ - asm goto("1: .byte 0xe9\n .long %l[t_dynamic] - 2f\n" + asm_volatile_goto("1: .byte 0xe9\n .long %l[t_dynamic] - 2f\n" "2:\n" ".section .altinstructions,\"a\"\n" " .long 1b - .\n" /* src offset */ diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h index 3a16c1483b45..6a2cefb4395a 100644 --- a/arch/x86/include/asm/jump_label.h +++ b/arch/x86/include/asm/jump_label.h @@ -3,18 +3,23 @@ #ifdef __KERNEL__ +#include <linux/stringify.h> #include <linux/types.h> #include <asm/nops.h> #include <asm/asm.h> #define JUMP_LABEL_NOP_SIZE 5 -#define STATIC_KEY_INITIAL_NOP ".byte 0xe9 \n\t .long 0\n\t" +#ifdef CONFIG_X86_64 +# define STATIC_KEY_INIT_NOP P6_NOP5_ATOMIC +#else +# define STATIC_KEY_INIT_NOP GENERIC_NOP5_ATOMIC +#endif static __always_inline bool arch_static_branch(struct static_key *key) { - asm goto("1:" - STATIC_KEY_INITIAL_NOP + asm_volatile_goto("1:" + ".byte " __stringify(STATIC_KEY_INIT_NOP) "\n\t" ".pushsection __jump_table, \"aw\" \n\t" _ASM_ALIGN "\n\t" _ASM_PTR "1b, %l[l_yes], %c0 \n\t" diff --git a/arch/x86/include/asm/mutex_64.h b/arch/x86/include/asm/mutex_64.h index e7e6751648ed..07537a44216e 100644 --- a/arch/x86/include/asm/mutex_64.h +++ b/arch/x86/include/asm/mutex_64.h @@ -20,7 +20,7 @@ static inline void __mutex_fastpath_lock(atomic_t *v, void (*fail_fn)(atomic_t *)) { - asm volatile goto(LOCK_PREFIX " decl %0\n" + asm_volatile_goto(LOCK_PREFIX " decl %0\n" " jns %l[exit]\n" : : "m" (v->counter) : "memory", "cc" @@ -75,7 +75,7 @@ static inline int __mutex_fastpath_lock_retval(atomic_t *count) static inline void __mutex_fastpath_unlock(atomic_t *v, void (*fail_fn)(atomic_t *)) { - asm volatile goto(LOCK_PREFIX " incl %0\n" + asm_volatile_goto(LOCK_PREFIX " incl %0\n" " jg %l[exit]\n" : : "m" (v->counter) : "memory", "cc" diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 8d16befdec88..3d1999458709 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -315,21 +315,6 @@ static inline pmd_t pmd_mksoft_dirty(pmd_t pmd) return pmd_set_flags(pmd, _PAGE_SOFT_DIRTY); } -static inline pte_t pte_swp_mksoft_dirty(pte_t pte) -{ - return pte_set_flags(pte, _PAGE_SWP_SOFT_DIRTY); -} - -static inline int pte_swp_soft_dirty(pte_t pte) -{ - return pte_flags(pte) & _PAGE_SWP_SOFT_DIRTY; -} - -static inline pte_t pte_swp_clear_soft_dirty(pte_t pte) -{ - return pte_clear_flags(pte, _PAGE_SWP_SOFT_DIRTY); -} - static inline pte_t pte_file_clear_soft_dirty(pte_t pte) { return pte_clear_flags(pte, _PAGE_SOFT_DIRTY); @@ -446,6 +431,7 @@ pte_t *populate_extra_pte(unsigned long vaddr); #ifndef __ASSEMBLY__ #include <linux/mm_types.h> +#include <linux/mmdebug.h> #include <linux/log2.h> static inline int pte_none(pte_t pte) @@ -864,6 +850,24 @@ static inline void update_mmu_cache_pmd(struct vm_area_struct *vma, { } +static inline pte_t pte_swp_mksoft_dirty(pte_t pte) +{ + VM_BUG_ON(pte_present(pte)); + return pte_set_flags(pte, _PAGE_SWP_SOFT_DIRTY); +} + +static inline int pte_swp_soft_dirty(pte_t pte) +{ + VM_BUG_ON(pte_present(pte)); + return pte_flags(pte) & _PAGE_SWP_SOFT_DIRTY; +} + +static inline pte_t pte_swp_clear_soft_dirty(pte_t pte) +{ + VM_BUG_ON(pte_present(pte)); + return pte_clear_flags(pte, _PAGE_SWP_SOFT_DIRTY); +} + #include <asm-generic/pgtable.h> #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index f4843e031131..0ecac257fb26 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -75,6 +75,9 @@ * with swap entry format. On x86 bits 6 and 7 are *not* involved * into swap entry computation, but bit 6 is used for nonlinear * file mapping, so we borrow bit 7 for soft dirty tracking. + * + * Please note that this bit must be treated as swap dirty page + * mark if and only if the PTE has present bit clear! */ #ifdef CONFIG_MEM_SOFT_DIRTY #define _PAGE_SWP_SOFT_DIRTY _PAGE_PSE diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index cf512003e663..e6d90babc245 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -62,6 +62,7 @@ static inline void __flush_tlb_all(void) static inline void __flush_tlb_one(unsigned long addr) { + count_vm_event(NR_TLB_LOCAL_FLUSH_ONE); __flush_tlb_single(addr); } @@ -84,14 +85,38 @@ static inline void __flush_tlb_one(unsigned long addr) #ifndef CONFIG_SMP -#define flush_tlb() __flush_tlb() -#define flush_tlb_all() __flush_tlb_all() -#define local_flush_tlb() __flush_tlb() +/* "_up" is for UniProcessor. + * + * This is a helper for other header functions. *Not* intended to be called + * directly. All global TLB flushes need to either call this, or to bump the + * vm statistics themselves. + */ +static inline void __flush_tlb_up(void) +{ + count_vm_event(NR_TLB_LOCAL_FLUSH_ALL); + __flush_tlb(); +} + +static inline void flush_tlb_all(void) +{ + count_vm_event(NR_TLB_LOCAL_FLUSH_ALL); + __flush_tlb_all(); +} + +static inline void flush_tlb(void) +{ + __flush_tlb_up(); +} + +static inline void local_flush_tlb(void) +{ + __flush_tlb_up(); +} static inline void flush_tlb_mm(struct mm_struct *mm) { if (mm == current->active_mm) - __flush_tlb(); + __flush_tlb_up(); } static inline void flush_tlb_page(struct vm_area_struct *vma, @@ -105,14 +130,14 @@ static inline void flush_tlb_range(struct vm_area_struct *vma, unsigned long start, unsigned long end) { if (vma->vm_mm == current->active_mm) - __flush_tlb(); + __flush_tlb_up(); } static inline void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, unsigned long end, unsigned long vmflag) { if (mm == current->active_mm) - __flush_tlb(); + __flush_tlb_up(); } static inline void native_flush_tlb_others(const struct cpumask *cpumask, diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index 6aef9fbc09b7..b913915e8e63 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h @@ -79,30 +79,38 @@ static inline int phys_to_machine_mapping_valid(unsigned long pfn) return get_phys_to_machine(pfn) != INVALID_P2M_ENTRY; } -static inline unsigned long mfn_to_pfn(unsigned long mfn) +static inline unsigned long mfn_to_pfn_no_overrides(unsigned long mfn) { unsigned long pfn; - int ret = 0; + int ret; if (xen_feature(XENFEAT_auto_translated_physmap)) return mfn; - if (unlikely(mfn >= machine_to_phys_nr)) { - pfn = ~0; - goto try_override; - } - pfn = 0; + if (unlikely(mfn >= machine_to_phys_nr)) + return ~0; + /* * The array access can fail (e.g., device space beyond end of RAM). * In such cases it doesn't matter what we return (we return garbage), * but we must handle the fault without crashing! */ ret = __get_user(pfn, &machine_to_phys_mapping[mfn]); -try_override: - /* ret might be < 0 if there are no entries in the m2p for mfn */ if (ret < 0) - pfn = ~0; - else if (get_phys_to_machine(pfn) != mfn) + return ~0; + + return pfn; +} + +static inline unsigned long mfn_to_pfn(unsigned long mfn) +{ + unsigned long pfn; + + if (xen_feature(XENFEAT_auto_translated_physmap)) + return mfn; + + pfn = mfn_to_pfn_no_overrides(mfn); + if (get_phys_to_machine(pfn) != mfn) { /* * If this appears to be a foreign mfn (because the pfn * doesn't map back to the mfn), then check the local override @@ -111,6 +119,7 @@ try_override: * m2p_find_override_pfn returns ~0 if it doesn't find anything. */ pfn = m2p_find_override_pfn(mfn, ~0); + } /* * pfn is ~0 if there are no entries in the m2p for mfn or if the diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 1191ac1c9d25..a419814cea57 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -113,7 +113,7 @@ static int __init early_get_pnodeid(void) break; case UV3_HUB_PART_NUMBER: case UV3_HUB_PART_NUMBER_X: - uv_min_hub_revision_id += UV3_HUB_REVISION_BASE - 1; + uv_min_hub_revision_id += UV3_HUB_REVISION_BASE; break; } diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index d4cdfa67509e..ce2d0a2c3e4f 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -683,6 +683,7 @@ static void prepare_set(void) __acquires(set_atomicity_lock) } /* Flush all TLBs via a mov %cr3, %reg; mov %reg, %cr3 */ + count_vm_event(NR_TLB_LOCAL_FLUSH_ALL); __flush_tlb(); /* Save MTRR state */ @@ -696,6 +697,7 @@ static void prepare_set(void) __acquires(set_atomicity_lock) static void post_set(void) __releases(set_atomicity_lock) { /* Flush TLBs (no need to flush caches - they are disabled) */ + count_vm_event(NR_TLB_LOCAL_FLUSH_ALL); __flush_tlb(); /* Intel (P6) standard MTRRs */ diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 8355c84b9729..9d8449158cf9 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1506,7 +1506,7 @@ static int __init init_hw_perf_events(void) err = amd_pmu_init(); break; default: - return 0; + err = -ENOTSUPP; } if (err != 0) { pr_cont("no PMU driver, software events only.\n"); @@ -1883,26 +1883,21 @@ static struct pmu pmu = { void arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now) { - userpg->cap_usr_time = 0; - userpg->cap_usr_time_zero = 0; - userpg->cap_usr_rdpmc = x86_pmu.attr_rdpmc; + userpg->cap_user_time = 0; + userpg->cap_user_time_zero = 0; + userpg->cap_user_rdpmc = x86_pmu.attr_rdpmc; userpg->pmc_width = x86_pmu.cntval_bits; - if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) - return; - - if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) + if (!sched_clock_stable) return; - userpg->cap_usr_time = 1; + userpg->cap_user_time = 1; userpg->time_mult = this_cpu_read(cyc2ns); userpg->time_shift = CYC2NS_SCALE_FACTOR; userpg->time_offset = this_cpu_read(cyc2ns_offset) - now; - if (sched_clock_stable && !check_tsc_disabled()) { - userpg->cap_usr_time_zero = 1; - userpg->time_zero = this_cpu_read(cyc2ns_offset); - } + userpg->cap_user_time_zero = 1; + userpg->time_zero = this_cpu_read(cyc2ns_offset); } /* diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 0abf6742a8b0..f31a1655d1ff 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -124,6 +124,7 @@ static struct event_constraint intel_ivb_event_constraints[] __read_mostly = INTEL_UEVENT_CONSTRAINT(0x0148, 0x4), /* L1D_PEND_MISS.PENDING */ INTEL_UEVENT_CONSTRAINT(0x0279, 0xf), /* IDQ.EMTPY */ INTEL_UEVENT_CONSTRAINT(0x019c, 0xf), /* IDQ_UOPS_NOT_DELIVERED.CORE */ + INTEL_UEVENT_CONSTRAINT(0x02a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_LDM_PENDING */ INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_NO_EXECUTE */ INTEL_UEVENT_CONSTRAINT(0x05a3, 0xf), /* CYCLE_ACTIVITY.STALLS_L2_PENDING */ INTEL_UEVENT_CONSTRAINT(0x06a3, 0xf), /* CYCLE_ACTIVITY.STALLS_LDM_PENDING */ @@ -898,8 +899,8 @@ static __initconst const u64 atom_hw_cache_event_ids static struct extra_reg intel_slm_extra_regs[] __read_mostly = { /* must define OFFCORE_RSP_X first, see intel_fixup_er() */ - INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x768005ffff, RSP_0), - INTEL_UEVENT_EXTRA_REG(0x02b7, MSR_OFFCORE_RSP_1, 0x768005ffff, RSP_1), + INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x768005ffffull, RSP_0), + INTEL_UEVENT_EXTRA_REG(0x02b7, MSR_OFFCORE_RSP_1, 0x768005ffffull, RSP_1), EVENT_EXTRA_END }; @@ -2324,6 +2325,7 @@ __init int intel_pmu_init(void) break; case 55: /* Atom 22nm "Silvermont" */ + case 77: /* Avoton "Silvermont" */ memcpy(hw_cache_event_ids, slm_hw_cache_event_ids, sizeof(hw_cache_event_ids)); memcpy(hw_cache_extra_regs, slm_hw_cache_extra_regs, diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index 63438aad177f..ab3ba1c1b7dd 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -584,6 +584,7 @@ struct event_constraint intel_snb_pebs_event_constraints[] = { INTEL_EVENT_CONSTRAINT(0xd0, 0xf), /* MEM_UOP_RETIRED.* */ INTEL_EVENT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */ INTEL_EVENT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */ INTEL_UEVENT_CONSTRAINT(0x02d4, 0xf), /* MEM_LOAD_UOPS_MISC_RETIRED.LLC_MISS */ EVENT_CONSTRAINT_END }; diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c index fd8011ed4dcd..4118f9f68315 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c @@ -2706,14 +2706,14 @@ static void uncore_pmu_init_hrtimer(struct intel_uncore_box *box) box->hrtimer.function = uncore_pmu_hrtimer; } -struct intel_uncore_box *uncore_alloc_box(struct intel_uncore_type *type, int cpu) +static struct intel_uncore_box *uncore_alloc_box(struct intel_uncore_type *type, int node) { struct intel_uncore_box *box; int i, size; size = sizeof(*box) + type->num_shared_regs * sizeof(struct intel_uncore_extra_reg); - box = kzalloc_node(size, GFP_KERNEL, cpu_to_node(cpu)); + box = kzalloc_node(size, GFP_KERNEL, node); if (!box) return NULL; @@ -2808,7 +2808,7 @@ uncore_get_event_constraint(struct intel_uncore_box *box, struct perf_event *eve return c; } - if (event->hw.config == ~0ULL) + if (event->attr.config == UNCORE_FIXED_EVENT) return &constraint_fixed; if (type->constraints) { @@ -3031,7 +3031,7 @@ static int uncore_validate_group(struct intel_uncore_pmu *pmu, struct intel_uncore_box *fake_box; int ret = -EINVAL, n; - fake_box = uncore_alloc_box(pmu->type, smp_processor_id()); + fake_box = uncore_alloc_box(pmu->type, NUMA_NO_NODE); if (!fake_box) return -ENOMEM; @@ -3112,7 +3112,9 @@ static int uncore_pmu_event_init(struct perf_event *event) */ if (pmu->type->single_fixed && pmu->pmu_idx > 0) return -EINVAL; - hwc->config = ~0ULL; + + /* fixed counters have event field hardcoded to zero */ + hwc->config = 0ULL; } else { hwc->config = event->attr.config & pmu->type->event_mask; if (pmu->type->ops->hw_config) { @@ -3292,7 +3294,7 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id } type = pci_uncores[UNCORE_PCI_DEV_TYPE(id->driver_data)]; - box = uncore_alloc_box(type, 0); + box = uncore_alloc_box(type, NUMA_NO_NODE); if (!box) return -ENOMEM; @@ -3497,7 +3499,7 @@ static int uncore_cpu_prepare(int cpu, int phys_id) if (pmu->func_id < 0) pmu->func_id = j; - box = uncore_alloc_box(type, cpu); + box = uncore_alloc_box(type, cpu_to_node(cpu)); if (!box) return -ENOMEM; diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c index 69eb2fa25494..376dc7873447 100644 --- a/arch/x86/kernel/devicetree.c +++ b/arch/x86/kernel/devicetree.c @@ -52,8 +52,7 @@ void * __init early_init_dt_alloc_memory_arch(u64 size, u64 align) } #ifdef CONFIG_BLK_DEV_INITRD -void __init early_init_dt_setup_initrd_arch(unsigned long start, - unsigned long end) +void __init early_init_dt_setup_initrd_arch(u64 start, u64 end) { initrd_start = (unsigned long)__va(start); initrd_end = (unsigned long)__va(end); diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index 63bdb29b2549..b3cd3ebae077 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -12,6 +12,7 @@ #include <linux/pci.h> #include <linux/acpi.h> #include <linux/pci_ids.h> +#include <drm/i915_drm.h> #include <asm/pci-direct.h> #include <asm/dma.h> #include <asm/io_apic.h> @@ -216,6 +217,157 @@ static void __init intel_remapping_check(int num, int slot, int func) } +/* + * Systems with Intel graphics controllers set aside memory exclusively + * for gfx driver use. This memory is not marked in the E820 as reserved + * or as RAM, and so is subject to overlap from E820 manipulation later + * in the boot process. On some systems, MMIO space is allocated on top, + * despite the efforts of the "RAM buffer" approach, which simply rounds + * memory boundaries up to 64M to try to catch space that may decode + * as RAM and so is not suitable for MMIO. + * + * And yes, so far on current devices the base addr is always under 4G. + */ +static u32 __init intel_stolen_base(int num, int slot, int func) +{ + u32 base; + + /* + * For the PCI IDs in this quirk, the stolen base is always + * in 0x5c, aka the BDSM register (yes that's really what + * it's called). + */ + base = read_pci_config(num, slot, func, 0x5c); + base &= ~((1<<20) - 1); + + return base; +} + +#define KB(x) ((x) * 1024) +#define MB(x) (KB (KB (x))) +#define GB(x) (MB (KB (x))) + +static size_t __init gen3_stolen_size(int num, int slot, int func) +{ + size_t stolen_size; + u16 gmch_ctrl; + + gmch_ctrl = read_pci_config_16(0, 0, 0, I830_GMCH_CTRL); + + switch (gmch_ctrl & I855_GMCH_GMS_MASK) { + case I855_GMCH_GMS_STOLEN_1M: + stolen_size = MB(1); + break; + case I855_GMCH_GMS_STOLEN_4M: + stolen_size = MB(4); + break; + case I855_GMCH_GMS_STOLEN_8M: + stolen_size = MB(8); + break; + case I855_GMCH_GMS_STOLEN_16M: + stolen_size = MB(16); + break; + case I855_GMCH_GMS_STOLEN_32M: + stolen_size = MB(32); + break; + case I915_GMCH_GMS_STOLEN_48M: + stolen_size = MB(48); + break; + case I915_GMCH_GMS_STOLEN_64M: + stolen_size = MB(64); + break; + case G33_GMCH_GMS_STOLEN_128M: + stolen_size = MB(128); + break; + case G33_GMCH_GMS_STOLEN_256M: + stolen_size = MB(256); + break; + case INTEL_GMCH_GMS_STOLEN_96M: + stolen_size = MB(96); + break; + case INTEL_GMCH_GMS_STOLEN_160M: + stolen_size = MB(160); + break; + case INTEL_GMCH_GMS_STOLEN_224M: + stolen_size = MB(224); + break; + case INTEL_GMCH_GMS_STOLEN_352M: + stolen_size = MB(352); + break; + default: + stolen_size = 0; + break; + } + + return stolen_size; +} + +static size_t __init gen6_stolen_size(int num, int slot, int func) +{ + u16 gmch_ctrl; + + gmch_ctrl = read_pci_config_16(num, slot, func, SNB_GMCH_CTRL); + gmch_ctrl >>= SNB_GMCH_GMS_SHIFT; + gmch_ctrl &= SNB_GMCH_GMS_MASK; + + return gmch_ctrl << 25; /* 32 MB units */ +} + +typedef size_t (*stolen_size_fn)(int num, int slot, int func); + +static struct pci_device_id intel_stolen_ids[] __initdata = { + INTEL_I915G_IDS(gen3_stolen_size), + INTEL_I915GM_IDS(gen3_stolen_size), + INTEL_I945G_IDS(gen3_stolen_size), + INTEL_I945GM_IDS(gen3_stolen_size), + INTEL_VLV_M_IDS(gen3_stolen_size), + INTEL_VLV_D_IDS(gen3_stolen_size), + INTEL_PINEVIEW_IDS(gen3_stolen_size), + INTEL_I965G_IDS(gen3_stolen_size), + INTEL_G33_IDS(gen3_stolen_size), + INTEL_I965GM_IDS(gen3_stolen_size), + INTEL_GM45_IDS(gen3_stolen_size), + INTEL_G45_IDS(gen3_stolen_size), + INTEL_IRONLAKE_D_IDS(gen3_stolen_size), + INTEL_IRONLAKE_M_IDS(gen3_stolen_size), + INTEL_SNB_D_IDS(gen6_stolen_size), + INTEL_SNB_M_IDS(gen6_stolen_size), + INTEL_IVB_M_IDS(gen6_stolen_size), + INTEL_IVB_D_IDS(gen6_stolen_size), + INTEL_HSW_D_IDS(gen6_stolen_size), + INTEL_HSW_M_IDS(gen6_stolen_size), +}; + +static void __init intel_graphics_stolen(int num, int slot, int func) +{ + size_t size; + int i; + u32 start; + u16 device, subvendor, subdevice; + + device = read_pci_config_16(num, slot, func, PCI_DEVICE_ID); + subvendor = read_pci_config_16(num, slot, func, + PCI_SUBSYSTEM_VENDOR_ID); + subdevice = read_pci_config_16(num, slot, func, PCI_SUBSYSTEM_ID); + + for (i = 0; i < ARRAY_SIZE(intel_stolen_ids); i++) { + if (intel_stolen_ids[i].device == device) { + stolen_size_fn stolen_size = + (stolen_size_fn)intel_stolen_ids[i].driver_data; + size = stolen_size(num, slot, func); + start = intel_stolen_base(num, slot, func); + if (size && start) { + /* Mark this space as reserved */ + e820_add_region(start, size, E820_RESERVED); + sanitize_e820_map(e820.map, + ARRAY_SIZE(e820.map), + &e820.nr_map); + } + return; + } + } +} + #define QFLAG_APPLY_ONCE 0x1 #define QFLAG_APPLIED 0x2 #define QFLAG_DONE (QFLAG_APPLY_ONCE|QFLAG_APPLIED) @@ -251,6 +403,8 @@ static struct chipset early_qrk[] __initdata = { PCI_BASE_CLASS_BRIDGE, 0, intel_remapping_check }, { PCI_VENDOR_ID_INTEL, 0x3406, PCI_CLASS_BRIDGE_HOST, PCI_BASE_CLASS_BRIDGE, 0, intel_remapping_check }, + { PCI_VENDOR_ID_INTEL, PCI_ANY_ID, PCI_CLASS_DISPLAY_VGA, PCI_ANY_ID, + QFLAG_APPLY_ONCE, intel_graphics_stolen }, {} }; diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 1b69951a81e2..b077f4cc225a 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -487,21 +487,6 @@ ENDPROC(native_usergs_sysret64) TRACE_IRQS_OFF .endm -ENTRY(save_rest) - PARTIAL_FRAME 1 (REST_SKIP+8) - movq 5*8+16(%rsp), %r11 /* save return address */ - movq_cfi rbx, RBX+16 - movq_cfi rbp, RBP+16 - movq_cfi r12, R12+16 - movq_cfi r13, R13+16 - movq_cfi r14, R14+16 - movq_cfi r15, R15+16 - movq %r11, 8(%rsp) /* return address */ - FIXUP_TOP_OF_STACK %r11, 16 - ret - CFI_ENDPROC -END(save_rest) - /* save complete stack frame */ .pushsection .kprobes.text, "ax" ENTRY(save_paranoid) diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c index 460f5d9ceebb..ee11b7dfbfbb 100644 --- a/arch/x86/kernel/jump_label.c +++ b/arch/x86/kernel/jump_label.c @@ -24,18 +24,57 @@ union jump_code_union { } __attribute__((packed)); }; +static void bug_at(unsigned char *ip, int line) +{ + /* + * The location is not an op that we were expecting. + * Something went wrong. Crash the box, as something could be + * corrupting the kernel. + */ + pr_warning("Unexpected op at %pS [%p] (%02x %02x %02x %02x %02x) %s:%d\n", + ip, ip, ip[0], ip[1], ip[2], ip[3], ip[4], __FILE__, line); + BUG(); +} + static void __jump_label_transform(struct jump_entry *entry, enum jump_label_type type, - void *(*poker)(void *, const void *, size_t)) + void *(*poker)(void *, const void *, size_t), + int init) { union jump_code_union code; + const unsigned char *ideal_nop = ideal_nops[NOP_ATOMIC5]; if (type == JUMP_LABEL_ENABLE) { + /* + * We are enabling this jump label. If it is not a nop + * then something must have gone wrong. + */ + if (unlikely(memcmp((void *)entry->code, ideal_nop, 5) != 0)) + bug_at((void *)entry->code, __LINE__); + code.jump = 0xe9; code.offset = entry->target - (entry->code + JUMP_LABEL_NOP_SIZE); - } else + } else { + /* + * We are disabling this jump label. If it is not what + * we think it is, then something must have gone wrong. + * If this is the first initialization call, then we + * are converting the default nop to the ideal nop. + */ + if (init) { + const unsigned char default_nop[] = { STATIC_KEY_INIT_NOP }; + if (unlikely(memcmp((void *)entry->code, default_nop, 5) != 0)) + bug_at((void *)entry->code, __LINE__); + } else { + code.jump = 0xe9; + code.offset = entry->target - + (entry->code + JUMP_LABEL_NOP_SIZE); + if (unlikely(memcmp((void *)entry->code, &code, 5) != 0)) + bug_at((void *)entry->code, __LINE__); + } memcpy(&code, ideal_nops[NOP_ATOMIC5], JUMP_LABEL_NOP_SIZE); + } /* * Make text_poke_bp() a default fallback poker. @@ -57,15 +96,38 @@ void arch_jump_label_transform(struct jump_entry *entry, { get_online_cpus(); mutex_lock(&text_mutex); - __jump_label_transform(entry, type, NULL); + __jump_label_transform(entry, type, NULL, 0); mutex_unlock(&text_mutex); put_online_cpus(); } +static enum { + JL_STATE_START, + JL_STATE_NO_UPDATE, + JL_STATE_UPDATE, +} jlstate __initdata_or_module = JL_STATE_START; + __init_or_module void arch_jump_label_transform_static(struct jump_entry *entry, enum jump_label_type type) { - __jump_label_transform(entry, type, text_poke_early); + /* + * This function is called at boot up and when modules are + * first loaded. Check if the default nop, the one that is + * inserted at compile time, is the ideal nop. If it is, then + * we do not need to update the nop, and we can leave it as is. + * If it is not, then we need to update the nop to the ideal nop. + */ + if (jlstate == JL_STATE_START) { + const unsigned char default_nop[] = { STATIC_KEY_INIT_NOP }; + const unsigned char *ideal_nop = ideal_nops[NOP_ATOMIC5]; + + if (memcmp(ideal_nop, default_nop, 5) != 0) + jlstate = JL_STATE_UPDATE; + else + jlstate = JL_STATE_NO_UPDATE; + } + if (jlstate == JL_STATE_UPDATE) + __jump_label_transform(entry, type, text_poke_early, 1); } #endif diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 697b93af02dd..a0e2a8a80c94 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -775,11 +775,22 @@ void __init kvm_spinlock_init(void) if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT)) return; - printk(KERN_INFO "KVM setup paravirtual spinlock\n"); + pv_lock_ops.lock_spinning = PV_CALLEE_SAVE(kvm_lock_spinning); + pv_lock_ops.unlock_kick = kvm_unlock_kick; +} + +static __init int kvm_spinlock_init_jump(void) +{ + if (!kvm_para_available()) + return 0; + if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT)) + return 0; static_key_slow_inc(¶virt_ticketlocks_enabled); + printk(KERN_INFO "KVM setup paravirtual spinlock\n"); - pv_lock_ops.lock_spinning = PV_CALLEE_SAVE(kvm_lock_spinning); - pv_lock_ops.unlock_kick = kvm_unlock_kick; + return 0; } +early_initcall(kvm_spinlock_init_jump); + #endif /* CONFIG_PARAVIRT_SPINLOCKS */ diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index 7123b5df479d..af99f71aeb7f 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -216,6 +216,7 @@ int apply_microcode_amd(int cpu) /* need to apply patch? */ if (rev >= mc_amd->hdr.patch_id) { c->microcode = rev; + uci->cpu_sig.rev = rev; return 0; } diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 563ed91e6faa..7e920bff99a3 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -326,6 +326,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E6320"), }, }, + { /* Handle problems with rebooting on the Latitude E5410. */ + .callback = set_pci_reboot, + .ident = "Dell Latitude E5410", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E5410"), + }, + }, { /* Handle problems with rebooting on the Latitude E5420. */ .callback = set_pci_reboot, .ident = "Dell Latitude E5420", @@ -352,12 +360,28 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { }, { /* Handle problems with rebooting on the Precision M6600. */ .callback = set_pci_reboot, - .ident = "Dell OptiPlex 990", + .ident = "Dell Precision M6600", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), DMI_MATCH(DMI_PRODUCT_NAME, "Precision M6600"), }, }, + { /* Handle problems with rebooting on the Dell PowerEdge C6100. */ + .callback = set_pci_reboot, + .ident = "Dell PowerEdge C6100", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "C6100"), + }, + }, + { /* Some C6100 machines were shipped with vendor being 'Dell'. */ + .callback = set_pci_reboot, + .ident = "Dell PowerEdge C6100", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell"), + DMI_MATCH(DMI_PRODUCT_NAME, "C6100"), + }, + }, { } }; diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index aecc98a93d1b..6cacab671f9b 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -653,6 +653,7 @@ static void announce_cpu(int cpu, int apicid) { static int current_node = -1; int node = early_cpu_to_node(cpu); + int max_cpu_present = find_last_bit(cpumask_bits(cpu_present_mask), NR_CPUS); if (system_state == SYSTEM_BOOTING) { if (node != current_node) { @@ -661,7 +662,7 @@ static void announce_cpu(int cpu, int apicid) current_node = node; pr_info("Booting Node %3d, Processors ", node); } - pr_cont(" #%d%s", cpu, cpu == (nr_cpu_ids - 1) ? " OK\n" : ""); + pr_cont(" #%4d%s", cpu, cpu == max_cpu_present ? " OK\n" : ""); return; } else pr_info("Booting Node %d Processor %d APIC 0x%x\n", diff --git a/arch/x86/kernel/sysfb_simplefb.c b/arch/x86/kernel/sysfb_simplefb.c index 22513e96b012..86179d409893 100644 --- a/arch/x86/kernel/sysfb_simplefb.c +++ b/arch/x86/kernel/sysfb_simplefb.c @@ -72,14 +72,14 @@ __init int create_simplefb(const struct screen_info *si, * the part that is occupied by the framebuffer */ len = mode->height * mode->stride; len = PAGE_ALIGN(len); - if (len > si->lfb_size << 16) { + if (len > (u64)si->lfb_size << 16) { printk(KERN_WARNING "sysfb: VRAM smaller than advertised\n"); return -EINVAL; } /* setup IORESOURCE_MEM as framebuffer memory */ memset(&res, 0, sizeof(res)); - res.flags = IORESOURCE_MEM; + res.flags = IORESOURCE_MEM | IORESOURCE_BUSY; res.name = simplefb_resname; res.start = si->lfb_base; res.end = si->lfb_base + len - 1; diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 2bc1e81045b0..ddc3f3d2afdb 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2025,6 +2025,17 @@ static int em_ret_far(struct x86_emulate_ctxt *ctxt) return rc; } +static int em_ret_far_imm(struct x86_emulate_ctxt *ctxt) +{ + int rc; + + rc = em_ret_far(ctxt); + if (rc != X86EMUL_CONTINUE) + return rc; + rsp_increment(ctxt, ctxt->src.val); + return X86EMUL_CONTINUE; +} + static int em_cmpxchg(struct x86_emulate_ctxt *ctxt) { /* Save real source value, then compare EAX against destination. */ @@ -3763,7 +3774,8 @@ static const struct opcode opcode_table[256] = { G(ByteOp, group11), G(0, group11), /* 0xC8 - 0xCF */ I(Stack | SrcImmU16 | Src2ImmByte, em_enter), I(Stack, em_leave), - N, I(ImplicitOps | Stack, em_ret_far), + I(ImplicitOps | Stack | SrcImmU16, em_ret_far_imm), + I(ImplicitOps | Stack, em_ret_far), D(ImplicitOps), DI(SrcImmByte, intn), D(ImplicitOps | No64), II(ImplicitOps, em_iret, iret), /* 0xD0 - 0xD7 */ diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 6e2d2c8f230b..dce0df8150df 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -4421,13 +4421,12 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm) } } -static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc) +static unsigned long +mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) { struct kvm *kvm; int nr_to_scan = sc->nr_to_scan; - - if (nr_to_scan == 0) - goto out; + unsigned long freed = 0; raw_spin_lock(&kvm_lock); @@ -4462,25 +4461,37 @@ static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc) goto unlock; } - prepare_zap_oldest_mmu_page(kvm, &invalid_list); + if (prepare_zap_oldest_mmu_page(kvm, &invalid_list)) + freed++; kvm_mmu_commit_zap_page(kvm, &invalid_list); unlock: spin_unlock(&kvm->mmu_lock); srcu_read_unlock(&kvm->srcu, idx); + /* + * unfair on small ones + * per-vm shrinkers cry out + * sadness comes quickly + */ list_move_tail(&kvm->vm_list, &vm_list); break; } raw_spin_unlock(&kvm_lock); + return freed; -out: +} + +static unsigned long +mmu_shrink_count(struct shrinker *shrink, struct shrink_control *sc) +{ return percpu_counter_read_positive(&kvm_total_used_mmu_pages); } static struct shrinker mmu_shrinker = { - .shrink = mmu_shrink, + .count_objects = mmu_shrink_count, + .scan_objects = mmu_shrink_scan, .seeks = DEFAULT_SEEKS * 10, }; diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 043330159179..ad75d77999d0 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -99,6 +99,7 @@ struct guest_walker { pt_element_t prefetch_ptes[PTE_PREFETCH_NUM]; gpa_t pte_gpa[PT_MAX_FULL_LEVELS]; pt_element_t __user *ptep_user[PT_MAX_FULL_LEVELS]; + bool pte_writable[PT_MAX_FULL_LEVELS]; unsigned pt_access; unsigned pte_access; gfn_t gfn; @@ -235,6 +236,22 @@ static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu, if (pte == orig_pte) continue; + /* + * If the slot is read-only, simply do not process the accessed + * and dirty bits. This is the correct thing to do if the slot + * is ROM, and page tables in read-as-ROM/write-as-MMIO slots + * are only supported if the accessed and dirty bits are already + * set in the ROM (so that MMIO writes are never needed). + * + * Note that NPT does not allow this at all and faults, since + * it always wants nested page table entries for the guest + * page tables to be writable. And EPT works but will simply + * overwrite the read-only memory to set the accessed and dirty + * bits. + */ + if (unlikely(!walker->pte_writable[level - 1])) + continue; + ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index, orig_pte, pte); if (ret) return ret; @@ -309,7 +326,8 @@ retry_walk: goto error; real_gfn = gpa_to_gfn(real_gfn); - host_addr = gfn_to_hva(vcpu->kvm, real_gfn); + host_addr = gfn_to_hva_prot(vcpu->kvm, real_gfn, + &walker->pte_writable[walker->level - 1]); if (unlikely(kvm_is_error_hva(host_addr))) goto error; diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 1f1da43ff2a2..2b2fce1b2009 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3255,25 +3255,29 @@ static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) static void ept_load_pdptrs(struct kvm_vcpu *vcpu) { + struct kvm_mmu *mmu = vcpu->arch.walk_mmu; + if (!test_bit(VCPU_EXREG_PDPTR, (unsigned long *)&vcpu->arch.regs_dirty)) return; if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) { - vmcs_write64(GUEST_PDPTR0, vcpu->arch.mmu.pdptrs[0]); - vmcs_write64(GUEST_PDPTR1, vcpu->arch.mmu.pdptrs[1]); - vmcs_write64(GUEST_PDPTR2, vcpu->arch.mmu.pdptrs[2]); - vmcs_write64(GUEST_PDPTR3, vcpu->arch.mmu.pdptrs[3]); + vmcs_write64(GUEST_PDPTR0, mmu->pdptrs[0]); + vmcs_write64(GUEST_PDPTR1, mmu->pdptrs[1]); + vmcs_write64(GUEST_PDPTR2, mmu->pdptrs[2]); + vmcs_write64(GUEST_PDPTR3, mmu->pdptrs[3]); } } static void ept_save_pdptrs(struct kvm_vcpu *vcpu) { + struct kvm_mmu *mmu = vcpu->arch.walk_mmu; + if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) { - vcpu->arch.mmu.pdptrs[0] = vmcs_read64(GUEST_PDPTR0); - vcpu->arch.mmu.pdptrs[1] = vmcs_read64(GUEST_PDPTR1); - vcpu->arch.mmu.pdptrs[2] = vmcs_read64(GUEST_PDPTR2); - vcpu->arch.mmu.pdptrs[3] = vmcs_read64(GUEST_PDPTR3); + mmu->pdptrs[0] = vmcs_read64(GUEST_PDPTR0); + mmu->pdptrs[1] = vmcs_read64(GUEST_PDPTR1); + mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2); + mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3); } __set_bit(VCPU_EXREG_PDPTR, @@ -5339,6 +5343,17 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) return 0; } + /* + * EPT violation happened while executing iret from NMI, + * "blocked by NMI" bit has to be set before next VM entry. + * There are errata that may cause this bit to not be set: + * AAK134, BY25. + */ + if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && + cpu_has_virtual_nmis() && + (exit_qualification & INTR_INFO_UNBLOCK_NMI)) + vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI); + gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); trace_kvm_page_fault(gpa, exit_qualification); diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 654be4ae3047..3aaeffcfd67a 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -842,23 +842,15 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, force_sig_info_fault(SIGBUS, code, address, tsk, fault); } -static noinline int +static noinline void mm_fault_error(struct pt_regs *regs, unsigned long error_code, unsigned long address, unsigned int fault) { - /* - * Pagefault was interrupted by SIGKILL. We have no reason to - * continue pagefault. - */ - if (fatal_signal_pending(current)) { - if (!(fault & VM_FAULT_RETRY)) - up_read(¤t->mm->mmap_sem); - if (!(error_code & PF_USER)) - no_context(regs, error_code, address, 0, 0); - return 1; + if (fatal_signal_pending(current) && !(error_code & PF_USER)) { + up_read(¤t->mm->mmap_sem); + no_context(regs, error_code, address, 0, 0); + return; } - if (!(fault & VM_FAULT_ERROR)) - return 0; if (fault & VM_FAULT_OOM) { /* Kernel mode? Handle exceptions or die: */ @@ -866,7 +858,7 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code, up_read(¤t->mm->mmap_sem); no_context(regs, error_code, address, SIGSEGV, SEGV_MAPERR); - return 1; + return; } up_read(¤t->mm->mmap_sem); @@ -884,7 +876,6 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code, else BUG(); } - return 1; } static int spurious_fault_check(unsigned long error_code, pte_t *pte) @@ -1011,9 +1002,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code) unsigned long address; struct mm_struct *mm; int fault; - int write = error_code & PF_WRITE; - unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE | - (write ? FAULT_FLAG_WRITE : 0); + unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; tsk = current; mm = tsk->mm; @@ -1083,6 +1072,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code) if (user_mode_vm(regs)) { local_irq_enable(); error_code |= PF_USER; + flags |= FAULT_FLAG_USER; } else { if (regs->flags & X86_EFLAGS_IF) local_irq_enable(); @@ -1109,6 +1099,9 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code) return; } + if (error_code & PF_WRITE) + flags |= FAULT_FLAG_WRITE; + /* * When running in the kernel we expect faults to occur only to * addresses in user space. All other faults represent errors in @@ -1187,9 +1180,17 @@ good_area: */ fault = handle_mm_fault(mm, vma, address, flags); - if (unlikely(fault & (VM_FAULT_RETRY|VM_FAULT_ERROR))) { - if (mm_fault_error(regs, error_code, address, fault)) - return; + /* + * If we need to retry but a fatal signal is pending, handle the + * signal first. We do not need to release the mmap_sem because it + * would already be released in __lock_page_or_retry in mm/filemap.c. + */ + if (unlikely((fault & VM_FAULT_RETRY) && fatal_signal_pending(current))) + return; + + if (unlikely(fault & VM_FAULT_ERROR)) { + mm_fault_error(regs, error_code, address, fault); + return; } /* diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index 7e73e8c69096..9d980d88b747 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -59,6 +59,10 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address, return NULL; } +int pmd_huge_support(void) +{ + return 0; +} #else struct page * @@ -77,6 +81,10 @@ int pud_huge(pud_t pud) return !!(pud_val(pud) & _PAGE_PSE); } +int pmd_huge_support(void) +{ + return 1; +} #endif /* x86_64 also uses this file */ diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 282375f13c7e..ae699b3bbac8 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -103,6 +103,7 @@ static void flush_tlb_func(void *info) if (f->flush_mm != this_cpu_read(cpu_tlbstate.active_mm)) return; + count_vm_event(NR_TLB_REMOTE_FLUSH_RECEIVED); if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) { if (f->flush_end == TLB_FLUSH_ALL) local_flush_tlb(); @@ -130,6 +131,7 @@ void native_flush_tlb_others(const struct cpumask *cpumask, info.flush_start = start; info.flush_end = end; + count_vm_event(NR_TLB_REMOTE_FLUSH); if (is_uv_system()) { unsigned int cpu; @@ -149,6 +151,7 @@ void flush_tlb_current_task(void) preempt_disable(); + count_vm_event(NR_TLB_LOCAL_FLUSH_ALL); local_flush_tlb(); if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL); @@ -211,16 +214,19 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, act_entries = mm->total_vm > tlb_entries ? tlb_entries : mm->total_vm; /* tlb_flushall_shift is on balance point, details in commit log */ - if ((end - start) >> PAGE_SHIFT > act_entries >> tlb_flushall_shift) + if ((end - start) >> PAGE_SHIFT > act_entries >> tlb_flushall_shift) { + count_vm_event(NR_TLB_LOCAL_FLUSH_ALL); local_flush_tlb(); - else { + } else { if (has_large_page(mm, start, end)) { local_flush_tlb(); goto flush_all; } /* flush range by one by one 'invlpg' */ - for (addr = start; addr < end; addr += PAGE_SIZE) + for (addr = start; addr < end; addr += PAGE_SIZE) { + count_vm_event(NR_TLB_LOCAL_FLUSH_ONE); __flush_tlb_single(addr); + } if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) @@ -256,6 +262,7 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long start) static void do_flush_tlb_all(void *info) { + count_vm_event(NR_TLB_REMOTE_FLUSH_RECEIVED); __flush_tlb_all(); if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY) leave_mm(smp_processor_id()); @@ -263,6 +270,7 @@ static void do_flush_tlb_all(void *info) void flush_tlb_all(void) { + count_vm_event(NR_TLB_REMOTE_FLUSH); on_each_cpu(do_flush_tlb_all, NULL, 1); } diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c index 5596c7bdd327..082e88129712 100644 --- a/arch/x86/pci/mmconfig-shared.c +++ b/arch/x86/pci/mmconfig-shared.c @@ -700,7 +700,7 @@ int pci_mmconfig_insert(struct device *dev, u16 seg, u8 start, u8 end, if (!(pci_probe & PCI_PROBE_MMCONF) || pci_mmcfg_arch_init_failed) return -ENODEV; - if (start > end || !addr) + if (start > end) return -EINVAL; mutex_lock(&pci_mmcfg_lock); @@ -716,6 +716,11 @@ int pci_mmconfig_insert(struct device *dev, u16 seg, u8 start, u8 end, return -EEXIST; } + if (!addr) { + mutex_unlock(&pci_mmcfg_lock); + return -EINVAL; + } + rc = -EBUSY; cfg = pci_mmconfig_alloc(seg, start, end, addr); if (cfg == NULL) { diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 90f6ed127096..c7e22ab29a5a 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -912,10 +912,13 @@ void __init efi_enter_virtual_mode(void) for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { md = p; - if (!(md->attribute & EFI_MEMORY_RUNTIME) && - md->type != EFI_BOOT_SERVICES_CODE && - md->type != EFI_BOOT_SERVICES_DATA) - continue; + if (!(md->attribute & EFI_MEMORY_RUNTIME)) { +#ifdef CONFIG_X86_64 + if (md->type != EFI_BOOT_SERVICES_CODE && + md->type != EFI_BOOT_SERVICES_DATA) +#endif + continue; + } size = md->num_pages << EFI_PAGE_SHIFT; end = md->phys_addr + size; diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 2fc216dfbd9c..fa6ade76ef3f 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1692,7 +1692,6 @@ static int xen_hvm_cpu_notify(struct notifier_block *self, unsigned long action, case CPU_UP_PREPARE: xen_vcpu_setup(cpu); if (xen_have_vector_callback) { - xen_init_lock_cpu(cpu); if (xen_feature(XENFEAT_hvm_safe_pvclock)) xen_setup_timer(cpu); } diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 0d4ec35895d4..a61c7d5811be 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -879,7 +879,6 @@ int m2p_add_override(unsigned long mfn, struct page *page, unsigned long uninitialized_var(address); unsigned level; pte_t *ptep = NULL; - int ret = 0; pfn = page_to_pfn(page); if (!PageHighMem(page)) { @@ -926,8 +925,8 @@ int m2p_add_override(unsigned long mfn, struct page *page, * frontend pages while they are being shared with the backend, * because mfn_to_pfn (that ends up being called by GUPF) will * return the backend pfn rather than the frontend pfn. */ - ret = __get_user(pfn, &machine_to_phys_mapping[mfn]); - if (ret == 0 && get_phys_to_machine(pfn) == mfn) + pfn = mfn_to_pfn_no_overrides(mfn); + if (get_phys_to_machine(pfn) == mfn) set_phys_to_machine(pfn, FOREIGN_FRAME(mfn)); return 0; @@ -942,7 +941,6 @@ int m2p_remove_override(struct page *page, unsigned long uninitialized_var(address); unsigned level; pte_t *ptep = NULL; - int ret = 0; pfn = page_to_pfn(page); mfn = get_phys_to_machine(pfn); @@ -990,10 +988,13 @@ int m2p_remove_override(struct page *page, printk(KERN_WARNING "m2p_remove_override: " "pfn %lx mfn %lx, failed to modify kernel mappings", pfn, mfn); + put_balloon_scratch_page(); return -1; } - mcs = xen_mc_entry( + xen_mc_batch(); + + mcs = __xen_mc_entry( sizeof(struct gnttab_unmap_and_replace)); unmap_op = mcs.args; unmap_op->host_addr = kmap_op->host_addr; @@ -1003,12 +1004,11 @@ int m2p_remove_override(struct page *page, MULTI_grant_table_op(mcs.mc, GNTTABOP_unmap_and_replace, unmap_op, 1); - xen_mc_issue(PARAVIRT_LAZY_MMU); - mcs = __xen_mc_entry(0); MULTI_update_va_mapping(mcs.mc, scratch_page_address, - pfn_pte(page_to_pfn(get_balloon_scratch_page()), + pfn_pte(page_to_pfn(scratch_page), PAGE_KERNEL_RO), 0); + xen_mc_issue(PARAVIRT_LAZY_MMU); kmap_op->host_addr = 0; @@ -1027,8 +1027,8 @@ int m2p_remove_override(struct page *page, * the original pfn causes mfn_to_pfn(mfn) to return the frontend * pfn again. */ mfn &= ~FOREIGN_FRAME_BIT; - ret = __get_user(pfn, &machine_to_phys_mapping[mfn]); - if (ret == 0 && get_phys_to_machine(pfn) == FOREIGN_FRAME(mfn) && + pfn = mfn_to_pfn_no_overrides(mfn); + if (get_phys_to_machine(pfn) == FOREIGN_FRAME(mfn) && m2p_find_override(mfn) == NULL) set_phys_to_machine(pfn, mfn); diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 9235842cd76a..31d04758b76f 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -273,12 +273,29 @@ static void __init xen_smp_prepare_boot_cpu(void) BUG_ON(smp_processor_id() != 0); native_smp_prepare_boot_cpu(); - /* We've switched to the "real" per-cpu gdt, so make sure the - old memory can be recycled */ - make_lowmem_page_readwrite(xen_initial_gdt); + if (xen_pv_domain()) { + /* We've switched to the "real" per-cpu gdt, so make sure the + old memory can be recycled */ + make_lowmem_page_readwrite(xen_initial_gdt); - xen_filter_cpu_maps(); - xen_setup_vcpu_info_placement(); +#ifdef CONFIG_X86_32 + /* + * Xen starts us with XEN_FLAT_RING1_DS, but linux code + * expects __USER_DS + */ + loadsegment(ds, __USER_DS); + loadsegment(es, __USER_DS); +#endif + + xen_filter_cpu_maps(); + xen_setup_vcpu_info_placement(); + } + /* + * The alternative logic (which patches the unlock/lock) runs before + * the smp bootup up code is activated. Hence we need to set this up + * the core kernel is being patched. Otherwise we will have only + * modules patched but not core code. + */ xen_init_spinlocks(); } @@ -709,6 +726,15 @@ static int xen_hvm_cpu_up(unsigned int cpu, struct task_struct *tidle) WARN_ON(rc); if (!rc) rc = native_cpu_up(cpu, tidle); + + /* + * We must initialize the slowpath CPU kicker _after_ the native + * path has executed. If we initialized it before none of the + * unlocker IPI kicks would reach the booting CPU as the booting + * CPU had not set itself 'online' in cpu_online_mask. That mask + * is checked when IPIs are sent (on HVM at least). + */ + xen_init_lock_cpu(cpu); return rc; } @@ -728,4 +754,5 @@ void __init xen_hvm_smp_init(void) smp_ops.cpu_die = xen_hvm_cpu_die; smp_ops.send_call_func_ipi = xen_smp_send_call_function_ipi; smp_ops.send_call_func_single_ipi = xen_smp_send_call_function_single_ipi; + smp_ops.smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu; } diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c index 0438b9324a72..be6b86078957 100644 --- a/arch/x86/xen/spinlock.c +++ b/arch/x86/xen/spinlock.c @@ -81,7 +81,6 @@ static inline void spin_time_accum_blocked(u64 start) spinlock_stats.time_blocked += delta; } #else /* !CONFIG_XEN_DEBUG_FS */ -#define TIMEOUT (1 << 10) static inline void add_stats(enum xen_contention_stat var, u32 val) { } @@ -96,23 +95,6 @@ static inline void spin_time_accum_blocked(u64 start) } #endif /* CONFIG_XEN_DEBUG_FS */ -/* - * Size struct xen_spinlock so it's the same as arch_spinlock_t. - */ -#if NR_CPUS < 256 -typedef u8 xen_spinners_t; -# define inc_spinners(xl) \ - asm(LOCK_PREFIX " incb %0" : "+m" ((xl)->spinners) : : "memory"); -# define dec_spinners(xl) \ - asm(LOCK_PREFIX " decb %0" : "+m" ((xl)->spinners) : : "memory"); -#else -typedef u16 xen_spinners_t; -# define inc_spinners(xl) \ - asm(LOCK_PREFIX " incw %0" : "+m" ((xl)->spinners) : : "memory"); -# define dec_spinners(xl) \ - asm(LOCK_PREFIX " decw %0" : "+m" ((xl)->spinners) : : "memory"); -#endif - struct xen_lock_waiting { struct arch_spinlock *lock; __ticket_t want; @@ -123,6 +105,7 @@ static DEFINE_PER_CPU(char *, irq_name); static DEFINE_PER_CPU(struct xen_lock_waiting, lock_waiting); static cpumask_t waiting_cpus; +static bool xen_pvspin = true; static void xen_lock_spinning(struct arch_spinlock *lock, __ticket_t want) { int irq = __this_cpu_read(lock_kicker_irq); @@ -241,16 +224,12 @@ void xen_init_lock_cpu(int cpu) int irq; char *name; + if (!xen_pvspin) + return; + WARN(per_cpu(lock_kicker_irq, cpu) >= 0, "spinlock on CPU%d exists on IRQ%d!\n", cpu, per_cpu(lock_kicker_irq, cpu)); - /* - * See git commit f10cd522c5fbfec9ae3cc01967868c9c2401ed23 - * (xen: disable PV spinlocks on HVM) - */ - if (xen_hvm_domain()) - return; - name = kasprintf(GFP_KERNEL, "spinlock%d", cpu); irq = bind_ipi_to_irqhandler(XEN_SPIN_UNLOCK_VECTOR, cpu, @@ -270,11 +249,7 @@ void xen_init_lock_cpu(int cpu) void xen_uninit_lock_cpu(int cpu) { - /* - * See git commit f10cd522c5fbfec9ae3cc01967868c9c2401ed23 - * (xen: disable PV spinlocks on HVM) - */ - if (xen_hvm_domain()) + if (!xen_pvspin) return; unbind_from_irqhandler(per_cpu(lock_kicker_irq, cpu), NULL); @@ -283,28 +258,43 @@ void xen_uninit_lock_cpu(int cpu) per_cpu(irq_name, cpu) = NULL; } -static bool xen_pvspin __initdata = true; +/* + * Our init of PV spinlocks is split in two init functions due to us + * using paravirt patching and jump labels patching and having to do + * all of this before SMP code is invoked. + * + * The paravirt patching needs to be done _before_ the alternative asm code + * is started, otherwise we would not patch the core kernel code. + */ void __init xen_init_spinlocks(void) { - /* - * See git commit f10cd522c5fbfec9ae3cc01967868c9c2401ed23 - * (xen: disable PV spinlocks on HVM) - */ - if (xen_hvm_domain()) - return; if (!xen_pvspin) { printk(KERN_DEBUG "xen: PV spinlocks disabled\n"); return; } - static_key_slow_inc(¶virt_ticketlocks_enabled); - pv_lock_ops.lock_spinning = PV_CALLEE_SAVE(xen_lock_spinning); pv_lock_ops.unlock_kick = xen_unlock_kick; } +/* + * While the jump_label init code needs to happend _after_ the jump labels are + * enabled and before SMP is started. Hence we use pre-SMP initcall level + * init. We cannot do it in xen_init_spinlocks as that is done before + * jump labels are activated. + */ +static __init int xen_init_spinlocks_jump(void) +{ + if (!xen_pvspin) + return 0; + + static_key_slow_inc(¶virt_ticketlocks_enabled); + return 0; +} +early_initcall(xen_init_spinlocks_jump); + static __init int xen_parse_nopvspin(char *arg) { xen_pvspin = false; @@ -323,6 +313,9 @@ static int __init xen_spinlock_debugfs(void) if (d_xen == NULL) return -ENOMEM; + if (!xen_pvspin) + return 0; + d_spin_debug = debugfs_create_dir("spinlocks", d_xen); debugfs_create_u8("zero_stats", 0644, d_spin_debug, &zero_stats); |