aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig84
-rw-r--r--arch/x86/Kconfig.debug12
-rw-r--r--arch/x86/Makefile15
-rw-r--r--arch/x86/boot/Makefile2
-rw-r--r--arch/x86/boot/boot.h3
-rw-r--r--arch/x86/boot/compressed/aslr.c2
-rw-r--r--arch/x86/boot/compressed/eboot.c8
-rw-r--r--arch/x86/boot/compressed/misc.c27
-rw-r--r--arch/x86/boot/compressed/misc.h11
-rw-r--r--arch/x86/boot/header.S2
-rw-r--r--arch/x86/boot/main.c3
-rw-r--r--arch/x86/boot/mca.c38
-rw-r--r--arch/x86/configs/i386_defconfig1
-rw-r--r--arch/x86/configs/x86_64_defconfig2
-rw-r--r--arch/x86/entry/Makefile1
-rw-r--r--arch/x86/entry/calling.h9
-rw-r--r--arch/x86/entry/common.c318
-rw-r--r--arch/x86/entry/entry_32.S130
-rw-r--r--arch/x86/entry/entry_64.S510
-rw-r--r--arch/x86/entry/entry_64_compat.S78
-rw-r--r--arch/x86/entry/syscalls/syscall_32.tbl17
-rw-r--r--arch/x86/entry/syscalls/syscall_64.tbl2
-rw-r--r--arch/x86/entry/vdso/Makefile8
-rw-r--r--arch/x86/entry/vdso/vclock_gettime.c16
-rw-r--r--arch/x86/entry/vdso/vma.c7
-rw-r--r--arch/x86/entry/vsyscall/vsyscall_64.c4
-rw-r--r--arch/x86/ia32/ia32_signal.c93
-rw-r--r--arch/x86/include/asm/Kbuild1
-rw-r--r--arch/x86/include/asm/apic.h2
-rw-r--r--arch/x86/include/asm/arch_hweight.h13
-rw-r--r--arch/x86/include/asm/atomic.h25
-rw-r--r--arch/x86/include/asm/atomic64_32.h14
-rw-r--r--arch/x86/include/asm/atomic64_64.h15
-rw-r--r--arch/x86/include/asm/barrier.h19
-rw-r--r--arch/x86/include/asm/cacheflush.h73
-rw-r--r--arch/x86/include/asm/context_tracking.h10
-rw-r--r--arch/x86/include/asm/cpufeature.h5
-rw-r--r--arch/x86/include/asm/delay.h1
-rw-r--r--arch/x86/include/asm/desc.h15
-rw-r--r--arch/x86/include/asm/dma-mapping.h34
-rw-r--r--arch/x86/include/asm/efi.h12
-rw-r--r--arch/x86/include/asm/elf.h17
-rw-r--r--arch/x86/include/asm/espfix.h2
-rw-r--r--arch/x86/include/asm/fpu/types.h72
-rw-r--r--arch/x86/include/asm/ftrace.h4
-rw-r--r--arch/x86/include/asm/hw_irq.h6
-rw-r--r--arch/x86/include/asm/ia32.h9
-rw-r--r--arch/x86/include/asm/intel_pmc_ipc.h27
-rw-r--r--arch/x86/include/asm/io.h8
-rw-r--r--arch/x86/include/asm/iosf_mbi.h8
-rw-r--r--arch/x86/include/asm/irq.h4
-rw-r--r--arch/x86/include/asm/irq_vectors.h10
-rw-r--r--arch/x86/include/asm/jump_label.h23
-rw-r--r--arch/x86/include/asm/kasan.h11
-rw-r--r--arch/x86/include/asm/kdebug.h2
-rw-r--r--arch/x86/include/asm/kvm_host.h47
-rw-r--r--arch/x86/include/asm/math_emu.h6
-rw-r--r--arch/x86/include/asm/mce.h8
-rw-r--r--arch/x86/include/asm/mm-arch-hooks.h15
-rw-r--r--arch/x86/include/asm/mmu.h5
-rw-r--r--arch/x86/include/asm/mmu_context.h70
-rw-r--r--arch/x86/include/asm/mshyperv.h5
-rw-r--r--arch/x86/include/asm/msr-index.h23
-rw-r--r--arch/x86/include/asm/msr.h70
-rw-r--r--arch/x86/include/asm/mwait.h45
-rw-r--r--arch/x86/include/asm/paravirt.h40
-rw-r--r--arch/x86/include/asm/paravirt_types.h3
-rw-r--r--arch/x86/include/asm/pci_x86.h2
-rw-r--r--arch/x86/include/asm/perf_event.h7
-rw-r--r--arch/x86/include/asm/pmc_atom.h29
-rw-r--r--arch/x86/include/asm/pmem.h153
-rw-r--r--arch/x86/include/asm/preempt.h4
-rw-r--r--arch/x86/include/asm/processor.h31
-rw-r--r--arch/x86/include/asm/ptrace.h1
-rw-r--r--arch/x86/include/asm/pvclock-abi.h1
-rw-r--r--arch/x86/include/asm/pvclock.h10
-rw-r--r--arch/x86/include/asm/qrwlock.h10
-rw-r--r--arch/x86/include/asm/qspinlock.h19
-rw-r--r--arch/x86/include/asm/sigcontext.h6
-rw-r--r--arch/x86/include/asm/sigframe.h10
-rw-r--r--arch/x86/include/asm/signal.h2
-rw-r--r--arch/x86/include/asm/stackprotector.h2
-rw-r--r--arch/x86/include/asm/switch_to.h12
-rw-r--r--arch/x86/include/asm/syscalls.h1
-rw-r--r--arch/x86/include/asm/thread_info.h27
-rw-r--r--arch/x86/include/asm/tlbflush.h6
-rw-r--r--arch/x86/include/asm/traps.h4
-rw-r--r--arch/x86/include/asm/tsc.h19
-rw-r--r--arch/x86/include/asm/vm86.h57
-rw-r--r--arch/x86/include/asm/vmx.h47
-rw-r--r--arch/x86/include/asm/xen/events.h11
-rw-r--r--arch/x86/include/asm/xen/hypercall.h10
-rw-r--r--arch/x86/include/asm/xen/interface.h219
-rw-r--r--arch/x86/include/asm/xen/page.h47
-rw-r--r--arch/x86/include/uapi/asm/bitsperlong.h2
-rw-r--r--arch/x86/include/uapi/asm/bootparam.h2
-rw-r--r--arch/x86/include/uapi/asm/e820.h2
-rw-r--r--arch/x86/include/uapi/asm/hyperv.h4
-rw-r--r--arch/x86/include/uapi/asm/kvm.h4
-rw-r--r--arch/x86/include/uapi/asm/mce.h3
-rw-r--r--arch/x86/include/uapi/asm/processor-flags.h2
-rw-r--r--arch/x86/include/uapi/asm/sigcontext.h21
-rw-r--r--arch/x86/include/uapi/asm/vmx.h2
-rw-r--r--arch/x86/kernel/Makefile12
-rw-r--r--arch/x86/kernel/acpi/boot.c9
-rw-r--r--arch/x86/kernel/alternative.c5
-rw-r--r--arch/x86/kernel/apb_timer.c8
-rw-r--r--arch/x86/kernel/apic/apic.c113
-rw-r--r--arch/x86/kernel/apic/apic_flat_64.c2
-rw-r--r--arch/x86/kernel/apic/apic_noop.c1
-rw-r--r--arch/x86/kernel/apic/apic_numachip.c2
-rw-r--r--arch/x86/kernel/apic/bigsmp_32.c1
-rw-r--r--arch/x86/kernel/apic/hw_nmi.c133
-rw-r--r--arch/x86/kernel/apic/io_apic.c9
-rw-r--r--arch/x86/kernel/apic/msi.c2
-rw-r--r--arch/x86/kernel/apic/probe_32.c1
-rw-r--r--arch/x86/kernel/apic/vector.c97
-rw-r--r--arch/x86/kernel/apic/x2apic_cluster.c3
-rw-r--r--arch/x86/kernel/apic/x2apic_phys.c1
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c2
-rw-r--r--arch/x86/kernel/apm_32.c2
-rw-r--r--arch/x86/kernel/check.c5
-rw-r--r--arch/x86/kernel/cpu/Makefile2
-rw-r--r--arch/x86/kernel/cpu/amd.c10
-rw-r--r--arch/x86/kernel/cpu/common.c32
-rw-r--r--arch/x86/kernel/cpu/cpu.h1
-rw-r--r--arch/x86/kernel/cpu/intel.c47
-rw-r--r--arch/x86/kernel/cpu/intel_pt.h39
-rw-r--r--arch/x86/kernel/cpu/mcheck/Makefile2
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-apei.c1
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-genpool.c99
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-internal.h14
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c244
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel.c61
-rw-r--r--arch/x86/kernel/cpu/mcheck/p5.c5
-rw-r--r--arch/x86/kernel/cpu/mcheck/winchip.c4
-rw-r--r--arch/x86/kernel/cpu/microcode/core.c7
-rw-r--r--arch/x86/kernel/cpu/microcode/intel_early.c2
-rw-r--r--arch/x86/kernel/cpu/mshyperv.c52
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c2
-rw-r--r--arch/x86/kernel/cpu/perf_event.c20
-rw-r--r--arch/x86/kernel/cpu/perf_event.h26
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c332
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_bts.c4
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_cqm.c16
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_ds.c106
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_lbr.c58
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_pt.c85
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_rapl.c20
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_uncore.c11
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_uncore.h2
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c23
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c166
-rw-r--r--arch/x86/kernel/cpu/perf_event_msr.c242
-rw-r--r--arch/x86/kernel/cpu/scattered.c2
-rw-r--r--arch/x86/kernel/cpuid.c2
-rw-r--r--arch/x86/kernel/crash.c7
-rw-r--r--arch/x86/kernel/early_printk.c4
-rw-r--r--arch/x86/kernel/espfix_64.c30
-rw-r--r--arch/x86/kernel/fpu/core.c2
-rw-r--r--arch/x86/kernel/fpu/init.c53
-rw-r--r--arch/x86/kernel/head64.c10
-rw-r--r--arch/x86/kernel/head_64.S29
-rw-r--r--arch/x86/kernel/hpet.c206
-rw-r--r--arch/x86/kernel/hw_breakpoint.c31
-rw-r--r--arch/x86/kernel/i8253.c2
-rw-r--r--arch/x86/kernel/irq.c132
-rw-r--r--arch/x86/kernel/irq_32.c23
-rw-r--r--arch/x86/kernel/irq_64.c9
-rw-r--r--arch/x86/kernel/irqinit.c6
-rw-r--r--arch/x86/kernel/jump_label.c2
-rw-r--r--arch/x86/kernel/kexec-bzimage64.c7
-rw-r--r--arch/x86/kernel/kvmclock.c4
-rw-r--r--arch/x86/kernel/ldt.c262
-rw-r--r--arch/x86/kernel/nmi.c133
-rw-r--r--arch/x86/kernel/paravirt.c18
-rw-r--r--arch/x86/kernel/paravirt_patch_32.c2
-rw-r--r--arch/x86/kernel/pci-dma.c60
-rw-r--r--arch/x86/kernel/pmem.c79
-rw-r--r--arch/x86/kernel/process.c64
-rw-r--r--arch/x86/kernel/process_32.c29
-rw-r--r--arch/x86/kernel/process_64.c34
-rw-r--r--arch/x86/kernel/ptrace.c340
-rw-r--r--arch/x86/kernel/reboot.c4
-rw-r--r--arch/x86/kernel/setup.c29
-rw-r--r--arch/x86/kernel/signal.c59
-rw-r--r--arch/x86/kernel/signal_compat.c95
-rw-r--r--arch/x86/kernel/smp.c2
-rw-r--r--arch/x86/kernel/smpboot.c66
-rw-r--r--arch/x86/kernel/step.c10
-rw-r--r--arch/x86/kernel/topology.c4
-rw-r--r--arch/x86/kernel/trace_clock.c7
-rw-r--r--arch/x86/kernel/traps.c90
-rw-r--r--arch/x86/kernel/tsc.c66
-rw-r--r--arch/x86/kernel/tsc_sync.c14
-rw-r--r--arch/x86/kernel/uprobes.c9
-rw-r--r--arch/x86/kernel/vm86_32.c400
-rw-r--r--arch/x86/kernel/vmlinux.lds.S2
-rw-r--r--arch/x86/kvm/Makefile4
-rw-r--r--arch/x86/kvm/cpuid.c2
-rw-r--r--arch/x86/kvm/emulate.c2
-rw-r--r--arch/x86/kvm/hyperv.c377
-rw-r--r--arch/x86/kvm/hyperv.h32
-rw-r--r--arch/x86/kvm/i8259.c15
-rw-r--r--arch/x86/kvm/iommu.c2
-rw-r--r--arch/x86/kvm/irq.h8
-rw-r--r--arch/x86/kvm/lapic.c11
-rw-r--r--arch/x86/kvm/lapic.h2
-rw-r--r--arch/x86/kvm/mmu.c309
-rw-r--r--arch/x86/kvm/mmu.h4
-rw-r--r--arch/x86/kvm/mtrr.c40
-rw-r--r--arch/x86/kvm/paging_tmpl.h13
-rw-r--r--arch/x86/kvm/pmu_amd.c2
-rw-r--r--arch/x86/kvm/svm.c13
-rw-r--r--arch/x86/kvm/vmx.c202
-rw-r--r--arch/x86/kvm/x86.c391
-rw-r--r--arch/x86/kvm/x86.h10
-rw-r--r--arch/x86/lguest/boot.c91
-rw-r--r--arch/x86/lib/delay.c60
-rw-r--r--arch/x86/lib/usercopy.c2
-rw-r--r--arch/x86/math-emu/fpu_entry.c3
-rw-r--r--arch/x86/math-emu/fpu_system.h21
-rw-r--r--arch/x86/math-emu/get_address.c4
-rw-r--r--arch/x86/mm/fault.c7
-rw-r--r--arch/x86/mm/init.c7
-rw-r--r--arch/x86/mm/init_32.c5
-rw-r--r--arch/x86/mm/init_64.c6
-rw-r--r--arch/x86/mm/ioremap.c23
-rw-r--r--arch/x86/mm/kasan_init_64.c138
-rw-r--r--arch/x86/mm/mmap.c7
-rw-r--r--arch/x86/mm/mpx.c75
-rw-r--r--arch/x86/mm/numa.c6
-rw-r--r--arch/x86/mm/pageattr-test.c4
-rw-r--r--arch/x86/mm/pageattr.c1
-rw-r--r--arch/x86/mm/srat.c5
-rw-r--r--arch/x86/mm/tlb.c3
-rw-r--r--arch/x86/net/bpf_jit_comp.c94
-rw-r--r--arch/x86/pci/common.c20
-rw-r--r--arch/x86/pci/fixup.c13
-rw-r--r--arch/x86/pci/intel_mid_pci.c45
-rw-r--r--arch/x86/pci/irq.c23
-rw-r--r--arch/x86/pci/xen.c8
-rw-r--r--arch/x86/platform/Makefile1
-rw-r--r--arch/x86/platform/atom/Makefile3
-rw-r--r--arch/x86/platform/atom/pmc_atom.c (renamed from arch/x86/kernel/pmc_atom.c)315
-rw-r--r--arch/x86/platform/efi/efi.c76
-rw-r--r--arch/x86/platform/intel/Makefile1
-rw-r--r--arch/x86/platform/intel/iosf_mbi.c (renamed from arch/x86/kernel/iosf_mbi.c)27
-rw-r--r--arch/x86/platform/uv/uv_irq.c2
-rw-r--r--arch/x86/platform/uv/uv_nmi.c6
-rw-r--r--arch/x86/platform/uv/uv_time.c37
-rw-r--r--arch/x86/power/cpu.c3
-rw-r--r--arch/x86/ras/Kconfig11
-rw-r--r--arch/x86/ras/Makefile2
-rw-r--r--arch/x86/ras/mce_amd_inj.c375
-rw-r--r--arch/x86/um/asm/barrier.h13
-rw-r--r--arch/x86/xen/Kconfig25
-rw-r--r--arch/x86/xen/Makefile4
-rw-r--r--arch/x86/xen/apic.c6
-rw-r--r--arch/x86/xen/enlighten.c87
-rw-r--r--arch/x86/xen/mmu.c431
-rw-r--r--arch/x86/xen/p2m.c62
-rw-r--r--arch/x86/xen/p2m.h15
-rw-r--r--arch/x86/xen/platform-pci-unplug.c2
-rw-r--r--arch/x86/xen/pmu.c570
-rw-r--r--arch/x86/xen/pmu.h15
-rw-r--r--arch/x86/xen/setup.c498
-rw-r--r--arch/x86/xen/smp.c31
-rw-r--r--arch/x86/xen/suspend.c23
-rw-r--r--arch/x86/xen/time.c80
-rw-r--r--arch/x86/xen/xen-head.S2
-rw-r--r--arch/x86/xen/xen-ops.h13
272 files changed, 7988 insertions, 4411 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 55bced17dc95..96d058a87100 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -27,7 +27,8 @@ config X86
select ARCH_HAS_ELF_RANDOMIZE
select ARCH_HAS_FAST_MULTIPLIER
select ARCH_HAS_GCOV_PROFILE_ALL
- select ARCH_HAS_PMEM_API
+ select ARCH_HAS_PMEM_API if X86_64
+ select ARCH_HAS_MMIO_FLUSH
select ARCH_HAS_SG_CHAIN
select ARCH_HAVE_NMI_SAFE_CMPXCHG
select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI
@@ -41,6 +42,8 @@ config X86
select ARCH_USE_CMPXCHG_LOCKREF if X86_64
select ARCH_USE_QUEUED_RWLOCKS
select ARCH_USE_QUEUED_SPINLOCKS
+ select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH if SMP
+ select ARCH_WANTS_DYNAMIC_TASK_STRUCT
select ARCH_WANT_FRAME_POINTERS
select ARCH_WANT_IPC_PARSE_VERSION if X86_32
select ARCH_WANT_OPTIONAL_GPIOLIB
@@ -132,7 +135,7 @@ config X86
select HAVE_PERF_USER_STACK_DUMP
select HAVE_REGS_AND_STACK_ACCESS_API
select HAVE_SYSCALL_TRACEPOINTS
- select HAVE_UID16 if X86_32
+ select HAVE_UID16 if X86_32 || IA32_EMULATION
select HAVE_UNSTABLE_SCHED_CLOCK
select HAVE_USER_RETURN_NOTIFIER
select IRQ_FORCED_THREADING
@@ -254,6 +257,11 @@ config ARCH_SUPPORTS_OPTIMIZED_INLINING
config ARCH_SUPPORTS_DEBUG_PAGEALLOC
def_bool y
+config KASAN_SHADOW_OFFSET
+ hex
+ depends on KASAN
+ default 0xdffffc0000000000
+
config HAVE_INTEL_TXT
def_bool y
depends on INTEL_IOMMU && ACPI
@@ -949,6 +957,7 @@ config X86_REROUTE_FOR_BROKEN_BOOT_IRQS
config X86_MCE
bool "Machine Check / overheating reporting"
+ select GENERIC_ALLOCATOR
default y
---help---
Machine Check support allows the processor to notify the
@@ -996,19 +1005,42 @@ config X86_THERMAL_VECTOR
def_bool y
depends on X86_MCE_INTEL
-config VM86
- bool "Enable VM86 support" if EXPERT
- default y
+config X86_LEGACY_VM86
+ bool "Legacy VM86 support"
+ default n
depends on X86_32
---help---
- This option is required by programs like DOSEMU to run
- 16-bit real mode legacy code on x86 processors. It also may
- be needed by software like XFree86 to initialize some video
- cards via BIOS. Disabling this option saves about 6K.
+ This option allows user programs to put the CPU into V8086
+ mode, which is an 80286-era approximation of 16-bit real mode.
+
+ Some very old versions of X and/or vbetool require this option
+ for user mode setting. Similarly, DOSEMU will use it if
+ available to accelerate real mode DOS programs. However, any
+ recent version of DOSEMU, X, or vbetool should be fully
+ functional even without kernel VM86 support, as they will all
+ fall back to software emulation. Nevertheless, if you are using
+ a 16-bit DOS program where 16-bit performance matters, vm86
+ mode might be faster than emulation and you might want to
+ enable this option.
+
+ Note that any app that works on a 64-bit kernel is unlikely to
+ need this option, as 64-bit kernels don't, and can't, support
+ V8086 mode. This option is also unrelated to 16-bit protected
+ mode and is not needed to run most 16-bit programs under Wine.
+
+ Enabling this option increases the complexity of the kernel
+ and slows down exception handling a tiny bit.
+
+ If unsure, say N here.
+
+config VM86
+ bool
+ default X86_LEGACY_VM86
config X86_16BIT
bool "Enable support for 16-bit segments" if EXPERT
default y
+ depends on MODIFY_LDT_SYSCALL
---help---
This option is required by programs like Wine to run 16-bit
protected mode legacy code on x86 processors. Disabling
@@ -1276,6 +1308,7 @@ config HIGHMEM
config X86_PAE
bool "PAE (Physical Address Extension) Support"
depends on X86_32 && !HIGHMEM4G
+ select SWIOTLB
---help---
PAE is required for NX support, and furthermore enables
larger swapspace support for non-overcommit purposes. It
@@ -1420,10 +1453,14 @@ config ILLEGAL_POINTER_VALUE
source "mm/Kconfig"
+config X86_PMEM_LEGACY_DEVICE
+ bool
+
config X86_PMEM_LEGACY
- bool "Support non-standard NVDIMMs and ADR protected memory"
+ tristate "Support non-standard NVDIMMs and ADR protected memory"
depends on PHYS_ADDR_T_64BIT
depends on BLK_DEV
+ select X86_PMEM_LEGACY_DEVICE
select LIBNVDIMM
help
Treat memory marked using the non-standard e820 type of 12 as used
@@ -1503,6 +1540,7 @@ config X86_RESERVE_LOW
config MATH_EMULATION
bool
+ depends on MODIFY_LDT_SYSCALL
prompt "Math emulation" if X86_32
---help---
Linux can emulate a math coprocessor (used for floating point
@@ -1718,6 +1756,7 @@ source kernel/Kconfig.hz
config KEXEC
bool "kexec system call"
+ select KEXEC_CORE
---help---
kexec is a system call that implements the ability to shutdown your
current kernel, and to start another kernel. It is like a reboot
@@ -1734,8 +1773,8 @@ config KEXEC
config KEXEC_FILE
bool "kexec file based system call"
+ select KEXEC_CORE
select BUILD_BIN2C
- depends on KEXEC
depends on X86_64
depends on CRYPTO=y
depends on CRYPTO_SHA256=y
@@ -2015,7 +2054,7 @@ config CMDLINE_BOOL
To compile command line arguments into the kernel,
set this option to 'Y', then fill in the
- the boot arguments in CONFIG_CMDLINE.
+ boot arguments in CONFIG_CMDLINE.
Systems with fully functional boot loaders (i.e. non-embedded)
should leave this option set to 'N'.
@@ -2047,6 +2086,22 @@ config CMDLINE_OVERRIDE
This is used to work around broken boot loaders. This should
be set to 'N' under normal conditions.
+config MODIFY_LDT_SYSCALL
+ bool "Enable the LDT (local descriptor table)" if EXPERT
+ default y
+ ---help---
+ Linux can allow user programs to install a per-process x86
+ Local Descriptor Table (LDT) using the modify_ldt(2) system
+ call. This is required to run 16-bit or segmented code such as
+ DOSEMU or some Wine programs. It is also used by some very old
+ threading libraries.
+
+ Enabling this feature adds a small amount of overhead to
+ context switches and increases the low-level kernel attack
+ surface. Disabling it removes the modify_ldt(2) system call.
+
+ Saying 'N' here may make sense for embedded or server kernels.
+
source "kernel/livepatch/Kconfig"
endmenu
@@ -2516,7 +2571,7 @@ config IA32_EMULATION
depends on X86_64
select BINFMT_ELF
select COMPAT_BINFMT_ELF
- select HAVE_UID16
+ select ARCH_WANT_OLD_COMPAT_IPC
---help---
Include code to run legacy 32-bit programs under a
64-bit kernel. You should likely turn this on, unless you're
@@ -2530,7 +2585,7 @@ config IA32_AOUT
config X86_X32
bool "x32 ABI for 64-bit mode"
- depends on X86_64 && IA32_EMULATION
+ depends on X86_64
---help---
Include code to run binaries for the x32 native 32-bit ABI
for 64-bit processors. An x32 process gets access to the
@@ -2544,7 +2599,6 @@ config X86_X32
config COMPAT
def_bool y
depends on IA32_EMULATION || X86_X32
- select ARCH_WANT_OLD_COMPAT_IPC
if COMPAT
config COMPAT_FOR_U64_ALIGNMENT
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index a15893d17c55..d8c0d3266173 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -297,6 +297,18 @@ config OPTIMIZE_INLINING
If unsure, say N.
+config DEBUG_ENTRY
+ bool "Debug low-level entry code"
+ depends on DEBUG_KERNEL
+ ---help---
+ This option enables sanity checks in x86's low-level entry code.
+ Some of these sanity checks may slow down kernel entries and
+ exits or otherwise impact performance.
+
+ This is currently used to help test NMI code.
+
+ If unsure, say N.
+
config DEBUG_NMI_SELFTEST
bool "NMI Selftest"
depends on DEBUG_KERNEL && X86_LOCAL_APIC
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 118e6debc483..747860c696e1 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -39,6 +39,16 @@ ifdef CONFIG_X86_NEED_RELOCS
LDFLAGS_vmlinux := --emit-relocs
endif
+#
+# Prevent GCC from generating any FP code by mistake.
+#
+# This must happen before we try the -mpreferred-stack-boundary, see:
+#
+# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=53383
+#
+KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow
+KBUILD_CFLAGS += $(call cc-option,-mno-avx,)
+
ifeq ($(CONFIG_X86_32),y)
BITS := 32
UTS_MACHINE := i386
@@ -167,9 +177,6 @@ KBUILD_CFLAGS += -pipe
KBUILD_CFLAGS += -Wno-sign-compare
#
KBUILD_CFLAGS += -fno-asynchronous-unwind-tables
-# prevent gcc from generating any FP code by mistake
-KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow
-KBUILD_CFLAGS += $(call cc-option,-mno-avx,)
KBUILD_CFLAGS += $(mflags-y)
KBUILD_AFLAGS += $(mflags-y)
@@ -212,6 +219,8 @@ drivers-$(CONFIG_PM) += arch/x86/power/
drivers-$(CONFIG_FB) += arch/x86/video/
+drivers-$(CONFIG_RAS) += arch/x86/ras/
+
####
# boot loader support. Several targets are kept for legacy purposes
diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile
index 57bbf2fb21f6..0d553e54171b 100644
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -23,7 +23,7 @@ targets += fdimage fdimage144 fdimage288 image.iso mtools.conf
subdir- := compressed
setup-y += a20.o bioscall.o cmdline.o copy.o cpu.o cpuflags.o cpucheck.o
-setup-y += early_serial_console.o edd.o header.o main.o mca.o memory.o
+setup-y += early_serial_console.o edd.o header.o main.o memory.o
setup-y += pm.o pmjump.o printf.o regs.o string.o tty.o video.o
setup-y += video-mode.o version.o
setup-$(CONFIG_X86_APM_BOOT) += apm.o
diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h
index bd49ec61255c..0033e96c3f09 100644
--- a/arch/x86/boot/boot.h
+++ b/arch/x86/boot/boot.h
@@ -307,9 +307,6 @@ void query_edd(void);
/* header.S */
void __attribute__((noreturn)) die(void);
-/* mca.c */
-int query_mca(void);
-
/* memory.c */
int detect_memory(void);
diff --git a/arch/x86/boot/compressed/aslr.c b/arch/x86/boot/compressed/aslr.c
index d7b1f655b3ef..6a9b96b4624d 100644
--- a/arch/x86/boot/compressed/aslr.c
+++ b/arch/x86/boot/compressed/aslr.c
@@ -82,7 +82,7 @@ static unsigned long get_random_long(void)
if (has_cpuflag(X86_FEATURE_TSC)) {
debug_putstr(" RDTSC");
- rdtscll(raw);
+ raw = rdtsc();
random ^= raw;
use_i8254 = false;
diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c
index 2c82bd150d43..ee1b6d346b98 100644
--- a/arch/x86/boot/compressed/eboot.c
+++ b/arch/x86/boot/compressed/eboot.c
@@ -1041,7 +1041,6 @@ void setup_graphics(struct boot_params *boot_params)
struct boot_params *make_boot_params(struct efi_config *c)
{
struct boot_params *boot_params;
- struct sys_desc_table *sdt;
struct apm_bios_info *bi;
struct setup_header *hdr;
struct efi_info *efi;
@@ -1089,7 +1088,6 @@ struct boot_params *make_boot_params(struct efi_config *c)
hdr = &boot_params->hdr;
efi = &boot_params->efi_info;
bi = &boot_params->apm_bios_info;
- sdt = &boot_params->sys_desc_table;
/* Copy the second sector to boot_params */
memcpy(&hdr->jump, image->image_base + 512, 512);
@@ -1118,8 +1116,6 @@ struct boot_params *make_boot_params(struct efi_config *c)
/* Clear APM BIOS info */
memset(bi, 0, sizeof(*bi));
- memset(sdt, 0, sizeof(*sdt));
-
status = efi_parse_options(cmdline_ptr);
if (status != EFI_SUCCESS)
goto fail2;
@@ -1193,6 +1189,10 @@ static efi_status_t setup_e820(struct boot_params *params,
unsigned int e820_type = 0;
unsigned long m = efi->efi_memmap;
+#ifdef CONFIG_X86_64
+ m |= (u64)efi->efi_memmap_hi << 32;
+#endif
+
d = (efi_memory_desc_t *)(m + (i * efi->efi_memdesc_size));
switch (d->type) {
case EFI_RESERVED_TYPE:
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index a107b935e22f..79dac1758e7c 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -220,6 +220,23 @@ void __putstr(const char *s)
outb(0xff & (pos >> 1), vidport+1);
}
+void __puthex(unsigned long value)
+{
+ char alpha[2] = "0";
+ int bits;
+
+ for (bits = sizeof(value) * 8 - 4; bits >= 0; bits -= 4) {
+ unsigned long digit = (value >> bits) & 0xf;
+
+ if (digit < 0xA)
+ alpha[0] = '0' + digit;
+ else
+ alpha[0] = 'a' + (digit - 0xA);
+
+ __putstr(alpha);
+ }
+}
+
static void error(char *x)
{
error_putstr("\n\n");
@@ -399,6 +416,13 @@ asmlinkage __visible void *decompress_kernel(void *rmode, memptr heap,
free_mem_ptr = heap; /* Heap */
free_mem_end_ptr = heap + BOOT_HEAP_SIZE;
+ /* Report initial kernel position details. */
+ debug_putaddr(input_data);
+ debug_putaddr(input_len);
+ debug_putaddr(output);
+ debug_putaddr(output_len);
+ debug_putaddr(run_size);
+
/*
* The memory hole needed for the kernel is the larger of either
* the entire decompressed kernel plus relocation table, or the
@@ -424,7 +448,8 @@ asmlinkage __visible void *decompress_kernel(void *rmode, memptr heap,
#endif
debug_putstr("\nDecompressing Linux... ");
- decompress(input_data, input_len, NULL, NULL, output, NULL, error);
+ __decompress(input_data, input_len, NULL, NULL, output, output_len,
+ NULL, error);
parse_elf(output);
/*
* 32-bit always performs relocations. 64-bit relocations are only
diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
index 805d25ca5f1d..3783dc3e10b3 100644
--- a/arch/x86/boot/compressed/misc.h
+++ b/arch/x86/boot/compressed/misc.h
@@ -34,16 +34,27 @@ extern memptr free_mem_ptr;
extern memptr free_mem_end_ptr;
extern struct boot_params *real_mode; /* Pointer to real-mode data */
void __putstr(const char *s);
+void __puthex(unsigned long value);
#define error_putstr(__x) __putstr(__x)
+#define error_puthex(__x) __puthex(__x)
#ifdef CONFIG_X86_VERBOSE_BOOTUP
#define debug_putstr(__x) __putstr(__x)
+#define debug_puthex(__x) __puthex(__x)
+#define debug_putaddr(__x) { \
+ debug_putstr(#__x ": 0x"); \
+ debug_puthex((unsigned long)(__x)); \
+ debug_putstr("\n"); \
+ }
#else
static inline void debug_putstr(const char *s)
{ }
+static inline void debug_puthex(const char *s)
+{ }
+#define debug_putaddr(x) /* */
#endif
diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
index 16ef02596db2..2d6b309c8e9a 100644
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -414,7 +414,7 @@ xloadflags:
# define XLF23 0
#endif
-#if defined(CONFIG_X86_64) && defined(CONFIG_EFI) && defined(CONFIG_KEXEC)
+#if defined(CONFIG_X86_64) && defined(CONFIG_EFI) && defined(CONFIG_KEXEC_CORE)
# define XLF4 XLF_EFI_KEXEC
#else
# define XLF4 0
diff --git a/arch/x86/boot/main.c b/arch/x86/boot/main.c
index fd6c9f236996..9bcea386db65 100644
--- a/arch/x86/boot/main.c
+++ b/arch/x86/boot/main.c
@@ -161,9 +161,6 @@ void main(void)
/* Set keyboard repeat rate (why?) and query the lock flags */
keyboard_init();
- /* Query MCA information */
- query_mca();
-
/* Query Intel SpeedStep (IST) information */
query_ist();
diff --git a/arch/x86/boot/mca.c b/arch/x86/boot/mca.c
deleted file mode 100644
index a95a531148ef..000000000000
--- a/arch/x86/boot/mca.c
+++ /dev/null
@@ -1,38 +0,0 @@
-/* -*- linux-c -*- ------------------------------------------------------- *
- *
- * Copyright (C) 1991, 1992 Linus Torvalds
- * Copyright 2007 rPath, Inc. - All Rights Reserved
- * Copyright 2009 Intel Corporation; author H. Peter Anvin
- *
- * This file is part of the Linux kernel, and is made available under
- * the terms of the GNU General Public License version 2.
- *
- * ----------------------------------------------------------------------- */
-
-/*
- * Get the MCA system description table
- */
-
-#include "boot.h"
-
-int query_mca(void)
-{
- struct biosregs ireg, oreg;
- u16 len;
-
- initregs(&ireg);
- ireg.ah = 0xc0;
- intcall(0x15, &ireg, &oreg);
-
- if (oreg.eflags & X86_EFLAGS_CF)
- return -1; /* No MCA present */
-
- set_fs(oreg.es);
- len = rdfs16(oreg.bx);
-
- if (len > sizeof(boot_params.sys_desc_table))
- len = sizeof(boot_params.sys_desc_table);
-
- copy_from_fs(&boot_params.sys_desc_table, oreg.bx, len);
- return 0;
-}
diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig
index aaa1118bf01e..028be48c8839 100644
--- a/arch/x86/configs/i386_defconfig
+++ b/arch/x86/configs/i386_defconfig
@@ -23,6 +23,7 @@ CONFIG_BLK_DEV_INITRD=y
# CONFIG_COMPAT_BRK is not set
CONFIG_PROFILING=y
CONFIG_KPROBES=y
+CONFIG_JUMP_LABEL=y
CONFIG_MODULES=y
CONFIG_MODULE_UNLOAD=y
CONFIG_MODULE_FORCE_UNLOAD=y
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig
index 315b86106572..cb5b3ab5beec 100644
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -22,6 +22,7 @@ CONFIG_BLK_DEV_INITRD=y
# CONFIG_COMPAT_BRK is not set
CONFIG_PROFILING=y
CONFIG_KPROBES=y
+CONFIG_JUMP_LABEL=y
CONFIG_MODULES=y
CONFIG_MODULE_UNLOAD=y
CONFIG_MODULE_FORCE_UNLOAD=y
@@ -207,7 +208,6 @@ CONFIG_AGP_AMD64=y
CONFIG_AGP_INTEL=y
CONFIG_DRM=y
CONFIG_DRM_I915=y
-CONFIG_DRM_I915_KMS=y
CONFIG_FB_MODE_HELPERS=y
CONFIG_FB_TILEBLITTING=y
CONFIG_FB_EFI=y
diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile
index 7a144971db79..bd55dedd7614 100644
--- a/arch/x86/entry/Makefile
+++ b/arch/x86/entry/Makefile
@@ -2,6 +2,7 @@
# Makefile for the x86 low level entry code
#
obj-y := entry_$(BITS).o thunk_$(BITS).o syscall_$(BITS).o
+obj-y += common.o
obj-y += vdso/
obj-y += vsyscall/
diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index f4e6308c4200..3c71dd947c7b 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -135,9 +135,6 @@ For 32-bit we have the following conventions - kernel is built with
movq %rbp, 4*8+\offset(%rsp)
movq %rbx, 5*8+\offset(%rsp)
.endm
- .macro SAVE_EXTRA_REGS_RBP offset=0
- movq %rbp, 4*8+\offset(%rsp)
- .endm
.macro RESTORE_EXTRA_REGS offset=0
movq 0*8+\offset(%rsp), %r15
@@ -193,12 +190,6 @@ For 32-bit we have the following conventions - kernel is built with
.macro RESTORE_C_REGS_EXCEPT_RCX_R11
RESTORE_C_REGS_HELPER 1,0,0,1,1
.endm
- .macro RESTORE_RSI_RDI
- RESTORE_C_REGS_HELPER 0,0,0,0,0
- .endm
- .macro RESTORE_RSI_RDI_RDX
- RESTORE_C_REGS_HELPER 0,0,0,0,1
- .endm
.macro REMOVE_PT_GPREGS_FROM_STACK addskip=0
subq $-(15*8+\addskip), %rsp
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
new file mode 100644
index 000000000000..80dcc9261ca3
--- /dev/null
+++ b/arch/x86/entry/common.c
@@ -0,0 +1,318 @@
+/*
+ * common.c - C code for kernel entry and exit
+ * Copyright (c) 2015 Andrew Lutomirski
+ * GPL v2
+ *
+ * Based on asm and ptrace code by many authors. The code here originated
+ * in ptrace.c and signal.c.
+ */
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/errno.h>
+#include <linux/ptrace.h>
+#include <linux/tracehook.h>
+#include <linux/audit.h>
+#include <linux/seccomp.h>
+#include <linux/signal.h>
+#include <linux/export.h>
+#include <linux/context_tracking.h>
+#include <linux/user-return-notifier.h>
+#include <linux/uprobes.h>
+
+#include <asm/desc.h>
+#include <asm/traps.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/syscalls.h>
+
+#ifdef CONFIG_CONTEXT_TRACKING
+/* Called on entry from user mode with IRQs off. */
+__visible void enter_from_user_mode(void)
+{
+ CT_WARN_ON(ct_state() != CONTEXT_USER);
+ user_exit();
+}
+#endif
+
+static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch)
+{
+#ifdef CONFIG_X86_64
+ if (arch == AUDIT_ARCH_X86_64) {
+ audit_syscall_entry(regs->orig_ax, regs->di,
+ regs->si, regs->dx, regs->r10);
+ } else
+#endif
+ {
+ audit_syscall_entry(regs->orig_ax, regs->bx,
+ regs->cx, regs->dx, regs->si);
+ }
+}
+
+/*
+ * We can return 0 to resume the syscall or anything else to go to phase
+ * 2. If we resume the syscall, we need to put something appropriate in
+ * regs->orig_ax.
+ *
+ * NB: We don't have full pt_regs here, but regs->orig_ax and regs->ax
+ * are fully functional.
+ *
+ * For phase 2's benefit, our return value is:
+ * 0: resume the syscall
+ * 1: go to phase 2; no seccomp phase 2 needed
+ * anything else: go to phase 2; pass return value to seccomp
+ */
+unsigned long syscall_trace_enter_phase1(struct pt_regs *regs, u32 arch)
+{
+ unsigned long ret = 0;
+ u32 work;
+
+ BUG_ON(regs != task_pt_regs(current));
+
+ work = ACCESS_ONCE(current_thread_info()->flags) &
+ _TIF_WORK_SYSCALL_ENTRY;
+
+#ifdef CONFIG_CONTEXT_TRACKING
+ /*
+ * If TIF_NOHZ is set, we are required to call user_exit() before
+ * doing anything that could touch RCU.
+ */
+ if (work & _TIF_NOHZ) {
+ enter_from_user_mode();
+ work &= ~_TIF_NOHZ;
+ }
+#endif
+
+#ifdef CONFIG_SECCOMP
+ /*
+ * Do seccomp first -- it should minimize exposure of other
+ * code, and keeping seccomp fast is probably more valuable
+ * than the rest of this.
+ */
+ if (work & _TIF_SECCOMP) {
+ struct seccomp_data sd;
+
+ sd.arch = arch;
+ sd.nr = regs->orig_ax;
+ sd.instruction_pointer = regs->ip;
+#ifdef CONFIG_X86_64
+ if (arch == AUDIT_ARCH_X86_64) {
+ sd.args[0] = regs->di;
+ sd.args[1] = regs->si;
+ sd.args[2] = regs->dx;
+ sd.args[3] = regs->r10;
+ sd.args[4] = regs->r8;
+ sd.args[5] = regs->r9;
+ } else
+#endif
+ {
+ sd.args[0] = regs->bx;
+ sd.args[1] = regs->cx;
+ sd.args[2] = regs->dx;
+ sd.args[3] = regs->si;
+ sd.args[4] = regs->di;
+ sd.args[5] = regs->bp;
+ }
+
+ BUILD_BUG_ON(SECCOMP_PHASE1_OK != 0);
+ BUILD_BUG_ON(SECCOMP_PHASE1_SKIP != 1);
+
+ ret = seccomp_phase1(&sd);
+ if (ret == SECCOMP_PHASE1_SKIP) {
+ regs->orig_ax = -1;
+ ret = 0;
+ } else if (ret != SECCOMP_PHASE1_OK) {
+ return ret; /* Go directly to phase 2 */
+ }
+
+ work &= ~_TIF_SECCOMP;
+ }
+#endif
+
+ /* Do our best to finish without phase 2. */
+ if (work == 0)
+ return ret; /* seccomp and/or nohz only (ret == 0 here) */
+
+#ifdef CONFIG_AUDITSYSCALL
+ if (work == _TIF_SYSCALL_AUDIT) {
+ /*
+ * If there is no more work to be done except auditing,
+ * then audit in phase 1. Phase 2 always audits, so, if
+ * we audit here, then we can't go on to phase 2.
+ */
+ do_audit_syscall_entry(regs, arch);
+ return 0;
+ }
+#endif
+
+ return 1; /* Something is enabled that we can't handle in phase 1 */
+}
+
+/* Returns the syscall nr to run (which should match regs->orig_ax). */
+long syscall_trace_enter_phase2(struct pt_regs *regs, u32 arch,
+ unsigned long phase1_result)
+{
+ long ret = 0;
+ u32 work = ACCESS_ONCE(current_thread_info()->flags) &
+ _TIF_WORK_SYSCALL_ENTRY;
+
+ BUG_ON(regs != task_pt_regs(current));
+
+ /*
+ * If we stepped into a sysenter/syscall insn, it trapped in
+ * kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP.
+ * If user-mode had set TF itself, then it's still clear from
+ * do_debug() and we need to set it again to restore the user
+ * state. If we entered on the slow path, TF was already set.
+ */
+ if (work & _TIF_SINGLESTEP)
+ regs->flags |= X86_EFLAGS_TF;
+
+#ifdef CONFIG_SECCOMP
+ /*
+ * Call seccomp_phase2 before running the other hooks so that
+ * they can see any changes made by a seccomp tracer.
+ */
+ if (phase1_result > 1 && seccomp_phase2(phase1_result)) {
+ /* seccomp failures shouldn't expose any additional code. */
+ return -1;
+ }
+#endif
+
+ if (unlikely(work & _TIF_SYSCALL_EMU))
+ ret = -1L;
+
+ if ((ret || test_thread_flag(TIF_SYSCALL_TRACE)) &&
+ tracehook_report_syscall_entry(regs))
+ ret = -1L;
+
+ if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
+ trace_sys_enter(regs, regs->orig_ax);
+
+ do_audit_syscall_entry(regs, arch);
+
+ return ret ?: regs->orig_ax;
+}
+
+long syscall_trace_enter(struct pt_regs *regs)
+{
+ u32 arch = is_ia32_task() ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64;
+ unsigned long phase1_result = syscall_trace_enter_phase1(regs, arch);
+
+ if (phase1_result == 0)
+ return regs->orig_ax;
+ else
+ return syscall_trace_enter_phase2(regs, arch, phase1_result);
+}
+
+static struct thread_info *pt_regs_to_thread_info(struct pt_regs *regs)
+{
+ unsigned long top_of_stack =
+ (unsigned long)(regs + 1) + TOP_OF_KERNEL_STACK_PADDING;
+ return (struct thread_info *)(top_of_stack - THREAD_SIZE);
+}
+
+/* Called with IRQs disabled. */
+__visible void prepare_exit_to_usermode(struct pt_regs *regs)
+{
+ if (WARN_ON(!irqs_disabled()))
+ local_irq_disable();
+
+ /*
+ * In order to return to user mode, we need to have IRQs off with
+ * none of _TIF_SIGPENDING, _TIF_NOTIFY_RESUME, _TIF_USER_RETURN_NOTIFY,
+ * _TIF_UPROBE, or _TIF_NEED_RESCHED set. Several of these flags
+ * can be set at any time on preemptable kernels if we have IRQs on,
+ * so we need to loop. Disabling preemption wouldn't help: doing the
+ * work to clear some of the flags can sleep.
+ */
+ while (true) {
+ u32 cached_flags =
+ READ_ONCE(pt_regs_to_thread_info(regs)->flags);
+
+ if (!(cached_flags & (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME |
+ _TIF_UPROBE | _TIF_NEED_RESCHED |
+ _TIF_USER_RETURN_NOTIFY)))
+ break;
+
+ /* We have work to do. */
+ local_irq_enable();
+
+ if (cached_flags & _TIF_NEED_RESCHED)
+ schedule();
+
+ if (cached_flags & _TIF_UPROBE)
+ uprobe_notify_resume(regs);
+
+ /* deal with pending signal delivery */
+ if (cached_flags & _TIF_SIGPENDING)
+ do_signal(regs);
+
+ if (cached_flags & _TIF_NOTIFY_RESUME) {
+ clear_thread_flag(TIF_NOTIFY_RESUME);
+ tracehook_notify_resume(regs);
+ }
+
+ if (cached_flags & _TIF_USER_RETURN_NOTIFY)
+ fire_user_return_notifiers();
+
+ /* Disable IRQs and retry */
+ local_irq_disable();
+ }
+
+ user_enter();
+}
+
+/*
+ * Called with IRQs on and fully valid regs. Returns with IRQs off in a
+ * state such that we can immediately switch to user mode.
+ */
+__visible void syscall_return_slowpath(struct pt_regs *regs)
+{
+ struct thread_info *ti = pt_regs_to_thread_info(regs);
+ u32 cached_flags = READ_ONCE(ti->flags);
+ bool step;
+
+ CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
+
+ if (WARN(irqs_disabled(), "syscall %ld left IRQs disabled",
+ regs->orig_ax))
+ local_irq_enable();
+
+ /*
+ * First do one-time work. If these work items are enabled, we
+ * want to run them exactly once per syscall exit with IRQs on.
+ */
+ if (cached_flags & (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT |
+ _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT)) {
+ audit_syscall_exit(regs);
+
+ if (cached_flags & _TIF_SYSCALL_TRACEPOINT)
+ trace_sys_exit(regs, regs->ax);
+
+ /*
+ * If TIF_SYSCALL_EMU is set, we only get here because of
+ * TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP).
+ * We already reported this syscall instruction in
+ * syscall_trace_enter().
+ */
+ step = unlikely(
+ (cached_flags & (_TIF_SINGLESTEP | _TIF_SYSCALL_EMU))
+ == _TIF_SINGLESTEP);
+ if (step || cached_flags & _TIF_SYSCALL_TRACE)
+ tracehook_report_syscall_exit(regs, step);
+ }
+
+#ifdef CONFIG_COMPAT
+ /*
+ * Compat syscalls set TS_COMPAT. Make sure we clear it before
+ * returning to user mode.
+ */
+ ti->status &= ~TS_COMPAT;
+#endif
+
+ local_irq_disable();
+ prepare_exit_to_usermode(regs);
+}
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 21dc60a60b5f..b2909bf8cf70 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -45,16 +45,6 @@
#include <asm/asm.h>
#include <asm/smap.h>
-/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
-#include <linux/elf-em.h>
-#define AUDIT_ARCH_I386 (EM_386|__AUDIT_ARCH_LE)
-#define __AUDIT_ARCH_LE 0x40000000
-
-#ifndef CONFIG_AUDITSYSCALL
-# define sysenter_audit syscall_trace_entry
-# define sysexit_audit syscall_exit_work
-#endif
-
.section .entry.text, "ax"
/*
@@ -266,14 +256,10 @@ ret_from_intr:
ENTRY(resume_userspace)
LOCKDEP_SYS_EXIT
- DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt
- # setting need_resched or sigpending
- # between sampling and the iret
+ DISABLE_INTERRUPTS(CLBR_ANY)
TRACE_IRQS_OFF
- movl TI_flags(%ebp), %ecx
- andl $_TIF_WORK_MASK, %ecx # is there any work to be done on
- # int/exception return?
- jne work_pending
+ movl %esp, %eax
+ call prepare_exit_to_usermode
jmp restore_all
END(ret_from_exception)
@@ -339,7 +325,7 @@ sysenter_past_esp:
GET_THREAD_INFO(%ebp)
testl $_TIF_WORK_SYSCALL_ENTRY, TI_flags(%ebp)
- jnz sysenter_audit
+ jnz syscall_trace_entry
sysenter_do_call:
cmpl $(NR_syscalls), %eax
jae sysenter_badsys
@@ -351,7 +337,7 @@ sysenter_after_call:
TRACE_IRQS_OFF
movl TI_flags(%ebp), %ecx
testl $_TIF_ALLWORK_MASK, %ecx
- jnz sysexit_audit
+ jnz syscall_exit_work_irqs_off
sysenter_exit:
/* if something modifies registers it must also disable sysexit */
movl PT_EIP(%esp), %edx
@@ -362,40 +348,6 @@ sysenter_exit:
PTGS_TO_GS
ENABLE_INTERRUPTS_SYSEXIT
-#ifdef CONFIG_AUDITSYSCALL
-sysenter_audit:
- testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), TI_flags(%ebp)
- jnz syscall_trace_entry
- /* movl PT_EAX(%esp), %eax already set, syscall number: 1st arg to audit */
- movl PT_EBX(%esp), %edx /* ebx/a0: 2nd arg to audit */
- /* movl PT_ECX(%esp), %ecx already set, a1: 3nd arg to audit */
- pushl PT_ESI(%esp) /* a3: 5th arg */
- pushl PT_EDX+4(%esp) /* a2: 4th arg */
- call __audit_syscall_entry
- popl %ecx /* get that remapped edx off the stack */
- popl %ecx /* get that remapped esi off the stack */
- movl PT_EAX(%esp), %eax /* reload syscall number */
- jmp sysenter_do_call
-
-sysexit_audit:
- testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx
- jnz syscall_exit_work
- TRACE_IRQS_ON
- ENABLE_INTERRUPTS(CLBR_ANY)
- movl %eax, %edx /* second arg, syscall return value */
- cmpl $-MAX_ERRNO, %eax /* is it an error ? */
- setbe %al /* 1 if so, 0 if not */
- movzbl %al, %eax /* zero-extend that */
- call __audit_syscall_exit
- DISABLE_INTERRUPTS(CLBR_ANY)
- TRACE_IRQS_OFF
- movl TI_flags(%ebp), %ecx
- testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx
- jnz syscall_exit_work
- movl PT_EAX(%esp), %eax /* reload syscall return value */
- jmp sysenter_exit
-#endif
-
.pushsection .fixup, "ax"
2: movl $0, PT_FS(%esp)
jmp 1b
@@ -421,13 +373,7 @@ syscall_after_call:
movl %eax, PT_EAX(%esp) # store the return value
syscall_exit:
LOCKDEP_SYS_EXIT
- DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt
- # setting need_resched or sigpending
- # between sampling and the iret
- TRACE_IRQS_OFF
- movl TI_flags(%ebp), %ecx
- testl $_TIF_ALLWORK_MASK, %ecx # current->work
- jnz syscall_exit_work
+ jmp syscall_exit_work
restore_all:
TRACE_IRQS_IRET
@@ -504,57 +450,6 @@ ldt_ss:
#endif
ENDPROC(entry_INT80_32)
- # perform work that needs to be done immediately before resumption
- ALIGN
-work_pending:
- testb $_TIF_NEED_RESCHED, %cl
- jz work_notifysig
-work_resched:
- call schedule
- LOCKDEP_SYS_EXIT
- DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt
- # setting need_resched or sigpending
- # between sampling and the iret
- TRACE_IRQS_OFF
- movl TI_flags(%ebp), %ecx
- andl $_TIF_WORK_MASK, %ecx # is there any work to be done other
- # than syscall tracing?
- jz restore_all
- testb $_TIF_NEED_RESCHED, %cl
- jnz work_resched
-
-work_notifysig: # deal with pending signals and
- # notify-resume requests
-#ifdef CONFIG_VM86
- testl $X86_EFLAGS_VM, PT_EFLAGS(%esp)
- movl %esp, %eax
- jnz work_notifysig_v86 # returning to kernel-space or
- # vm86-space
-1:
-#else
- movl %esp, %eax
-#endif
- TRACE_IRQS_ON
- ENABLE_INTERRUPTS(CLBR_NONE)
- movb PT_CS(%esp), %bl
- andb $SEGMENT_RPL_MASK, %bl
- cmpb $USER_RPL, %bl
- jb resume_kernel
- xorl %edx, %edx
- call do_notify_resume
- jmp resume_userspace
-
-#ifdef CONFIG_VM86
- ALIGN
-work_notifysig_v86:
- pushl %ecx # save ti_flags for do_notify_resume
- call save_v86_state # %eax contains pt_regs pointer
- popl %ecx
- movl %eax, %esp
- jmp 1b
-#endif
-END(work_pending)
-
# perform syscall exit tracing
ALIGN
syscall_trace_entry:
@@ -569,15 +464,14 @@ END(syscall_trace_entry)
# perform syscall exit tracing
ALIGN
-syscall_exit_work:
- testl $_TIF_WORK_SYSCALL_EXIT, %ecx
- jz work_pending
+syscall_exit_work_irqs_off:
TRACE_IRQS_ON
- ENABLE_INTERRUPTS(CLBR_ANY) # could let syscall_trace_leave() call
- # schedule() instead
+ ENABLE_INTERRUPTS(CLBR_ANY)
+
+syscall_exit_work:
movl %esp, %eax
- call syscall_trace_leave
- jmp resume_userspace
+ call syscall_return_slowpath
+ jmp restore_all
END(syscall_exit_work)
syscall_fault:
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 3bb2c4302df1..055a01de7c8d 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -33,7 +33,6 @@
#include <asm/paravirt.h>
#include <asm/percpu.h>
#include <asm/asm.h>
-#include <asm/context_tracking.h>
#include <asm/smap.h>
#include <asm/pgtable_types.h>
#include <linux/err.h>
@@ -229,6 +228,11 @@ entry_SYSCALL_64_fastpath:
*/
USERGS_SYSRET64
+GLOBAL(int_ret_from_sys_call_irqs_off)
+ TRACE_IRQS_ON
+ ENABLE_INTERRUPTS(CLBR_NONE)
+ jmp int_ret_from_sys_call
+
/* Do syscall entry tracing */
tracesys:
movq %rsp, %rdi
@@ -272,69 +276,11 @@ tracesys_phase2:
* Has correct iret frame.
*/
GLOBAL(int_ret_from_sys_call)
- DISABLE_INTERRUPTS(CLBR_NONE)
-int_ret_from_sys_call_irqs_off: /* jumps come here from the irqs-off SYSRET path */
- TRACE_IRQS_OFF
- movl $_TIF_ALLWORK_MASK, %edi
- /* edi: mask to check */
-GLOBAL(int_with_check)
- LOCKDEP_SYS_EXIT_IRQ
- GET_THREAD_INFO(%rcx)
- movl TI_flags(%rcx), %edx
- andl %edi, %edx
- jnz int_careful
- andl $~TS_COMPAT, TI_status(%rcx)
- jmp syscall_return
-
- /*
- * Either reschedule or signal or syscall exit tracking needed.
- * First do a reschedule test.
- * edx: work, edi: workmask
- */
-int_careful:
- bt $TIF_NEED_RESCHED, %edx
- jnc int_very_careful
- TRACE_IRQS_ON
- ENABLE_INTERRUPTS(CLBR_NONE)
- pushq %rdi
- SCHEDULE_USER
- popq %rdi
- DISABLE_INTERRUPTS(CLBR_NONE)
- TRACE_IRQS_OFF
- jmp int_with_check
-
- /* handle signals and tracing -- both require a full pt_regs */
-int_very_careful:
- TRACE_IRQS_ON
- ENABLE_INTERRUPTS(CLBR_NONE)
SAVE_EXTRA_REGS
- /* Check for syscall exit trace */
- testl $_TIF_WORK_SYSCALL_EXIT, %edx
- jz int_signal
- pushq %rdi
- leaq 8(%rsp), %rdi /* &ptregs -> arg1 */
- call syscall_trace_leave
- popq %rdi
- andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU), %edi
- jmp int_restore_rest
-
-int_signal:
- testl $_TIF_DO_NOTIFY_MASK, %edx
- jz 1f
- movq %rsp, %rdi /* &ptregs -> arg1 */
- xorl %esi, %esi /* oldset -> arg2 */
- call do_notify_resume
-1: movl $_TIF_WORK_MASK, %edi
-int_restore_rest:
+ movq %rsp, %rdi
+ call syscall_return_slowpath /* returns with IRQs disabled */
RESTORE_EXTRA_REGS
- DISABLE_INTERRUPTS(CLBR_NONE)
- TRACE_IRQS_OFF
- jmp int_with_check
-
-syscall_return:
- /* The IRETQ could re-enable interrupts: */
- DISABLE_INTERRUPTS(CLBR_ANY)
- TRACE_IRQS_IRETQ
+ TRACE_IRQS_IRETQ /* we're about to change IF */
/*
* Try to use SYSRET instead of IRET if we're returning to
@@ -555,23 +501,22 @@ END(irq_entries_start)
/* 0(%rsp): ~(interrupt number) */
.macro interrupt func
cld
- /*
- * Since nothing in interrupt handling code touches r12...r15 members
- * of "struct pt_regs", and since interrupts can nest, we can save
- * four stack slots and simultaneously provide
- * an unwind-friendly stack layout by saving "truncated" pt_regs
- * exactly up to rbp slot, without these members.
- */
- ALLOC_PT_GPREGS_ON_STACK -RBP
- SAVE_C_REGS -RBP
- /* this goes to 0(%rsp) for unwinder, not for saving the value: */
- SAVE_EXTRA_REGS_RBP -RBP
-
- leaq -RBP(%rsp), %rdi /* arg1 for \func (pointer to pt_regs) */
+ ALLOC_PT_GPREGS_ON_STACK
+ SAVE_C_REGS
+ SAVE_EXTRA_REGS
- testb $3, CS-RBP(%rsp)
+ testb $3, CS(%rsp)
jz 1f
+
+ /*
+ * IRQ from user mode. Switch to kernel gsbase and inform context
+ * tracking that we're in kernel mode.
+ */
SWAPGS
+#ifdef CONFIG_CONTEXT_TRACKING
+ call enter_from_user_mode
+#endif
+
1:
/*
* Save previous stack pointer, optionally switch to interrupt stack.
@@ -580,14 +525,14 @@ END(irq_entries_start)
* a little cheaper to use a separate counter in the PDA (short of
* moving irq_enter into assembly, which would be too much work)
*/
- movq %rsp, %rsi
+ movq %rsp, %rdi
incl PER_CPU_VAR(irq_count)
cmovzq PER_CPU_VAR(irq_stack_ptr), %rsp
- pushq %rsi
+ pushq %rdi
/* We entered an interrupt context - irqs are off: */
TRACE_IRQS_OFF
- call \func
+ call \func /* rdi points to pt_regs */
.endm
/*
@@ -606,34 +551,19 @@ ret_from_intr:
decl PER_CPU_VAR(irq_count)
/* Restore saved previous stack */
- popq %rsi
- /* return code expects complete pt_regs - adjust rsp accordingly: */
- leaq -RBP(%rsi), %rsp
+ popq %rsp
testb $3, CS(%rsp)
jz retint_kernel
- /* Interrupt came from user space */
-retint_user:
- GET_THREAD_INFO(%rcx)
- /* %rcx: thread info. Interrupts are off. */
-retint_with_reschedule:
- movl $_TIF_WORK_MASK, %edi
-retint_check:
+ /* Interrupt came from user space */
LOCKDEP_SYS_EXIT_IRQ
- movl TI_flags(%rcx), %edx
- andl %edi, %edx
- jnz retint_careful
-
-retint_swapgs: /* return to user-space */
- /*
- * The iretq could re-enable interrupts:
- */
- DISABLE_INTERRUPTS(CLBR_ANY)
+GLOBAL(retint_user)
+ mov %rsp,%rdi
+ call prepare_exit_to_usermode
TRACE_IRQS_IRETQ
-
SWAPGS
- jmp restore_c_regs_and_iret
+ jmp restore_regs_and_iret
/* Returning to kernel space */
retint_kernel:
@@ -657,6 +587,8 @@ retint_kernel:
* At this label, code paths which return to kernel and to user,
* which come from interrupts/exception and from syscalls, merge.
*/
+restore_regs_and_iret:
+ RESTORE_EXTRA_REGS
restore_c_regs_and_iret:
RESTORE_C_REGS
REMOVE_PT_GPREGS_FROM_STACK 8
@@ -707,37 +639,6 @@ native_irq_return_ldt:
popq %rax
jmp native_irq_return_iret
#endif
-
- /* edi: workmask, edx: work */
-retint_careful:
- bt $TIF_NEED_RESCHED, %edx
- jnc retint_signal
- TRACE_IRQS_ON
- ENABLE_INTERRUPTS(CLBR_NONE)
- pushq %rdi
- SCHEDULE_USER
- popq %rdi
- GET_THREAD_INFO(%rcx)
- DISABLE_INTERRUPTS(CLBR_NONE)
- TRACE_IRQS_OFF
- jmp retint_check
-
-retint_signal:
- testl $_TIF_DO_NOTIFY_MASK, %edx
- jz retint_swapgs
- TRACE_IRQS_ON
- ENABLE_INTERRUPTS(CLBR_NONE)
- SAVE_EXTRA_REGS
- movq $-1, ORIG_RAX(%rsp)
- xorl %esi, %esi /* oldset */
- movq %rsp, %rdi /* &pt_regs */
- call do_notify_resume
- RESTORE_EXTRA_REGS
- DISABLE_INTERRUPTS(CLBR_NONE)
- TRACE_IRQS_OFF
- GET_THREAD_INFO(%rcx)
- jmp retint_with_reschedule
-
END(common_interrupt)
/*
@@ -1143,12 +1044,22 @@ ENTRY(error_entry)
SAVE_EXTRA_REGS 8
xorl %ebx, %ebx
testb $3, CS+8(%rsp)
- jz error_kernelspace
+ jz .Lerror_kernelspace
- /* We entered from user mode */
+.Lerror_entry_from_usermode_swapgs:
+ /*
+ * We entered from user mode or we're pretending to have entered
+ * from user mode due to an IRET fault.
+ */
SWAPGS
-error_entry_done:
+.Lerror_entry_from_usermode_after_swapgs:
+#ifdef CONFIG_CONTEXT_TRACKING
+ call enter_from_user_mode
+#endif
+
+.Lerror_entry_done:
+
TRACE_IRQS_OFF
ret
@@ -1158,31 +1069,30 @@ error_entry_done:
* truncated RIP for IRET exceptions returning to compat mode. Check
* for these here too.
*/
-error_kernelspace:
+.Lerror_kernelspace:
incl %ebx
leaq native_irq_return_iret(%rip), %rcx
cmpq %rcx, RIP+8(%rsp)
- je error_bad_iret
+ je .Lerror_bad_iret
movl %ecx, %eax /* zero extend */
cmpq %rax, RIP+8(%rsp)
- je bstep_iret
+ je .Lbstep_iret
cmpq $gs_change, RIP+8(%rsp)
- jne error_entry_done
+ jne .Lerror_entry_done
/*
* hack: gs_change can fail with user gsbase. If this happens, fix up
* gsbase and proceed. We'll fix up the exception and land in
* gs_change's error handler with kernel gsbase.
*/
- SWAPGS
- jmp error_entry_done
+ jmp .Lerror_entry_from_usermode_swapgs
-bstep_iret:
+.Lbstep_iret:
/* Fix truncated RIP */
movq %rcx, RIP+8(%rsp)
/* fall through */
-error_bad_iret:
+.Lerror_bad_iret:
/*
* We came from an IRET to user mode, so we have user gsbase.
* Switch to kernel gsbase:
@@ -1198,7 +1108,7 @@ error_bad_iret:
call fixup_bad_iret
mov %rax, %rsp
decl %ebx
- jmp error_entry_done
+ jmp .Lerror_entry_from_usermode_after_swapgs
END(error_entry)
@@ -1209,7 +1119,6 @@ END(error_entry)
*/
ENTRY(error_exit)
movl %ebx, %eax
- RESTORE_EXTRA_REGS
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
testl %eax, %eax
@@ -1219,7 +1128,18 @@ END(error_exit)
/* Runs on exception stack */
ENTRY(nmi)
+ /*
+ * Fix up the exception frame if we're on Xen.
+ * PARAVIRT_ADJUST_EXCEPTION_FRAME is guaranteed to push at most
+ * one value to the stack on native, so it may clobber the rdx
+ * scratch slot, but it won't clobber any of the important
+ * slots past it.
+ *
+ * Xen is a different story, because the Xen frame itself overlaps
+ * the "NMI executing" variable.
+ */
PARAVIRT_ADJUST_EXCEPTION_FRAME
+
/*
* We allow breakpoints in NMIs. If a breakpoint occurs, then
* the iretq it performs will take us out of NMI context.
@@ -1237,11 +1157,12 @@ ENTRY(nmi)
* If the variable is not set and the stack is not the NMI
* stack then:
* o Set the special variable on the stack
- * o Copy the interrupt frame into a "saved" location on the stack
- * o Copy the interrupt frame into a "copy" location on the stack
+ * o Copy the interrupt frame into an "outermost" location on the
+ * stack
+ * o Copy the interrupt frame into an "iret" location on the stack
* o Continue processing the NMI
* If the variable is set or the previous stack is the NMI stack:
- * o Modify the "copy" location to jump to the repeate_nmi
+ * o Modify the "iret" location to jump to the repeat_nmi
* o return back to the first NMI
*
* Now on exit of the first NMI, we first clear the stack variable
@@ -1250,31 +1171,154 @@ ENTRY(nmi)
* a nested NMI that updated the copy interrupt stack frame, a
* jump will be made to the repeat_nmi code that will handle the second
* NMI.
+ *
+ * However, espfix prevents us from directly returning to userspace
+ * with a single IRET instruction. Similarly, IRET to user mode
+ * can fault. We therefore handle NMIs from user space like
+ * other IST entries.
*/
/* Use %rdx as our temp variable throughout */
pushq %rdx
+ testb $3, CS-RIP+8(%rsp)
+ jz .Lnmi_from_kernel
+
+ /*
+ * NMI from user mode. We need to run on the thread stack, but we
+ * can't go through the normal entry paths: NMIs are masked, and
+ * we don't want to enable interrupts, because then we'll end
+ * up in an awkward situation in which IRQs are on but NMIs
+ * are off.
+ *
+ * We also must not push anything to the stack before switching
+ * stacks lest we corrupt the "NMI executing" variable.
+ */
+
+ SWAPGS_UNSAFE_STACK
+ cld
+ movq %rsp, %rdx
+ movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
+ pushq 5*8(%rdx) /* pt_regs->ss */
+ pushq 4*8(%rdx) /* pt_regs->rsp */
+ pushq 3*8(%rdx) /* pt_regs->flags */
+ pushq 2*8(%rdx) /* pt_regs->cs */
+ pushq 1*8(%rdx) /* pt_regs->rip */
+ pushq $-1 /* pt_regs->orig_ax */
+ pushq %rdi /* pt_regs->di */
+ pushq %rsi /* pt_regs->si */
+ pushq (%rdx) /* pt_regs->dx */
+ pushq %rcx /* pt_regs->cx */
+ pushq %rax /* pt_regs->ax */
+ pushq %r8 /* pt_regs->r8 */
+ pushq %r9 /* pt_regs->r9 */
+ pushq %r10 /* pt_regs->r10 */
+ pushq %r11 /* pt_regs->r11 */
+ pushq %rbx /* pt_regs->rbx */
+ pushq %rbp /* pt_regs->rbp */
+ pushq %r12 /* pt_regs->r12 */
+ pushq %r13 /* pt_regs->r13 */
+ pushq %r14 /* pt_regs->r14 */
+ pushq %r15 /* pt_regs->r15 */
+
+ /*
+ * At this point we no longer need to worry about stack damage
+ * due to nesting -- we're on the normal thread stack and we're
+ * done with the NMI stack.
+ */
+
+ movq %rsp, %rdi
+ movq $-1, %rsi
+ call do_nmi
+
/*
- * If %cs was not the kernel segment, then the NMI triggered in user
- * space, which means it is definitely not nested.
+ * Return back to user mode. We must *not* do the normal exit
+ * work, because we don't want to enable interrupts. Fortunately,
+ * do_nmi doesn't modify pt_regs.
+ */
+ SWAPGS
+ jmp restore_c_regs_and_iret
+
+.Lnmi_from_kernel:
+ /*
+ * Here's what our stack frame will look like:
+ * +---------------------------------------------------------+
+ * | original SS |
+ * | original Return RSP |
+ * | original RFLAGS |
+ * | original CS |
+ * | original RIP |
+ * +---------------------------------------------------------+
+ * | temp storage for rdx |
+ * +---------------------------------------------------------+
+ * | "NMI executing" variable |
+ * +---------------------------------------------------------+
+ * | iret SS } Copied from "outermost" frame |
+ * | iret Return RSP } on each loop iteration; overwritten |
+ * | iret RFLAGS } by a nested NMI to force another |
+ * | iret CS } iteration if needed. |
+ * | iret RIP } |
+ * +---------------------------------------------------------+
+ * | outermost SS } initialized in first_nmi; |
+ * | outermost Return RSP } will not be changed before |
+ * | outermost RFLAGS } NMI processing is done. |
+ * | outermost CS } Copied to "iret" frame on each |
+ * | outermost RIP } iteration. |
+ * +---------------------------------------------------------+
+ * | pt_regs |
+ * +---------------------------------------------------------+
+ *
+ * The "original" frame is used by hardware. Before re-enabling
+ * NMIs, we need to be done with it, and we need to leave enough
+ * space for the asm code here.
+ *
+ * We return by executing IRET while RSP points to the "iret" frame.
+ * That will either return for real or it will loop back into NMI
+ * processing.
+ *
+ * The "outermost" frame is copied to the "iret" frame on each
+ * iteration of the loop, so each iteration starts with the "iret"
+ * frame pointing to the final return target.
*/
- cmpl $__KERNEL_CS, 16(%rsp)
- jne first_nmi
/*
- * Check the special variable on the stack to see if NMIs are
- * executing.
+ * Determine whether we're a nested NMI.
+ *
+ * If we interrupted kernel code between repeat_nmi and
+ * end_repeat_nmi, then we are a nested NMI. We must not
+ * modify the "iret" frame because it's being written by
+ * the outer NMI. That's okay; the outer NMI handler is
+ * about to about to call do_nmi anyway, so we can just
+ * resume the outer NMI.
+ */
+
+ movq $repeat_nmi, %rdx
+ cmpq 8(%rsp), %rdx
+ ja 1f
+ movq $end_repeat_nmi, %rdx
+ cmpq 8(%rsp), %rdx
+ ja nested_nmi_out
+1:
+
+ /*
+ * Now check "NMI executing". If it's set, then we're nested.
+ * This will not detect if we interrupted an outer NMI just
+ * before IRET.
*/
cmpl $1, -8(%rsp)
je nested_nmi
/*
- * Now test if the previous stack was an NMI stack.
- * We need the double check. We check the NMI stack to satisfy the
- * race when the first NMI clears the variable before returning.
- * We check the variable because the first NMI could be in a
- * breakpoint routine using a breakpoint stack.
+ * Now test if the previous stack was an NMI stack. This covers
+ * the case where we interrupt an outer NMI after it clears
+ * "NMI executing" but before IRET. We need to be careful, though:
+ * there is one case in which RSP could point to the NMI stack
+ * despite there being no NMI active: naughty userspace controls
+ * RSP at the very beginning of the SYSCALL targets. We can
+ * pull a fast one on naughty userspace, though: we program
+ * SYSCALL to mask DF, so userspace cannot cause DF to be set
+ * if it controls the kernel's RSP. We set DF before we clear
+ * "NMI executing".
*/
lea 6*8(%rsp), %rdx
/* Compare the NMI stack (rdx) with the stack we came from (4*8(%rsp)) */
@@ -1286,25 +1330,20 @@ ENTRY(nmi)
cmpq %rdx, 4*8(%rsp)
/* If it is below the NMI stack, it is a normal NMI */
jb first_nmi
- /* Ah, it is within the NMI stack, treat it as nested */
+
+ /* Ah, it is within the NMI stack. */
+
+ testb $(X86_EFLAGS_DF >> 8), (3*8 + 1)(%rsp)
+ jz first_nmi /* RSP was user controlled. */
+
+ /* This is a nested NMI. */
nested_nmi:
/*
- * Do nothing if we interrupted the fixup in repeat_nmi.
- * It's about to repeat the NMI handler, so we are fine
- * with ignoring this one.
+ * Modify the "iret" frame to point to repeat_nmi, forcing another
+ * iteration of NMI handling.
*/
- movq $repeat_nmi, %rdx
- cmpq 8(%rsp), %rdx
- ja 1f
- movq $end_repeat_nmi, %rdx
- cmpq 8(%rsp), %rdx
- ja nested_nmi_out
-
-1:
- /* Set up the interrupted NMIs stack to jump to repeat_nmi */
- leaq -1*8(%rsp), %rdx
- movq %rdx, %rsp
+ subq $8, %rsp
leaq -10*8(%rsp), %rdx
pushq $__KERNEL_DS
pushq %rdx
@@ -1318,61 +1357,42 @@ nested_nmi:
nested_nmi_out:
popq %rdx
- /* No need to check faults here */
+ /* We are returning to kernel mode, so this cannot result in a fault. */
INTERRUPT_RETURN
first_nmi:
- /*
- * Because nested NMIs will use the pushed location that we
- * stored in rdx, we must keep that space available.
- * Here's what our stack frame will look like:
- * +-------------------------+
- * | original SS |
- * | original Return RSP |
- * | original RFLAGS |
- * | original CS |
- * | original RIP |
- * +-------------------------+
- * | temp storage for rdx |
- * +-------------------------+
- * | NMI executing variable |
- * +-------------------------+
- * | copied SS |
- * | copied Return RSP |
- * | copied RFLAGS |
- * | copied CS |
- * | copied RIP |
- * +-------------------------+
- * | Saved SS |
- * | Saved Return RSP |
- * | Saved RFLAGS |
- * | Saved CS |
- * | Saved RIP |
- * +-------------------------+
- * | pt_regs |
- * +-------------------------+
- *
- * The saved stack frame is used to fix up the copied stack frame
- * that a nested NMI may change to make the interrupted NMI iret jump
- * to the repeat_nmi. The original stack frame and the temp storage
- * is also used by nested NMIs and can not be trusted on exit.
- */
- /* Do not pop rdx, nested NMIs will corrupt that part of the stack */
+ /* Restore rdx. */
movq (%rsp), %rdx
- /* Set the NMI executing variable on the stack. */
- pushq $1
+ /* Make room for "NMI executing". */
+ pushq $0
- /* Leave room for the "copied" frame */
+ /* Leave room for the "iret" frame */
subq $(5*8), %rsp
- /* Copy the stack frame to the Saved frame */
+ /* Copy the "original" frame to the "outermost" frame */
.rept 5
pushq 11*8(%rsp)
.endr
/* Everything up to here is safe from nested NMIs */
+#ifdef CONFIG_DEBUG_ENTRY
+ /*
+ * For ease of testing, unmask NMIs right away. Disabled by
+ * default because IRET is very expensive.
+ */
+ pushq $0 /* SS */
+ pushq %rsp /* RSP (minus 8 because of the previous push) */
+ addq $8, (%rsp) /* Fix up RSP */
+ pushfq /* RFLAGS */
+ pushq $__KERNEL_CS /* CS */
+ pushq $1f /* RIP */
+ INTERRUPT_RETURN /* continues at repeat_nmi below */
+1:
+#endif
+
+repeat_nmi:
/*
* If there was a nested NMI, the first NMI's iret will return
* here. But NMIs are still enabled and we can take another
@@ -1381,16 +1401,20 @@ first_nmi:
* it will just return, as we are about to repeat an NMI anyway.
* This makes it safe to copy to the stack frame that a nested
* NMI will update.
+ *
+ * RSP is pointing to "outermost RIP". gsbase is unknown, but, if
+ * we're repeating an NMI, gsbase has the same value that it had on
+ * the first iteration. paranoid_entry will load the kernel
+ * gsbase if needed before we call do_nmi. "NMI executing"
+ * is zero.
*/
-repeat_nmi:
+ movq $1, 10*8(%rsp) /* Set "NMI executing". */
+
/*
- * Update the stack variable to say we are still in NMI (the update
- * is benign for the non-repeat case, where 1 was pushed just above
- * to this very stack slot).
+ * Copy the "outermost" frame to the "iret" frame. NMIs that nest
+ * here must not modify the "iret" frame while we're writing to
+ * it or it will end up containing garbage.
*/
- movq $1, 10*8(%rsp)
-
- /* Make another copy, this one may be modified by nested NMIs */
addq $(10*8), %rsp
.rept 5
pushq -6*8(%rsp)
@@ -1399,9 +1423,9 @@ repeat_nmi:
end_repeat_nmi:
/*
- * Everything below this point can be preempted by a nested
- * NMI if the first NMI took an exception and reset our iret stack
- * so that we repeat another NMI.
+ * Everything below this point can be preempted by a nested NMI.
+ * If this happens, then the inner NMI will change the "iret"
+ * frame to point back to repeat_nmi.
*/
pushq $-1 /* ORIG_RAX: no syscall to restart */
ALLOC_PT_GPREGS_ON_STACK
@@ -1415,28 +1439,11 @@ end_repeat_nmi:
*/
call paranoid_entry
- /*
- * Save off the CR2 register. If we take a page fault in the NMI then
- * it could corrupt the CR2 value. If the NMI preempts a page fault
- * handler before it was able to read the CR2 register, and then the
- * NMI itself takes a page fault, the page fault that was preempted
- * will read the information from the NMI page fault and not the
- * origin fault. Save it off and restore it if it changes.
- * Use the r12 callee-saved register.
- */
- movq %cr2, %r12
-
/* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
movq %rsp, %rdi
movq $-1, %rsi
call do_nmi
- /* Did the NMI take a page fault? Restore cr2 if it did */
- movq %cr2, %rcx
- cmpq %rcx, %r12
- je 1f
- movq %r12, %cr2
-1:
testl %ebx, %ebx /* swapgs needed? */
jnz nmi_restore
nmi_swapgs:
@@ -1444,11 +1451,26 @@ nmi_swapgs:
nmi_restore:
RESTORE_EXTRA_REGS
RESTORE_C_REGS
- /* Pop the extra iret frame at once */
+
+ /* Point RSP at the "iret" frame. */
REMOVE_PT_GPREGS_FROM_STACK 6*8
- /* Clear the NMI executing stack variable */
- movq $0, 5*8(%rsp)
+ /*
+ * Clear "NMI executing". Set DF first so that we can easily
+ * distinguish the remaining code between here and IRET from
+ * the SYSCALL entry and exit paths. On a native kernel, we
+ * could just inspect RIP, but, on paravirt kernels,
+ * INTERRUPT_RETURN can translate into a jump into a
+ * hypercall page.
+ */
+ std
+ movq $0, 5*8(%rsp) /* clear "NMI executing" */
+
+ /*
+ * INTERRUPT_RETURN reads the "iret" frame and exits the NMI
+ * stack in a single instruction. We are returning to kernel
+ * mode, so this cannot result in a fault.
+ */
INTERRUPT_RETURN
END(nmi)
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index bb187a6a877c..a9360d40fb7f 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -22,8 +22,8 @@
#define __AUDIT_ARCH_LE 0x40000000
#ifndef CONFIG_AUDITSYSCALL
-# define sysexit_audit ia32_ret_from_sys_call
-# define sysretl_audit ia32_ret_from_sys_call
+# define sysexit_audit ia32_ret_from_sys_call_irqs_off
+# define sysretl_audit ia32_ret_from_sys_call_irqs_off
#endif
.section .entry.text, "ax"
@@ -140,7 +140,9 @@ sysexit_from_sys_call:
*/
andl $~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
movl RIP(%rsp), %ecx /* User %eip */
- RESTORE_RSI_RDI
+ movq RAX(%rsp), %rax
+ movl RSI(%rsp), %esi
+ movl RDI(%rsp), %edi
xorl %edx, %edx /* Do not leak kernel information */
xorq %r8, %r8
xorq %r9, %r9
@@ -205,14 +207,13 @@ sysexit_from_sys_call:
movl RDX(%rsp), %edx /* arg3 */
movl RSI(%rsp), %ecx /* arg4 */
movl RDI(%rsp), %r8d /* arg5 */
- movl %ebp, %r9d /* arg6 */
.endm
.macro auditsys_exit exit
- testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
- jnz ia32_ret_from_sys_call
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
+ testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
+ jnz ia32_ret_from_sys_call
movl %eax, %esi /* second arg, syscall return value */
cmpl $-MAX_ERRNO, %eax /* is it an error ? */
jbe 1f
@@ -220,7 +221,6 @@ sysexit_from_sys_call:
1: setbe %al /* 1 if error, 0 if not */
movzbl %al, %edi /* zero-extend that into %edi */
call __audit_syscall_exit
- movq RAX(%rsp), %rax /* reload syscall return value */
movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %edi
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
@@ -231,11 +231,12 @@ sysexit_from_sys_call:
movq %rax, R10(%rsp)
movq %rax, R9(%rsp)
movq %rax, R8(%rsp)
- jmp int_with_check
+ jmp int_ret_from_sys_call_irqs_off
.endm
sysenter_auditsys:
auditsys_entry_common
+ movl %ebp, %r9d /* reload 6th syscall arg */
jmp sysenter_dispatch
sysexit_audit:
@@ -336,7 +337,7 @@ ENTRY(entry_SYSCALL_compat)
* 32-bit zero extended:
*/
ASM_STAC
-1: movl (%r8), %ebp
+1: movl (%r8), %r9d
_ASM_EXTABLE(1b, ia32_badarg)
ASM_CLAC
orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
@@ -346,7 +347,7 @@ ENTRY(entry_SYSCALL_compat)
cstar_do_call:
/* 32-bit syscall -> 64-bit C ABI argument conversion */
movl %edi, %r8d /* arg5 */
- movl %ebp, %r9d /* arg6 */
+ /* r9 already loaded */ /* arg6 */
xchg %ecx, %esi /* rsi:arg2, rcx:arg4 */
movl %ebx, %edi /* arg1 */
movl %edx, %edx /* arg3 (zero extension) */
@@ -358,7 +359,6 @@ cstar_dispatch:
call *ia32_sys_call_table(, %rax, 8)
movq %rax, RAX(%rsp)
1:
- movl RCX(%rsp), %ebp
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
@@ -366,9 +366,12 @@ cstar_dispatch:
sysretl_from_sys_call:
andl $~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
- RESTORE_RSI_RDI_RDX
+ movl RDX(%rsp), %edx
+ movl RSI(%rsp), %esi
+ movl RDI(%rsp), %edi
movl RIP(%rsp), %ecx
movl EFLAGS(%rsp), %r11d
+ movq RAX(%rsp), %rax
xorq %r10, %r10
xorq %r9, %r9
xorq %r8, %r8
@@ -392,7 +395,9 @@ sysretl_from_sys_call:
#ifdef CONFIG_AUDITSYSCALL
cstar_auditsys:
+ movl %r9d, R9(%rsp) /* register to be clobbered by call */
auditsys_entry_common
+ movl R9(%rsp), %r9d /* reload 6th syscall arg */
jmp cstar_dispatch
sysretl_audit:
@@ -404,14 +409,16 @@ cstar_tracesys:
testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
jz cstar_auditsys
#endif
+ xchgl %r9d, %ebp
SAVE_EXTRA_REGS
xorl %eax, %eax /* Do not leak kernel information */
movq %rax, R11(%rsp)
movq %rax, R10(%rsp)
- movq %rax, R9(%rsp)
+ movq %r9, R9(%rsp)
movq %rax, R8(%rsp)
movq %rsp, %rdi /* &pt_regs -> arg1 */
call syscall_trace_enter
+ movl R9(%rsp), %r9d
/* Reload arg registers from stack. (see sysenter_tracesys) */
movl RCX(%rsp), %ecx
@@ -421,12 +428,53 @@ cstar_tracesys:
movl %eax, %eax /* zero extension */
RESTORE_EXTRA_REGS
+ xchgl %ebp, %r9d
jmp cstar_do_call
END(entry_SYSCALL_compat)
ia32_badarg:
- ASM_CLAC
- movq $-EFAULT, RAX(%rsp)
+ /*
+ * So far, we've entered kernel mode, set AC, turned on IRQs, and
+ * saved C regs except r8-r11. We haven't done any of the other
+ * standard entry work, though. We want to bail, but we shouldn't
+ * treat this as a syscall entry since we don't even know what the
+ * args are. Instead, treat this as a non-syscall entry, finish
+ * the entry work, and immediately exit after setting AX = -EFAULT.
+ *
+ * We're really just being polite here. Killing the task outright
+ * would be a reasonable action, too. Given that the only valid
+ * way to have gotten here is through the vDSO, and we already know
+ * that the stack pointer is bad, the task isn't going to survive
+ * for long no matter what we do.
+ */
+
+ ASM_CLAC /* undo STAC */
+ movq $-EFAULT, RAX(%rsp) /* return -EFAULT if possible */
+
+ /* Fill in the rest of pt_regs */
+ xorl %eax, %eax
+ movq %rax, R11(%rsp)
+ movq %rax, R10(%rsp)
+ movq %rax, R9(%rsp)
+ movq %rax, R8(%rsp)
+ SAVE_EXTRA_REGS
+
+ /* Turn IRQs back off. */
+ DISABLE_INTERRUPTS(CLBR_NONE)
+ TRACE_IRQS_OFF
+
+ /* Now finish entering normal kernel mode. */
+#ifdef CONFIG_CONTEXT_TRACKING
+ call enter_from_user_mode
+#endif
+
+ /* And exit again. */
+ jmp retint_user
+
+ia32_ret_from_sys_call_irqs_off:
+ TRACE_IRQS_ON
+ ENABLE_INTERRUPTS(CLBR_NONE)
+
ia32_ret_from_sys_call:
xorl %eax, %eax /* Do not leak kernel information */
movq %rax, R11(%rsp)
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index ef8187f9d28d..7663c455b9f6 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -365,3 +365,20 @@
356 i386 memfd_create sys_memfd_create
357 i386 bpf sys_bpf
358 i386 execveat sys_execveat stub32_execveat
+359 i386 socket sys_socket
+360 i386 socketpair sys_socketpair
+361 i386 bind sys_bind
+362 i386 connect sys_connect
+363 i386 listen sys_listen
+364 i386 accept4 sys_accept4
+365 i386 getsockopt sys_getsockopt compat_sys_getsockopt
+366 i386 setsockopt sys_setsockopt compat_sys_setsockopt
+367 i386 getsockname sys_getsockname
+368 i386 getpeername sys_getpeername
+369 i386 sendto sys_sendto
+370 i386 sendmsg sys_sendmsg compat_sys_sendmsg
+371 i386 recvfrom sys_recvfrom compat_sys_recvfrom
+372 i386 recvmsg sys_recvmsg compat_sys_recvmsg
+373 i386 shutdown sys_shutdown
+374 i386 userfaultfd sys_userfaultfd
+375 i386 membarrier sys_membarrier
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 9ef32d5f1b19..278842fdf1f6 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -329,6 +329,8 @@
320 common kexec_file_load sys_kexec_file_load
321 common bpf sys_bpf
322 64 execveat stub_execveat
+323 common userfaultfd sys_userfaultfd
+324 common membarrier sys_membarrier
#
# x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile
index e97032069f88..a3d0767a6b29 100644
--- a/arch/x86/entry/vdso/Makefile
+++ b/arch/x86/entry/vdso/Makefile
@@ -8,7 +8,7 @@ KASAN_SANITIZE := n
VDSO64-$(CONFIG_X86_64) := y
VDSOX32-$(CONFIG_X86_X32_ABI) := y
VDSO32-$(CONFIG_X86_32) := y
-VDSO32-$(CONFIG_COMPAT) := y
+VDSO32-$(CONFIG_IA32_EMULATION) := y
# files to link into the vdso
vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o
@@ -20,7 +20,7 @@ obj-y += vma.o
vdso_img-$(VDSO64-y) += 64
vdso_img-$(VDSOX32-y) += x32
vdso_img-$(VDSO32-y) += 32-int80
-vdso_img-$(CONFIG_COMPAT) += 32-syscall
+vdso_img-$(CONFIG_IA32_EMULATION) += 32-syscall
vdso_img-$(VDSO32-y) += 32-sysenter
obj-$(VDSO32-y) += vdso32-setup.o
@@ -126,7 +126,7 @@ $(obj)/vdsox32.so.dbg: $(src)/vdsox32.lds $(vobjx32s) FORCE
# Build multiple 32-bit vDSO images to choose from at boot time.
#
vdso32.so-$(VDSO32-y) += int80
-vdso32.so-$(CONFIG_COMPAT) += syscall
+vdso32.so-$(CONFIG_IA32_EMULATION) += syscall
vdso32.so-$(VDSO32-y) += sysenter
vdso32-images = $(vdso32.so-y:%=vdso32-%.so)
@@ -175,7 +175,7 @@ quiet_cmd_vdso = VDSO $@
-Wl,-T,$(filter %.lds,$^) $(filter %.o,$^) && \
sh $(srctree)/$(src)/checkundef.sh '$(NM)' '$@'
-VDSO_LDFLAGS = -fPIC -shared $(call cc-ldoption, -Wl$(comma)--hash-style=sysv) \
+VDSO_LDFLAGS = -fPIC -shared $(call cc-ldoption, -Wl$(comma)--hash-style=both) \
$(call cc-ldoption, -Wl$(comma)--build-id) -Wl,-Bsymbolic $(LTO_CFLAGS)
GCOV_PROFILE := n
diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c
index 9793322751e0..ca94fa649251 100644
--- a/arch/x86/entry/vdso/vclock_gettime.c
+++ b/arch/x86/entry/vdso/vclock_gettime.c
@@ -175,20 +175,8 @@ static notrace cycle_t vread_pvclock(int *mode)
notrace static cycle_t vread_tsc(void)
{
- cycle_t ret;
- u64 last;
-
- /*
- * Empirically, a fence (of type that depends on the CPU)
- * before rdtsc is enough to ensure that rdtsc is ordered
- * with respect to loads. The various CPU manuals are unclear
- * as to whether rdtsc can be reordered with later loads,
- * but no one has ever seen it happen.
- */
- rdtsc_barrier();
- ret = (cycle_t)__native_read_tsc();
-
- last = gtod->cycle_last;
+ cycle_t ret = (cycle_t)rdtsc_ordered();
+ u64 last = gtod->cycle_last;
if (likely(ret >= last))
return ret;
diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index 1c9f750c3859..434543145d78 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -177,7 +177,7 @@ up_fail:
return ret;
}
-#if defined(CONFIG_X86_32) || defined(CONFIG_COMPAT)
+#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
static int load_vdso32(void)
{
int ret;
@@ -219,8 +219,11 @@ int compat_arch_setup_additional_pages(struct linux_binprm *bprm,
return map_vdso(&vdso_image_x32, true);
}
#endif
-
+#ifdef CONFIG_IA32_EMULATION
return load_vdso32();
+#else
+ return 0;
+#endif
}
#endif
#else
diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c
index 2dcc6ff6fdcc..b160c0c6baed 100644
--- a/arch/x86/entry/vsyscall/vsyscall_64.c
+++ b/arch/x86/entry/vsyscall/vsyscall_64.c
@@ -277,7 +277,7 @@ static const char *gate_vma_name(struct vm_area_struct *vma)
{
return "[vsyscall]";
}
-static struct vm_operations_struct gate_vma_ops = {
+static const struct vm_operations_struct gate_vma_ops = {
.name = gate_vma_name,
};
static struct vm_area_struct gate_vma = {
@@ -290,7 +290,7 @@ static struct vm_area_struct gate_vma = {
struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
{
-#ifdef CONFIG_IA32_EMULATION
+#ifdef CONFIG_COMPAT
if (!mm || mm->context.ia32_compat)
return NULL;
#endif
diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index ae3a29ae875b..a0a19b7ba22d 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -34,99 +34,6 @@
#include <asm/sys_ia32.h>
#include <asm/smap.h>
-int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from)
-{
- int err = 0;
- bool ia32 = test_thread_flag(TIF_IA32);
-
- if (!access_ok(VERIFY_WRITE, to, sizeof(compat_siginfo_t)))
- return -EFAULT;
-
- put_user_try {
- /* If you change siginfo_t structure, please make sure that
- this code is fixed accordingly.
- It should never copy any pad contained in the structure
- to avoid security leaks, but must copy the generic
- 3 ints plus the relevant union member. */
- put_user_ex(from->si_signo, &to->si_signo);
- put_user_ex(from->si_errno, &to->si_errno);
- put_user_ex((short)from->si_code, &to->si_code);
-
- if (from->si_code < 0) {
- put_user_ex(from->si_pid, &to->si_pid);
- put_user_ex(from->si_uid, &to->si_uid);
- put_user_ex(ptr_to_compat(from->si_ptr), &to->si_ptr);
- } else {
- /*
- * First 32bits of unions are always present:
- * si_pid === si_band === si_tid === si_addr(LS half)
- */
- put_user_ex(from->_sifields._pad[0],
- &to->_sifields._pad[0]);
- switch (from->si_code >> 16) {
- case __SI_FAULT >> 16:
- break;
- case __SI_SYS >> 16:
- put_user_ex(from->si_syscall, &to->si_syscall);
- put_user_ex(from->si_arch, &to->si_arch);
- break;
- case __SI_CHLD >> 16:
- if (ia32) {
- put_user_ex(from->si_utime, &to->si_utime);
- put_user_ex(from->si_stime, &to->si_stime);
- } else {
- put_user_ex(from->si_utime, &to->_sifields._sigchld_x32._utime);
- put_user_ex(from->si_stime, &to->_sifields._sigchld_x32._stime);
- }
- put_user_ex(from->si_status, &to->si_status);
- /* FALL THROUGH */
- default:
- case __SI_KILL >> 16:
- put_user_ex(from->si_uid, &to->si_uid);
- break;
- case __SI_POLL >> 16:
- put_user_ex(from->si_fd, &to->si_fd);
- break;
- case __SI_TIMER >> 16:
- put_user_ex(from->si_overrun, &to->si_overrun);
- put_user_ex(ptr_to_compat(from->si_ptr),
- &to->si_ptr);
- break;
- /* This is not generated by the kernel as of now. */
- case __SI_RT >> 16:
- case __SI_MESGQ >> 16:
- put_user_ex(from->si_uid, &to->si_uid);
- put_user_ex(from->si_int, &to->si_int);
- break;
- }
- }
- } put_user_catch(err);
-
- return err;
-}
-
-int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from)
-{
- int err = 0;
- u32 ptr32;
-
- if (!access_ok(VERIFY_READ, from, sizeof(compat_siginfo_t)))
- return -EFAULT;
-
- get_user_try {
- get_user_ex(to->si_signo, &from->si_signo);
- get_user_ex(to->si_errno, &from->si_errno);
- get_user_ex(to->si_code, &from->si_code);
-
- get_user_ex(to->si_pid, &from->si_pid);
- get_user_ex(to->si_uid, &from->si_uid);
- get_user_ex(ptr32, &from->si_ptr);
- to->si_ptr = compat_ptr(ptr32);
- } get_user_catch(err);
-
- return err;
-}
-
/*
* Do a signal return; undo the signal stack.
*/
diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild
index 4dd1f2d770af..aeac434c9feb 100644
--- a/arch/x86/include/asm/Kbuild
+++ b/arch/x86/include/asm/Kbuild
@@ -9,3 +9,4 @@ generic-y += cputime.h
generic-y += dma-contiguous.h
generic-y += early_ioremap.h
generic-y += mcs_spinlock.h
+generic-y += mm-arch-hooks.h
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index c8393634ca0c..ebf6d5e5668c 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -313,7 +313,6 @@ struct apic {
/* wakeup_secondary_cpu */
int (*wakeup_secondary_cpu)(int apicid, unsigned long start_eip);
- bool wait_for_init_deassert;
void (*inquire_remote_apic)(int apicid);
/* apic ops */
@@ -378,7 +377,6 @@ extern struct apic *__apicdrivers[], *__apicdrivers_end[];
* APIC functionality to boot other CPUs - only used on SMP:
*/
#ifdef CONFIG_SMP
-extern atomic_t init_deasserted;
extern int wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip);
#endif
diff --git a/arch/x86/include/asm/arch_hweight.h b/arch/x86/include/asm/arch_hweight.h
index 9686c3d9ff73..259a7c1ef709 100644
--- a/arch/x86/include/asm/arch_hweight.h
+++ b/arch/x86/include/asm/arch_hweight.h
@@ -21,7 +21,7 @@
* ARCH_HWEIGHT_CFLAGS in <arch/x86/Kconfig> for the respective
* compiler switches.
*/
-static inline unsigned int __arch_hweight32(unsigned int w)
+static __always_inline unsigned int __arch_hweight32(unsigned int w)
{
unsigned int res = 0;
@@ -42,20 +42,23 @@ static inline unsigned int __arch_hweight8(unsigned int w)
return __arch_hweight32(w & 0xff);
}
+#ifdef CONFIG_X86_32
static inline unsigned long __arch_hweight64(__u64 w)
{
- unsigned long res = 0;
-
-#ifdef CONFIG_X86_32
return __arch_hweight32((u32)w) +
__arch_hweight32((u32)(w >> 32));
+}
#else
+static __always_inline unsigned long __arch_hweight64(__u64 w)
+{
+ unsigned long res = 0;
+
asm (ALTERNATIVE("call __sw_hweight64", POPCNT64, X86_FEATURE_POPCNT)
: "="REG_OUT (res)
: REG_IN (w));
-#endif /* CONFIG_X86_32 */
return res;
}
+#endif /* CONFIG_X86_32 */
#endif
diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h
index e9168955c42f..fb52aa644aab 100644
--- a/arch/x86/include/asm/atomic.h
+++ b/arch/x86/include/asm/atomic.h
@@ -182,6 +182,21 @@ static inline int atomic_xchg(atomic_t *v, int new)
return xchg(&v->counter, new);
}
+#define ATOMIC_OP(op) \
+static inline void atomic_##op(int i, atomic_t *v) \
+{ \
+ asm volatile(LOCK_PREFIX #op"l %1,%0" \
+ : "+m" (v->counter) \
+ : "ir" (i) \
+ : "memory"); \
+}
+
+ATOMIC_OP(and)
+ATOMIC_OP(or)
+ATOMIC_OP(xor)
+
+#undef ATOMIC_OP
+
/**
* __atomic_add_unless - add unless the number is already a given value
* @v: pointer of type atomic_t
@@ -219,16 +234,6 @@ static __always_inline short int atomic_inc_short(short int *v)
return *v;
}
-/* These are x86-specific, used by some header files */
-#define atomic_clear_mask(mask, addr) \
- asm volatile(LOCK_PREFIX "andl %0,%1" \
- : : "r" (~(mask)), "m" (*(addr)) : "memory")
-
-#define atomic_set_mask(mask, addr) \
- asm volatile(LOCK_PREFIX "orl %0,%1" \
- : : "r" ((unsigned)(mask)), "m" (*(addr)) \
- : "memory")
-
#ifdef CONFIG_X86_32
# include <asm/atomic64_32.h>
#else
diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h
index b154de75c90c..a11c30b77fb5 100644
--- a/arch/x86/include/asm/atomic64_32.h
+++ b/arch/x86/include/asm/atomic64_32.h
@@ -313,4 +313,18 @@ static inline long long atomic64_dec_if_positive(atomic64_t *v)
#undef alternative_atomic64
#undef __alternative_atomic64
+#define ATOMIC64_OP(op, c_op) \
+static inline void atomic64_##op(long long i, atomic64_t *v) \
+{ \
+ long long old, c = 0; \
+ while ((old = atomic64_cmpxchg(v, c, c c_op i)) != c) \
+ c = old; \
+}
+
+ATOMIC64_OP(and, &)
+ATOMIC64_OP(or, |)
+ATOMIC64_OP(xor, ^)
+
+#undef ATOMIC64_OP
+
#endif /* _ASM_X86_ATOMIC64_32_H */
diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h
index b965f9e03f2a..50e33eff58de 100644
--- a/arch/x86/include/asm/atomic64_64.h
+++ b/arch/x86/include/asm/atomic64_64.h
@@ -220,4 +220,19 @@ static inline long atomic64_dec_if_positive(atomic64_t *v)
return dec;
}
+#define ATOMIC64_OP(op) \
+static inline void atomic64_##op(long i, atomic64_t *v) \
+{ \
+ asm volatile(LOCK_PREFIX #op"q %1,%0" \
+ : "+m" (v->counter) \
+ : "er" (i) \
+ : "memory"); \
+}
+
+ATOMIC64_OP(and)
+ATOMIC64_OP(or)
+ATOMIC64_OP(xor)
+
+#undef ATOMIC64_OP
+
#endif /* _ASM_X86_ATOMIC64_64_H */
diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h
index e51a8f803f55..0681d2532527 100644
--- a/arch/x86/include/asm/barrier.h
+++ b/arch/x86/include/asm/barrier.h
@@ -57,12 +57,12 @@
do { \
compiletime_assert_atomic_type(*p); \
smp_mb(); \
- ACCESS_ONCE(*p) = (v); \
+ WRITE_ONCE(*p, v); \
} while (0)
#define smp_load_acquire(p) \
({ \
- typeof(*p) ___p1 = ACCESS_ONCE(*p); \
+ typeof(*p) ___p1 = READ_ONCE(*p); \
compiletime_assert_atomic_type(*p); \
smp_mb(); \
___p1; \
@@ -74,12 +74,12 @@ do { \
do { \
compiletime_assert_atomic_type(*p); \
barrier(); \
- ACCESS_ONCE(*p) = (v); \
+ WRITE_ONCE(*p, v); \
} while (0)
#define smp_load_acquire(p) \
({ \
- typeof(*p) ___p1 = ACCESS_ONCE(*p); \
+ typeof(*p) ___p1 = READ_ONCE(*p); \
compiletime_assert_atomic_type(*p); \
barrier(); \
___p1; \
@@ -91,15 +91,4 @@ do { \
#define smp_mb__before_atomic() barrier()
#define smp_mb__after_atomic() barrier()
-/*
- * Stop RDTSC speculation. This is needed when you need to use RDTSC
- * (or get_cycles or vread that possibly accesses the TSC) in a defined
- * code region.
- */
-static __always_inline void rdtsc_barrier(void)
-{
- alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC,
- "lfence", X86_FEATURE_LFENCE_RDTSC);
-}
-
#endif /* _ASM_X86_BARRIER_H */
diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h
index 9bf3ea14b9f0..e63aa38e85fb 100644
--- a/arch/x86/include/asm/cacheflush.h
+++ b/arch/x86/include/asm/cacheflush.h
@@ -89,6 +89,8 @@ int set_pages_rw(struct page *page, int numpages);
void clflush_cache_range(void *addr, unsigned int size);
+#define mmio_flush_range(addr, size) clflush_cache_range(addr, size)
+
#ifdef CONFIG_DEBUG_RODATA
void mark_rodata_ro(void);
extern const int rodata_test_data;
@@ -109,75 +111,4 @@ static inline int rodata_test(void)
}
#endif
-#ifdef ARCH_HAS_NOCACHE_UACCESS
-
-/**
- * arch_memcpy_to_pmem - copy data to persistent memory
- * @dst: destination buffer for the copy
- * @src: source buffer for the copy
- * @n: length of the copy in bytes
- *
- * Copy data to persistent memory media via non-temporal stores so that
- * a subsequent arch_wmb_pmem() can flush cpu and memory controller
- * write buffers to guarantee durability.
- */
-static inline void arch_memcpy_to_pmem(void __pmem *dst, const void *src,
- size_t n)
-{
- int unwritten;
-
- /*
- * We are copying between two kernel buffers, if
- * __copy_from_user_inatomic_nocache() returns an error (page
- * fault) we would have already reported a general protection fault
- * before the WARN+BUG.
- */
- unwritten = __copy_from_user_inatomic_nocache((void __force *) dst,
- (void __user *) src, n);
- if (WARN(unwritten, "%s: fault copying %p <- %p unwritten: %d\n",
- __func__, dst, src, unwritten))
- BUG();
-}
-
-/**
- * arch_wmb_pmem - synchronize writes to persistent memory
- *
- * After a series of arch_memcpy_to_pmem() operations this drains data
- * from cpu write buffers and any platform (memory controller) buffers
- * to ensure that written data is durable on persistent memory media.
- */
-static inline void arch_wmb_pmem(void)
-{
- /*
- * wmb() to 'sfence' all previous writes such that they are
- * architecturally visible to 'pcommit'. Note, that we've
- * already arranged for pmem writes to avoid the cache via
- * arch_memcpy_to_pmem().
- */
- wmb();
- pcommit_sfence();
-}
-
-static inline bool __arch_has_wmb_pmem(void)
-{
-#ifdef CONFIG_X86_64
- /*
- * We require that wmb() be an 'sfence', that is only guaranteed on
- * 64-bit builds
- */
- return static_cpu_has(X86_FEATURE_PCOMMIT);
-#else
- return false;
-#endif
-}
-#else /* ARCH_HAS_NOCACHE_UACCESS i.e. ARCH=um */
-extern void arch_memcpy_to_pmem(void __pmem *dst, const void *src, size_t n);
-extern void arch_wmb_pmem(void);
-
-static inline bool __arch_has_wmb_pmem(void)
-{
- return false;
-}
-#endif
-
#endif /* _ASM_X86_CACHEFLUSH_H */
diff --git a/arch/x86/include/asm/context_tracking.h b/arch/x86/include/asm/context_tracking.h
deleted file mode 100644
index 1fe49704b146..000000000000
--- a/arch/x86/include/asm/context_tracking.h
+++ /dev/null
@@ -1,10 +0,0 @@
-#ifndef _ASM_X86_CONTEXT_TRACKING_H
-#define _ASM_X86_CONTEXT_TRACKING_H
-
-#ifdef CONFIG_CONTEXT_TRACKING
-# define SCHEDULE_USER call schedule_user
-#else
-# define SCHEDULE_USER call schedule
-#endif
-
-#endif
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 3d6606fb97d0..9727b3b48bd1 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -119,6 +119,7 @@
#define X86_FEATURE_TM2 ( 4*32+ 8) /* Thermal Monitor 2 */
#define X86_FEATURE_SSSE3 ( 4*32+ 9) /* Supplemental SSE-3 */
#define X86_FEATURE_CID ( 4*32+10) /* Context ID */
+#define X86_FEATURE_SDBG ( 4*32+11) /* Silicon Debug */
#define X86_FEATURE_FMA ( 4*32+12) /* Fused multiply-add */
#define X86_FEATURE_CX16 ( 4*32+13) /* CMPXCHG16B */
#define X86_FEATURE_XTPR ( 4*32+14) /* Send Task Priority Messages */
@@ -176,6 +177,7 @@
#define X86_FEATURE_PERFCTR_NB ( 6*32+24) /* NB performance counter extensions */
#define X86_FEATURE_BPEXT (6*32+26) /* data breakpoint extension */
#define X86_FEATURE_PERFCTR_L2 ( 6*32+28) /* L2 performance counter extensions */
+#define X86_FEATURE_MWAITX ( 6*32+29) /* MWAIT extension (MONITORX/MWAITX) */
/*
* Auxiliary flags: Linux defined - For features scattered in various
@@ -191,7 +193,7 @@
#define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */
#define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */
#define X86_FEATURE_HWP ( 7*32+ 10) /* "hwp" Intel HWP */
-#define X86_FEATURE_HWP_NOITFY ( 7*32+ 11) /* Intel HWP_NOTIFY */
+#define X86_FEATURE_HWP_NOTIFY ( 7*32+ 11) /* Intel HWP_NOTIFY */
#define X86_FEATURE_HWP_ACT_WINDOW ( 7*32+ 12) /* Intel HWP_ACT_WINDOW */
#define X86_FEATURE_HWP_EPP ( 7*32+13) /* Intel HWP_EPP */
#define X86_FEATURE_HWP_PKG_REQ ( 7*32+14) /* Intel HWP_PKG_REQ */
@@ -239,6 +241,7 @@
#define X86_FEATURE_AVX512PF ( 9*32+26) /* AVX-512 Prefetch */
#define X86_FEATURE_AVX512ER ( 9*32+27) /* AVX-512 Exponential and Reciprocal */
#define X86_FEATURE_AVX512CD ( 9*32+28) /* AVX-512 Conflict Detection */
+#define X86_FEATURE_SHA_NI ( 9*32+29) /* SHA1/SHA256 Instruction Extensions */
/* Extended state features, CPUID level 0x0000000d:1 (eax), word 10 */
#define X86_FEATURE_XSAVEOPT (10*32+ 0) /* XSAVEOPT */
diff --git a/arch/x86/include/asm/delay.h b/arch/x86/include/asm/delay.h
index 9b3b4f2754c7..36a760bda462 100644
--- a/arch/x86/include/asm/delay.h
+++ b/arch/x86/include/asm/delay.h
@@ -4,5 +4,6 @@
#include <asm-generic/delay.h>
void use_tsc_delay(void);
+void use_mwaitx_delay(void);
#endif /* _ASM_X86_DELAY_H */
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index a0bf89fd2647..4e10d73cf018 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -280,21 +280,6 @@ static inline void clear_LDT(void)
set_ldt(NULL, 0);
}
-/*
- * load one particular LDT into the current CPU
- */
-static inline void load_LDT_nolock(mm_context_t *pc)
-{
- set_ldt(pc->ldt, pc->size);
-}
-
-static inline void load_LDT(mm_context_t *pc)
-{
- preempt_disable();
- load_LDT_nolock(pc);
- preempt_enable();
-}
-
static inline unsigned long get_desc_base(const struct desc_struct *desc)
{
return (unsigned)(desc->base0 | ((desc->base1) << 16) | ((desc->base2) << 24));
diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h
index 1f5b7287d1ad..953b7263f844 100644
--- a/arch/x86/include/asm/dma-mapping.h
+++ b/arch/x86/include/asm/dma-mapping.h
@@ -12,7 +12,6 @@
#include <linux/dma-attrs.h>
#include <asm/io.h>
#include <asm/swiotlb.h>
-#include <asm-generic/dma-coherent.h>
#include <linux/dma-contiguous.h>
#ifdef CONFIG_ISA
@@ -41,24 +40,13 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev)
#endif
}
-#include <asm-generic/dma-mapping-common.h>
-
-/* Make sure we keep the same behaviour */
-static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
-{
- struct dma_map_ops *ops = get_dma_ops(dev);
- debug_dma_mapping_error(dev, dma_addr);
- if (ops->mapping_error)
- return ops->mapping_error(dev, dma_addr);
-
- return (dma_addr == DMA_ERROR_CODE);
-}
-
-#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
-#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
+bool arch_dma_alloc_attrs(struct device **dev, gfp_t *gfp);
+#define arch_dma_alloc_attrs arch_dma_alloc_attrs
+#define HAVE_ARCH_DMA_SUPPORTED 1
extern int dma_supported(struct device *hwdev, u64 mask);
-extern int dma_set_mask(struct device *dev, u64 mask);
+
+#include <asm-generic/dma-mapping-common.h>
extern void *dma_generic_alloc_coherent(struct device *dev, size_t size,
dma_addr_t *dma_addr, gfp_t flag,
@@ -125,16 +113,4 @@ static inline gfp_t dma_alloc_coherent_gfp_flags(struct device *dev, gfp_t gfp)
return gfp;
}
-#define dma_alloc_coherent(d,s,h,f) dma_alloc_attrs(d,s,h,f,NULL)
-
-void *
-dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle,
- gfp_t gfp, struct dma_attrs *attrs);
-
-#define dma_free_coherent(d,s,c,h) dma_free_attrs(d,s,c,h,NULL)
-
-void dma_free_attrs(struct device *dev, size_t size,
- void *vaddr, dma_addr_t bus,
- struct dma_attrs *attrs);
-
#endif
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index 155162ea0e00..ae68be92f755 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -86,6 +86,18 @@ extern u64 asmlinkage efi_call(void *fp, ...);
extern void __iomem *__init efi_ioremap(unsigned long addr, unsigned long size,
u32 type, u64 attribute);
+#ifdef CONFIG_KASAN
+/*
+ * CONFIG_KASAN may redefine memset to __memset. __memset function is present
+ * only in kernel binary. Since the EFI stub linked into a separate binary it
+ * doesn't have __memset(). So we should use standard memset from
+ * arch/x86/boot/compressed/string.c. The same applies to memcpy and memmove.
+ */
+#undef memcpy
+#undef memset
+#undef memmove
+#endif
+
#endif /* CONFIG_X86_32 */
extern struct efi_scratch efi_scratch;
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index f161c189c27b..141c561f4664 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -78,7 +78,7 @@ typedef struct user_fxsr_struct elf_fpxregset_t;
#ifdef CONFIG_X86_64
extern unsigned int vdso64_enabled;
#endif
-#if defined(CONFIG_X86_32) || defined(CONFIG_COMPAT)
+#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
extern unsigned int vdso32_enabled;
#endif
@@ -187,8 +187,8 @@ static inline void elf_common_init(struct thread_struct *t,
#define COMPAT_ELF_PLAT_INIT(regs, load_addr) \
elf_common_init(&current->thread, regs, __USER_DS)
-void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp);
-#define compat_start_thread start_thread_ia32
+void compat_start_thread(struct pt_regs *regs, u32 new_ip, u32 new_sp);
+#define compat_start_thread compat_start_thread
void set_personality_ia32(bool);
#define COMPAT_SET_PERSONALITY(ex) \
@@ -344,14 +344,9 @@ extern int compat_arch_setup_additional_pages(struct linux_binprm *bprm,
*/
static inline int mmap_is_ia32(void)
{
-#ifdef CONFIG_X86_32
- return 1;
-#endif
-#ifdef CONFIG_IA32_EMULATION
- if (test_thread_flag(TIF_ADDR32))
- return 1;
-#endif
- return 0;
+ return config_enabled(CONFIG_X86_32) ||
+ (config_enabled(CONFIG_COMPAT) &&
+ test_thread_flag(TIF_ADDR32));
}
/* Do not change the values. See get_align_mask() */
diff --git a/arch/x86/include/asm/espfix.h b/arch/x86/include/asm/espfix.h
index 99efebb2f69d..ca3ce9ab9385 100644
--- a/arch/x86/include/asm/espfix.h
+++ b/arch/x86/include/asm/espfix.h
@@ -9,7 +9,7 @@ DECLARE_PER_CPU_READ_MOSTLY(unsigned long, espfix_stack);
DECLARE_PER_CPU_READ_MOSTLY(unsigned long, espfix_waddr);
extern void init_espfix_bsp(void);
-extern void init_espfix_ap(void);
+extern void init_espfix_ap(int cpu);
#endif /* CONFIG_X86_64 */
diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h
index 0637826292de..c49c5173158e 100644
--- a/arch/x86/include/asm/fpu/types.h
+++ b/arch/x86/include/asm/fpu/types.h
@@ -189,6 +189,7 @@ union fpregs_state {
struct fxregs_state fxsave;
struct swregs_state soft;
struct xregs_state xsave;
+ u8 __padding[PAGE_SIZE];
};
/*
@@ -198,40 +199,6 @@ union fpregs_state {
*/
struct fpu {
/*
- * @state:
- *
- * In-memory copy of all FPU registers that we save/restore
- * over context switches. If the task is using the FPU then
- * the registers in the FPU are more recent than this state
- * copy. If the task context-switches away then they get
- * saved here and represent the FPU state.
- *
- * After context switches there may be a (short) time period
- * during which the in-FPU hardware registers are unchanged
- * and still perfectly match this state, if the tasks
- * scheduled afterwards are not using the FPU.
- *
- * This is the 'lazy restore' window of optimization, which
- * we track though 'fpu_fpregs_owner_ctx' and 'fpu->last_cpu'.
- *
- * We detect whether a subsequent task uses the FPU via setting
- * CR0::TS to 1, which causes any FPU use to raise a #NM fault.
- *
- * During this window, if the task gets scheduled again, we
- * might be able to skip having to do a restore from this
- * memory buffer to the hardware registers - at the cost of
- * incurring the overhead of #NM fault traps.
- *
- * Note that on modern CPUs that support the XSAVEOPT (or other
- * optimized XSAVE instructions), we don't use #NM traps anymore,
- * as the hardware can track whether FPU registers need saving
- * or not. On such CPUs we activate the non-lazy ('eagerfpu')
- * logic, which unconditionally saves/restores all FPU state
- * across context switches. (if FPU state exists.)
- */
- union fpregs_state state;
-
- /*
* @last_cpu:
*
* Records the last CPU on which this context was loaded into
@@ -288,6 +255,43 @@ struct fpu {
* deal with bursty apps that only use the FPU for a short time:
*/
unsigned char counter;
+ /*
+ * @state:
+ *
+ * In-memory copy of all FPU registers that we save/restore
+ * over context switches. If the task is using the FPU then
+ * the registers in the FPU are more recent than this state
+ * copy. If the task context-switches away then they get
+ * saved here and represent the FPU state.
+ *
+ * After context switches there may be a (short) time period
+ * during which the in-FPU hardware registers are unchanged
+ * and still perfectly match this state, if the tasks
+ * scheduled afterwards are not using the FPU.
+ *
+ * This is the 'lazy restore' window of optimization, which
+ * we track though 'fpu_fpregs_owner_ctx' and 'fpu->last_cpu'.
+ *
+ * We detect whether a subsequent task uses the FPU via setting
+ * CR0::TS to 1, which causes any FPU use to raise a #NM fault.
+ *
+ * During this window, if the task gets scheduled again, we
+ * might be able to skip having to do a restore from this
+ * memory buffer to the hardware registers - at the cost of
+ * incurring the overhead of #NM fault traps.
+ *
+ * Note that on modern CPUs that support the XSAVEOPT (or other
+ * optimized XSAVE instructions), we don't use #NM traps anymore,
+ * as the hardware can track whether FPU registers need saving
+ * or not. On such CPUs we activate the non-lazy ('eagerfpu')
+ * logic, which unconditionally saves/restores all FPU state
+ * across context switches. (if FPU state exists.)
+ */
+ union fpregs_state state;
+ /*
+ * WARNING: 'state' is dynamically-sized. Do not put
+ * anything after it here.
+ */
};
#endif /* _ASM_X86_FPU_H */
diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h
index f45acad3c4b6..24938852db30 100644
--- a/arch/x86/include/asm/ftrace.h
+++ b/arch/x86/include/asm/ftrace.h
@@ -3,9 +3,9 @@
#ifdef CONFIG_FUNCTION_TRACER
#ifdef CC_USING_FENTRY
-# define MCOUNT_ADDR ((long)(__fentry__))
+# define MCOUNT_ADDR ((unsigned long)(__fentry__))
#else
-# define MCOUNT_ADDR ((long)(mcount))
+# define MCOUNT_ADDR ((unsigned long)(mcount))
#endif
#define MCOUNT_INSN_SIZE 5 /* sizeof mcount call */
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index 6615032e19c8..1e3408e88604 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -182,10 +182,10 @@ extern char irq_entries_start[];
#define trace_irq_entries_start irq_entries_start
#endif
-#define VECTOR_UNDEFINED (-1)
-#define VECTOR_RETRIGGERED (-2)
+#define VECTOR_UNUSED NULL
+#define VECTOR_RETRIGGERED ((void *)~0UL)
-typedef int vector_irq_t[NR_VECTORS];
+typedef struct irq_desc* vector_irq_t[NR_VECTORS];
DECLARE_PER_CPU(vector_irq_t, vector_irq);
#endif /* !ASSEMBLY_ */
diff --git a/arch/x86/include/asm/ia32.h b/arch/x86/include/asm/ia32.h
index d0e8e0141041..28019765442e 100644
--- a/arch/x86/include/asm/ia32.h
+++ b/arch/x86/include/asm/ia32.h
@@ -22,15 +22,6 @@ struct ucontext_ia32 {
compat_sigset_t uc_sigmask; /* mask last for extensibility */
};
-struct ucontext_x32 {
- unsigned int uc_flags;
- unsigned int uc_link;
- compat_stack_t uc_stack;
- unsigned int uc__pad0; /* needed for alignment */
- struct sigcontext uc_mcontext; /* the 64-bit sigcontext type */
- compat_sigset_t uc_sigmask; /* mask last for extensibility */
-};
-
/* This matches struct stat64 in glibc2.2, hence the absolutely
* insane amounts of padding around dev_t's.
*/
diff --git a/arch/x86/include/asm/intel_pmc_ipc.h b/arch/x86/include/asm/intel_pmc_ipc.h
index 200ec2e7821d..cd0310e186f4 100644
--- a/arch/x86/include/asm/intel_pmc_ipc.h
+++ b/arch/x86/include/asm/intel_pmc_ipc.h
@@ -25,36 +25,9 @@
#if IS_ENABLED(CONFIG_INTEL_PMC_IPC)
-/*
- * intel_pmc_ipc_simple_command
- * @cmd: command
- * @sub: sub type
- */
int intel_pmc_ipc_simple_command(int cmd, int sub);
-
-/*
- * intel_pmc_ipc_raw_cmd
- * @cmd: command
- * @sub: sub type
- * @in: input data
- * @inlen: input length in bytes
- * @out: output data
- * @outlen: output length in dwords
- * @sptr: data writing to SPTR register
- * @dptr: data writing to DPTR register
- */
int intel_pmc_ipc_raw_cmd(u32 cmd, u32 sub, u8 *in, u32 inlen,
u32 *out, u32 outlen, u32 dptr, u32 sptr);
-
-/*
- * intel_pmc_ipc_command
- * @cmd: command
- * @sub: sub type
- * @in: input data
- * @inlen: input length in bytes
- * @out: output data
- * @outlen: output length in dwords
- */
int intel_pmc_ipc_command(u32 cmd, u32 sub, u8 *in, u32 inlen,
u32 *out, u32 outlen);
diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index cc9c61bc1abe..de25aad07853 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -180,6 +180,8 @@ static inline unsigned int isa_virt_to_bus(volatile void *address)
*/
extern void __iomem *ioremap_nocache(resource_size_t offset, unsigned long size);
extern void __iomem *ioremap_uc(resource_size_t offset, unsigned long size);
+#define ioremap_uc ioremap_uc
+
extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size);
extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size,
unsigned long prot_val);
@@ -248,12 +250,6 @@ static inline void flush_write_buffers(void)
#endif
}
-static inline void __pmem *arch_memremap_pmem(resource_size_t offset,
- unsigned long size)
-{
- return (void __force __pmem *) ioremap_cache(offset, size);
-}
-
#endif /* __KERNEL__ */
extern void native_io_delay(void);
diff --git a/arch/x86/include/asm/iosf_mbi.h b/arch/x86/include/asm/iosf_mbi.h
index 57995f0596a6..b72ad0faa6c5 100644
--- a/arch/x86/include/asm/iosf_mbi.h
+++ b/arch/x86/include/asm/iosf_mbi.h
@@ -52,20 +52,20 @@
/* Quark available units */
#define QRK_MBI_UNIT_HBA 0x00
-#define QRK_MBI_UNIT_HB 0x03
+#define QRK_MBI_UNIT_HB 0x03
#define QRK_MBI_UNIT_RMU 0x04
-#define QRK_MBI_UNIT_MM 0x05
+#define QRK_MBI_UNIT_MM 0x05
#define QRK_MBI_UNIT_MMESRAM 0x05
#define QRK_MBI_UNIT_SOC 0x31
/* Quark read/write opcodes */
#define QRK_MBI_HBA_READ 0x10
#define QRK_MBI_HBA_WRITE 0x11
-#define QRK_MBI_HB_READ 0x10
+#define QRK_MBI_HB_READ 0x10
#define QRK_MBI_HB_WRITE 0x11
#define QRK_MBI_RMU_READ 0x10
#define QRK_MBI_RMU_WRITE 0x11
-#define QRK_MBI_MM_READ 0x10
+#define QRK_MBI_MM_READ 0x10
#define QRK_MBI_MM_WRITE 0x11
#define QRK_MBI_MMESRAM_READ 0x12
#define QRK_MBI_MMESRAM_WRITE 0x13
diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h
index 8008d06581c7..881b4768644a 100644
--- a/arch/x86/include/asm/irq.h
+++ b/arch/x86/include/asm/irq.h
@@ -36,7 +36,9 @@ extern void kvm_set_posted_intr_wakeup_handler(void (*handler)(void));
extern void (*x86_platform_ipi_callback)(void);
extern void native_init_IRQ(void);
-extern bool handle_irq(unsigned irq, struct pt_regs *regs);
+
+struct irq_desc;
+extern bool handle_irq(struct irq_desc *desc, struct pt_regs *regs);
extern __visible unsigned int do_IRQ(struct pt_regs *regs);
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 4c2d2eb2060a..6ca9fd6234e1 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -117,16 +117,6 @@
#define FPU_IRQ 13
-#define FIRST_VM86_IRQ 3
-#define LAST_VM86_IRQ 15
-
-#ifndef __ASSEMBLY__
-static inline int invalid_vm86_irq(int irq)
-{
- return irq < FIRST_VM86_IRQ || irq > LAST_VM86_IRQ;
-}
-#endif
-
/*
* Size the maximum number of interrupts.
*
diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h
index a4c1cf7e93f8..5daeca3d0f9e 100644
--- a/arch/x86/include/asm/jump_label.h
+++ b/arch/x86/include/asm/jump_label.h
@@ -16,15 +16,32 @@
# define STATIC_KEY_INIT_NOP GENERIC_NOP5_ATOMIC
#endif
-static __always_inline bool arch_static_branch(struct static_key *key)
+static __always_inline bool arch_static_branch(struct static_key *key, bool branch)
{
asm_volatile_goto("1:"
".byte " __stringify(STATIC_KEY_INIT_NOP) "\n\t"
".pushsection __jump_table, \"aw\" \n\t"
_ASM_ALIGN "\n\t"
- _ASM_PTR "1b, %l[l_yes], %c0 \n\t"
+ _ASM_PTR "1b, %l[l_yes], %c0 + %c1 \n\t"
".popsection \n\t"
- : : "i" (key) : : l_yes);
+ : : "i" (key), "i" (branch) : : l_yes);
+
+ return false;
+l_yes:
+ return true;
+}
+
+static __always_inline bool arch_static_branch_jump(struct static_key *key, bool branch)
+{
+ asm_volatile_goto("1:"
+ ".byte 0xe9\n\t .long %l[l_yes] - 2f\n\t"
+ "2:\n\t"
+ ".pushsection __jump_table, \"aw\" \n\t"
+ _ASM_ALIGN "\n\t"
+ _ASM_PTR "1b, %l[l_yes], %c0 + %c1 \n\t"
+ ".popsection \n\t"
+ : : "i" (key), "i" (branch) : : l_yes);
+
return false;
l_yes:
return true;
diff --git a/arch/x86/include/asm/kasan.h b/arch/x86/include/asm/kasan.h
index 8b22422fbad8..1410b567ecde 100644
--- a/arch/x86/include/asm/kasan.h
+++ b/arch/x86/include/asm/kasan.h
@@ -1,6 +1,9 @@
#ifndef _ASM_X86_KASAN_H
#define _ASM_X86_KASAN_H
+#include <linux/const.h>
+#define KASAN_SHADOW_OFFSET _AC(CONFIG_KASAN_SHADOW_OFFSET, UL)
+
/*
* Compiler uses shadow offset assuming that addresses start
* from 0. Kernel addresses don't start from 0, so shadow
@@ -14,15 +17,11 @@
#ifndef __ASSEMBLY__
-extern pte_t kasan_zero_pte[];
-extern pte_t kasan_zero_pmd[];
-extern pte_t kasan_zero_pud[];
-
#ifdef CONFIG_KASAN
-void __init kasan_map_early_shadow(pgd_t *pgd);
+void __init kasan_early_init(void);
void __init kasan_init(void);
#else
-static inline void kasan_map_early_shadow(pgd_t *pgd) { }
+static inline void kasan_early_init(void) { }
static inline void kasan_init(void) { }
#endif
diff --git a/arch/x86/include/asm/kdebug.h b/arch/x86/include/asm/kdebug.h
index 32ce71375b21..b130d59406fb 100644
--- a/arch/x86/include/asm/kdebug.h
+++ b/arch/x86/include/asm/kdebug.h
@@ -29,7 +29,7 @@ extern void show_trace(struct task_struct *t, struct pt_regs *regs,
extern void __show_regs(struct pt_regs *regs, int all);
extern unsigned long oops_begin(void);
extern void oops_end(unsigned long, struct pt_regs *, int signr);
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
extern int in_crash_kexec;
#else
/* no crash dump is ever in progress if no crash kernel can be kexec'd */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 2a7f5d782c33..2beee0382088 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -40,6 +40,7 @@
#define KVM_PIO_PAGE_OFFSET 1
#define KVM_COALESCED_MMIO_PAGE_OFFSET 2
+#define KVM_HALT_POLL_NS_DEFAULT 500000
#define KVM_IRQCHIP_NUM_PINS KVM_IOAPIC_NUM_PINS
@@ -252,6 +253,11 @@ struct kvm_pio_request {
int size;
};
+struct rsvd_bits_validate {
+ u64 rsvd_bits_mask[2][4];
+ u64 bad_mt_xwr;
+};
+
/*
* x86 supports 3 paging modes (4-level 64-bit, 3-level 64-bit, and 2-level
* 32-bit). The kvm_mmu structure abstracts the details of the current mmu
@@ -289,8 +295,15 @@ struct kvm_mmu {
u64 *pae_root;
u64 *lm_root;
- u64 rsvd_bits_mask[2][4];
- u64 bad_mt_xwr;
+
+ /*
+ * check zero bits on shadow page table entries, these
+ * bits include not only hardware reserved bits but also
+ * the bits spte never used.
+ */
+ struct rsvd_bits_validate shadow_zero_check;
+
+ struct rsvd_bits_validate guest_rsvd_check;
/*
* Bitmap: bit set = last pte in walk
@@ -358,6 +371,11 @@ struct kvm_mtrr {
struct list_head head;
};
+/* Hyper-V per vcpu emulation context */
+struct kvm_vcpu_hv {
+ u64 hv_vapic;
+};
+
struct kvm_vcpu_arch {
/*
* rip and regs accesses must go through
@@ -514,8 +532,7 @@ struct kvm_vcpu_arch {
/* used for guest single stepping over the given code position */
unsigned long singlestep_rip;
- /* fields used by HYPER-V emulation */
- u64 hv_vapic;
+ struct kvm_vcpu_hv hyperv;
cpumask_var_t wbinvd_dirty_mask;
@@ -586,6 +603,17 @@ struct kvm_apic_map {
struct kvm_lapic *logical_map[16][16];
};
+/* Hyper-V emulation context */
+struct kvm_hv {
+ u64 hv_guest_os_id;
+ u64 hv_hypercall;
+ u64 hv_tsc_page;
+
+ /* Hyper-v based guest crash (NT kernel bugcheck) parameters */
+ u64 hv_crash_param[HV_X64_MSR_CRASH_PARAMS];
+ u64 hv_crash_ctl;
+};
+
struct kvm_arch {
unsigned int n_used_mmu_pages;
unsigned int n_requested_mmu_pages;
@@ -604,6 +632,8 @@ struct kvm_arch {
bool iommu_noncoherent;
#define __KVM_HAVE_ARCH_NONCOHERENT_DMA
atomic_t noncoherent_dma_count;
+#define __KVM_HAVE_ARCH_ASSIGNED_DEVICE
+ atomic_t assigned_device_count;
struct kvm_pic *vpic;
struct kvm_ioapic *vioapic;
struct kvm_pit *vpit;
@@ -643,16 +673,14 @@ struct kvm_arch {
/* reads protected by irq_srcu, writes by irq_lock */
struct hlist_head mask_notifier_list;
- /* fields used by HYPER-V emulation */
- u64 hv_guest_os_id;
- u64 hv_hypercall;
- u64 hv_tsc_page;
+ struct kvm_hv hyperv;
#ifdef CONFIG_KVM_MMU_AUDIT
int audit_point;
#endif
bool boot_vcpu_runs_old_kvmclock;
+ u32 bsp_vcpu_id;
u64 disabled_quirks;
};
@@ -684,6 +712,7 @@ struct kvm_vcpu_stat {
u32 nmi_window_exits;
u32 halt_exits;
u32 halt_successful_poll;
+ u32 halt_attempted_poll;
u32 halt_wakeup;
u32 request_irq_exits;
u32 irq_exits;
@@ -1201,5 +1230,7 @@ int __x86_set_memory_region(struct kvm *kvm,
const struct kvm_userspace_memory_region *mem);
int x86_set_memory_region(struct kvm *kvm,
const struct kvm_userspace_memory_region *mem);
+bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu);
+bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu);
#endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/include/asm/math_emu.h b/arch/x86/include/asm/math_emu.h
index 031f6266f425..0d9b14f60d2c 100644
--- a/arch/x86/include/asm/math_emu.h
+++ b/arch/x86/include/asm/math_emu.h
@@ -2,7 +2,6 @@
#define _ASM_X86_MATH_EMU_H
#include <asm/ptrace.h>
-#include <asm/vm86.h>
/* This structure matches the layout of the data saved to the stack
following a device-not-present interrupt, part of it saved
@@ -10,9 +9,6 @@
*/
struct math_emu_info {
long ___orig_eip;
- union {
- struct pt_regs *regs;
- struct kernel_vm86_regs *vm86;
- };
+ struct pt_regs *regs;
};
#endif /* _ASM_X86_MATH_EMU_H */
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 982dfc3679ad..2dbc0bf2b9f3 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -151,10 +151,12 @@ extern int mce_p5_enabled;
#ifdef CONFIG_X86_MCE
int mcheck_init(void);
void mcheck_cpu_init(struct cpuinfo_x86 *c);
+void mcheck_cpu_clear(struct cpuinfo_x86 *c);
void mcheck_vendor_init_severity(void);
#else
static inline int mcheck_init(void) { return 0; }
static inline void mcheck_cpu_init(struct cpuinfo_x86 *c) {}
+static inline void mcheck_cpu_clear(struct cpuinfo_x86 *c) {}
static inline void mcheck_vendor_init_severity(void) {}
#endif
@@ -181,20 +183,18 @@ DECLARE_PER_CPU(struct device *, mce_device);
#ifdef CONFIG_X86_MCE_INTEL
void mce_intel_feature_init(struct cpuinfo_x86 *c);
+void mce_intel_feature_clear(struct cpuinfo_x86 *c);
void cmci_clear(void);
void cmci_reenable(void);
void cmci_rediscover(void);
void cmci_recheck(void);
-void lmce_clear(void);
-void lmce_enable(void);
#else
static inline void mce_intel_feature_init(struct cpuinfo_x86 *c) { }
+static inline void mce_intel_feature_clear(struct cpuinfo_x86 *c) { }
static inline void cmci_clear(void) {}
static inline void cmci_reenable(void) {}
static inline void cmci_rediscover(void) {}
static inline void cmci_recheck(void) {}
-static inline void lmce_clear(void) {}
-static inline void lmce_enable(void) {}
#endif
#ifdef CONFIG_X86_MCE_AMD
diff --git a/arch/x86/include/asm/mm-arch-hooks.h b/arch/x86/include/asm/mm-arch-hooks.h
deleted file mode 100644
index 4e881a342236..000000000000
--- a/arch/x86/include/asm/mm-arch-hooks.h
+++ /dev/null
@@ -1,15 +0,0 @@
-/*
- * Architecture specific mm hooks
- *
- * Copyright (C) 2015, IBM Corporation
- * Author: Laurent Dufour <ldufour@linux.vnet.ibm.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#ifndef _ASM_X86_MM_ARCH_HOOKS_H
-#define _ASM_X86_MM_ARCH_HOOKS_H
-
-#endif /* _ASM_X86_MM_ARCH_HOOKS_H */
diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
index 09b9620a73b4..55234d5e7160 100644
--- a/arch/x86/include/asm/mmu.h
+++ b/arch/x86/include/asm/mmu.h
@@ -9,8 +9,9 @@
* we put the segment information here.
*/
typedef struct {
- void *ldt;
- int size;
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
+ struct ldt_struct *ldt;
+#endif
#ifdef CONFIG_X86_64
/* True if mm supports a task running in 32 bit compatibility mode. */
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 5e8daee7c5c9..379cd3658799 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -23,7 +23,7 @@ extern struct static_key rdpmc_always_available;
static inline void load_mm_cr4(struct mm_struct *mm)
{
- if (static_key_true(&rdpmc_always_available) ||
+ if (static_key_false(&rdpmc_always_available) ||
atomic_read(&mm->context.perf_rdpmc_allowed))
cr4_set_bits(X86_CR4_PCE);
else
@@ -33,12 +33,68 @@ static inline void load_mm_cr4(struct mm_struct *mm)
static inline void load_mm_cr4(struct mm_struct *mm) {}
#endif
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
+/*
+ * ldt_structs can be allocated, used, and freed, but they are never
+ * modified while live.
+ */
+struct ldt_struct {
+ /*
+ * Xen requires page-aligned LDTs with special permissions. This is
+ * needed to prevent us from installing evil descriptors such as
+ * call gates. On native, we could merge the ldt_struct and LDT
+ * allocations, but it's not worth trying to optimize.
+ */
+ struct desc_struct *entries;
+ int size;
+};
+
/*
* Used for LDT copy/destruction.
*/
int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
void destroy_context(struct mm_struct *mm);
+#else /* CONFIG_MODIFY_LDT_SYSCALL */
+static inline int init_new_context(struct task_struct *tsk,
+ struct mm_struct *mm)
+{
+ return 0;
+}
+static inline void destroy_context(struct mm_struct *mm) {}
+#endif
+static inline void load_mm_ldt(struct mm_struct *mm)
+{
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
+ struct ldt_struct *ldt;
+
+ /* lockless_dereference synchronizes with smp_store_release */
+ ldt = lockless_dereference(mm->context.ldt);
+
+ /*
+ * Any change to mm->context.ldt is followed by an IPI to all
+ * CPUs with the mm active. The LDT will not be freed until
+ * after the IPI is handled by all such CPUs. This means that,
+ * if the ldt_struct changes before we return, the values we see
+ * will be safe, and the new values will be loaded before we run
+ * any user code.
+ *
+ * NB: don't try to convert this to use RCU without extreme care.
+ * We would still need IRQs off, because we don't want to change
+ * the local LDT after an IPI loaded a newer value than the one
+ * that we can see.
+ */
+
+ if (unlikely(ldt))
+ set_ldt(ldt->entries, ldt->size);
+ else
+ clear_LDT();
+#else
+ clear_LDT();
+#endif
+
+ DEBUG_LOCKS_WARN_ON(preemptible());
+}
static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
{
@@ -70,6 +126,7 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
/* Load per-mm CR4 state */
load_mm_cr4(next);
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
/*
* Load the LDT, if the LDT is different.
*
@@ -78,12 +135,13 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
* was called and then modify_ldt changed
* prev->context.ldt but suppressed an IPI to this CPU.
* In this case, prev->context.ldt != NULL, because we
- * never free an LDT while the mm still exists. That
- * means that next->context.ldt != prev->context.ldt,
- * because mms never share an LDT.
+ * never set context.ldt to NULL while the mm still
+ * exists. That means that next->context.ldt !=
+ * prev->context.ldt, because mms never share an LDT.
*/
if (unlikely(prev->context.ldt != next->context.ldt))
- load_LDT_nolock(&next->context);
+ load_mm_ldt(next);
+#endif
}
#ifdef CONFIG_SMP
else {
@@ -106,7 +164,7 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
load_cr3(next->pgd);
trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
load_mm_cr4(next);
- load_LDT_nolock(&next->context);
+ load_mm_ldt(next);
}
}
#endif
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index c163215abb9a..aaf59b7da98a 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -7,6 +7,7 @@
struct ms_hyperv_info {
u32 features;
+ u32 misc_features;
u32 hints;
};
@@ -20,4 +21,8 @@ void hyperv_vector_handler(struct pt_regs *regs);
void hv_setup_vmbus_irq(void (*handler)(void));
void hv_remove_vmbus_irq(void);
+void hv_setup_kexec_handler(void (*handler)(void));
+void hv_remove_kexec_handler(void);
+void hv_setup_crash_handler(void (*handler)(struct pt_regs *regs));
+void hv_remove_crash_handler(void);
#endif
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 9ebc3d009373..b8c14bb7fc8f 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -73,6 +73,12 @@
#define MSR_LBR_CORE_FROM 0x00000040
#define MSR_LBR_CORE_TO 0x00000060
+#define MSR_LBR_INFO_0 0x00000dc0 /* ... 0xddf for _31 */
+#define LBR_INFO_MISPRED BIT_ULL(63)
+#define LBR_INFO_IN_TX BIT_ULL(62)
+#define LBR_INFO_ABORT BIT_ULL(61)
+#define LBR_INFO_CYCLES 0xffff
+
#define MSR_IA32_PEBS_ENABLE 0x000003f1
#define MSR_IA32_DS_AREA 0x00000600
#define MSR_IA32_PERF_CAPABILITIES 0x00000345
@@ -80,13 +86,21 @@
#define MSR_IA32_RTIT_CTL 0x00000570
#define RTIT_CTL_TRACEEN BIT(0)
+#define RTIT_CTL_CYCLEACC BIT(1)
#define RTIT_CTL_OS BIT(2)
#define RTIT_CTL_USR BIT(3)
#define RTIT_CTL_CR3EN BIT(7)
#define RTIT_CTL_TOPA BIT(8)
+#define RTIT_CTL_MTC_EN BIT(9)
#define RTIT_CTL_TSC_EN BIT(10)
#define RTIT_CTL_DISRETC BIT(11)
#define RTIT_CTL_BRANCH_EN BIT(13)
+#define RTIT_CTL_MTC_RANGE_OFFSET 14
+#define RTIT_CTL_MTC_RANGE (0x0full << RTIT_CTL_MTC_RANGE_OFFSET)
+#define RTIT_CTL_CYC_THRESH_OFFSET 19
+#define RTIT_CTL_CYC_THRESH (0x0full << RTIT_CTL_CYC_THRESH_OFFSET)
+#define RTIT_CTL_PSB_FREQ_OFFSET 24
+#define RTIT_CTL_PSB_FREQ (0x0full << RTIT_CTL_PSB_FREQ_OFFSET)
#define MSR_IA32_RTIT_STATUS 0x00000571
#define RTIT_STATUS_CONTEXTEN BIT(1)
#define RTIT_STATUS_TRIGGEREN BIT(2)
@@ -127,6 +141,8 @@
#define DEBUGCTLMSR_BTS_OFF_USR (1UL << 10)
#define DEBUGCTLMSR_FREEZE_LBRS_ON_PMI (1UL << 11)
+#define MSR_PEBS_FRONTEND 0x000003f7
+
#define MSR_IA32_POWER_CTL 0x000001fc
#define MSR_IA32_MC0_CTL 0x00000400
@@ -170,6 +186,12 @@
#define MSR_PP1_ENERGY_STATUS 0x00000641
#define MSR_PP1_POLICY 0x00000642
+#define MSR_CONFIG_TDP_NOMINAL 0x00000648
+#define MSR_CONFIG_TDP_LEVEL_1 0x00000649
+#define MSR_CONFIG_TDP_LEVEL_2 0x0000064A
+#define MSR_CONFIG_TDP_CONTROL 0x0000064B
+#define MSR_TURBO_ACTIVATION_RATIO 0x0000064C
+
#define MSR_PKG_WEIGHTED_CORE_C0_RES 0x00000658
#define MSR_PKG_ANY_CORE_C0_RES 0x00000659
#define MSR_PKG_ANY_GFXE_C0_RES 0x0000065A
@@ -311,6 +333,7 @@
/* C1E active bits in int pending message */
#define K8_INTP_C1E_ACTIVE_MASK 0x18000000
#define MSR_K8_TSEG_ADDR 0xc0010112
+#define MSR_K8_TSEG_MASK 0xc0010113
#define K8_MTRRFIXRANGE_DRAM_ENABLE 0x00040000 /* MtrrFixDramEn bit */
#define K8_MTRRFIXRANGE_DRAM_MODIFY 0x00080000 /* MtrrFixDramModEn bit */
#define K8_MTRR_RDMEM_WRMEM_MASK 0x18181818 /* Mask: RdMem|WrMem */
diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index e6a707eb5081..77d8b284e4a7 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -47,14 +47,13 @@ static inline unsigned long long native_read_tscp(unsigned int *aux)
* it means rax *or* rdx.
*/
#ifdef CONFIG_X86_64
-#define DECLARE_ARGS(val, low, high) unsigned low, high
-#define EAX_EDX_VAL(val, low, high) ((low) | ((u64)(high) << 32))
-#define EAX_EDX_ARGS(val, low, high) "a" (low), "d" (high)
+/* Using 64-bit values saves one instruction clearing the high half of low */
+#define DECLARE_ARGS(val, low, high) unsigned long low, high
+#define EAX_EDX_VAL(val, low, high) ((low) | (high) << 32)
#define EAX_EDX_RET(val, low, high) "=a" (low), "=d" (high)
#else
#define DECLARE_ARGS(val, low, high) unsigned long long val
#define EAX_EDX_VAL(val, low, high) (val)
-#define EAX_EDX_ARGS(val, low, high) "A" (val)
#define EAX_EDX_RET(val, low, high) "=A" (val)
#endif
@@ -106,12 +105,19 @@ notrace static inline int native_write_msr_safe(unsigned int msr,
return err;
}
-extern unsigned long long native_read_tsc(void);
-
extern int rdmsr_safe_regs(u32 regs[8]);
extern int wrmsr_safe_regs(u32 regs[8]);
-static __always_inline unsigned long long __native_read_tsc(void)
+/**
+ * rdtsc() - returns the current TSC without ordering constraints
+ *
+ * rdtsc() returns the result of RDTSC as a 64-bit integer. The
+ * only ordering constraint it supplies is the ordering implied by
+ * "asm volatile": it will put the RDTSC in the place you expect. The
+ * CPU can and will speculatively execute that RDTSC, though, so the
+ * results can be non-monotonic if compared on different CPUs.
+ */
+static __always_inline unsigned long long rdtsc(void)
{
DECLARE_ARGS(val, low, high);
@@ -120,6 +126,35 @@ static __always_inline unsigned long long __native_read_tsc(void)
return EAX_EDX_VAL(val, low, high);
}
+/**
+ * rdtsc_ordered() - read the current TSC in program order
+ *
+ * rdtsc_ordered() returns the result of RDTSC as a 64-bit integer.
+ * It is ordered like a load to a global in-memory counter. It should
+ * be impossible to observe non-monotonic rdtsc_unordered() behavior
+ * across multiple CPUs as long as the TSC is synced.
+ */
+static __always_inline unsigned long long rdtsc_ordered(void)
+{
+ /*
+ * The RDTSC instruction is not ordered relative to memory
+ * access. The Intel SDM and the AMD APM are both vague on this
+ * point, but empirically an RDTSC instruction can be
+ * speculatively executed before prior loads. An RDTSC
+ * immediately after an appropriate barrier appears to be
+ * ordered as a normal load, that is, it provides the same
+ * ordering guarantees as reading from a global memory location
+ * that some other imaginary CPU is updating continuously with a
+ * time stamp.
+ */
+ alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC,
+ "lfence", X86_FEATURE_LFENCE_RDTSC);
+ return rdtsc();
+}
+
+/* Deprecated, keep it for a cycle for easier merging: */
+#define rdtscll(now) do { (now) = rdtsc_ordered(); } while (0)
+
static inline unsigned long long native_read_pmc(int counter)
{
DECLARE_ARGS(val, low, high);
@@ -153,8 +188,10 @@ static inline void wrmsr(unsigned msr, unsigned low, unsigned high)
#define rdmsrl(msr, val) \
((val) = native_read_msr((msr)))
-#define wrmsrl(msr, val) \
- native_write_msr((msr), (u32)((u64)(val)), (u32)((u64)(val) >> 32))
+static inline void wrmsrl(unsigned msr, u64 val)
+{
+ native_write_msr(msr, (u32)val, (u32)(val >> 32));
+}
/* wrmsr with exception handling */
static inline int wrmsr_safe(unsigned msr, unsigned low, unsigned high)
@@ -180,12 +217,6 @@ static inline int rdmsrl_safe(unsigned msr, unsigned long long *p)
return err;
}
-#define rdtscl(low) \
- ((low) = (u32)__native_read_tsc())
-
-#define rdtscll(val) \
- ((val) = __native_read_tsc())
-
#define rdpmc(counter, low, high) \
do { \
u64 _l = native_read_pmc((counter)); \
@@ -195,15 +226,6 @@ do { \
#define rdpmcl(counter, val) ((val) = native_read_pmc(counter))
-#define rdtscp(low, high, aux) \
-do { \
- unsigned long long _val = native_read_tscp(&(aux)); \
- (low) = (u32)_val; \
- (high) = (u32)(_val >> 32); \
-} while (0)
-
-#define rdtscpll(val, aux) (val) = native_read_tscp(&(aux))
-
#endif /* !CONFIG_PARAVIRT */
/*
diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h
index 653dfa7662e1..c70689b5e5aa 100644
--- a/arch/x86/include/asm/mwait.h
+++ b/arch/x86/include/asm/mwait.h
@@ -14,6 +14,9 @@
#define CPUID5_ECX_INTERRUPT_BREAK 0x2
#define MWAIT_ECX_INTERRUPT_BREAK 0x1
+#define MWAITX_ECX_TIMER_ENABLE BIT(1)
+#define MWAITX_MAX_LOOPS ((u32)-1)
+#define MWAITX_DISABLE_CSTATES 0xf
static inline void __monitor(const void *eax, unsigned long ecx,
unsigned long edx)
@@ -23,6 +26,14 @@ static inline void __monitor(const void *eax, unsigned long ecx,
:: "a" (eax), "c" (ecx), "d"(edx));
}
+static inline void __monitorx(const void *eax, unsigned long ecx,
+ unsigned long edx)
+{
+ /* "monitorx %eax, %ecx, %edx;" */
+ asm volatile(".byte 0x0f, 0x01, 0xfa;"
+ :: "a" (eax), "c" (ecx), "d"(edx));
+}
+
static inline void __mwait(unsigned long eax, unsigned long ecx)
{
/* "mwait %eax, %ecx;" */
@@ -30,6 +41,40 @@ static inline void __mwait(unsigned long eax, unsigned long ecx)
:: "a" (eax), "c" (ecx));
}
+/*
+ * MWAITX allows for a timer expiration to get the core out a wait state in
+ * addition to the default MWAIT exit condition of a store appearing at a
+ * monitored virtual address.
+ *
+ * Registers:
+ *
+ * MWAITX ECX[1]: enable timer if set
+ * MWAITX EBX[31:0]: max wait time expressed in SW P0 clocks. The software P0
+ * frequency is the same as the TSC frequency.
+ *
+ * Below is a comparison between MWAIT and MWAITX on AMD processors:
+ *
+ * MWAIT MWAITX
+ * opcode 0f 01 c9 | 0f 01 fb
+ * ECX[0] value of RFLAGS.IF seen by instruction
+ * ECX[1] unused/#GP if set | enable timer if set
+ * ECX[31:2] unused/#GP if set
+ * EAX unused (reserve for hint)
+ * EBX[31:0] unused | max wait time (P0 clocks)
+ *
+ * MONITOR MONITORX
+ * opcode 0f 01 c8 | 0f 01 fa
+ * EAX (logical) address to monitor
+ * ECX #GP if not zero
+ */
+static inline void __mwaitx(unsigned long eax, unsigned long ebx,
+ unsigned long ecx)
+{
+ /* "mwaitx %eax, %ebx, %ecx;" */
+ asm volatile(".byte 0x0f, 0x01, 0xfb;"
+ :: "a" (eax), "b" (ebx), "c" (ecx));
+}
+
static inline void __sti_mwait(unsigned long eax, unsigned long ecx)
{
trace_hardirqs_on();
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index d143bfad45d7..10d0596433f8 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -153,7 +153,11 @@ do { \
val = paravirt_read_msr(msr, &_err); \
} while (0)
-#define wrmsrl(msr, val) wrmsr(msr, (u32)((u64)(val)), ((u64)(val))>>32)
+static inline void wrmsrl(unsigned msr, u64 val)
+{
+ wrmsr(msr, (u32)val, (u32)(val>>32));
+}
+
#define wrmsr_safe(msr, a, b) paravirt_write_msr(msr, a, b)
/* rdmsr with exception handling */
@@ -174,19 +178,6 @@ static inline int rdmsrl_safe(unsigned msr, unsigned long long *p)
return err;
}
-static inline u64 paravirt_read_tsc(void)
-{
- return PVOP_CALL0(u64, pv_cpu_ops.read_tsc);
-}
-
-#define rdtscl(low) \
-do { \
- u64 _l = paravirt_read_tsc(); \
- low = (int)_l; \
-} while (0)
-
-#define rdtscll(val) (val = paravirt_read_tsc())
-
static inline unsigned long long paravirt_sched_clock(void)
{
return PVOP_CALL0(unsigned long long, pv_time_ops.sched_clock);
@@ -215,27 +206,6 @@ do { \
#define rdpmcl(counter, val) ((val) = paravirt_read_pmc(counter))
-static inline unsigned long long paravirt_rdtscp(unsigned int *aux)
-{
- return PVOP_CALL1(u64, pv_cpu_ops.read_tscp, aux);
-}
-
-#define rdtscp(low, high, aux) \
-do { \
- int __aux; \
- unsigned long __val = paravirt_rdtscp(&__aux); \
- (low) = (u32)__val; \
- (high) = (u32)(__val >> 32); \
- (aux) = __aux; \
-} while (0)
-
-#define rdtscpll(val, aux) \
-do { \
- unsigned long __aux; \
- val = paravirt_rdtscp(&__aux); \
- (aux) = __aux; \
-} while (0)
-
static inline void paravirt_alloc_ldt(struct desc_struct *ldt, unsigned entries)
{
PVOP_VCALL2(pv_cpu_ops.alloc_ldt, ldt, entries);
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index a6b8f9fadb06..31247b5bff7c 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -97,7 +97,6 @@ struct pv_lazy_ops {
struct pv_time_ops {
unsigned long long (*sched_clock)(void);
unsigned long long (*steal_clock)(int cpu);
- unsigned long (*get_tsc_khz)(void);
};
struct pv_cpu_ops {
@@ -156,9 +155,7 @@ struct pv_cpu_ops {
u64 (*read_msr)(unsigned int msr, int *err);
int (*write_msr)(unsigned int msr, unsigned low, unsigned high);
- u64 (*read_tsc)(void);
u64 (*read_pmc)(int counter);
- unsigned long long (*read_tscp)(unsigned int *aux);
#ifdef CONFIG_X86_32
/*
diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h
index 164e3f8d3c3d..fa1195dae425 100644
--- a/arch/x86/include/asm/pci_x86.h
+++ b/arch/x86/include/asm/pci_x86.h
@@ -93,8 +93,6 @@ extern raw_spinlock_t pci_config_lock;
extern int (*pcibios_enable_irq)(struct pci_dev *dev);
extern void (*pcibios_disable_irq)(struct pci_dev *dev);
-extern bool mp_should_keep_irq(struct device *dev);
-
struct pci_raw_ops {
int (*read)(unsigned int domain, unsigned int bus, unsigned int devfn,
int reg, int len, u32 *val);
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index dc0f6ed35b08..7bcb861a04e5 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -159,6 +159,13 @@ struct x86_pmu_capability {
*/
#define INTEL_PMC_IDX_FIXED_BTS (INTEL_PMC_IDX_FIXED + 16)
+#define GLOBAL_STATUS_COND_CHG BIT_ULL(63)
+#define GLOBAL_STATUS_BUFFER_OVF BIT_ULL(62)
+#define GLOBAL_STATUS_UNC_OVF BIT_ULL(61)
+#define GLOBAL_STATUS_ASIF BIT_ULL(60)
+#define GLOBAL_STATUS_COUNTERS_FROZEN BIT_ULL(59)
+#define GLOBAL_STATUS_LBRS_FROZEN BIT_ULL(58)
+
/*
* IBS cpuid feature detection
*/
diff --git a/arch/x86/include/asm/pmc_atom.h b/arch/x86/include/asm/pmc_atom.h
index bc0fc0866553..aa8744c77c6d 100644
--- a/arch/x86/include/asm/pmc_atom.h
+++ b/arch/x86/include/asm/pmc_atom.h
@@ -18,6 +18,8 @@
/* ValleyView Power Control Unit PCI Device ID */
#define PCI_DEVICE_ID_VLV_PMC 0x0F1C
+/* CherryTrail Power Control Unit PCI Device ID */
+#define PCI_DEVICE_ID_CHT_PMC 0x229C
/* PMC Memory mapped IO registers */
#define PMC_BASE_ADDR_OFFSET 0x44
@@ -29,6 +31,10 @@
#define PMC_FUNC_DIS 0x34
#define PMC_FUNC_DIS_2 0x38
+/* CHT specific bits in FUNC_DIS2 register */
+#define BIT_FD_GMM BIT(3)
+#define BIT_FD_ISH BIT(4)
+
/* S0ix wake event control */
#define PMC_S0IX_WAKE_EN 0x3C
@@ -75,6 +81,21 @@
#define PMC_PSS_BIT_USB BIT(16)
#define PMC_PSS_BIT_USB_SUS BIT(17)
+/* CHT specific bits in PSS register */
+#define PMC_PSS_BIT_CHT_UFS BIT(7)
+#define PMC_PSS_BIT_CHT_UXD BIT(11)
+#define PMC_PSS_BIT_CHT_UXD_FD BIT(12)
+#define PMC_PSS_BIT_CHT_UX_ENG BIT(15)
+#define PMC_PSS_BIT_CHT_USB_SUS BIT(16)
+#define PMC_PSS_BIT_CHT_GMM BIT(17)
+#define PMC_PSS_BIT_CHT_ISH BIT(18)
+#define PMC_PSS_BIT_CHT_DFX_MASTER BIT(26)
+#define PMC_PSS_BIT_CHT_DFX_CLUSTER1 BIT(27)
+#define PMC_PSS_BIT_CHT_DFX_CLUSTER2 BIT(28)
+#define PMC_PSS_BIT_CHT_DFX_CLUSTER3 BIT(29)
+#define PMC_PSS_BIT_CHT_DFX_CLUSTER4 BIT(30)
+#define PMC_PSS_BIT_CHT_DFX_CLUSTER5 BIT(31)
+
/* These registers reflect D3 status of functions */
#define PMC_D3_STS_0 0xA0
@@ -117,6 +138,10 @@
#define BIT_USH_SS_PHY BIT(2)
#define BIT_DFX BIT(3)
+/* CHT specific bits in PMC_D3_STS_1 register */
+#define BIT_STS_GMM BIT(1)
+#define BIT_STS_ISH BIT(2)
+
/* PMC I/O Registers */
#define ACPI_BASE_ADDR_OFFSET 0x40
#define ACPI_BASE_ADDR_MASK 0xFFFFFE00
@@ -126,4 +151,8 @@
#define SLEEP_TYPE_MASK 0xFFFFECFF
#define SLEEP_TYPE_S5 0x1C00
#define SLEEP_ENABLE 0x2000
+
+extern int pmc_atom_read(int offset, u32 *value);
+extern int pmc_atom_write(int offset, u32 value);
+
#endif /* PMC_ATOM_H */
diff --git a/arch/x86/include/asm/pmem.h b/arch/x86/include/asm/pmem.h
new file mode 100644
index 000000000000..d8ce3ec816ab
--- /dev/null
+++ b/arch/x86/include/asm/pmem.h
@@ -0,0 +1,153 @@
+/*
+ * Copyright(c) 2015 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#ifndef __ASM_X86_PMEM_H__
+#define __ASM_X86_PMEM_H__
+
+#include <linux/uaccess.h>
+#include <asm/cacheflush.h>
+#include <asm/cpufeature.h>
+#include <asm/special_insns.h>
+
+#ifdef CONFIG_ARCH_HAS_PMEM_API
+/**
+ * arch_memcpy_to_pmem - copy data to persistent memory
+ * @dst: destination buffer for the copy
+ * @src: source buffer for the copy
+ * @n: length of the copy in bytes
+ *
+ * Copy data to persistent memory media via non-temporal stores so that
+ * a subsequent arch_wmb_pmem() can flush cpu and memory controller
+ * write buffers to guarantee durability.
+ */
+static inline void arch_memcpy_to_pmem(void __pmem *dst, const void *src,
+ size_t n)
+{
+ int unwritten;
+
+ /*
+ * We are copying between two kernel buffers, if
+ * __copy_from_user_inatomic_nocache() returns an error (page
+ * fault) we would have already reported a general protection fault
+ * before the WARN+BUG.
+ */
+ unwritten = __copy_from_user_inatomic_nocache((void __force *) dst,
+ (void __user *) src, n);
+ if (WARN(unwritten, "%s: fault copying %p <- %p unwritten: %d\n",
+ __func__, dst, src, unwritten))
+ BUG();
+}
+
+/**
+ * arch_wmb_pmem - synchronize writes to persistent memory
+ *
+ * After a series of arch_memcpy_to_pmem() operations this drains data
+ * from cpu write buffers and any platform (memory controller) buffers
+ * to ensure that written data is durable on persistent memory media.
+ */
+static inline void arch_wmb_pmem(void)
+{
+ /*
+ * wmb() to 'sfence' all previous writes such that they are
+ * architecturally visible to 'pcommit'. Note, that we've
+ * already arranged for pmem writes to avoid the cache via
+ * arch_memcpy_to_pmem().
+ */
+ wmb();
+ pcommit_sfence();
+}
+
+/**
+ * __arch_wb_cache_pmem - write back a cache range with CLWB
+ * @vaddr: virtual start address
+ * @size: number of bytes to write back
+ *
+ * Write back a cache range using the CLWB (cache line write back)
+ * instruction. This function requires explicit ordering with an
+ * arch_wmb_pmem() call. This API is internal to the x86 PMEM implementation.
+ */
+static inline void __arch_wb_cache_pmem(void *vaddr, size_t size)
+{
+ u16 x86_clflush_size = boot_cpu_data.x86_clflush_size;
+ unsigned long clflush_mask = x86_clflush_size - 1;
+ void *vend = vaddr + size;
+ void *p;
+
+ for (p = (void *)((unsigned long)vaddr & ~clflush_mask);
+ p < vend; p += x86_clflush_size)
+ clwb(p);
+}
+
+/*
+ * copy_from_iter_nocache() on x86 only uses non-temporal stores for iovec
+ * iterators, so for other types (bvec & kvec) we must do a cache write-back.
+ */
+static inline bool __iter_needs_pmem_wb(struct iov_iter *i)
+{
+ return iter_is_iovec(i) == false;
+}
+
+/**
+ * arch_copy_from_iter_pmem - copy data from an iterator to PMEM
+ * @addr: PMEM destination address
+ * @bytes: number of bytes to copy
+ * @i: iterator with source data
+ *
+ * Copy data from the iterator 'i' to the PMEM buffer starting at 'addr'.
+ * This function requires explicit ordering with an arch_wmb_pmem() call.
+ */
+static inline size_t arch_copy_from_iter_pmem(void __pmem *addr, size_t bytes,
+ struct iov_iter *i)
+{
+ void *vaddr = (void __force *)addr;
+ size_t len;
+
+ /* TODO: skip the write-back by always using non-temporal stores */
+ len = copy_from_iter_nocache(vaddr, bytes, i);
+
+ if (__iter_needs_pmem_wb(i))
+ __arch_wb_cache_pmem(vaddr, bytes);
+
+ return len;
+}
+
+/**
+ * arch_clear_pmem - zero a PMEM memory range
+ * @addr: virtual start address
+ * @size: number of bytes to zero
+ *
+ * Write zeros into the memory range starting at 'addr' for 'size' bytes.
+ * This function requires explicit ordering with an arch_wmb_pmem() call.
+ */
+static inline void arch_clear_pmem(void __pmem *addr, size_t size)
+{
+ void *vaddr = (void __force *)addr;
+
+ /* TODO: implement the zeroing via non-temporal writes */
+ if (size == PAGE_SIZE && ((unsigned long)vaddr & ~PAGE_MASK) == 0)
+ clear_page(vaddr);
+ else
+ memset(vaddr, 0, size);
+
+ __arch_wb_cache_pmem(vaddr, size);
+}
+
+static inline bool __arch_has_wmb_pmem(void)
+{
+ /*
+ * We require that wmb() be an 'sfence', that is only guaranteed on
+ * 64-bit builds
+ */
+ return static_cpu_has(X86_FEATURE_PCOMMIT);
+}
+#endif /* CONFIG_ARCH_HAS_PMEM_API */
+#endif /* __ASM_X86_PMEM_H__ */
diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h
index dca71714f860..b12f81022a6b 100644
--- a/arch/x86/include/asm/preempt.h
+++ b/arch/x86/include/asm/preempt.h
@@ -90,9 +90,9 @@ static __always_inline bool __preempt_count_dec_and_test(void)
/*
* Returns true when we need to resched and can (barring IRQ state).
*/
-static __always_inline bool should_resched(void)
+static __always_inline bool should_resched(int preempt_offset)
{
- return unlikely(!raw_cpu_read_4(__preempt_count));
+ return unlikely(raw_cpu_read_4(__preempt_count) == preempt_offset);
}
#ifdef CONFIG_PREEMPT
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 43e6519df0d5..19577dd325fa 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -6,8 +6,8 @@
/* Forward declaration, a strange C thing */
struct task_struct;
struct mm_struct;
+struct vm86;
-#include <asm/vm86.h>
#include <asm/math_emu.h>
#include <asm/segment.h>
#include <asm/types.h>
@@ -390,9 +390,6 @@ struct thread_struct {
#endif
unsigned long gs;
- /* Floating point and extended processor state */
- struct fpu fpu;
-
/* Save middle states of ptrace breakpoints */
struct perf_event *ptrace_bps[HBP_NUM];
/* Debug status used for traps, single steps, etc... */
@@ -403,21 +400,22 @@ struct thread_struct {
unsigned long cr2;
unsigned long trap_nr;
unsigned long error_code;
-#ifdef CONFIG_X86_32
+#ifdef CONFIG_VM86
/* Virtual 86 mode info */
- struct vm86_struct __user *vm86_info;
- unsigned long screen_bitmap;
- unsigned long v86flags;
- unsigned long v86mask;
- unsigned long saved_sp0;
- unsigned int saved_fs;
- unsigned int saved_gs;
+ struct vm86 *vm86;
#endif
/* IO permissions: */
unsigned long *io_bitmap_ptr;
unsigned long iopl;
/* Max allowed port in the bitmap, in bytes: */
unsigned io_bitmap_max;
+
+ /* Floating point and extended processor state */
+ struct fpu fpu;
+ /*
+ * WARNING: 'fpu' is dynamically-sized. It *MUST* be at
+ * the end.
+ */
};
/*
@@ -647,14 +645,6 @@ static inline void update_debugctlmsr(unsigned long debugctlmsr)
extern void set_task_blockstep(struct task_struct *task, bool on);
-/*
- * from system description table in BIOS. Mostly for MCA use, but
- * others may find it useful:
- */
-extern unsigned int machine_id;
-extern unsigned int machine_submodel_id;
-extern unsigned int BIOS_revision;
-
/* Boot loader type from the setup header: */
extern int bootloader_type;
extern int bootloader_version;
@@ -716,7 +706,6 @@ static inline void spin_lock_prefetch(const void *x)
#define INIT_THREAD { \
.sp0 = TOP_OF_INIT_STACK, \
- .vm86_info = NULL, \
.sysenter_cs = __KERNEL_CS, \
.io_bitmap_ptr = NULL, \
}
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 5fabf1362942..6271281f947d 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -88,7 +88,6 @@ extern long syscall_trace_enter_phase2(struct pt_regs *, u32 arch,
unsigned long phase1_result);
extern long syscall_trace_enter(struct pt_regs *);
-extern void syscall_trace_leave(struct pt_regs *);
static inline unsigned long regs_return_value(struct pt_regs *regs)
{
diff --git a/arch/x86/include/asm/pvclock-abi.h b/arch/x86/include/asm/pvclock-abi.h
index 655e07a48f6c..67f08230103a 100644
--- a/arch/x86/include/asm/pvclock-abi.h
+++ b/arch/x86/include/asm/pvclock-abi.h
@@ -41,6 +41,7 @@ struct pvclock_wall_clock {
#define PVCLOCK_TSC_STABLE_BIT (1 << 0)
#define PVCLOCK_GUEST_STOPPED (1 << 1)
+/* PVCLOCK_COUNTS_FROM_ZERO broke ABI and can't be used anymore. */
#define PVCLOCK_COUNTS_FROM_ZERO (1 << 2)
#endif /* __ASSEMBLY__ */
#endif /* _ASM_X86_PVCLOCK_ABI_H */
diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
index 628954ceede1..7a6bed5c08bc 100644
--- a/arch/x86/include/asm/pvclock.h
+++ b/arch/x86/include/asm/pvclock.h
@@ -62,7 +62,7 @@ static inline u64 pvclock_scale_delta(u64 delta, u32 mul_frac, int shift)
static __always_inline
u64 pvclock_get_nsec_offset(const struct pvclock_vcpu_time_info *src)
{
- u64 delta = __native_read_tsc() - src->tsc_timestamp;
+ u64 delta = rdtsc_ordered() - src->tsc_timestamp;
return pvclock_scale_delta(delta, src->tsc_to_system_mul,
src->tsc_shift);
}
@@ -76,13 +76,7 @@ unsigned __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src,
u8 ret_flags;
version = src->version;
- /* Note: emulated platforms which do not advertise SSE2 support
- * result in kvmclock not using the necessary RDTSC barriers.
- * Without barriers, it is possible that RDTSC instruction reads from
- * the time stamp counter outside rdtsc_barrier protected section
- * below, resulting in violation of monotonicity.
- */
- rdtsc_barrier();
+
offset = pvclock_get_nsec_offset(src);
ret = src->system_time + offset;
ret_flags = src->flags;
diff --git a/arch/x86/include/asm/qrwlock.h b/arch/x86/include/asm/qrwlock.h
index ae0e241e228b..c537cbb038a7 100644
--- a/arch/x86/include/asm/qrwlock.h
+++ b/arch/x86/include/asm/qrwlock.h
@@ -2,16 +2,6 @@
#define _ASM_X86_QRWLOCK_H
#include <asm-generic/qrwlock_types.h>
-
-#ifndef CONFIG_X86_PPRO_FENCE
-#define queue_write_unlock queue_write_unlock
-static inline void queue_write_unlock(struct qrwlock *lock)
-{
- barrier();
- ACCESS_ONCE(*(u8 *)&lock->cnts) = 0;
-}
-#endif
-
#include <asm-generic/qrwlock.h>
#endif /* _ASM_X86_QRWLOCK_H */
diff --git a/arch/x86/include/asm/qspinlock.h b/arch/x86/include/asm/qspinlock.h
index 9d51fae1cba3..eaba08076030 100644
--- a/arch/x86/include/asm/qspinlock.h
+++ b/arch/x86/include/asm/qspinlock.h
@@ -39,18 +39,27 @@ static inline void queued_spin_unlock(struct qspinlock *lock)
}
#endif
-#define virt_queued_spin_lock virt_queued_spin_lock
-
-static inline bool virt_queued_spin_lock(struct qspinlock *lock)
+#ifdef CONFIG_PARAVIRT
+#define virt_spin_lock virt_spin_lock
+static inline bool virt_spin_lock(struct qspinlock *lock)
{
if (!static_cpu_has(X86_FEATURE_HYPERVISOR))
return false;
- while (atomic_cmpxchg(&lock->val, 0, _Q_LOCKED_VAL) != 0)
- cpu_relax();
+ /*
+ * On hypervisors without PARAVIRT_SPINLOCKS support we fall
+ * back to a Test-and-Set spinlock, because fair locks have
+ * horrible lock 'holder' preemption issues.
+ */
+
+ do {
+ while (atomic_read(&lock->val) != 0)
+ cpu_relax();
+ } while (atomic_cmpxchg(&lock->val, 0, _Q_LOCKED_VAL) != 0);
return true;
}
+#endif /* CONFIG_PARAVIRT */
#include <asm-generic/qspinlock.h>
diff --git a/arch/x86/include/asm/sigcontext.h b/arch/x86/include/asm/sigcontext.h
index 6fe6b182c998..9dfce4e0417d 100644
--- a/arch/x86/include/asm/sigcontext.h
+++ b/arch/x86/include/asm/sigcontext.h
@@ -57,9 +57,9 @@ struct sigcontext {
unsigned long ip;
unsigned long flags;
unsigned short cs;
- unsigned short __pad2; /* Was called gs, but was always zero. */
- unsigned short __pad1; /* Was called fs, but was always zero. */
- unsigned short ss;
+ unsigned short gs;
+ unsigned short fs;
+ unsigned short __pad0;
unsigned long err;
unsigned long trapno;
unsigned long oldmask;
diff --git a/arch/x86/include/asm/sigframe.h b/arch/x86/include/asm/sigframe.h
index 7c7c27c97daa..1f3175bb994e 100644
--- a/arch/x86/include/asm/sigframe.h
+++ b/arch/x86/include/asm/sigframe.h
@@ -4,6 +4,7 @@
#include <asm/sigcontext.h>
#include <asm/siginfo.h>
#include <asm/ucontext.h>
+#include <linux/compat.h>
#ifdef CONFIG_X86_32
#define sigframe_ia32 sigframe
@@ -69,6 +70,15 @@ struct rt_sigframe {
#ifdef CONFIG_X86_X32_ABI
+struct ucontext_x32 {
+ unsigned int uc_flags;
+ unsigned int uc_link;
+ compat_stack_t uc_stack;
+ unsigned int uc__pad0; /* needed for alignment */
+ struct sigcontext uc_mcontext; /* the 64-bit sigcontext type */
+ compat_sigset_t uc_sigmask; /* mask last for extensibility */
+};
+
struct rt_sigframe_x32 {
u64 pretcode;
struct ucontext_x32 uc;
diff --git a/arch/x86/include/asm/signal.h b/arch/x86/include/asm/signal.h
index 31eab867e6d3..c481be78fcf1 100644
--- a/arch/x86/include/asm/signal.h
+++ b/arch/x86/include/asm/signal.h
@@ -30,7 +30,7 @@ typedef sigset_t compat_sigset_t;
#endif /* __ASSEMBLY__ */
#include <uapi/asm/signal.h>
#ifndef __ASSEMBLY__
-extern void do_notify_resume(struct pt_regs *, void *, __u32);
+extern void do_signal(struct pt_regs *regs);
#define __ARCH_HAS_SA_RESTORER
diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h
index c2e00bb2a136..58505f01962f 100644
--- a/arch/x86/include/asm/stackprotector.h
+++ b/arch/x86/include/asm/stackprotector.h
@@ -72,7 +72,7 @@ static __always_inline void boot_init_stack_canary(void)
* on during the bootup the random pool has true entropy too.
*/
get_random_bytes(&canary, sizeof(canary));
- tsc = __native_read_tsc();
+ tsc = rdtsc();
canary += tsc + (tsc << 32UL);
current->stack_canary = canary;
diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h
index 751bf4b7bf11..d7f3b3b78ac3 100644
--- a/arch/x86/include/asm/switch_to.h
+++ b/arch/x86/include/asm/switch_to.h
@@ -79,12 +79,12 @@ do { \
#else /* CONFIG_X86_32 */
/* frame pointer must be last for get_wchan */
-#define SAVE_CONTEXT "pushq %%rbp ; movq %%rsi,%%rbp\n\t"
-#define RESTORE_CONTEXT "movq %%rbp,%%rsi ; popq %%rbp\t"
+#define SAVE_CONTEXT "pushf ; pushq %%rbp ; movq %%rsi,%%rbp\n\t"
+#define RESTORE_CONTEXT "movq %%rbp,%%rsi ; popq %%rbp ; popf\t"
#define __EXTRA_CLOBBER \
, "rcx", "rbx", "rdx", "r8", "r9", "r10", "r11", \
- "r12", "r13", "r14", "r15", "flags"
+ "r12", "r13", "r14", "r15"
#ifdef CONFIG_CC_STACKPROTECTOR
#define __switch_canary \
@@ -100,11 +100,7 @@ do { \
#define __switch_canary_iparam
#endif /* CC_STACKPROTECTOR */
-/*
- * There is no need to save or restore flags, because flags are always
- * clean in kernel mode, with the possible exception of IOPL. Kernel IOPL
- * has no effect.
- */
+/* Save restore flags to clear handle leaking NT */
#define switch_to(prev, next, last) \
asm volatile(SAVE_CONTEXT \
"movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \
diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h
index 592a6a672e07..91dfcafe27a6 100644
--- a/arch/x86/include/asm/syscalls.h
+++ b/arch/x86/include/asm/syscalls.h
@@ -37,6 +37,7 @@ asmlinkage long sys_get_thread_area(struct user_desc __user *);
asmlinkage unsigned long sys_sigreturn(void);
/* kernel/vm86_32.c */
+struct vm86_struct;
asmlinkage long sys_vm86old(struct vm86_struct __user *);
asmlinkage long sys_vm86(unsigned long, unsigned long);
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 225ee545e1a0..8afdc3e44247 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -27,14 +27,17 @@
* Without this offset, that can result in a page fault. (We are
* careful that, in this case, the value we read doesn't matter.)
*
- * In vm86 mode, the hardware frame is much longer still, but we neither
- * access the extra members from NMI context, nor do we write such a
- * frame at sp0 at all.
+ * In vm86 mode, the hardware frame is much longer still, so add 16
+ * bytes to make room for the real-mode segments.
*
* x86_64 has a fixed-length stack frame.
*/
#ifdef CONFIG_X86_32
-# define TOP_OF_KERNEL_STACK_PADDING 8
+# ifdef CONFIG_VM86
+# define TOP_OF_KERNEL_STACK_PADDING 16
+# else
+# define TOP_OF_KERNEL_STACK_PADDING 8
+# endif
#else
# define TOP_OF_KERNEL_STACK_PADDING 0
#endif
@@ -140,27 +143,11 @@ struct thread_info {
_TIF_SECCOMP | _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT | \
_TIF_NOHZ)
-/* work to do in syscall_trace_leave() */
-#define _TIF_WORK_SYSCALL_EXIT \
- (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_SINGLESTEP | \
- _TIF_SYSCALL_TRACEPOINT | _TIF_NOHZ)
-
-/* work to do on interrupt/exception return */
-#define _TIF_WORK_MASK \
- (0x0000FFFF & \
- ~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT| \
- _TIF_SINGLESTEP|_TIF_SECCOMP|_TIF_SYSCALL_EMU))
-
/* work to do on any return to user space */
#define _TIF_ALLWORK_MASK \
((0x0000FFFF & ~_TIF_SECCOMP) | _TIF_SYSCALL_TRACEPOINT | \
_TIF_NOHZ)
-/* Only used for 64 bit */
-#define _TIF_DO_NOTIFY_MASK \
- (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | \
- _TIF_USER_RETURN_NOTIFY | _TIF_UPROBE)
-
/* flags to check in __switch_to() */
#define _TIF_WORK_CTXSW \
(_TIF_IO_BITMAP|_TIF_NOTSC|_TIF_BLOCKSTEP)
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index cd791948b286..6df2029405a3 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -261,6 +261,12 @@ static inline void reset_lazy_tlbstate(void)
#endif /* SMP */
+/* Not inlined due to inc_irq_stat not being defined yet */
+#define flush_tlb_local() { \
+ inc_irq_stat(irq_tlb_count); \
+ local_flush_tlb(); \
+}
+
#ifndef CONFIG_PARAVIRT
#define flush_tlb_others(mask, mm, start, end) \
native_flush_tlb_others(mask, mm, start, end)
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index c5380bea2a36..c3496619740a 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -112,8 +112,8 @@ asmlinkage void smp_threshold_interrupt(void);
asmlinkage void smp_deferred_error_interrupt(void);
#endif
-extern enum ctx_state ist_enter(struct pt_regs *regs);
-extern void ist_exit(struct pt_regs *regs, enum ctx_state prev_state);
+extern void ist_enter(struct pt_regs *regs);
+extern void ist_exit(struct pt_regs *regs);
extern void ist_begin_non_atomic(struct pt_regs *regs);
extern void ist_end_non_atomic(void);
diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h
index 94605c0e9cee..6d7c5479bcea 100644
--- a/arch/x86/include/asm/tsc.h
+++ b/arch/x86/include/asm/tsc.h
@@ -21,28 +21,12 @@ extern void disable_TSC(void);
static inline cycles_t get_cycles(void)
{
- unsigned long long ret = 0;
-
#ifndef CONFIG_X86_TSC
if (!cpu_has_tsc)
return 0;
#endif
- rdtscll(ret);
-
- return ret;
-}
-static __always_inline cycles_t vget_cycles(void)
-{
- /*
- * We only do VDSOs on TSC capable CPUs, so this shouldn't
- * access boot_cpu_data (which is not VDSO-safe):
- */
-#ifndef CONFIG_X86_TSC
- if (!cpu_has_tsc)
- return 0;
-#endif
- return (cycles_t)__native_read_tsc();
+ return rdtsc();
}
extern void tsc_init(void);
@@ -51,6 +35,7 @@ extern int unsynchronized_tsc(void);
extern int check_tsc_unstable(void);
extern int check_tsc_disabled(void);
extern unsigned long native_calibrate_tsc(void);
+extern unsigned long long native_sched_clock_from_tsc(u64 tsc);
extern int tsc_clocksource_reliable;
diff --git a/arch/x86/include/asm/vm86.h b/arch/x86/include/asm/vm86.h
index 1d8de3f3feca..1e491f3af317 100644
--- a/arch/x86/include/asm/vm86.h
+++ b/arch/x86/include/asm/vm86.h
@@ -1,7 +1,6 @@
#ifndef _ASM_X86_VM86_H
#define _ASM_X86_VM86_H
-
#include <asm/ptrace.h>
#include <uapi/asm/vm86.h>
@@ -28,43 +27,49 @@ struct kernel_vm86_regs {
unsigned short gs, __gsh;
};
-struct kernel_vm86_struct {
- struct kernel_vm86_regs regs;
-/*
- * the below part remains on the kernel stack while we are in VM86 mode.
- * 'tss.esp0' then contains the address of VM86_TSS_ESP0 below, and when we
- * get forced back from VM86, the CPU and "SAVE_ALL" will restore the above
- * 'struct kernel_vm86_regs' with the then actual values.
- * Therefore, pt_regs in fact points to a complete 'kernel_vm86_struct'
- * in kernelspace, hence we need not reget the data from userspace.
- */
-#define VM86_TSS_ESP0 flags
+struct vm86 {
+ struct vm86plus_struct __user *user_vm86;
+ struct pt_regs regs32;
+ unsigned long veflags;
+ unsigned long veflags_mask;
+ unsigned long saved_sp0;
+
unsigned long flags;
unsigned long screen_bitmap;
unsigned long cpu_type;
struct revectored_struct int_revectored;
struct revectored_struct int21_revectored;
struct vm86plus_info_struct vm86plus;
- struct pt_regs *regs32; /* here we save the pointer to the old regs */
-/*
- * The below is not part of the structure, but the stack layout continues
- * this way. In front of 'return-eip' may be some data, depending on
- * compilation, so we don't rely on this and save the pointer to 'oldregs'
- * in 'regs32' above.
- * However, with GCC-2.7.2 and the current CFLAGS you see exactly this:
-
- long return-eip; from call to vm86()
- struct pt_regs oldregs; user space registers as saved by syscall
- */
};
#ifdef CONFIG_VM86
void handle_vm86_fault(struct kernel_vm86_regs *, long);
int handle_vm86_trap(struct kernel_vm86_regs *, long, int);
-struct pt_regs *save_v86_state(struct kernel_vm86_regs *);
+void save_v86_state(struct kernel_vm86_regs *, int);
struct task_struct;
+
+#define free_vm86(t) do { \
+ struct thread_struct *__t = (t); \
+ if (__t->vm86 != NULL) { \
+ kfree(__t->vm86); \
+ __t->vm86 = NULL; \
+ } \
+} while (0)
+
+/*
+ * Support for VM86 programs to request interrupts for
+ * real mode hardware drivers:
+ */
+#define FIRST_VM86_IRQ 3
+#define LAST_VM86_IRQ 15
+
+static inline int invalid_vm86_irq(int irq)
+{
+ return irq < FIRST_VM86_IRQ || irq > LAST_VM86_IRQ;
+}
+
void release_vm86_irqs(struct task_struct *);
#else
@@ -77,6 +82,10 @@ static inline int handle_vm86_trap(struct kernel_vm86_regs *a, long b, int c)
return 0;
}
+static inline void save_v86_state(struct kernel_vm86_regs *a, int b) { }
+
+#define free_vm86(t) do { } while(0)
+
#endif /* CONFIG_VM86 */
#endif /* _ASM_X86_VM86_H */
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index da772edd19ab..448b7ca61aee 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -47,6 +47,7 @@
#define CPU_BASED_MOV_DR_EXITING 0x00800000
#define CPU_BASED_UNCOND_IO_EXITING 0x01000000
#define CPU_BASED_USE_IO_BITMAPS 0x02000000
+#define CPU_BASED_MONITOR_TRAP_FLAG 0x08000000
#define CPU_BASED_USE_MSR_BITMAPS 0x10000000
#define CPU_BASED_MONITOR_EXITING 0x20000000
#define CPU_BASED_PAUSE_EXITING 0x40000000
@@ -367,29 +368,29 @@ enum vmcs_field {
#define TYPE_PHYSICAL_APIC_EVENT (10 << 12)
#define TYPE_PHYSICAL_APIC_INST (15 << 12)
-/* segment AR */
-#define SEGMENT_AR_L_MASK (1 << 13)
-
-#define AR_TYPE_ACCESSES_MASK 1
-#define AR_TYPE_READABLE_MASK (1 << 1)
-#define AR_TYPE_WRITEABLE_MASK (1 << 2)
-#define AR_TYPE_CODE_MASK (1 << 3)
-#define AR_TYPE_MASK 0x0f
-#define AR_TYPE_BUSY_64_TSS 11
-#define AR_TYPE_BUSY_32_TSS 11
-#define AR_TYPE_BUSY_16_TSS 3
-#define AR_TYPE_LDT 2
-
-#define AR_UNUSABLE_MASK (1 << 16)
-#define AR_S_MASK (1 << 4)
-#define AR_P_MASK (1 << 7)
-#define AR_L_MASK (1 << 13)
-#define AR_DB_MASK (1 << 14)
-#define AR_G_MASK (1 << 15)
-#define AR_DPL_SHIFT 5
-#define AR_DPL(ar) (((ar) >> AR_DPL_SHIFT) & 3)
-
-#define AR_RESERVD_MASK 0xfffe0f00
+/* segment AR in VMCS -- these are different from what LAR reports */
+#define VMX_SEGMENT_AR_L_MASK (1 << 13)
+
+#define VMX_AR_TYPE_ACCESSES_MASK 1
+#define VMX_AR_TYPE_READABLE_MASK (1 << 1)
+#define VMX_AR_TYPE_WRITEABLE_MASK (1 << 2)
+#define VMX_AR_TYPE_CODE_MASK (1 << 3)
+#define VMX_AR_TYPE_MASK 0x0f
+#define VMX_AR_TYPE_BUSY_64_TSS 11
+#define VMX_AR_TYPE_BUSY_32_TSS 11
+#define VMX_AR_TYPE_BUSY_16_TSS 3
+#define VMX_AR_TYPE_LDT 2
+
+#define VMX_AR_UNUSABLE_MASK (1 << 16)
+#define VMX_AR_S_MASK (1 << 4)
+#define VMX_AR_P_MASK (1 << 7)
+#define VMX_AR_L_MASK (1 << 13)
+#define VMX_AR_DB_MASK (1 << 14)
+#define VMX_AR_G_MASK (1 << 15)
+#define VMX_AR_DPL_SHIFT 5
+#define VMX_AR_DPL(ar) (((ar) >> VMX_AR_DPL_SHIFT) & 3)
+
+#define VMX_AR_RESERVD_MASK 0xfffe0f00
#define TSS_PRIVATE_MEMSLOT (KVM_USER_MEM_SLOTS + 0)
#define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT (KVM_USER_MEM_SLOTS + 1)
diff --git a/arch/x86/include/asm/xen/events.h b/arch/x86/include/asm/xen/events.h
index 608a79d5a466..e6911caf5bbf 100644
--- a/arch/x86/include/asm/xen/events.h
+++ b/arch/x86/include/asm/xen/events.h
@@ -20,4 +20,15 @@ static inline int xen_irqs_disabled(struct pt_regs *regs)
/* No need for a barrier -- XCHG is a barrier on x86. */
#define xchg_xen_ulong(ptr, val) xchg((ptr), (val))
+extern int xen_have_vector_callback;
+
+/*
+ * Events delivered via platform PCI interrupts are always
+ * routed to vcpu 0 and hence cannot be rebound.
+ */
+static inline bool xen_support_evtchn_rebind(void)
+{
+ return (!xen_hvm_domain() || xen_have_vector_callback);
+}
+
#endif /* _ASM_X86_XEN_EVENTS_H */
diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h
index ca08a27b90b3..4c20dd333412 100644
--- a/arch/x86/include/asm/xen/hypercall.h
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -336,10 +336,10 @@ HYPERVISOR_update_descriptor(u64 ma, u64 desc)
return _hypercall4(int, update_descriptor, ma, ma>>32, desc, desc>>32);
}
-static inline int
+static inline long
HYPERVISOR_memory_op(unsigned int cmd, void *arg)
{
- return _hypercall2(int, memory_op, cmd, arg);
+ return _hypercall2(long, memory_op, cmd, arg);
}
static inline int
@@ -465,6 +465,12 @@ HYPERVISOR_tmem_op(
return _hypercall1(int, tmem_op, op);
}
+static inline int
+HYPERVISOR_xenpmu_op(unsigned int op, void *arg)
+{
+ return _hypercall2(int, xenpmu_op, op, arg);
+}
+
static inline void
MULTI_fpu_taskswitch(struct multicall_entry *mcl, int set)
{
diff --git a/arch/x86/include/asm/xen/interface.h b/arch/x86/include/asm/xen/interface.h
index 3400dbaec3c3..62ca03ef5c65 100644
--- a/arch/x86/include/asm/xen/interface.h
+++ b/arch/x86/include/asm/xen/interface.h
@@ -3,12 +3,38 @@
*
* Guest OS interface to x86 Xen.
*
- * Copyright (c) 2004, K A Fraser
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Copyright (c) 2004-2006, K A Fraser
*/
#ifndef _ASM_X86_XEN_INTERFACE_H
#define _ASM_X86_XEN_INTERFACE_H
+/*
+ * XEN_GUEST_HANDLE represents a guest pointer, when passed as a field
+ * in a struct in memory.
+ * XEN_GUEST_HANDLE_PARAM represent a guest pointer, when passed as an
+ * hypercall argument.
+ * XEN_GUEST_HANDLE_PARAM and XEN_GUEST_HANDLE are the same on X86 but
+ * they might not be on other architectures.
+ */
#ifdef __XEN__
#define __DEFINE_GUEST_HANDLE(name, type) \
typedef struct { type *p; } __guest_handle_ ## name
@@ -88,13 +114,16 @@ DEFINE_GUEST_HANDLE(xen_ulong_t);
* start of the GDT because some stupid OSes export hard-coded selector values
* in their ABI. These hard-coded values are always near the start of the GDT,
* so Xen places itself out of the way, at the far end of the GDT.
+ *
+ * NB The LDT is set using the MMUEXT_SET_LDT op of HYPERVISOR_mmuext_op
*/
#define FIRST_RESERVED_GDT_PAGE 14
#define FIRST_RESERVED_GDT_BYTE (FIRST_RESERVED_GDT_PAGE * 4096)
#define FIRST_RESERVED_GDT_ENTRY (FIRST_RESERVED_GDT_BYTE / 8)
/*
- * Send an array of these to HYPERVISOR_set_trap_table()
+ * Send an array of these to HYPERVISOR_set_trap_table().
+ * Terminate the array with a sentinel entry, with traps[].address==0.
* The privilege level specifies which modes may enter a trap via a software
* interrupt. On x86/64, since rings 1 and 2 are unavailable, we allocate
* privilege levels as follows:
@@ -118,10 +147,41 @@ struct trap_info {
DEFINE_GUEST_HANDLE_STRUCT(trap_info);
struct arch_shared_info {
- unsigned long max_pfn; /* max pfn that appears in table */
- /* Frame containing list of mfns containing list of mfns containing p2m. */
- unsigned long pfn_to_mfn_frame_list_list;
- unsigned long nmi_reason;
+ /*
+ * Number of valid entries in the p2m table(s) anchored at
+ * pfn_to_mfn_frame_list_list and/or p2m_vaddr.
+ */
+ unsigned long max_pfn;
+ /*
+ * Frame containing list of mfns containing list of mfns containing p2m.
+ * A value of 0 indicates it has not yet been set up, ~0 indicates it
+ * has been set to invalid e.g. due to the p2m being too large for the
+ * 3-level p2m tree. In this case the linear mapper p2m list anchored
+ * at p2m_vaddr is to be used.
+ */
+ xen_pfn_t pfn_to_mfn_frame_list_list;
+ unsigned long nmi_reason;
+ /*
+ * Following three fields are valid if p2m_cr3 contains a value
+ * different from 0.
+ * p2m_cr3 is the root of the address space where p2m_vaddr is valid.
+ * p2m_cr3 is in the same format as a cr3 value in the vcpu register
+ * state and holds the folded machine frame number (via xen_pfn_to_cr3)
+ * of a L3 or L4 page table.
+ * p2m_vaddr holds the virtual address of the linear p2m list. All
+ * entries in the range [0...max_pfn[ are accessible via this pointer.
+ * p2m_generation will be incremented by the guest before and after each
+ * change of the mappings of the p2m list. p2m_generation starts at 0
+ * and a value with the least significant bit set indicates that a
+ * mapping update is in progress. This allows guest external software
+ * (e.g. in Dom0) to verify that read mappings are consistent and
+ * whether they have changed since the last check.
+ * Modifying a p2m element in the linear p2m list is allowed via an
+ * atomic write only.
+ */
+ unsigned long p2m_cr3; /* cr3 value of the p2m address space */
+ unsigned long p2m_vaddr; /* virtual address of the p2m list */
+ unsigned long p2m_generation; /* generation count of p2m mapping */
};
#endif /* !__ASSEMBLY__ */
@@ -137,13 +197,31 @@ struct arch_shared_info {
/*
* The following is all CPU context. Note that the fpu_ctxt block is filled
* in by FXSAVE if the CPU has feature FXSR; otherwise FSAVE is used.
+ *
+ * Also note that when calling DOMCTL_setvcpucontext and VCPU_initialise
+ * for HVM and PVH guests, not all information in this structure is updated:
+ *
+ * - For HVM guests, the structures read include: fpu_ctxt (if
+ * VGCT_I387_VALID is set), flags, user_regs, debugreg[*]
+ *
+ * - PVH guests are the same as HVM guests, but additionally use ctrlreg[3] to
+ * set cr3. All other fields not used should be set to 0.
*/
struct vcpu_guest_context {
/* FPU registers come first so they can be aligned for FXSAVE/FXRSTOR. */
struct { char x[512]; } fpu_ctxt; /* User-level FPU registers */
-#define VGCF_I387_VALID (1<<0)
-#define VGCF_HVM_GUEST (1<<1)
-#define VGCF_IN_KERNEL (1<<2)
+#define VGCF_I387_VALID (1<<0)
+#define VGCF_IN_KERNEL (1<<2)
+#define _VGCF_i387_valid 0
+#define VGCF_i387_valid (1<<_VGCF_i387_valid)
+#define _VGCF_in_kernel 2
+#define VGCF_in_kernel (1<<_VGCF_in_kernel)
+#define _VGCF_failsafe_disables_events 3
+#define VGCF_failsafe_disables_events (1<<_VGCF_failsafe_disables_events)
+#define _VGCF_syscall_disables_events 4
+#define VGCF_syscall_disables_events (1<<_VGCF_syscall_disables_events)
+#define _VGCF_online 5
+#define VGCF_online (1<<_VGCF_online)
unsigned long flags; /* VGCF_* flags */
struct cpu_user_regs user_regs; /* User-level CPU registers */
struct trap_info trap_ctxt[256]; /* Virtual IDT */
@@ -172,6 +250,129 @@ struct vcpu_guest_context {
#endif
};
DEFINE_GUEST_HANDLE_STRUCT(vcpu_guest_context);
+
+/* AMD PMU registers and structures */
+struct xen_pmu_amd_ctxt {
+ /*
+ * Offsets to counter and control MSRs (relative to xen_pmu_arch.c.amd).
+ * For PV(H) guests these fields are RO.
+ */
+ uint32_t counters;
+ uint32_t ctrls;
+
+ /* Counter MSRs */
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+ uint64_t regs[];
+#elif defined(__GNUC__)
+ uint64_t regs[0];
+#endif
+};
+
+/* Intel PMU registers and structures */
+struct xen_pmu_cntr_pair {
+ uint64_t counter;
+ uint64_t control;
+};
+
+struct xen_pmu_intel_ctxt {
+ /*
+ * Offsets to fixed and architectural counter MSRs (relative to
+ * xen_pmu_arch.c.intel).
+ * For PV(H) guests these fields are RO.
+ */
+ uint32_t fixed_counters;
+ uint32_t arch_counters;
+
+ /* PMU registers */
+ uint64_t global_ctrl;
+ uint64_t global_ovf_ctrl;
+ uint64_t global_status;
+ uint64_t fixed_ctrl;
+ uint64_t ds_area;
+ uint64_t pebs_enable;
+ uint64_t debugctl;
+
+ /* Fixed and architectural counter MSRs */
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+ uint64_t regs[];
+#elif defined(__GNUC__)
+ uint64_t regs[0];
+#endif
+};
+
+/* Sampled domain's registers */
+struct xen_pmu_regs {
+ uint64_t ip;
+ uint64_t sp;
+ uint64_t flags;
+ uint16_t cs;
+ uint16_t ss;
+ uint8_t cpl;
+ uint8_t pad[3];
+};
+
+/* PMU flags */
+#define PMU_CACHED (1<<0) /* PMU MSRs are cached in the context */
+#define PMU_SAMPLE_USER (1<<1) /* Sample is from user or kernel mode */
+#define PMU_SAMPLE_REAL (1<<2) /* Sample is from realmode */
+#define PMU_SAMPLE_PV (1<<3) /* Sample from a PV guest */
+
+/*
+ * Architecture-specific information describing state of the processor at
+ * the time of PMU interrupt.
+ * Fields of this structure marked as RW for guest should only be written by
+ * the guest when PMU_CACHED bit in pmu_flags is set (which is done by the
+ * hypervisor during PMU interrupt). Hypervisor will read updated data in
+ * XENPMU_flush hypercall and clear PMU_CACHED bit.
+ */
+struct xen_pmu_arch {
+ union {
+ /*
+ * Processor's registers at the time of interrupt.
+ * WO for hypervisor, RO for guests.
+ */
+ struct xen_pmu_regs regs;
+ /*
+ * Padding for adding new registers to xen_pmu_regs in
+ * the future
+ */
+#define XENPMU_REGS_PAD_SZ 64
+ uint8_t pad[XENPMU_REGS_PAD_SZ];
+ } r;
+
+ /* WO for hypervisor, RO for guest */
+ uint64_t pmu_flags;
+
+ /*
+ * APIC LVTPC register.
+ * RW for both hypervisor and guest.
+ * Only APIC_LVT_MASKED bit is loaded by the hypervisor into hardware
+ * during XENPMU_flush or XENPMU_lvtpc_set.
+ */
+ union {
+ uint32_t lapic_lvtpc;
+ uint64_t pad;
+ } l;
+
+ /*
+ * Vendor-specific PMU registers.
+ * RW for both hypervisor and guest (see exceptions above).
+ * Guest's updates to this field are verified and then loaded by the
+ * hypervisor into hardware during XENPMU_flush
+ */
+ union {
+ struct xen_pmu_amd_ctxt amd;
+ struct xen_pmu_intel_ctxt intel;
+
+ /*
+ * Padding for contexts (fixed parts only, does not include
+ * MSR banks that are specified by offsets)
+ */
+#define XENPMU_CTXT_PAD_SZ 128
+ uint8_t pad[XENPMU_CTXT_PAD_SZ];
+ } c;
+};
+
#endif /* !__ASSEMBLY__ */
/*
diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index c44a5d53e464..0679e11d2cf7 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -35,9 +35,7 @@ typedef struct xpaddr {
#define FOREIGN_FRAME(m) ((m) | FOREIGN_FRAME_BIT)
#define IDENTITY_FRAME(m) ((m) | IDENTITY_FRAME_BIT)
-/* Maximum amount of memory we can handle in a domain in pages */
-#define MAX_DOMAIN_PAGES \
- ((unsigned long)((u64)CONFIG_XEN_MAX_DOMAIN_MEMORY * 1024 * 1024 * 1024 / PAGE_SIZE))
+#define P2M_PER_PAGE (PAGE_SIZE / sizeof(unsigned long))
extern unsigned long *machine_to_phys_mapping;
extern unsigned long machine_to_phys_nr;
@@ -48,8 +46,8 @@ extern unsigned long xen_max_p2m_pfn;
extern unsigned long get_phys_to_machine(unsigned long pfn);
extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn);
extern bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn);
-extern unsigned long set_phys_range_identity(unsigned long pfn_s,
- unsigned long pfn_e);
+extern unsigned long __init set_phys_range_identity(unsigned long pfn_s,
+ unsigned long pfn_e);
extern int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops,
struct gnttab_map_grant_ref *kmap_ops,
@@ -103,6 +101,11 @@ static inline unsigned long pfn_to_mfn(unsigned long pfn)
{
unsigned long mfn;
+ /*
+ * Some x86 code are still using pfn_to_mfn instead of
+ * pfn_to_mfn. This will have to be removed when we figured
+ * out which call.
+ */
if (xen_feature(XENFEAT_auto_translated_physmap))
return pfn;
@@ -149,6 +152,11 @@ static inline unsigned long mfn_to_pfn(unsigned long mfn)
{
unsigned long pfn;
+ /*
+ * Some x86 code are still using mfn_to_pfn instead of
+ * gfn_to_pfn. This will have to be removed when we figure
+ * out which call.
+ */
if (xen_feature(XENFEAT_auto_translated_physmap))
return mfn;
@@ -178,6 +186,27 @@ static inline xpaddr_t machine_to_phys(xmaddr_t machine)
return XPADDR(PFN_PHYS(mfn_to_pfn(PFN_DOWN(machine.maddr))) | offset);
}
+/* Pseudo-physical <-> Guest conversion */
+static inline unsigned long pfn_to_gfn(unsigned long pfn)
+{
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ return pfn;
+ else
+ return pfn_to_mfn(pfn);
+}
+
+static inline unsigned long gfn_to_pfn(unsigned long gfn)
+{
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ return gfn;
+ else
+ return mfn_to_pfn(gfn);
+}
+
+/* Pseudo-physical <-> Bus conversion */
+#define pfn_to_bfn(pfn) pfn_to_gfn(pfn)
+#define bfn_to_pfn(bfn) gfn_to_pfn(bfn)
+
/*
* We detect special mappings in one of two ways:
* 1. If the MFN is an I/O page then Xen will set the m2p entry
@@ -198,7 +227,7 @@ static inline xpaddr_t machine_to_phys(xmaddr_t machine)
* require. In all the cases we care about, the FOREIGN_FRAME bit is
* masked (e.g., pfn_to_mfn()) so behaviour there is correct.
*/
-static inline unsigned long mfn_to_local_pfn(unsigned long mfn)
+static inline unsigned long bfn_to_local_pfn(unsigned long mfn)
{
unsigned long pfn;
@@ -217,6 +246,10 @@ static inline unsigned long mfn_to_local_pfn(unsigned long mfn)
#define virt_to_mfn(v) (pfn_to_mfn(virt_to_pfn(v)))
#define mfn_to_virt(m) (__va(mfn_to_pfn(m) << PAGE_SHIFT))
+/* VIRT <-> GUEST conversion */
+#define virt_to_gfn(v) (pfn_to_gfn(virt_to_pfn(v)))
+#define gfn_to_virt(g) (__va(gfn_to_pfn(g) << PAGE_SHIFT))
+
static inline unsigned long pte_mfn(pte_t pte)
{
return (pte.pte & PTE_PFN_MASK) >> PAGE_SHIFT;
@@ -264,7 +297,7 @@ void make_lowmem_page_readwrite(void *vaddr);
static inline bool xen_arch_need_swiotlb(struct device *dev,
unsigned long pfn,
- unsigned long mfn)
+ unsigned long bfn)
{
return false;
}
diff --git a/arch/x86/include/uapi/asm/bitsperlong.h b/arch/x86/include/uapi/asm/bitsperlong.h
index b0ae1c4dc791..217909b4d6f5 100644
--- a/arch/x86/include/uapi/asm/bitsperlong.h
+++ b/arch/x86/include/uapi/asm/bitsperlong.h
@@ -1,7 +1,7 @@
#ifndef __ASM_X86_BITSPERLONG_H
#define __ASM_X86_BITSPERLONG_H
-#ifdef __x86_64__
+#if defined(__x86_64__) && !defined(__ILP32__)
# define __BITS_PER_LONG 64
#else
# define __BITS_PER_LONG 32
diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h
index ab456dc233b5..329254373479 100644
--- a/arch/x86/include/uapi/asm/bootparam.h
+++ b/arch/x86/include/uapi/asm/bootparam.h
@@ -120,7 +120,7 @@ struct boot_params {
__u8 _pad3[16]; /* 0x070 */
__u8 hd0_info[16]; /* obsolete! */ /* 0x080 */
__u8 hd1_info[16]; /* obsolete! */ /* 0x090 */
- struct sys_desc_table sys_desc_table; /* 0x0a0 */
+ struct sys_desc_table sys_desc_table; /* obsolete! */ /* 0x0a0 */
struct olpc_ofw_header olpc_ofw_header; /* 0x0b0 */
__u32 ext_ramdisk_image; /* 0x0c0 */
__u32 ext_ramdisk_size; /* 0x0c4 */
diff --git a/arch/x86/include/uapi/asm/e820.h b/arch/x86/include/uapi/asm/e820.h
index 0f457e6eab18..9dafe59cf6e2 100644
--- a/arch/x86/include/uapi/asm/e820.h
+++ b/arch/x86/include/uapi/asm/e820.h
@@ -37,7 +37,7 @@
/*
* This is a non-standardized way to represent ADR or NVDIMM regions that
* persist over a reboot. The kernel will ignore their special capabilities
- * unless the CONFIG_X86_PMEM_LEGACY=y option is set.
+ * unless the CONFIG_X86_PMEM_LEGACY option is set.
*
* ( Note that older platforms also used 6 for the same type of memory,
* but newer versions switched to 12 as 6 was assigned differently. Some
diff --git a/arch/x86/include/uapi/asm/hyperv.h b/arch/x86/include/uapi/asm/hyperv.h
index 8fba544e9cc4..f0412c50c47b 100644
--- a/arch/x86/include/uapi/asm/hyperv.h
+++ b/arch/x86/include/uapi/asm/hyperv.h
@@ -27,6 +27,8 @@
#define HV_X64_MSR_VP_RUNTIME_AVAILABLE (1 << 0)
/* Partition Reference Counter (HV_X64_MSR_TIME_REF_COUNT) available*/
#define HV_X64_MSR_TIME_REF_COUNT_AVAILABLE (1 << 1)
+/* Partition reference TSC MSR is available */
+#define HV_X64_MSR_REFERENCE_TSC_AVAILABLE (1 << 9)
/* A partition's reference time stamp counter (TSC) page */
#define HV_X64_MSR_REFERENCE_TSC 0x40000021
@@ -108,6 +110,8 @@
#define HV_X64_HYPERCALL_PARAMS_XMM_AVAILABLE (1 << 4)
/* Support for a virtual guest idle state is available */
#define HV_X64_GUEST_IDLE_STATE_AVAILABLE (1 << 5)
+/* Guest crash data handler available */
+#define HV_X64_GUEST_CRASH_MSR_AVAILABLE (1 << 10)
/*
* Implementation recommendations. Indicates which behaviors the hypervisor
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index a4ae82eb82aa..cd54147cb365 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -354,7 +354,7 @@ struct kvm_xcrs {
struct kvm_sync_regs {
};
-#define KVM_QUIRK_LINT0_REENABLED (1 << 0)
-#define KVM_QUIRK_CD_NW_CLEARED (1 << 1)
+#define KVM_X86_QUIRK_LINT0_REENABLED (1 << 0)
+#define KVM_X86_QUIRK_CD_NW_CLEARED (1 << 1)
#endif /* _ASM_X86_KVM_H */
diff --git a/arch/x86/include/uapi/asm/mce.h b/arch/x86/include/uapi/asm/mce.h
index a0eab85ce7b8..76880ede9a35 100644
--- a/arch/x86/include/uapi/asm/mce.h
+++ b/arch/x86/include/uapi/asm/mce.h
@@ -15,7 +15,8 @@ struct mce {
__u64 time; /* wall time_t when error was detected */
__u8 cpuvendor; /* cpu vendor as encoded in system.h */
__u8 inject_flags; /* software inject flags */
- __u16 pad;
+ __u8 severity;
+ __u8 usable_addr;
__u32 cpuid; /* CPUID 1 EAX */
__u8 cs; /* code segment */
__u8 bank; /* machine check bank */
diff --git a/arch/x86/include/uapi/asm/processor-flags.h b/arch/x86/include/uapi/asm/processor-flags.h
index 180a0c3c224d..79887abcb5e1 100644
--- a/arch/x86/include/uapi/asm/processor-flags.h
+++ b/arch/x86/include/uapi/asm/processor-flags.h
@@ -37,8 +37,6 @@
#define X86_EFLAGS_VM _BITUL(X86_EFLAGS_VM_BIT)
#define X86_EFLAGS_AC_BIT 18 /* Alignment Check/Access Control */
#define X86_EFLAGS_AC _BITUL(X86_EFLAGS_AC_BIT)
-#define X86_EFLAGS_AC_BIT 18 /* Alignment Check/Access Control */
-#define X86_EFLAGS_AC _BITUL(X86_EFLAGS_AC_BIT)
#define X86_EFLAGS_VIF_BIT 19 /* Virtual Interrupt Flag */
#define X86_EFLAGS_VIF _BITUL(X86_EFLAGS_VIF_BIT)
#define X86_EFLAGS_VIP_BIT 20 /* Virtual Interrupt Pending */
diff --git a/arch/x86/include/uapi/asm/sigcontext.h b/arch/x86/include/uapi/asm/sigcontext.h
index 0e8a973de9ee..40836a9a7250 100644
--- a/arch/x86/include/uapi/asm/sigcontext.h
+++ b/arch/x86/include/uapi/asm/sigcontext.h
@@ -177,24 +177,9 @@ struct sigcontext {
__u64 rip;
__u64 eflags; /* RFLAGS */
__u16 cs;
-
- /*
- * Prior to 2.5.64 ("[PATCH] x86-64 updates for 2.5.64-bk3"),
- * Linux saved and restored fs and gs in these slots. This
- * was counterproductive, as fsbase and gsbase were never
- * saved, so arch_prctl was presumably unreliable.
- *
- * If these slots are ever needed for any other purpose, there
- * is some risk that very old 64-bit binaries could get
- * confused. I doubt that many such binaries still work,
- * though, since the same patch in 2.5.64 also removed the
- * 64-bit set_thread_area syscall, so it appears that there is
- * no TLS API that works in both pre- and post-2.5.64 kernels.
- */
- __u16 __pad2; /* Was gs. */
- __u16 __pad1; /* Was fs. */
-
- __u16 ss;
+ __u16 gs;
+ __u16 fs;
+ __u16 __pad0;
__u64 err;
__u64 trapno;
__u64 oldmask;
diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
index 1fe92181ee9e..37fee272618f 100644
--- a/arch/x86/include/uapi/asm/vmx.h
+++ b/arch/x86/include/uapi/asm/vmx.h
@@ -58,6 +58,7 @@
#define EXIT_REASON_INVALID_STATE 33
#define EXIT_REASON_MSR_LOAD_FAIL 34
#define EXIT_REASON_MWAIT_INSTRUCTION 36
+#define EXIT_REASON_MONITOR_TRAP_FLAG 37
#define EXIT_REASON_MONITOR_INSTRUCTION 39
#define EXIT_REASON_PAUSE_INSTRUCTION 40
#define EXIT_REASON_MCE_DURING_VMENTRY 41
@@ -106,6 +107,7 @@
{ EXIT_REASON_MSR_READ, "MSR_READ" }, \
{ EXIT_REASON_MSR_WRITE, "MSR_WRITE" }, \
{ EXIT_REASON_MWAIT_INSTRUCTION, "MWAIT_INSTRUCTION" }, \
+ { EXIT_REASON_MONITOR_TRAP_FLAG, "MONITOR_TRAP_FLAG" }, \
{ EXIT_REASON_MONITOR_INSTRUCTION, "MONITOR_INSTRUCTION" }, \
{ EXIT_REASON_PAUSE_INSTRUCTION, "PAUSE_INSTRUCTION" }, \
{ EXIT_REASON_MCE_DURING_VMENTRY, "MCE_DURING_VMENTRY" }, \
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 0f15af41bd80..b1b78ffe01d0 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -23,8 +23,10 @@ KASAN_SANITIZE_dumpstack_$(BITS).o := n
CFLAGS_irq.o := -I$(src)/../include/asm/trace
obj-y := process_$(BITS).o signal.o
+obj-$(CONFIG_COMPAT) += signal_compat.o
obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
-obj-y += time.o ioport.o ldt.o dumpstack.o nmi.o
+obj-y += time.o ioport.o dumpstack.o nmi.o
+obj-$(CONFIG_MODIFY_LDT_SYSCALL) += ldt.o
obj-y += setup.o x86_init.o i8259.o irqinit.o jump_label.o
obj-$(CONFIG_IRQ_WORK) += irq_work.o
obj-y += probe_roms.o
@@ -69,8 +71,8 @@ obj-$(CONFIG_LIVEPATCH) += livepatch.o
obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o
obj-$(CONFIG_FTRACE_SYSCALLS) += ftrace.o
obj-$(CONFIG_X86_TSC) += trace_clock.o
-obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o
-obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o
+obj-$(CONFIG_KEXEC_CORE) += machine_kexec_$(BITS).o
+obj-$(CONFIG_KEXEC_CORE) += relocate_kernel_$(BITS).o crash.o
obj-$(CONFIG_KEXEC_FILE) += kexec-bzimage64.o
obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o
obj-y += kprobes/
@@ -92,7 +94,7 @@ obj-$(CONFIG_KVM_GUEST) += kvm.o kvmclock.o
obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o
obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= paravirt-spinlocks.o
obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o
-obj-$(CONFIG_X86_PMEM_LEGACY) += pmem.o
+obj-$(CONFIG_X86_PMEM_LEGACY_DEVICE) += pmem.o
obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o
@@ -107,8 +109,6 @@ obj-$(CONFIG_EFI) += sysfb_efi.o
obj-$(CONFIG_PERF_EVENTS) += perf_regs.o
obj-$(CONFIG_TRACING) += tracepoint.o
-obj-$(CONFIG_IOSF_MBI) += iosf_mbi.o
-obj-$(CONFIG_PMC_ATOM) += pmc_atom.o
###
# 64 bit specific files
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index e49ee24da85e..ded848c20e05 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -445,6 +445,7 @@ static void __init acpi_sci_ioapic_setup(u8 bus_irq, u16 polarity, u16 trigger,
polarity = acpi_sci_flags & ACPI_MADT_POLARITY_MASK;
mp_override_legacy_irq(bus_irq, polarity, trigger, gsi);
+ acpi_penalize_sci_irq(bus_irq, trigger, polarity);
/*
* stash over-ride to indicate we've been here
@@ -710,7 +711,7 @@ static void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)
#endif
}
-static int _acpi_map_lsapic(acpi_handle handle, int physid, int *pcpu)
+int acpi_map_cpu(acpi_handle handle, phys_cpuid_t physid, int *pcpu)
{
int cpu;
@@ -726,12 +727,6 @@ static int _acpi_map_lsapic(acpi_handle handle, int physid, int *pcpu)
*pcpu = cpu;
return 0;
}
-
-/* wrapper to silence section mismatch warning */
-int __ref acpi_map_cpu(acpi_handle handle, phys_cpuid_t physid, int *pcpu)
-{
- return _acpi_map_lsapic(handle, physid, pcpu);
-}
EXPORT_SYMBOL(acpi_map_cpu);
int acpi_unmap_cpu(int cpu)
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index c42827eb86cf..25f909362b7a 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -338,10 +338,15 @@ done:
static void __init_or_module optimize_nops(struct alt_instr *a, u8 *instr)
{
+ unsigned long flags;
+
if (instr[0] != 0x90)
return;
+ local_irq_save(flags);
add_nops(instr + (a->instrlen - a->padlen), a->padlen);
+ sync_core();
+ local_irq_restore(flags);
DUMP_BYTES(instr, a->instrlen, "%p: [%d:%d) optimized NOPs: ",
instr, a->instrlen - a->padlen, a->padlen);
diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c
index ede92c3364d3..222a57076039 100644
--- a/arch/x86/kernel/apb_timer.c
+++ b/arch/x86/kernel/apb_timer.c
@@ -263,7 +263,7 @@ static int apbt_clocksource_register(void)
/* Verify whether apbt counter works */
t1 = dw_apb_clocksource_read(clocksource_apbt);
- rdtscll(start);
+ start = rdtsc();
/*
* We don't know the TSC frequency yet, but waiting for
@@ -273,7 +273,7 @@ static int apbt_clocksource_register(void)
*/
do {
rep_nop();
- rdtscll(now);
+ now = rdtsc();
} while ((now - start) < 200000UL);
/* APBT is the only always on clocksource, it has to work! */
@@ -390,13 +390,13 @@ unsigned long apbt_quick_calibrate(void)
old = dw_apb_clocksource_read(clocksource_apbt);
old += loop;
- t1 = __native_read_tsc();
+ t1 = rdtsc();
do {
new = dw_apb_clocksource_read(clocksource_apbt);
} while (new < old);
- t2 = __native_read_tsc();
+ t2 = rdtsc();
shift = 5;
if (unlikely(loop >> shift == 0)) {
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index dcb52850a28f..24e94ce454e2 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -336,6 +336,13 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
apic_write(APIC_LVTT, lvtt_value);
if (lvtt_value & APIC_LVT_TIMER_TSCDEADLINE) {
+ /*
+ * See Intel SDM: TSC-Deadline Mode chapter. In xAPIC mode,
+ * writing to the APIC LVTT and TSC_DEADLINE MSR isn't serialized.
+ * According to Intel, MFENCE can do the serialization here.
+ */
+ asm volatile("mfence" : : : "memory");
+
printk_once(KERN_DEBUG "TSC deadline timer enabled\n");
return;
}
@@ -457,45 +464,45 @@ static int lapic_next_deadline(unsigned long delta,
{
u64 tsc;
- rdtscll(tsc);
+ tsc = rdtsc();
wrmsrl(MSR_IA32_TSC_DEADLINE, tsc + (((u64) delta) * TSC_DIVISOR));
return 0;
}
-/*
- * Setup the lapic timer in periodic or oneshot mode
- */
-static void lapic_timer_setup(enum clock_event_mode mode,
- struct clock_event_device *evt)
+static int lapic_timer_shutdown(struct clock_event_device *evt)
{
- unsigned long flags;
unsigned int v;
/* Lapic used as dummy for broadcast ? */
if (evt->features & CLOCK_EVT_FEAT_DUMMY)
- return;
+ return 0;
- local_irq_save(flags);
+ v = apic_read(APIC_LVTT);
+ v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
+ apic_write(APIC_LVTT, v);
+ apic_write(APIC_TMICT, 0);
+ return 0;
+}
- switch (mode) {
- case CLOCK_EVT_MODE_PERIODIC:
- case CLOCK_EVT_MODE_ONESHOT:
- __setup_APIC_LVTT(lapic_timer_frequency,
- mode != CLOCK_EVT_MODE_PERIODIC, 1);
- break;
- case CLOCK_EVT_MODE_UNUSED:
- case CLOCK_EVT_MODE_SHUTDOWN:
- v = apic_read(APIC_LVTT);
- v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
- apic_write(APIC_LVTT, v);
- apic_write(APIC_TMICT, 0);
- break;
- case CLOCK_EVT_MODE_RESUME:
- /* Nothing to do here */
- break;
- }
+static inline int
+lapic_timer_set_periodic_oneshot(struct clock_event_device *evt, bool oneshot)
+{
+ /* Lapic used as dummy for broadcast ? */
+ if (evt->features & CLOCK_EVT_FEAT_DUMMY)
+ return 0;
- local_irq_restore(flags);
+ __setup_APIC_LVTT(lapic_timer_frequency, oneshot, 1);
+ return 0;
+}
+
+static int lapic_timer_set_periodic(struct clock_event_device *evt)
+{
+ return lapic_timer_set_periodic_oneshot(evt, false);
+}
+
+static int lapic_timer_set_oneshot(struct clock_event_device *evt)
+{
+ return lapic_timer_set_periodic_oneshot(evt, true);
}
/*
@@ -513,15 +520,18 @@ static void lapic_timer_broadcast(const struct cpumask *mask)
* The local apic timer can be used for any function which is CPU local.
*/
static struct clock_event_device lapic_clockevent = {
- .name = "lapic",
- .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT
- | CLOCK_EVT_FEAT_C3STOP | CLOCK_EVT_FEAT_DUMMY,
- .shift = 32,
- .set_mode = lapic_timer_setup,
- .set_next_event = lapic_next_event,
- .broadcast = lapic_timer_broadcast,
- .rating = 100,
- .irq = -1,
+ .name = "lapic",
+ .features = CLOCK_EVT_FEAT_PERIODIC |
+ CLOCK_EVT_FEAT_ONESHOT | CLOCK_EVT_FEAT_C3STOP
+ | CLOCK_EVT_FEAT_DUMMY,
+ .shift = 32,
+ .set_state_shutdown = lapic_timer_shutdown,
+ .set_state_periodic = lapic_timer_set_periodic,
+ .set_state_oneshot = lapic_timer_set_oneshot,
+ .set_next_event = lapic_next_event,
+ .broadcast = lapic_timer_broadcast,
+ .rating = 100,
+ .irq = -1,
};
static DEFINE_PER_CPU(struct clock_event_device, lapic_events);
@@ -592,7 +602,7 @@ static void __init lapic_cal_handler(struct clock_event_device *dev)
unsigned long pm = acpi_pm_read_early();
if (cpu_has_tsc)
- rdtscll(tsc);
+ tsc = rdtsc();
switch (lapic_cal_loops++) {
case 0:
@@ -778,7 +788,7 @@ static int __init calibrate_APIC_clock(void)
* Setup the apic timer manually
*/
levt->event_handler = lapic_cal_handler;
- lapic_timer_setup(CLOCK_EVT_MODE_PERIODIC, levt);
+ lapic_timer_set_periodic(levt);
lapic_cal_loops = -1;
/* Let the interrupts run */
@@ -788,7 +798,8 @@ static int __init calibrate_APIC_clock(void)
cpu_relax();
/* Stop the lapic timer */
- lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, levt);
+ local_irq_disable();
+ lapic_timer_shutdown(levt);
/* Jiffies delta */
deltaj = lapic_cal_j2 - lapic_cal_j1;
@@ -799,8 +810,8 @@ static int __init calibrate_APIC_clock(void)
apic_printk(APIC_VERBOSE, "... jiffies result ok\n");
else
levt->features |= CLOCK_EVT_FEAT_DUMMY;
- } else
- local_irq_enable();
+ }
+ local_irq_enable();
if (levt->features & CLOCK_EVT_FEAT_DUMMY) {
pr_warning("APIC timer disabled due to verification failure\n");
@@ -878,7 +889,7 @@ static void local_apic_timer_interrupt(void)
if (!evt->event_handler) {
pr_warning("Spurious LAPIC timer interrupt on cpu %d\n", cpu);
/* Switch it off */
- lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, evt);
+ lapic_timer_shutdown(evt);
return;
}
@@ -1209,7 +1220,7 @@ void setup_local_APIC(void)
long long max_loops = cpu_khz ? cpu_khz : 1000000;
if (cpu_has_tsc)
- rdtscll(tsc);
+ tsc = rdtsc();
if (disable_apic) {
disable_ioapic_support();
@@ -1293,7 +1304,7 @@ void setup_local_APIC(void)
}
if (queued) {
if (cpu_has_tsc && cpu_khz) {
- rdtscll(ntsc);
+ ntsc = rdtsc();
max_loops = (cpu_khz << 10) - (ntsc - tsc);
} else
max_loops--;
@@ -1424,7 +1435,7 @@ static inline void __x2apic_disable(void)
{
u64 msr;
- if (cpu_has_apic)
+ if (!cpu_has_apic)
return;
rdmsrl(MSR_IA32_APICBASE, msr);
@@ -1483,10 +1494,13 @@ void x2apic_setup(void)
static __init void x2apic_disable(void)
{
- u32 x2apic_id;
+ u32 x2apic_id, state = x2apic_state;
- if (x2apic_state != X2APIC_ON)
- goto out;
+ x2apic_mode = 0;
+ x2apic_state = X2APIC_DISABLED;
+
+ if (state != X2APIC_ON)
+ return;
x2apic_id = read_apic_id();
if (x2apic_id >= 255)
@@ -1494,9 +1508,6 @@ static __init void x2apic_disable(void)
__x2apic_disable();
register_lapic_address(mp_lapic_addr);
-out:
- x2apic_state = X2APIC_DISABLED;
- x2apic_mode = 0;
}
static __init void x2apic_enable(void)
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index de918c410eae..f92ab36979a2 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -191,7 +191,6 @@ static struct apic apic_flat = {
.send_IPI_all = flat_send_IPI_all,
.send_IPI_self = apic_send_IPI_self,
- .wait_for_init_deassert = false,
.inquire_remote_apic = default_inquire_remote_apic,
.read = native_apic_mem_read,
@@ -299,7 +298,6 @@ static struct apic apic_physflat = {
.send_IPI_all = physflat_send_IPI_all,
.send_IPI_self = apic_send_IPI_self,
- .wait_for_init_deassert = false,
.inquire_remote_apic = default_inquire_remote_apic,
.read = native_apic_mem_read,
diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c
index b205cdbdbe6a..0d96749cfcac 100644
--- a/arch/x86/kernel/apic/apic_noop.c
+++ b/arch/x86/kernel/apic/apic_noop.c
@@ -152,7 +152,6 @@ struct apic apic_noop = {
.wakeup_secondary_cpu = noop_wakeup_secondary_cpu,
- .wait_for_init_deassert = false,
.inquire_remote_apic = NULL,
.read = noop_apic_read,
diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c
index 017149cded07..b548fd3b764b 100644
--- a/arch/x86/kernel/apic/apic_numachip.c
+++ b/arch/x86/kernel/apic/apic_numachip.c
@@ -92,7 +92,6 @@ static int numachip_wakeup_secondary(int phys_apicid, unsigned long start_rip)
write_lcsr(CSR_G3_EXT_IRQ_GEN, int_gen.v);
- atomic_set(&init_deasserted, 1);
return 0;
}
@@ -235,7 +234,6 @@ static const struct apic apic_numachip __refconst = {
.send_IPI_self = numachip_send_IPI_self,
.wakeup_secondary_cpu = numachip_wakeup_secondary,
- .wait_for_init_deassert = false,
.inquire_remote_apic = NULL, /* REMRD not supported */
.read = native_apic_mem_read,
diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c
index c4a8d63f8220..971cf8875939 100644
--- a/arch/x86/kernel/apic/bigsmp_32.c
+++ b/arch/x86/kernel/apic/bigsmp_32.c
@@ -186,7 +186,6 @@ static struct apic apic_bigsmp = {
.send_IPI_all = bigsmp_send_IPI_all,
.send_IPI_self = default_send_IPI_self,
- .wait_for_init_deassert = true,
.inquire_remote_apic = default_inquire_remote_apic,
.read = native_apic_mem_read,
diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
index 6873ab925d00..045e424fb368 100644
--- a/arch/x86/kernel/apic/hw_nmi.c
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -28,146 +28,21 @@ u64 hw_nmi_get_sample_period(int watchdog_thresh)
#endif
#ifdef arch_trigger_all_cpu_backtrace
-/* For reliability, we're prepared to waste bits here. */
-static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly;
-static cpumask_t printtrace_mask;
-
-#define NMI_BUF_SIZE 4096
-
-struct nmi_seq_buf {
- unsigned char buffer[NMI_BUF_SIZE];
- struct seq_buf seq;
-};
-
-/* Safe printing in NMI context */
-static DEFINE_PER_CPU(struct nmi_seq_buf, nmi_print_seq);
-
-/* "in progress" flag of arch_trigger_all_cpu_backtrace */
-static unsigned long backtrace_flag;
-
-static void print_seq_line(struct nmi_seq_buf *s, int start, int end)
+static void nmi_raise_cpu_backtrace(cpumask_t *mask)
{
- const char *buf = s->buffer + start;
-
- printk("%.*s", (end - start) + 1, buf);
+ apic->send_IPI_mask(mask, NMI_VECTOR);
}
void arch_trigger_all_cpu_backtrace(bool include_self)
{
- struct nmi_seq_buf *s;
- int len;
- int cpu;
- int i;
- int this_cpu = get_cpu();
-
- if (test_and_set_bit(0, &backtrace_flag)) {
- /*
- * If there is already a trigger_all_cpu_backtrace() in progress
- * (backtrace_flag == 1), don't output double cpu dump infos.
- */
- put_cpu();
- return;
- }
-
- cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask);
- if (!include_self)
- cpumask_clear_cpu(this_cpu, to_cpumask(backtrace_mask));
-
- cpumask_copy(&printtrace_mask, to_cpumask(backtrace_mask));
- /*
- * Set up per_cpu seq_buf buffers that the NMIs running on the other
- * CPUs will write to.
- */
- for_each_cpu(cpu, to_cpumask(backtrace_mask)) {
- s = &per_cpu(nmi_print_seq, cpu);
- seq_buf_init(&s->seq, s->buffer, NMI_BUF_SIZE);
- }
-
- if (!cpumask_empty(to_cpumask(backtrace_mask))) {
- pr_info("sending NMI to %s CPUs:\n",
- (include_self ? "all" : "other"));
- apic->send_IPI_mask(to_cpumask(backtrace_mask), NMI_VECTOR);
- }
-
- /* Wait for up to 10 seconds for all CPUs to do the backtrace */
- for (i = 0; i < 10 * 1000; i++) {
- if (cpumask_empty(to_cpumask(backtrace_mask)))
- break;
- mdelay(1);
- touch_softlockup_watchdog();
- }
-
- /*
- * Now that all the NMIs have triggered, we can dump out their
- * back traces safely to the console.
- */
- for_each_cpu(cpu, &printtrace_mask) {
- int last_i = 0;
-
- s = &per_cpu(nmi_print_seq, cpu);
- len = seq_buf_used(&s->seq);
- if (!len)
- continue;
-
- /* Print line by line. */
- for (i = 0; i < len; i++) {
- if (s->buffer[i] == '\n') {
- print_seq_line(s, last_i, i);
- last_i = i + 1;
- }
- }
- /* Check if there was a partial line. */
- if (last_i < len) {
- print_seq_line(s, last_i, len - 1);
- pr_cont("\n");
- }
- }
-
- clear_bit(0, &backtrace_flag);
- smp_mb__after_atomic();
- put_cpu();
-}
-
-/*
- * It is not safe to call printk() directly from NMI handlers.
- * It may be fine if the NMI detected a lock up and we have no choice
- * but to do so, but doing a NMI on all other CPUs to get a back trace
- * can be done with a sysrq-l. We don't want that to lock up, which
- * can happen if the NMI interrupts a printk in progress.
- *
- * Instead, we redirect the vprintk() to this nmi_vprintk() that writes
- * the content into a per cpu seq_buf buffer. Then when the NMIs are
- * all done, we can safely dump the contents of the seq_buf to a printk()
- * from a non NMI context.
- */
-static int nmi_vprintk(const char *fmt, va_list args)
-{
- struct nmi_seq_buf *s = this_cpu_ptr(&nmi_print_seq);
- unsigned int len = seq_buf_used(&s->seq);
-
- seq_buf_vprintf(&s->seq, fmt, args);
- return seq_buf_used(&s->seq) - len;
+ nmi_trigger_all_cpu_backtrace(include_self, nmi_raise_cpu_backtrace);
}
static int
arch_trigger_all_cpu_backtrace_handler(unsigned int cmd, struct pt_regs *regs)
{
- int cpu;
-
- cpu = smp_processor_id();
-
- if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) {
- printk_func_t printk_func_save = this_cpu_read(printk_func);
-
- /* Replace printk to write into the NMI seq */
- this_cpu_write(printk_func, nmi_vprintk);
- printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu);
- show_regs(regs);
- this_cpu_write(printk_func, printk_func_save);
-
- cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask));
+ if (nmi_cpu_backtrace(regs))
return NMI_HANDLED;
- }
return NMI_DONE;
}
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 845dc0df2002..5c60bb162622 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -943,7 +943,7 @@ static bool mp_check_pin_attr(int irq, struct irq_alloc_info *info)
*/
if (irq < nr_legacy_irqs() && data->count == 1) {
if (info->ioapic_trigger != data->trigger)
- mp_register_handler(irq, data->trigger);
+ mp_register_handler(irq, info->ioapic_trigger);
data->entry.trigger = data->trigger = info->ioapic_trigger;
data->entry.polarity = data->polarity = info->ioapic_polarity;
}
@@ -2522,6 +2522,7 @@ void __init setup_ioapic_dest(void)
int pin, ioapic, irq, irq_entry;
const struct cpumask *mask;
struct irq_data *idata;
+ struct irq_chip *chip;
if (skip_ioapic_setup == 1)
return;
@@ -2541,13 +2542,13 @@ void __init setup_ioapic_dest(void)
* Honour affinities which have been set in early boot
*/
if (!irqd_can_balance(idata) || irqd_affinity_was_set(idata))
- mask = idata->affinity;
+ mask = irq_data_get_affinity_mask(idata);
else
mask = apic->target_cpus();
- irq_set_affinity(irq, mask);
+ chip = irq_data_get_irq_chip(idata);
+ chip->irq_set_affinity(idata, mask, false);
}
-
}
#endif
diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c
index 1a9d735e09c6..5f1feb6854af 100644
--- a/arch/x86/kernel/apic/msi.c
+++ b/arch/x86/kernel/apic/msi.c
@@ -264,7 +264,7 @@ static inline int hpet_dev_id(struct irq_domain *domain)
static void hpet_msi_write_msg(struct irq_data *data, struct msi_msg *msg)
{
- hpet_msi_write(data->handler_data, msg);
+ hpet_msi_write(irq_data_get_irq_handler_data(data), msg);
}
static struct irq_chip hpet_msi_controller = {
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index bda488680dbc..7694ae6c1199 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -111,7 +111,6 @@ static struct apic apic_default = {
.send_IPI_all = default_send_IPI_all,
.send_IPI_self = default_send_IPI_self,
- .wait_for_init_deassert = true,
.inquire_remote_apic = default_inquire_remote_apic,
.read = native_apic_mem_read,
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index 28eba2d38b15..836d11b92811 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -169,8 +169,7 @@ next:
goto next;
for_each_cpu_and(new_cpu, vector_cpumask, cpu_online_mask) {
- if (per_cpu(vector_irq, new_cpu)[vector] >
- VECTOR_UNDEFINED)
+ if (!IS_ERR_OR_NULL(per_cpu(vector_irq, new_cpu)[vector]))
goto next;
}
/* Found one! */
@@ -182,7 +181,7 @@ next:
cpumask_intersects(d->old_domain, cpu_online_mask);
}
for_each_cpu_and(new_cpu, vector_cpumask, cpu_online_mask)
- per_cpu(vector_irq, new_cpu)[vector] = irq;
+ per_cpu(vector_irq, new_cpu)[vector] = irq_to_desc(irq);
d->cfg.vector = vector;
cpumask_copy(d->domain, vector_cpumask);
err = 0;
@@ -224,15 +223,16 @@ static int assign_irq_vector_policy(int irq, int node,
static void clear_irq_vector(int irq, struct apic_chip_data *data)
{
- int cpu, vector;
+ struct irq_desc *desc;
unsigned long flags;
+ int cpu, vector;
raw_spin_lock_irqsave(&vector_lock, flags);
BUG_ON(!data->cfg.vector);
vector = data->cfg.vector;
for_each_cpu_and(cpu, data->domain, cpu_online_mask)
- per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED;
+ per_cpu(vector_irq, cpu)[vector] = VECTOR_UNUSED;
data->cfg.vector = 0;
cpumask_clear(data->domain);
@@ -242,12 +242,13 @@ static void clear_irq_vector(int irq, struct apic_chip_data *data)
return;
}
+ desc = irq_to_desc(irq);
for_each_cpu_and(cpu, data->old_domain, cpu_online_mask) {
for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS;
vector++) {
- if (per_cpu(vector_irq, cpu)[vector] != irq)
+ if (per_cpu(vector_irq, cpu)[vector] != desc)
continue;
- per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED;
+ per_cpu(vector_irq, cpu)[vector] = VECTOR_UNUSED;
break;
}
}
@@ -296,7 +297,7 @@ static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq,
struct irq_alloc_info *info = arg;
struct apic_chip_data *data;
struct irq_data *irq_data;
- int i, err;
+ int i, err, node;
if (disable_apic)
return -ENXIO;
@@ -308,12 +309,13 @@ static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq,
for (i = 0; i < nr_irqs; i++) {
irq_data = irq_domain_get_irq_data(domain, virq + i);
BUG_ON(!irq_data);
+ node = irq_data_get_node(irq_data);
#ifdef CONFIG_X86_IO_APIC
if (virq + i < nr_legacy_irqs() && legacy_irq_data[virq + i])
data = legacy_irq_data[virq + i];
else
#endif
- data = alloc_apic_chip_data(irq_data->node);
+ data = alloc_apic_chip_data(node);
if (!data) {
err = -ENOMEM;
goto error;
@@ -322,8 +324,7 @@ static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq,
irq_data->chip = &lapic_controller;
irq_data->chip_data = data;
irq_data->hwirq = virq + i;
- err = assign_irq_vector_policy(virq, irq_data->node, data,
- info);
+ err = assign_irq_vector_policy(virq + i, node, data, info);
if (err)
goto error;
}
@@ -403,49 +404,43 @@ int __init arch_early_irq_init(void)
return arch_early_ioapic_init();
}
+/* Initialize vector_irq on a new cpu */
static void __setup_vector_irq(int cpu)
{
- /* Initialize vector_irq on a new cpu */
- int irq, vector;
struct apic_chip_data *data;
+ struct irq_desc *desc;
+ int irq, vector;
- /*
- * vector_lock will make sure that we don't run into irq vector
- * assignments that might be happening on another cpu in parallel,
- * while we setup our initial vector to irq mappings.
- */
- raw_spin_lock(&vector_lock);
/* Mark the inuse vectors */
- for_each_active_irq(irq) {
- data = apic_chip_data(irq_get_irq_data(irq));
- if (!data)
- continue;
+ for_each_irq_desc(irq, desc) {
+ struct irq_data *idata = irq_desc_get_irq_data(desc);
- if (!cpumask_test_cpu(cpu, data->domain))
+ data = apic_chip_data(idata);
+ if (!data || !cpumask_test_cpu(cpu, data->domain))
continue;
vector = data->cfg.vector;
- per_cpu(vector_irq, cpu)[vector] = irq;
+ per_cpu(vector_irq, cpu)[vector] = desc;
}
/* Mark the free vectors */
for (vector = 0; vector < NR_VECTORS; ++vector) {
- irq = per_cpu(vector_irq, cpu)[vector];
- if (irq <= VECTOR_UNDEFINED)
+ desc = per_cpu(vector_irq, cpu)[vector];
+ if (IS_ERR_OR_NULL(desc))
continue;
- data = apic_chip_data(irq_get_irq_data(irq));
+ data = apic_chip_data(irq_desc_get_irq_data(desc));
if (!cpumask_test_cpu(cpu, data->domain))
- per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED;
+ per_cpu(vector_irq, cpu)[vector] = VECTOR_UNUSED;
}
- raw_spin_unlock(&vector_lock);
}
/*
- * Setup the vector to irq mappings.
+ * Setup the vector to irq mappings. Must be called with vector_lock held.
*/
void setup_vector_irq(int cpu)
{
int irq;
+ lockdep_assert_held(&vector_lock);
/*
* On most of the platforms, legacy PIC delivers the interrupts on the
* boot cpu. But there are certain platforms where PIC interrupts are
@@ -454,7 +449,7 @@ void setup_vector_irq(int cpu)
* legacy vector to irq mapping:
*/
for (irq = 0; irq < nr_legacy_irqs(); irq++)
- per_cpu(vector_irq, cpu)[ISA_IRQ_VECTOR(irq)] = irq;
+ per_cpu(vector_irq, cpu)[ISA_IRQ_VECTOR(irq)] = irq_to_desc(irq);
__setup_vector_irq(cpu);
}
@@ -494,9 +489,8 @@ static int apic_set_affinity(struct irq_data *irq_data,
err = assign_irq_vector(irq, data, dest);
if (err) {
- struct irq_data *top = irq_get_irq_data(irq);
-
- if (assign_irq_vector(irq, data, top->affinity))
+ if (assign_irq_vector(irq, data,
+ irq_data_get_affinity_mask(irq_data)))
pr_err("Failed to recover vector for irq %d\n", irq);
return err;
}
@@ -544,27 +538,30 @@ asmlinkage __visible void smp_irq_move_cleanup_interrupt(void)
entering_ack_irq();
+ /* Prevent vectors vanishing under us */
+ raw_spin_lock(&vector_lock);
+
me = smp_processor_id();
for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
- int irq;
- unsigned int irr;
- struct irq_desc *desc;
struct apic_chip_data *data;
+ struct irq_desc *desc;
+ unsigned int irr;
- irq = __this_cpu_read(vector_irq[vector]);
-
- if (irq <= VECTOR_UNDEFINED)
+ retry:
+ desc = __this_cpu_read(vector_irq[vector]);
+ if (IS_ERR_OR_NULL(desc))
continue;
- desc = irq_to_desc(irq);
- if (!desc)
- continue;
+ if (!raw_spin_trylock(&desc->lock)) {
+ raw_spin_unlock(&vector_lock);
+ cpu_relax();
+ raw_spin_lock(&vector_lock);
+ goto retry;
+ }
- data = apic_chip_data(&desc->irq_data);
+ data = apic_chip_data(irq_desc_get_irq_data(desc));
if (!data)
- continue;
-
- raw_spin_lock(&desc->lock);
+ goto unlock;
/*
* Check if the irq migration is in progress. If so, we
@@ -589,11 +586,13 @@ asmlinkage __visible void smp_irq_move_cleanup_interrupt(void)
apic->send_IPI_self(IRQ_MOVE_CLEANUP_VECTOR);
goto unlock;
}
- __this_cpu_write(vector_irq[vector], VECTOR_UNDEFINED);
+ __this_cpu_write(vector_irq[vector], VECTOR_UNUSED);
unlock:
raw_spin_unlock(&desc->lock);
}
+ raw_spin_unlock(&vector_lock);
+
exiting_irq();
}
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index ab3219b3fbda..cc8311c4d298 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -182,7 +182,7 @@ update_clusterinfo(struct notifier_block *nfb, unsigned long action, void *hcpu)
return notifier_from_errno(err);
}
-static struct notifier_block __refdata x2apic_cpu_notifier = {
+static struct notifier_block x2apic_cpu_notifier = {
.notifier_call = update_clusterinfo,
};
@@ -272,7 +272,6 @@ static struct apic apic_x2apic_cluster = {
.send_IPI_all = x2apic_send_IPI_all,
.send_IPI_self = x2apic_send_IPI_self,
- .wait_for_init_deassert = false,
.inquire_remote_apic = NULL,
.read = native_apic_msr_read,
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index 3ffd925655e0..662e9150ea6f 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -128,7 +128,6 @@ static struct apic apic_x2apic_phys = {
.send_IPI_all = x2apic_send_IPI_all,
.send_IPI_self = x2apic_send_IPI_self,
- .wait_for_init_deassert = false,
.inquire_remote_apic = NULL,
.read = native_apic_msr_read,
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index c8d92950bc04..4a139465f1d4 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -248,7 +248,6 @@ static int uv_wakeup_secondary(int phys_apicid, unsigned long start_rip)
APIC_DM_STARTUP;
uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
- atomic_set(&init_deasserted, 1);
return 0;
}
@@ -414,7 +413,6 @@ static struct apic __refdata apic_x2apic_uv_x = {
.send_IPI_self = uv_send_IPI_self,
.wakeup_secondary_cpu = uv_wakeup_secondary,
- .wait_for_init_deassert = false,
.inquire_remote_apic = NULL,
.read = native_apic_msr_read,
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 927ec9235947..052c9c3026cc 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -919,7 +919,7 @@ recalc:
} else if (jiffies_since_last_check > idle_period) {
unsigned int idle_percentage;
- idle_percentage = stime - last_stime;
+ idle_percentage = cputime_to_jiffies(stime - last_stime);
idle_percentage *= 100;
idle_percentage /= jiffies_since_last_check;
use_apm_idle = (idle_percentage > idle_threshold);
diff --git a/arch/x86/kernel/check.c b/arch/x86/kernel/check.c
index 58118e207a69..145863d4d343 100644
--- a/arch/x86/kernel/check.c
+++ b/arch/x86/kernel/check.c
@@ -1,4 +1,4 @@
-#include <linux/module.h>
+#include <linux/init.h>
#include <linux/sched.h>
#include <linux/kthread.h>
#include <linux/workqueue.h>
@@ -163,6 +163,5 @@ static int start_periodic_check_for_corruption(void)
schedule_delayed_work(&bios_check_work, 0);
return 0;
}
-
-module_init(start_periodic_check_for_corruption);
+device_initcall(start_periodic_check_for_corruption);
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 9bff68798836..4eb065c6bed2 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -46,6 +46,8 @@ obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE) += perf_event_intel_uncore.o \
perf_event_intel_uncore_snb.o \
perf_event_intel_uncore_snbep.o \
perf_event_intel_uncore_nhmex.o
+obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_msr.o
+obj-$(CONFIG_CPU_SUP_AMD) += perf_event_msr.o
endif
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index dd3a4baffe50..4a70fc6d400a 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -11,6 +11,7 @@
#include <asm/cpu.h>
#include <asm/smp.h>
#include <asm/pci-direct.h>
+#include <asm/delay.h>
#ifdef CONFIG_X86_64
# include <asm/mmconfig.h>
@@ -114,7 +115,7 @@ static void init_amd_k6(struct cpuinfo_x86 *c)
const int K6_BUG_LOOP = 1000000;
int n;
void (*f_vide)(void);
- unsigned long d, d2;
+ u64 d, d2;
printk(KERN_INFO "AMD K6 stepping B detected - ");
@@ -125,10 +126,10 @@ static void init_amd_k6(struct cpuinfo_x86 *c)
n = K6_BUG_LOOP;
f_vide = vide;
- rdtscl(d);
+ d = rdtsc();
while (n--)
f_vide();
- rdtscl(d2);
+ d2 = rdtsc();
d = d2-d;
if (d > 20*K6_BUG_LOOP)
@@ -506,6 +507,9 @@ static void bsp_init_amd(struct cpuinfo_x86 *c)
/* A random value per boot for bit slice [12:upper_bit) */
va_align.bits = get_random_int() & va_align.mask;
}
+
+ if (cpu_has(c, X86_FEATURE_MWAITX))
+ use_mwaitx_delay();
}
static void early_init_amd(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 922c5e0cea4c..de22ea7ff82f 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -13,6 +13,7 @@
#include <linux/kgdb.h>
#include <linux/smp.h>
#include <linux/io.h>
+#include <linux/syscore_ops.h>
#include <asm/stackprotector.h>
#include <asm/perf_event.h>
@@ -1109,10 +1110,10 @@ void print_cpu_info(struct cpuinfo_x86 *c)
else
printk(KERN_CONT "%d86", c->x86);
- printk(KERN_CONT " (fam: %02x, model: %02x", c->x86, c->x86_model);
+ printk(KERN_CONT " (family: 0x%x, model: 0x%x", c->x86, c->x86_model);
if (c->x86_mask || c->cpuid_level >= 0)
- printk(KERN_CONT ", stepping: %02x)\n", c->x86_mask);
+ printk(KERN_CONT ", stepping: 0x%x)\n", c->x86_mask);
else
printk(KERN_CONT ")\n");
@@ -1185,10 +1186,10 @@ void syscall_init(void)
* set CS/DS but only a 32bit target. LSTAR sets the 64bit rip.
*/
wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32);
- wrmsrl(MSR_LSTAR, entry_SYSCALL_64);
+ wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
#ifdef CONFIG_IA32_EMULATION
- wrmsrl(MSR_CSTAR, entry_SYSCALL_compat);
+ wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat);
/*
* This only works on Intel CPUs.
* On AMD CPUs these MSRs are 32-bit, CPU truncates MSR_IA32_SYSENTER_EIP.
@@ -1199,7 +1200,7 @@ void syscall_init(void)
wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat);
#else
- wrmsrl(MSR_CSTAR, ignore_sysret);
+ wrmsrl(MSR_CSTAR, (unsigned long)ignore_sysret);
wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG);
wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL);
@@ -1410,7 +1411,7 @@ void cpu_init(void)
load_sp0(t, &current->thread);
set_tss_desc(cpu, t);
load_TR_desc();
- load_LDT(&init_mm.context);
+ load_mm_ldt(&init_mm);
clear_all_debug_regs();
dbg_restore_debug_regs();
@@ -1459,7 +1460,7 @@ void cpu_init(void)
load_sp0(t, thread);
set_tss_desc(cpu, t);
load_TR_desc();
- load_LDT(&init_mm.context);
+ load_mm_ldt(&init_mm);
t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
@@ -1488,3 +1489,20 @@ inline bool __static_cpu_has_safe(u16 bit)
return boot_cpu_has(bit);
}
EXPORT_SYMBOL_GPL(__static_cpu_has_safe);
+
+static void bsp_resume(void)
+{
+ if (this_cpu->c_bsp_resume)
+ this_cpu->c_bsp_resume(&boot_cpu_data);
+}
+
+static struct syscore_ops cpu_syscore_ops = {
+ .resume = bsp_resume,
+};
+
+static int __init init_cpu_syscore(void)
+{
+ register_syscore_ops(&cpu_syscore_ops);
+ return 0;
+}
+core_initcall(init_cpu_syscore);
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index c37dc37e8317..2584265d4745 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -13,6 +13,7 @@ struct cpu_dev {
void (*c_init)(struct cpuinfo_x86 *);
void (*c_identify)(struct cpuinfo_x86 *);
void (*c_detect_tlb)(struct cpuinfo_x86 *);
+ void (*c_bsp_resume)(struct cpuinfo_x86 *);
int c_x86_vendor;
#ifdef CONFIG_X86_32
/* Optional vendor specific routine to obtain the cache size. */
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 50163fa9034f..98a13db5f4be 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -371,6 +371,36 @@ static void detect_vmx_virtcap(struct cpuinfo_x86 *c)
}
}
+static void init_intel_energy_perf(struct cpuinfo_x86 *c)
+{
+ u64 epb;
+
+ /*
+ * Initialize MSR_IA32_ENERGY_PERF_BIAS if not already initialized.
+ * (x86_energy_perf_policy(8) is available to change it at run-time.)
+ */
+ if (!cpu_has(c, X86_FEATURE_EPB))
+ return;
+
+ rdmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb);
+ if ((epb & 0xF) != ENERGY_PERF_BIAS_PERFORMANCE)
+ return;
+
+ pr_warn_once("ENERGY_PERF_BIAS: Set to 'normal', was 'performance'\n");
+ pr_warn_once("ENERGY_PERF_BIAS: View and update with x86_energy_perf_policy(8)\n");
+ epb = (epb & ~0xF) | ENERGY_PERF_BIAS_NORMAL;
+ wrmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb);
+}
+
+static void intel_bsp_resume(struct cpuinfo_x86 *c)
+{
+ /*
+ * MSR_IA32_ENERGY_PERF_BIAS is lost across suspend/resume,
+ * so reinitialize it properly like during bootup:
+ */
+ init_intel_energy_perf(c);
+}
+
static void init_intel(struct cpuinfo_x86 *c)
{
unsigned int l2 = 0;
@@ -478,21 +508,7 @@ static void init_intel(struct cpuinfo_x86 *c)
if (cpu_has(c, X86_FEATURE_VMX))
detect_vmx_virtcap(c);
- /*
- * Initialize MSR_IA32_ENERGY_PERF_BIAS if BIOS did not.
- * x86_energy_perf_policy(8) is available to change it at run-time
- */
- if (cpu_has(c, X86_FEATURE_EPB)) {
- u64 epb;
-
- rdmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb);
- if ((epb & 0xF) == ENERGY_PERF_BIAS_PERFORMANCE) {
- pr_warn_once("ENERGY_PERF_BIAS: Set to 'normal', was 'performance'\n");
- pr_warn_once("ENERGY_PERF_BIAS: View and update with x86_energy_perf_policy(8)\n");
- epb = (epb & ~0xF) | ENERGY_PERF_BIAS_NORMAL;
- wrmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb);
- }
- }
+ init_intel_energy_perf(c);
}
#ifdef CONFIG_X86_32
@@ -747,6 +763,7 @@ static const struct cpu_dev intel_cpu_dev = {
.c_detect_tlb = intel_detect_tlb,
.c_early_init = early_init_intel,
.c_init = init_intel,
+ .c_bsp_resume = intel_bsp_resume,
.c_x86_vendor = X86_VENDOR_INTEL,
};
diff --git a/arch/x86/kernel/cpu/intel_pt.h b/arch/x86/kernel/cpu/intel_pt.h
index 1c338b0eba05..336878a5d205 100644
--- a/arch/x86/kernel/cpu/intel_pt.h
+++ b/arch/x86/kernel/cpu/intel_pt.h
@@ -25,32 +25,11 @@
*/
#define TOPA_PMI_MARGIN 512
-/*
- * Table of Physical Addresses bits
- */
-enum topa_sz {
- TOPA_4K = 0,
- TOPA_8K,
- TOPA_16K,
- TOPA_32K,
- TOPA_64K,
- TOPA_128K,
- TOPA_256K,
- TOPA_512K,
- TOPA_1MB,
- TOPA_2MB,
- TOPA_4MB,
- TOPA_8MB,
- TOPA_16MB,
- TOPA_32MB,
- TOPA_64MB,
- TOPA_128MB,
- TOPA_SZ_END,
-};
+#define TOPA_SHIFT 12
-static inline unsigned int sizes(enum topa_sz tsz)
+static inline unsigned int sizes(unsigned int tsz)
{
- return 1 << (tsz + 12);
+ return 1 << (tsz + TOPA_SHIFT);
};
struct topa_entry {
@@ -66,20 +45,26 @@ struct topa_entry {
u64 rsvd4 : 16;
};
-#define TOPA_SHIFT 12
-#define PT_CPUID_LEAVES 2
+#define PT_CPUID_LEAVES 2
+#define PT_CPUID_REGS_NUM 4 /* number of regsters (eax, ebx, ecx, edx) */
enum pt_capabilities {
PT_CAP_max_subleaf = 0,
PT_CAP_cr3_filtering,
+ PT_CAP_psb_cyc,
+ PT_CAP_mtc,
PT_CAP_topa_output,
PT_CAP_topa_multiple_entries,
+ PT_CAP_single_range_output,
PT_CAP_payloads_lip,
+ PT_CAP_mtc_periods,
+ PT_CAP_cycle_thresholds,
+ PT_CAP_psb_periods,
};
struct pt_pmu {
struct pmu pmu;
- u32 caps[4 * PT_CPUID_LEAVES];
+ u32 caps[PT_CPUID_REGS_NUM * PT_CPUID_LEAVES];
};
/**
diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile
index bb34b03af252..a3311c886194 100644
--- a/arch/x86/kernel/cpu/mcheck/Makefile
+++ b/arch/x86/kernel/cpu/mcheck/Makefile
@@ -1,4 +1,4 @@
-obj-y = mce.o mce-severity.o
+obj-y = mce.o mce-severity.o mce-genpool.o
obj-$(CONFIG_X86_ANCIENT_MCE) += winchip.o p5.o
obj-$(CONFIG_X86_MCE_INTEL) += mce_intel.o
diff --git a/arch/x86/kernel/cpu/mcheck/mce-apei.c b/arch/x86/kernel/cpu/mcheck/mce-apei.c
index a1aef9533154..34c89a3e8260 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-apei.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-apei.c
@@ -57,7 +57,6 @@ void apei_mce_report_mem_error(int severity, struct cper_sec_mem_err *mem_err)
m.addr = mem_err->physical_addr;
mce_log(&m);
- mce_notify_irq();
}
EXPORT_SYMBOL_GPL(apei_mce_report_mem_error);
diff --git a/arch/x86/kernel/cpu/mcheck/mce-genpool.c b/arch/x86/kernel/cpu/mcheck/mce-genpool.c
new file mode 100644
index 000000000000..0a850100c594
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/mce-genpool.c
@@ -0,0 +1,99 @@
+/*
+ * MCE event pool management in MCE context
+ *
+ * Copyright (C) 2015 Intel Corp.
+ * Author: Chen, Gong <gong.chen@linux.intel.com>
+ *
+ * This file is licensed under GPLv2.
+ */
+#include <linux/smp.h>
+#include <linux/mm.h>
+#include <linux/genalloc.h>
+#include <linux/llist.h>
+#include "mce-internal.h"
+
+/*
+ * printk() is not safe in MCE context. This is a lock-less memory allocator
+ * used to save error information organized in a lock-less list.
+ *
+ * This memory pool is only to be used to save MCE records in MCE context.
+ * MCE events are rare, so a fixed size memory pool should be enough. Use
+ * 2 pages to save MCE events for now (~80 MCE records at most).
+ */
+#define MCE_POOLSZ (2 * PAGE_SIZE)
+
+static struct gen_pool *mce_evt_pool;
+static LLIST_HEAD(mce_event_llist);
+static char gen_pool_buf[MCE_POOLSZ];
+
+void mce_gen_pool_process(void)
+{
+ struct llist_node *head;
+ struct mce_evt_llist *node;
+ struct mce *mce;
+
+ head = llist_del_all(&mce_event_llist);
+ if (!head)
+ return;
+
+ head = llist_reverse_order(head);
+ llist_for_each_entry(node, head, llnode) {
+ mce = &node->mce;
+ atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, mce);
+ gen_pool_free(mce_evt_pool, (unsigned long)node, sizeof(*node));
+ }
+}
+
+bool mce_gen_pool_empty(void)
+{
+ return llist_empty(&mce_event_llist);
+}
+
+int mce_gen_pool_add(struct mce *mce)
+{
+ struct mce_evt_llist *node;
+
+ if (!mce_evt_pool)
+ return -EINVAL;
+
+ node = (void *)gen_pool_alloc(mce_evt_pool, sizeof(*node));
+ if (!node) {
+ pr_warn_ratelimited("MCE records pool full!\n");
+ return -ENOMEM;
+ }
+
+ memcpy(&node->mce, mce, sizeof(*mce));
+ llist_add(&node->llnode, &mce_event_llist);
+
+ return 0;
+}
+
+static int mce_gen_pool_create(void)
+{
+ struct gen_pool *tmpp;
+ int ret = -ENOMEM;
+
+ tmpp = gen_pool_create(ilog2(sizeof(struct mce_evt_llist)), -1);
+ if (!tmpp)
+ goto out;
+
+ ret = gen_pool_add(tmpp, (unsigned long)gen_pool_buf, MCE_POOLSZ, -1);
+ if (ret) {
+ gen_pool_destroy(tmpp);
+ goto out;
+ }
+
+ mce_evt_pool = tmpp;
+
+out:
+ return ret;
+}
+
+int mce_gen_pool_init(void)
+{
+ /* Just init mce_gen_pool once. */
+ if (mce_evt_pool)
+ return 0;
+
+ return mce_gen_pool_create();
+}
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h
index fe32074b865b..547720efd923 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-internal.h
+++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h
@@ -13,6 +13,8 @@ enum severity_level {
MCE_PANIC_SEVERITY,
};
+extern struct atomic_notifier_head x86_mce_decoder_chain;
+
#define ATTR_LEN 16
#define INITIAL_CHECK_INTERVAL 5 * 60 /* 5 minutes */
@@ -24,6 +26,16 @@ struct mce_bank {
char attrname[ATTR_LEN]; /* attribute name */
};
+struct mce_evt_llist {
+ struct llist_node llnode;
+ struct mce mce;
+};
+
+void mce_gen_pool_process(void);
+bool mce_gen_pool_empty(void);
+int mce_gen_pool_add(struct mce *mce);
+int mce_gen_pool_init(void);
+
extern int (*mce_severity)(struct mce *a, int tolerant, char **msg, bool is_excp);
struct dentry *mce_get_debugfs_dir(void);
@@ -67,3 +79,5 @@ static inline int apei_clear_mce(u64 record_id)
return -EINVAL;
}
#endif
+
+void mce_inject_log(struct mce *m);
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index df919ff103c3..9d014b82a124 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -52,11 +52,11 @@
static DEFINE_MUTEX(mce_chrdev_read_mutex);
-#define rcu_dereference_check_mce(p) \
+#define mce_log_get_idx_check(p) \
({ \
- rcu_lockdep_assert(rcu_read_lock_sched_held() || \
- lockdep_is_held(&mce_chrdev_read_mutex), \
- "suspicious rcu_dereference_check_mce() usage"); \
+ RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \
+ !lockdep_is_held(&mce_chrdev_read_mutex), \
+ "suspicious mce_log_get_idx_check() usage"); \
smp_load_acquire(&(p)); \
})
@@ -110,22 +110,24 @@ DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
*/
mce_banks_t mce_banks_ce_disabled;
-static DEFINE_PER_CPU(struct work_struct, mce_work);
+static struct work_struct mce_work;
+static struct irq_work mce_irq_work;
static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs);
+static int mce_usable_address(struct mce *m);
/*
* CPU/chipset specific EDAC code can register a notifier call here to print
* MCE errors in a human-readable form.
*/
-static ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain);
+ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain);
/* Do initial initialization of a struct mce */
void mce_setup(struct mce *m)
{
memset(m, 0, sizeof(struct mce));
m->cpu = m->extcpu = smp_processor_id();
- rdtscll(m->tsc);
+ m->tsc = rdtsc();
/* We hope get_seconds stays lockless */
m->time = get_seconds();
m->cpuvendor = boot_cpu_data.x86_vendor;
@@ -157,12 +159,13 @@ void mce_log(struct mce *mce)
/* Emit the trace record: */
trace_mce_record(mce);
- atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, mce);
+ if (!mce_gen_pool_add(mce))
+ irq_work_queue(&mce_irq_work);
mce->finished = 0;
wmb();
for (;;) {
- entry = rcu_dereference_check_mce(mcelog.next);
+ entry = mce_log_get_idx_check(mcelog.next);
for (;;) {
/*
@@ -196,48 +199,23 @@ void mce_log(struct mce *mce)
set_bit(0, &mce_need_notify);
}
-static void drain_mcelog_buffer(void)
+void mce_inject_log(struct mce *m)
{
- unsigned int next, i, prev = 0;
-
- next = ACCESS_ONCE(mcelog.next);
-
- do {
- struct mce *m;
-
- /* drain what was logged during boot */
- for (i = prev; i < next; i++) {
- unsigned long start = jiffies;
- unsigned retries = 1;
-
- m = &mcelog.entry[i];
-
- while (!m->finished) {
- if (time_after_eq(jiffies, start + 2*retries))
- retries++;
-
- cpu_relax();
-
- if (!m->finished && retries >= 4) {
- pr_err("skipping error being logged currently!\n");
- break;
- }
- }
- smp_rmb();
- atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m);
- }
-
- memset(mcelog.entry + prev, 0, (next - prev) * sizeof(*m));
- prev = next;
- next = cmpxchg(&mcelog.next, prev, 0);
- } while (next != prev);
+ mutex_lock(&mce_chrdev_read_mutex);
+ mce_log(m);
+ mutex_unlock(&mce_chrdev_read_mutex);
}
+EXPORT_SYMBOL_GPL(mce_inject_log);
+static struct notifier_block mce_srao_nb;
void mce_register_decode_chain(struct notifier_block *nb)
{
+ /* Ensure SRAO notifier has the highest priority in the decode chain. */
+ if (nb != &mce_srao_nb && nb->priority == INT_MAX)
+ nb->priority -= 1;
+
atomic_notifier_chain_register(&x86_mce_decoder_chain, nb);
- drain_mcelog_buffer();
}
EXPORT_SYMBOL_GPL(mce_register_decode_chain);
@@ -461,61 +439,6 @@ static inline void mce_gather_info(struct mce *m, struct pt_regs *regs)
}
}
-/*
- * Simple lockless ring to communicate PFNs from the exception handler with the
- * process context work function. This is vastly simplified because there's
- * only a single reader and a single writer.
- */
-#define MCE_RING_SIZE 16 /* we use one entry less */
-
-struct mce_ring {
- unsigned short start;
- unsigned short end;
- unsigned long ring[MCE_RING_SIZE];
-};
-static DEFINE_PER_CPU(struct mce_ring, mce_ring);
-
-/* Runs with CPU affinity in workqueue */
-static int mce_ring_empty(void)
-{
- struct mce_ring *r = this_cpu_ptr(&mce_ring);
-
- return r->start == r->end;
-}
-
-static int mce_ring_get(unsigned long *pfn)
-{
- struct mce_ring *r;
- int ret = 0;
-
- *pfn = 0;
- get_cpu();
- r = this_cpu_ptr(&mce_ring);
- if (r->start == r->end)
- goto out;
- *pfn = r->ring[r->start];
- r->start = (r->start + 1) % MCE_RING_SIZE;
- ret = 1;
-out:
- put_cpu();
- return ret;
-}
-
-/* Always runs in MCE context with preempt off */
-static int mce_ring_add(unsigned long pfn)
-{
- struct mce_ring *r = this_cpu_ptr(&mce_ring);
- unsigned next;
-
- next = (r->end + 1) % MCE_RING_SIZE;
- if (next == r->start)
- return -1;
- r->ring[r->end] = pfn;
- wmb();
- r->end = next;
- return 0;
-}
-
int mce_available(struct cpuinfo_x86 *c)
{
if (mca_cfg.disabled)
@@ -525,12 +448,10 @@ int mce_available(struct cpuinfo_x86 *c)
static void mce_schedule_work(void)
{
- if (!mce_ring_empty())
- schedule_work(this_cpu_ptr(&mce_work));
+ if (!mce_gen_pool_empty() && keventd_up())
+ schedule_work(&mce_work);
}
-static DEFINE_PER_CPU(struct irq_work, mce_irq_work);
-
static void mce_irq_work_cb(struct irq_work *entry)
{
mce_notify_irq();
@@ -551,8 +472,29 @@ static void mce_report_event(struct pt_regs *regs)
return;
}
- irq_work_queue(this_cpu_ptr(&mce_irq_work));
+ irq_work_queue(&mce_irq_work);
+}
+
+static int srao_decode_notifier(struct notifier_block *nb, unsigned long val,
+ void *data)
+{
+ struct mce *mce = (struct mce *)data;
+ unsigned long pfn;
+
+ if (!mce)
+ return NOTIFY_DONE;
+
+ if (mce->usable_addr && (mce->severity == MCE_AO_SEVERITY)) {
+ pfn = mce->addr >> PAGE_SHIFT;
+ memory_failure(pfn, MCE_VECTOR, 0);
+ }
+
+ return NOTIFY_OK;
}
+static struct notifier_block mce_srao_nb = {
+ .notifier_call = srao_decode_notifier,
+ .priority = INT_MAX,
+};
/*
* Read ADDR and MISC registers.
@@ -672,8 +614,11 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
*/
if (severity == MCE_DEFERRED_SEVERITY && memory_error(&m)) {
if (m.status & MCI_STATUS_ADDRV) {
- mce_ring_add(m.addr >> PAGE_SHIFT);
- mce_schedule_work();
+ m.severity = severity;
+ m.usable_addr = mce_usable_address(&m);
+
+ if (!mce_gen_pool_add(&m))
+ mce_schedule_work();
}
}
@@ -1029,7 +974,6 @@ void do_machine_check(struct pt_regs *regs, long error_code)
{
struct mca_config *cfg = &mca_cfg;
struct mce m, *final;
- enum ctx_state prev_state;
int i;
int worst = 0;
int severity;
@@ -1055,7 +999,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
int flags = MF_ACTION_REQUIRED;
int lmce = 0;
- prev_state = ist_enter(regs);
+ ist_enter(regs);
this_cpu_inc(mce_exception_count);
@@ -1143,15 +1087,9 @@ void do_machine_check(struct pt_regs *regs, long error_code)
mce_read_aux(&m, i);
- /*
- * Action optional error. Queue address for later processing.
- * When the ring overflows we just ignore the AO error.
- * RED-PEN add some logging mechanism when
- * usable_address or mce_add_ring fails.
- * RED-PEN don't ignore overflow for mca_cfg.tolerant == 0
- */
- if (severity == MCE_AO_SEVERITY && mce_usable_address(&m))
- mce_ring_add(m.addr >> PAGE_SHIFT);
+ /* assuming valid severity level != 0 */
+ m.severity = severity;
+ m.usable_addr = mce_usable_address(&m);
mce_log(&m);
@@ -1227,7 +1165,7 @@ out:
local_irq_disable();
ist_end_non_atomic();
done:
- ist_exit(regs, prev_state);
+ ist_exit(regs);
}
EXPORT_SYMBOL_GPL(do_machine_check);
@@ -1247,14 +1185,11 @@ int memory_failure(unsigned long pfn, int vector, int flags)
/*
* Action optional processing happens here (picking up
* from the list of faulting pages that do_machine_check()
- * placed into the "ring").
+ * placed into the genpool).
*/
static void mce_process_work(struct work_struct *dummy)
{
- unsigned long pfn;
-
- while (mce_ring_get(&pfn))
- memory_failure(pfn, MCE_VECTOR, 0);
+ mce_gen_pool_process();
}
#ifdef CONFIG_X86_MCE_INTEL
@@ -1678,6 +1613,17 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
}
}
+static void __mcheck_cpu_clear_vendor(struct cpuinfo_x86 *c)
+{
+ switch (c->x86_vendor) {
+ case X86_VENDOR_INTEL:
+ mce_intel_feature_clear(c);
+ break;
+ default:
+ break;
+ }
+}
+
static void mce_start_timer(unsigned int cpu, struct timer_list *t)
{
unsigned long iv = check_interval * HZ;
@@ -1731,13 +1677,36 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c)
return;
}
+ if (mce_gen_pool_init()) {
+ mca_cfg.disabled = true;
+ pr_emerg("Couldn't allocate MCE records pool!\n");
+ return;
+ }
+
machine_check_vector = do_machine_check;
__mcheck_cpu_init_generic();
__mcheck_cpu_init_vendor(c);
__mcheck_cpu_init_timer();
- INIT_WORK(this_cpu_ptr(&mce_work), mce_process_work);
- init_irq_work(this_cpu_ptr(&mce_irq_work), &mce_irq_work_cb);
+}
+
+/*
+ * Called for each booted CPU to clear some machine checks opt-ins
+ */
+void mcheck_cpu_clear(struct cpuinfo_x86 *c)
+{
+ if (mca_cfg.disabled)
+ return;
+
+ if (!mce_available(c))
+ return;
+
+ /*
+ * Possibly to clear general settings generic to x86
+ * __mcheck_cpu_clear_generic(c);
+ */
+ __mcheck_cpu_clear_vendor(c);
+
}
/*
@@ -1784,7 +1753,7 @@ static void collect_tscs(void *data)
{
unsigned long *cpu_tsc = (unsigned long *)data;
- rdtscll(cpu_tsc[smp_processor_id()]);
+ cpu_tsc[smp_processor_id()] = rdtsc();
}
static int mce_apei_read_done;
@@ -1850,7 +1819,7 @@ static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf,
goto out;
}
- next = rcu_dereference_check_mce(mcelog.next);
+ next = mce_log_get_idx_check(mcelog.next);
/* Only supports full reads right now */
err = -EINVAL;
@@ -2056,8 +2025,12 @@ __setup("mce", mcheck_enable);
int __init mcheck_init(void)
{
mcheck_intel_therm_init();
+ mce_register_decode_chain(&mce_srao_nb);
mcheck_vendor_init_severity();
+ INIT_WORK(&mce_work, mce_process_work);
+ init_irq_work(&mce_irq_work, mce_irq_work_cb);
+
return 0;
}
@@ -2591,5 +2564,20 @@ static int __init mcheck_debugfs_init(void)
return 0;
}
-late_initcall(mcheck_debugfs_init);
+#else
+static int __init mcheck_debugfs_init(void) { return -EINVAL; }
#endif
+
+static int __init mcheck_late_init(void)
+{
+ mcheck_debugfs_init();
+
+ /*
+ * Flush out everything that has been logged during early boot, now that
+ * everything has been initialized (workqueues, decoders, ...).
+ */
+ mce_schedule_work();
+
+ return 0;
+}
+late_initcall(mcheck_late_init);
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index 844f56c5616d..1e8bb6c94f14 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -146,6 +146,27 @@ void mce_intel_hcpu_update(unsigned long cpu)
per_cpu(cmci_storm_state, cpu) = CMCI_STORM_NONE;
}
+static void cmci_toggle_interrupt_mode(bool on)
+{
+ unsigned long flags, *owned;
+ int bank;
+ u64 val;
+
+ raw_spin_lock_irqsave(&cmci_discover_lock, flags);
+ owned = this_cpu_ptr(mce_banks_owned);
+ for_each_set_bit(bank, owned, MAX_NR_BANKS) {
+ rdmsrl(MSR_IA32_MCx_CTL2(bank), val);
+
+ if (on)
+ val |= MCI_CTL2_CMCI_EN;
+ else
+ val &= ~MCI_CTL2_CMCI_EN;
+
+ wrmsrl(MSR_IA32_MCx_CTL2(bank), val);
+ }
+ raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
+}
+
unsigned long cmci_intel_adjust_timer(unsigned long interval)
{
if ((this_cpu_read(cmci_backoff_cnt) > 0) &&
@@ -175,7 +196,7 @@ unsigned long cmci_intel_adjust_timer(unsigned long interval)
*/
if (!atomic_read(&cmci_storm_on_cpus)) {
__this_cpu_write(cmci_storm_state, CMCI_STORM_NONE);
- cmci_reenable();
+ cmci_toggle_interrupt_mode(true);
cmci_recheck();
}
return CMCI_POLL_INTERVAL;
@@ -186,22 +207,6 @@ unsigned long cmci_intel_adjust_timer(unsigned long interval)
}
}
-static void cmci_storm_disable_banks(void)
-{
- unsigned long flags, *owned;
- int bank;
- u64 val;
-
- raw_spin_lock_irqsave(&cmci_discover_lock, flags);
- owned = this_cpu_ptr(mce_banks_owned);
- for_each_set_bit(bank, owned, MAX_NR_BANKS) {
- rdmsrl(MSR_IA32_MCx_CTL2(bank), val);
- val &= ~MCI_CTL2_CMCI_EN;
- wrmsrl(MSR_IA32_MCx_CTL2(bank), val);
- }
- raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
-}
-
static bool cmci_storm_detect(void)
{
unsigned int cnt = __this_cpu_read(cmci_storm_cnt);
@@ -223,7 +228,7 @@ static bool cmci_storm_detect(void)
if (cnt <= CMCI_STORM_THRESHOLD)
return false;
- cmci_storm_disable_banks();
+ cmci_toggle_interrupt_mode(false);
__this_cpu_write(cmci_storm_state, CMCI_STORM_ACTIVE);
r = atomic_add_return(1, &cmci_storm_on_cpus);
mce_timer_kick(CMCI_STORM_INTERVAL);
@@ -246,7 +251,6 @@ static void intel_threshold_interrupt(void)
return;
machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned));
- mce_notify_irq();
}
/*
@@ -435,7 +439,7 @@ static void intel_init_cmci(void)
cmci_recheck();
}
-void intel_init_lmce(void)
+static void intel_init_lmce(void)
{
u64 val;
@@ -448,9 +452,26 @@ void intel_init_lmce(void)
wrmsrl(MSR_IA32_MCG_EXT_CTL, val | MCG_EXT_CTL_LMCE_EN);
}
+static void intel_clear_lmce(void)
+{
+ u64 val;
+
+ if (!lmce_supported())
+ return;
+
+ rdmsrl(MSR_IA32_MCG_EXT_CTL, val);
+ val &= ~MCG_EXT_CTL_LMCE_EN;
+ wrmsrl(MSR_IA32_MCG_EXT_CTL, val);
+}
+
void mce_intel_feature_init(struct cpuinfo_x86 *c)
{
intel_init_thermal(c);
intel_init_cmci();
intel_init_lmce();
}
+
+void mce_intel_feature_clear(struct cpuinfo_x86 *c)
+{
+ intel_clear_lmce();
+}
diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c
index 737b0ad4e61a..12402e10aeff 100644
--- a/arch/x86/kernel/cpu/mcheck/p5.c
+++ b/arch/x86/kernel/cpu/mcheck/p5.c
@@ -19,10 +19,9 @@ int mce_p5_enabled __read_mostly;
/* Machine check handler for Pentium class Intel CPUs: */
static void pentium_machine_check(struct pt_regs *regs, long error_code)
{
- enum ctx_state prev_state;
u32 loaddr, hi, lotype;
- prev_state = ist_enter(regs);
+ ist_enter(regs);
rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi);
rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi);
@@ -39,7 +38,7 @@ static void pentium_machine_check(struct pt_regs *regs, long error_code)
add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
- ist_exit(regs, prev_state);
+ ist_exit(regs);
}
/* Set up machine check reporting for processors with Intel style MCE: */
diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mcheck/winchip.c
index 44f138296fbe..01dd8702880b 100644
--- a/arch/x86/kernel/cpu/mcheck/winchip.c
+++ b/arch/x86/kernel/cpu/mcheck/winchip.c
@@ -15,12 +15,12 @@
/* Machine check handler for WinChip C6: */
static void winchip_machine_check(struct pt_regs *regs, long error_code)
{
- enum ctx_state prev_state = ist_enter(regs);
+ ist_enter(regs);
printk(KERN_EMERG "CPU0: Machine Check Exception.\n");
add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
- ist_exit(regs, prev_state);
+ ist_exit(regs);
}
/* Set up machine check reporting on the Winchip C6 series */
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index 6236a54a63f4..9e3f3c7dd5d7 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -377,17 +377,16 @@ static int mc_device_add(struct device *dev, struct subsys_interface *sif)
return err;
}
-static int mc_device_remove(struct device *dev, struct subsys_interface *sif)
+static void mc_device_remove(struct device *dev, struct subsys_interface *sif)
{
int cpu = dev->id;
if (!cpu_online(cpu))
- return 0;
+ return;
pr_debug("CPU%d removed\n", cpu);
microcode_fini_cpu(cpu);
sysfs_remove_group(&dev->kobj, &mc_attr_group);
- return 0;
}
static struct subsys_interface mc_cpu_interface = {
@@ -460,7 +459,7 @@ mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
return NOTIFY_OK;
}
-static struct notifier_block __refdata mc_cpu_notifier = {
+static struct notifier_block mc_cpu_notifier = {
.notifier_call = mc_cpu_callback,
};
diff --git a/arch/x86/kernel/cpu/microcode/intel_early.c b/arch/x86/kernel/cpu/microcode/intel_early.c
index 8187b7247d1c..37ea89c11520 100644
--- a/arch/x86/kernel/cpu/microcode/intel_early.c
+++ b/arch/x86/kernel/cpu/microcode/intel_early.c
@@ -390,7 +390,7 @@ static int collect_cpu_info_early(struct ucode_cpu_info *uci)
}
#ifdef DEBUG
-static void __ref show_saved_mc(void)
+static void show_saved_mc(void)
{
int i, j;
unsigned int sig, pf, rev, total_size, data_size, date;
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index aad4bd84b475..20e242ea1bc4 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -18,6 +18,7 @@
#include <linux/efi.h>
#include <linux/interrupt.h>
#include <linux/irq.h>
+#include <linux/kexec.h>
#include <asm/processor.h>
#include <asm/hypervisor.h>
#include <asm/hyperv.h>
@@ -28,12 +29,15 @@
#include <asm/i8259.h>
#include <asm/apic.h>
#include <asm/timer.h>
+#include <asm/reboot.h>
struct ms_hyperv_info ms_hyperv;
EXPORT_SYMBOL_GPL(ms_hyperv);
#if IS_ENABLED(CONFIG_HYPERV)
static void (*vmbus_handler)(void);
+static void (*hv_kexec_handler)(void);
+static void (*hv_crash_handler)(struct pt_regs *regs);
void hyperv_vector_handler(struct pt_regs *regs)
{
@@ -67,7 +71,47 @@ void hv_remove_vmbus_irq(void)
}
EXPORT_SYMBOL_GPL(hv_setup_vmbus_irq);
EXPORT_SYMBOL_GPL(hv_remove_vmbus_irq);
-#endif
+
+void hv_setup_kexec_handler(void (*handler)(void))
+{
+ hv_kexec_handler = handler;
+}
+EXPORT_SYMBOL_GPL(hv_setup_kexec_handler);
+
+void hv_remove_kexec_handler(void)
+{
+ hv_kexec_handler = NULL;
+}
+EXPORT_SYMBOL_GPL(hv_remove_kexec_handler);
+
+void hv_setup_crash_handler(void (*handler)(struct pt_regs *regs))
+{
+ hv_crash_handler = handler;
+}
+EXPORT_SYMBOL_GPL(hv_setup_crash_handler);
+
+void hv_remove_crash_handler(void)
+{
+ hv_crash_handler = NULL;
+}
+EXPORT_SYMBOL_GPL(hv_remove_crash_handler);
+
+#ifdef CONFIG_KEXEC_CORE
+static void hv_machine_shutdown(void)
+{
+ if (kexec_in_progress && hv_kexec_handler)
+ hv_kexec_handler();
+ native_machine_shutdown();
+}
+
+static void hv_machine_crash_shutdown(struct pt_regs *regs)
+{
+ if (hv_crash_handler)
+ hv_crash_handler(regs);
+ native_machine_crash_shutdown(regs);
+}
+#endif /* CONFIG_KEXEC_CORE */
+#endif /* CONFIG_HYPERV */
static uint32_t __init ms_hyperv_platform(void)
{
@@ -114,6 +158,7 @@ static void __init ms_hyperv_init_platform(void)
* Extract the features and hints
*/
ms_hyperv.features = cpuid_eax(HYPERV_CPUID_FEATURES);
+ ms_hyperv.misc_features = cpuid_edx(HYPERV_CPUID_FEATURES);
ms_hyperv.hints = cpuid_eax(HYPERV_CPUID_ENLIGHTMENT_INFO);
printk(KERN_INFO "HyperV: features 0x%x, hints 0x%x\n",
@@ -141,6 +186,11 @@ static void __init ms_hyperv_init_platform(void)
no_timer_check = 1;
#endif
+#if IS_ENABLED(CONFIG_HYPERV) && defined(CONFIG_KEXEC_CORE)
+ machine_ops.shutdown = hv_machine_shutdown;
+ machine_ops.crash_shutdown = hv_machine_crash_shutdown;
+#endif
+ mark_tsc_unstable("running on Hyper-V");
}
const __refconst struct hypervisor_x86 x86_hyper_ms_hyperv = {
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index e7ed0d8ebacb..f891b4750f04 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -448,7 +448,6 @@ int mtrr_add(unsigned long base, unsigned long size, unsigned int type,
return mtrr_add_page(base >> PAGE_SHIFT, size >> PAGE_SHIFT, type,
increment);
}
-EXPORT_SYMBOL(mtrr_add);
/**
* mtrr_del_page - delete a memory type region
@@ -537,7 +536,6 @@ int mtrr_del(int reg, unsigned long base, unsigned long size)
return -EINVAL;
return mtrr_del_page(reg, base >> PAGE_SHIFT, size >> PAGE_SHIFT);
}
-EXPORT_SYMBOL(mtrr_del);
/**
* arch_phys_wc_add - add a WC MTRR and handle errors if PAT is unavailable
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 3658de47900f..66dd3fe99b82 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1551,7 +1551,7 @@ static void __init filter_events(struct attribute **attrs)
}
/* Merge two pointer arrays */
-static __init struct attribute **merge_attr(struct attribute **a, struct attribute **b)
+__init struct attribute **merge_attr(struct attribute **a, struct attribute **b)
{
struct attribute **new;
int j, i;
@@ -2179,24 +2179,32 @@ static unsigned long get_segment_base(unsigned int segment)
int idx = segment >> 3;
if ((segment & SEGMENT_TI_MASK) == SEGMENT_LDT) {
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
+ struct ldt_struct *ldt;
+
if (idx > LDT_ENTRIES)
return 0;
- if (idx > current->active_mm->context.size)
+ /* IRQs are off, so this synchronizes with smp_store_release */
+ ldt = lockless_dereference(current->active_mm->context.ldt);
+ if (!ldt || idx > ldt->size)
return 0;
- desc = current->active_mm->context.ldt;
+ desc = &ldt->entries[idx];
+#else
+ return 0;
+#endif
} else {
if (idx > GDT_ENTRIES)
return 0;
- desc = raw_cpu_ptr(gdt_page.gdt);
+ desc = raw_cpu_ptr(gdt_page.gdt) + idx;
}
- return get_desc_base(desc + idx);
+ return get_desc_base(desc);
}
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_IA32_EMULATION
#include <asm/compat.h>
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 3e7fd27dfe20..165be83a7fa4 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -47,6 +47,7 @@ enum extra_reg_type {
EXTRA_REG_RSP_1 = 1, /* offcore_response_1 */
EXTRA_REG_LBR = 2, /* lbr_select */
EXTRA_REG_LDLAT = 3, /* ld_lat_threshold */
+ EXTRA_REG_FE = 4, /* fe_* */
EXTRA_REG_MAX /* number of entries needed */
};
@@ -165,7 +166,7 @@ struct intel_excl_cntrs {
unsigned core_id; /* per-core: core id */
};
-#define MAX_LBR_ENTRIES 16
+#define MAX_LBR_ENTRIES 32
enum {
X86_PERF_KFREE_SHARED = 0,
@@ -594,6 +595,7 @@ struct x86_pmu {
struct event_constraint *pebs_constraints;
void (*pebs_aliases)(struct perf_event *event);
int max_pebs_events;
+ unsigned long free_running_flags;
/*
* Intel LBR
@@ -624,6 +626,7 @@ struct x86_pmu {
struct x86_perf_task_context {
u64 lbr_from[MAX_LBR_ENTRIES];
u64 lbr_to[MAX_LBR_ENTRIES];
+ u64 lbr_info[MAX_LBR_ENTRIES];
int lbr_callstack_users;
int lbr_stack_state;
};
@@ -793,6 +796,8 @@ static inline void set_linear_ip(struct pt_regs *regs, unsigned long ip)
ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event);
ssize_t intel_event_sysfs_show(char *page, u64 config);
+struct attribute **merge_attr(struct attribute **a, struct attribute **b);
+
#ifdef CONFIG_CPU_SUP_AMD
int amd_pmu_init(void);
@@ -808,20 +813,6 @@ static inline int amd_pmu_init(void)
#ifdef CONFIG_CPU_SUP_INTEL
-static inline bool intel_pmu_needs_lbr_smpl(struct perf_event *event)
-{
- /* user explicitly requested branch sampling */
- if (has_branch_stack(event))
- return true;
-
- /* implicit branch sampling to correct PEBS skid */
- if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1 &&
- x86_pmu.intel_cap.pebs_format < 2)
- return true;
-
- return false;
-}
-
static inline bool intel_pmu_has_bts(struct perf_event *event)
{
if (event->attr.config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS &&
@@ -873,6 +864,8 @@ extern struct event_constraint intel_ivb_pebs_event_constraints[];
extern struct event_constraint intel_hsw_pebs_event_constraints[];
+extern struct event_constraint intel_skl_pebs_event_constraints[];
+
struct event_constraint *intel_pebs_constraints(struct perf_event *event);
void intel_pmu_pebs_enable(struct perf_event *event);
@@ -911,6 +904,8 @@ void intel_pmu_lbr_init_snb(void);
void intel_pmu_lbr_init_hsw(void);
+void intel_pmu_lbr_init_skl(void);
+
int intel_pmu_setup_lbr_filter(struct perf_event *event);
void intel_pt_interrupt(void);
@@ -934,6 +929,7 @@ static inline int is_ht_workaround_enabled(void)
{
return !!(x86_pmu.flags & PMU_FL_EXCL_ENABLED);
}
+
#else /* CONFIG_CPU_SUP_INTEL */
static inline void reserve_ds_buffers(void)
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index b9826a981fb2..f63360be2238 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -12,7 +12,7 @@
#include <linux/init.h>
#include <linux/slab.h>
#include <linux/export.h>
-#include <linux/watchdog.h>
+#include <linux/nmi.h>
#include <asm/cpufeature.h>
#include <asm/hardirq.h>
@@ -177,6 +177,14 @@ static struct event_constraint intel_slm_event_constraints[] __read_mostly =
EVENT_CONSTRAINT_END
};
+struct event_constraint intel_skl_event_constraints[] = {
+ FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
+ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
+ FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
+ INTEL_UEVENT_CONSTRAINT(0x1c0, 0x2), /* INST_RETIRED.PREC_DIST */
+ EVENT_CONSTRAINT_END
+};
+
static struct extra_reg intel_snb_extra_regs[] __read_mostly = {
/* must define OFFCORE_RSP_X first, see intel_fixup_er() */
INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3f807f8fffull, RSP_0),
@@ -193,6 +201,18 @@ static struct extra_reg intel_snbep_extra_regs[] __read_mostly = {
EVENT_EXTRA_END
};
+static struct extra_reg intel_skl_extra_regs[] __read_mostly = {
+ INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3fffff8fffull, RSP_0),
+ INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0x3fffff8fffull, RSP_1),
+ INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd),
+ /*
+ * Note the low 8 bits eventsel code is not a continuous field, containing
+ * some #GPing bits. These are masked out.
+ */
+ INTEL_UEVENT_EXTRA_REG(0x01c6, MSR_PEBS_FRONTEND, 0x7fff17, FE),
+ EVENT_EXTRA_END
+};
+
EVENT_ATTR_STR(mem-loads, mem_ld_nhm, "event=0x0b,umask=0x10,ldlat=3");
EVENT_ATTR_STR(mem-loads, mem_ld_snb, "event=0xcd,umask=0x1,ldlat=3");
EVENT_ATTR_STR(mem-stores, mem_st_snb, "event=0xcd,umask=0x2");
@@ -235,7 +255,7 @@ struct event_constraint intel_bdw_event_constraints[] = {
FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
INTEL_UEVENT_CONSTRAINT(0x148, 0x4), /* L1D_PEND_MISS.PENDING */
- INTEL_EVENT_CONSTRAINT(0xa3, 0x4), /* CYCLE_ACTIVITY.* */
+ INTEL_UEVENT_CONSTRAINT(0x8a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_MISS */
EVENT_CONSTRAINT_END
};
@@ -244,6 +264,200 @@ static u64 intel_pmu_event_map(int hw_event)
return intel_perfmon_event_map[hw_event];
}
+/*
+ * Notes on the events:
+ * - data reads do not include code reads (comparable to earlier tables)
+ * - data counts include speculative execution (except L1 write, dtlb, bpu)
+ * - remote node access includes remote memory, remote cache, remote mmio.
+ * - prefetches are not included in the counts.
+ * - icache miss does not include decoded icache
+ */
+
+#define SKL_DEMAND_DATA_RD BIT_ULL(0)
+#define SKL_DEMAND_RFO BIT_ULL(1)
+#define SKL_ANY_RESPONSE BIT_ULL(16)
+#define SKL_SUPPLIER_NONE BIT_ULL(17)
+#define SKL_L3_MISS_LOCAL_DRAM BIT_ULL(26)
+#define SKL_L3_MISS_REMOTE_HOP0_DRAM BIT_ULL(27)
+#define SKL_L3_MISS_REMOTE_HOP1_DRAM BIT_ULL(28)
+#define SKL_L3_MISS_REMOTE_HOP2P_DRAM BIT_ULL(29)
+#define SKL_L3_MISS (SKL_L3_MISS_LOCAL_DRAM| \
+ SKL_L3_MISS_REMOTE_HOP0_DRAM| \
+ SKL_L3_MISS_REMOTE_HOP1_DRAM| \
+ SKL_L3_MISS_REMOTE_HOP2P_DRAM)
+#define SKL_SPL_HIT BIT_ULL(30)
+#define SKL_SNOOP_NONE BIT_ULL(31)
+#define SKL_SNOOP_NOT_NEEDED BIT_ULL(32)
+#define SKL_SNOOP_MISS BIT_ULL(33)
+#define SKL_SNOOP_HIT_NO_FWD BIT_ULL(34)
+#define SKL_SNOOP_HIT_WITH_FWD BIT_ULL(35)
+#define SKL_SNOOP_HITM BIT_ULL(36)
+#define SKL_SNOOP_NON_DRAM BIT_ULL(37)
+#define SKL_ANY_SNOOP (SKL_SPL_HIT|SKL_SNOOP_NONE| \
+ SKL_SNOOP_NOT_NEEDED|SKL_SNOOP_MISS| \
+ SKL_SNOOP_HIT_NO_FWD|SKL_SNOOP_HIT_WITH_FWD| \
+ SKL_SNOOP_HITM|SKL_SNOOP_NON_DRAM)
+#define SKL_DEMAND_READ SKL_DEMAND_DATA_RD
+#define SKL_SNOOP_DRAM (SKL_SNOOP_NONE| \
+ SKL_SNOOP_NOT_NEEDED|SKL_SNOOP_MISS| \
+ SKL_SNOOP_HIT_NO_FWD|SKL_SNOOP_HIT_WITH_FWD| \
+ SKL_SNOOP_HITM|SKL_SPL_HIT)
+#define SKL_DEMAND_WRITE SKL_DEMAND_RFO
+#define SKL_LLC_ACCESS SKL_ANY_RESPONSE
+#define SKL_L3_MISS_REMOTE (SKL_L3_MISS_REMOTE_HOP0_DRAM| \
+ SKL_L3_MISS_REMOTE_HOP1_DRAM| \
+ SKL_L3_MISS_REMOTE_HOP2P_DRAM)
+
+static __initconst const u64 skl_hw_cache_event_ids
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_INST_RETIRED.ALL_LOADS */
+ [ C(RESULT_MISS) ] = 0x151, /* L1D.REPLACEMENT */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_INST_RETIRED.ALL_STORES */
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(L1I ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x283, /* ICACHE_64B.MISS */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ [ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ [ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(DTLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_INST_RETIRED.ALL_LOADS */
+ [ C(RESULT_MISS) ] = 0x608, /* DTLB_LOAD_MISSES.WALK_COMPLETED */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_INST_RETIRED.ALL_STORES */
+ [ C(RESULT_MISS) ] = 0x649, /* DTLB_STORE_MISSES.WALK_COMPLETED */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(ITLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x2085, /* ITLB_MISSES.STLB_HIT */
+ [ C(RESULT_MISS) ] = 0xe85, /* ITLB_MISSES.WALK_COMPLETED */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+ [ C(BPU ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0xc4, /* BR_INST_RETIRED.ALL_BRANCHES */
+ [ C(RESULT_MISS) ] = 0xc5, /* BR_MISP_RETIRED.ALL_BRANCHES */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+ [ C(NODE) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ [ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ [ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+};
+
+static __initconst const u64 skl_hw_cache_extra_regs
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = SKL_DEMAND_READ|
+ SKL_LLC_ACCESS|SKL_ANY_SNOOP,
+ [ C(RESULT_MISS) ] = SKL_DEMAND_READ|
+ SKL_L3_MISS|SKL_ANY_SNOOP|
+ SKL_SUPPLIER_NONE,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = SKL_DEMAND_WRITE|
+ SKL_LLC_ACCESS|SKL_ANY_SNOOP,
+ [ C(RESULT_MISS) ] = SKL_DEMAND_WRITE|
+ SKL_L3_MISS|SKL_ANY_SNOOP|
+ SKL_SUPPLIER_NONE,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(NODE) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = SKL_DEMAND_READ|
+ SKL_L3_MISS_LOCAL_DRAM|SKL_SNOOP_DRAM,
+ [ C(RESULT_MISS) ] = SKL_DEMAND_READ|
+ SKL_L3_MISS_REMOTE|SKL_SNOOP_DRAM,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = SKL_DEMAND_WRITE|
+ SKL_L3_MISS_LOCAL_DRAM|SKL_SNOOP_DRAM,
+ [ C(RESULT_MISS) ] = SKL_DEMAND_WRITE|
+ SKL_L3_MISS_REMOTE|SKL_SNOOP_DRAM,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+};
+
#define SNB_DMND_DATA_RD (1ULL << 0)
#define SNB_DMND_RFO (1ULL << 1)
#define SNB_DMND_IFETCH (1ULL << 2)
@@ -1114,7 +1328,7 @@ static struct extra_reg intel_slm_extra_regs[] __read_mostly =
{
/* must define OFFCORE_RSP_X first, see intel_fixup_er() */
INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x768005ffffull, RSP_0),
- INTEL_UEVENT_EXTRA_REG(0x02b7, MSR_OFFCORE_RSP_1, 0x768005ffffull, RSP_1),
+ INTEL_UEVENT_EXTRA_REG(0x02b7, MSR_OFFCORE_RSP_1, 0x368005ffffull, RSP_1),
EVENT_EXTRA_END
};
@@ -1594,6 +1808,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
loops = 0;
again:
+ intel_pmu_lbr_read();
intel_pmu_ack_status(status);
if (++loops > 100) {
static bool warned = false;
@@ -1608,16 +1823,16 @@ again:
inc_irq_stat(apic_perf_irqs);
- intel_pmu_lbr_read();
/*
- * CondChgd bit 63 doesn't mean any overflow status. Ignore
- * and clear the bit.
+ * Ignore a range of extra bits in status that do not indicate
+ * overflow by themselves.
*/
- if (__test_and_clear_bit(63, (unsigned long *)&status)) {
- if (!status)
- goto done;
- }
+ status &= ~(GLOBAL_STATUS_COND_CHG |
+ GLOBAL_STATUS_ASIF |
+ GLOBAL_STATUS_LBRS_FROZEN);
+ if (!status)
+ goto done;
/*
* PEBS overflow sets bit 62 in the global status register
@@ -1699,18 +1914,22 @@ intel_bts_constraints(struct perf_event *event)
return NULL;
}
-static int intel_alt_er(int idx)
+static int intel_alt_er(int idx, u64 config)
{
+ int alt_idx;
if (!(x86_pmu.flags & PMU_FL_HAS_RSP_1))
return idx;
if (idx == EXTRA_REG_RSP_0)
- return EXTRA_REG_RSP_1;
+ alt_idx = EXTRA_REG_RSP_1;
if (idx == EXTRA_REG_RSP_1)
- return EXTRA_REG_RSP_0;
+ alt_idx = EXTRA_REG_RSP_0;
+
+ if (config & ~x86_pmu.extra_regs[alt_idx].valid_mask)
+ return idx;
- return idx;
+ return alt_idx;
}
static void intel_fixup_er(struct perf_event *event, int idx)
@@ -1799,7 +2018,7 @@ again:
*/
c = NULL;
} else {
- idx = intel_alt_er(idx);
+ idx = intel_alt_er(idx, reg->config);
if (idx != reg->idx) {
raw_spin_unlock_irqrestore(&era->lock, flags);
goto again;
@@ -2102,9 +2321,12 @@ static struct event_constraint *
intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
struct perf_event *event)
{
- struct event_constraint *c1 = cpuc->event_constraint[idx];
+ struct event_constraint *c1 = NULL;
struct event_constraint *c2;
+ if (idx >= 0) /* fake does < 0 */
+ c1 = cpuc->event_constraint[idx];
+
/*
* first time only
* - static constraint: no change across incremental scheduling calls
@@ -2253,6 +2475,15 @@ static void intel_pebs_aliases_snb(struct perf_event *event)
}
}
+static unsigned long intel_pmu_free_running_flags(struct perf_event *event)
+{
+ unsigned long flags = x86_pmu.free_running_flags;
+
+ if (event->attr.use_clockid)
+ flags &= ~PERF_SAMPLE_TIME;
+ return flags;
+}
+
static int intel_pmu_hw_config(struct perf_event *event)
{
int ret = x86_pmu_hw_config(event);
@@ -2263,7 +2494,8 @@ static int intel_pmu_hw_config(struct perf_event *event)
if (event->attr.precise_ip) {
if (!event->attr.freq) {
event->hw.flags |= PERF_X86_EVENT_AUTO_RELOAD;
- if (!(event->attr.sample_type & ~PEBS_FREERUNNING_FLAGS))
+ if (!(event->attr.sample_type &
+ ~intel_pmu_free_running_flags(event)))
event->hw.flags |= PERF_X86_EVENT_FREERUNNING;
}
if (x86_pmu.pebs_aliases)
@@ -2534,7 +2766,7 @@ static int intel_pmu_cpu_prepare(int cpu)
if (x86_pmu.extra_regs || x86_pmu.lbr_sel_map) {
cpuc->shared_regs = allocate_shared_regs(cpu);
if (!cpuc->shared_regs)
- return NOTIFY_BAD;
+ goto err;
}
if (x86_pmu.flags & PMU_FL_EXCL_CNTRS) {
@@ -2542,18 +2774,27 @@ static int intel_pmu_cpu_prepare(int cpu)
cpuc->constraint_list = kzalloc(sz, GFP_KERNEL);
if (!cpuc->constraint_list)
- return NOTIFY_BAD;
+ goto err_shared_regs;
cpuc->excl_cntrs = allocate_excl_cntrs(cpu);
- if (!cpuc->excl_cntrs) {
- kfree(cpuc->constraint_list);
- kfree(cpuc->shared_regs);
- return NOTIFY_BAD;
- }
+ if (!cpuc->excl_cntrs)
+ goto err_constraint_list;
+
cpuc->excl_thread_id = 0;
}
return NOTIFY_OK;
+
+err_constraint_list:
+ kfree(cpuc->constraint_list);
+ cpuc->constraint_list = NULL;
+
+err_shared_regs:
+ kfree(cpuc->shared_regs);
+ cpuc->shared_regs = NULL;
+
+err:
+ return NOTIFY_BAD;
}
static void intel_pmu_cpu_starting(int cpu)
@@ -2655,6 +2896,8 @@ PMU_FORMAT_ATTR(offcore_rsp, "config1:0-63");
PMU_FORMAT_ATTR(ldlat, "config1:0-15");
+PMU_FORMAT_ATTR(frontend, "config1:0-23");
+
static struct attribute *intel_arch3_formats_attr[] = {
&format_attr_event.attr,
&format_attr_umask.attr,
@@ -2671,6 +2914,11 @@ static struct attribute *intel_arch3_formats_attr[] = {
NULL,
};
+static struct attribute *skl_format_attr[] = {
+ &format_attr_frontend.attr,
+ NULL,
+};
+
static __initconst const struct x86_pmu core_pmu = {
.name = "core",
.handle_irq = x86_pmu_handle_irq,
@@ -2685,6 +2933,8 @@ static __initconst const struct x86_pmu core_pmu = {
.event_map = intel_pmu_event_map,
.max_events = ARRAY_SIZE(intel_perfmon_event_map),
.apic = 1,
+ .free_running_flags = PEBS_FREERUNNING_FLAGS,
+
/*
* Intel PMCs cannot be accessed sanely above 32-bit width,
* so we install an artificial 1<<31 period regardless of
@@ -2723,6 +2973,7 @@ static __initconst const struct x86_pmu intel_pmu = {
.event_map = intel_pmu_event_map,
.max_events = ARRAY_SIZE(intel_perfmon_event_map),
.apic = 1,
+ .free_running_flags = PEBS_FREERUNNING_FLAGS,
/*
* Intel PMCs cannot be accessed sanely above 32 bit width,
* so we install an artificial 1<<31 period regardless of
@@ -3260,6 +3511,30 @@ __init int intel_pmu_init(void)
pr_cont("Broadwell events, ");
break;
+ case 78: /* 14nm Skylake Mobile */
+ case 94: /* 14nm Skylake Desktop */
+ x86_pmu.late_ack = true;
+ memcpy(hw_cache_event_ids, skl_hw_cache_event_ids, sizeof(hw_cache_event_ids));
+ memcpy(hw_cache_extra_regs, skl_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
+ intel_pmu_lbr_init_skl();
+
+ x86_pmu.event_constraints = intel_skl_event_constraints;
+ x86_pmu.pebs_constraints = intel_skl_pebs_event_constraints;
+ x86_pmu.extra_regs = intel_skl_extra_regs;
+ x86_pmu.pebs_aliases = intel_pebs_aliases_snb;
+ /* all extra regs are per-cpu when HT is on */
+ x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+ x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
+
+ x86_pmu.hw_config = hsw_hw_config;
+ x86_pmu.get_event_constraints = hsw_get_event_constraints;
+ x86_pmu.format_attrs = merge_attr(intel_arch3_formats_attr,
+ skl_format_attr);
+ WARN_ON(!x86_pmu.format_attrs);
+ x86_pmu.cpu_events = hsw_events_attrs;
+ pr_cont("Skylake events, ");
+ break;
+
default:
switch (x86_pmu.version) {
case 1:
@@ -3329,7 +3604,7 @@ __init int intel_pmu_init(void)
*/
if (x86_pmu.extra_regs) {
for (er = x86_pmu.extra_regs; er->msr; er++) {
- er->extra_msr_access = check_msr(er->msr, 0x1ffUL);
+ er->extra_msr_access = check_msr(er->msr, 0x11UL);
/* Disable LBR select mapping */
if ((er->idx == EXTRA_REG_LBR) && !er->extra_msr_access)
x86_pmu.lbr_sel_map = NULL;
@@ -3368,7 +3643,10 @@ static __init int fixup_ht_bug(void)
return 0;
}
- watchdog_nmi_disable_all();
+ if (lockup_detector_suspend() != 0) {
+ pr_debug("failed to disable PMU erratum BJ122, BV98, HSD29 workaround\n");
+ return 0;
+ }
x86_pmu.flags &= ~(PMU_FL_EXCL_CNTRS | PMU_FL_EXCL_ENABLED);
@@ -3376,7 +3654,7 @@ static __init int fixup_ht_bug(void)
x86_pmu.commit_scheduling = NULL;
x86_pmu.stop_scheduling = NULL;
- watchdog_nmi_enable_all();
+ lockup_detector_resume();
get_online_cpus();
diff --git a/arch/x86/kernel/cpu/perf_event_intel_bts.c b/arch/x86/kernel/cpu/perf_event_intel_bts.c
index 43dd672d788b..d1c0f254afbe 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_bts.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_bts.c
@@ -62,9 +62,6 @@ struct bts_buffer {
struct pmu bts_pmu;
-void intel_pmu_enable_bts(u64 config);
-void intel_pmu_disable_bts(void);
-
static size_t buf_size(struct page *page)
{
return 1 << (PAGE_SHIFT + page_private(page));
@@ -225,6 +222,7 @@ static void __bts_event_start(struct perf_event *event)
if (!buf || bts_buffer_is_full(buf, bts))
return;
+ event->hw.itrace_started = 1;
event->hw.state = 0;
if (!buf->snapshot)
diff --git a/arch/x86/kernel/cpu/perf_event_intel_cqm.c b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
index 188076161c1b..377e8f8ed391 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_cqm.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
@@ -952,6 +952,14 @@ static u64 intel_cqm_event_count(struct perf_event *event)
return 0;
/*
+ * Getting up-to-date values requires an SMP IPI which is not
+ * possible if we're being called in interrupt context. Return
+ * the cached values instead.
+ */
+ if (unlikely(in_interrupt()))
+ goto out;
+
+ /*
* Notice that we don't perform the reading of an RMID
* atomically, because we can't hold a spin lock across the
* IPIs.
@@ -1247,7 +1255,7 @@ static inline void cqm_pick_event_reader(int cpu)
cpumask_set_cpu(cpu, &cqm_cpumask);
}
-static void intel_cqm_cpu_prepare(unsigned int cpu)
+static void intel_cqm_cpu_starting(unsigned int cpu)
{
struct intel_pqr_state *state = &per_cpu(pqr_state, cpu);
struct cpuinfo_x86 *c = &cpu_data(cpu);
@@ -1288,13 +1296,11 @@ static int intel_cqm_cpu_notifier(struct notifier_block *nb,
unsigned int cpu = (unsigned long)hcpu;
switch (action & ~CPU_TASKS_FROZEN) {
- case CPU_UP_PREPARE:
- intel_cqm_cpu_prepare(cpu);
- break;
case CPU_DOWN_PREPARE:
intel_cqm_cpu_exit(cpu);
break;
case CPU_STARTING:
+ intel_cqm_cpu_starting(cpu);
cqm_pick_event_reader(cpu);
break;
}
@@ -1365,7 +1371,7 @@ static int __init intel_cqm_init(void)
goto out;
for_each_online_cpu(i) {
- intel_cqm_cpu_prepare(i);
+ intel_cqm_cpu_starting(i);
cqm_pick_event_reader(i);
}
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 71fc40238843..84f236ab96b0 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -224,6 +224,19 @@ union hsw_tsx_tuning {
#define PEBS_HSW_TSX_FLAGS 0xff00000000ULL
+/* Same as HSW, plus TSC */
+
+struct pebs_record_skl {
+ u64 flags, ip;
+ u64 ax, bx, cx, dx;
+ u64 si, di, bp, sp;
+ u64 r8, r9, r10, r11;
+ u64 r12, r13, r14, r15;
+ u64 status, dla, dse, lat;
+ u64 real_ip, tsx_tuning;
+ u64 tsc;
+};
+
void init_debug_store_on_cpu(int cpu)
{
struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
@@ -675,6 +688,28 @@ struct event_constraint intel_hsw_pebs_event_constraints[] = {
EVENT_CONSTRAINT_END
};
+struct event_constraint intel_skl_pebs_event_constraints[] = {
+ INTEL_FLAGS_UEVENT_CONSTRAINT(0x1c0, 0x2), /* INST_RETIRED.PREC_DIST */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(0x01c2, 0xf), /* UOPS_RETIRED.ALL */
+ /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
+ INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
+ INTEL_PLD_CONSTRAINT(0x1cd, 0xf), /* MEM_TRANS_RETIRED.* */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x11d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_LOADS */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x12d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_STORES */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x21d0, 0xf), /* MEM_INST_RETIRED.LOCK_LOADS */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x22d0, 0xf), /* MEM_INST_RETIRED.LOCK_STORES */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x41d0, 0xf), /* MEM_INST_RETIRED.SPLIT_LOADS */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x42d0, 0xf), /* MEM_INST_RETIRED.SPLIT_STORES */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x81d0, 0xf), /* MEM_INST_RETIRED.ALL_LOADS */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x82d0, 0xf), /* MEM_INST_RETIRED.ALL_STORES */
+ INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd1, 0xf), /* MEM_LOAD_RETIRED.* */
+ INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd2, 0xf), /* MEM_LOAD_L3_HIT_RETIRED.* */
+ INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd3, 0xf), /* MEM_LOAD_L3_MISS_RETIRED.* */
+ /* Allow all events as PEBS with no flags */
+ INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
+ EVENT_CONSTRAINT_END
+};
+
struct event_constraint *intel_pebs_constraints(struct perf_event *event)
{
struct event_constraint *c;
@@ -754,6 +789,11 @@ void intel_pmu_pebs_disable(struct perf_event *event)
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct hw_perf_event *hwc = &event->hw;
struct debug_store *ds = cpuc->ds;
+ bool large_pebs = ds->pebs_interrupt_threshold >
+ ds->pebs_buffer_base + x86_pmu.pebs_record_size;
+
+ if (large_pebs)
+ intel_pmu_drain_pebs_buffer();
cpuc->pebs_enabled &= ~(1ULL << hwc->idx);
@@ -762,12 +802,8 @@ void intel_pmu_pebs_disable(struct perf_event *event)
else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST)
cpuc->pebs_enabled &= ~(1ULL << 63);
- if (ds->pebs_interrupt_threshold >
- ds->pebs_buffer_base + x86_pmu.pebs_record_size) {
- intel_pmu_drain_pebs_buffer();
- if (!pebs_is_enabled(cpuc))
- perf_sched_cb_dec(event->ctx->pmu);
- }
+ if (large_pebs && !pebs_is_enabled(cpuc))
+ perf_sched_cb_dec(event->ctx->pmu);
if (cpuc->enabled)
wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled);
@@ -885,7 +921,7 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
return 0;
}
-static inline u64 intel_hsw_weight(struct pebs_record_hsw *pebs)
+static inline u64 intel_hsw_weight(struct pebs_record_skl *pebs)
{
if (pebs->tsx_tuning) {
union hsw_tsx_tuning tsx = { .value = pebs->tsx_tuning };
@@ -894,7 +930,7 @@ static inline u64 intel_hsw_weight(struct pebs_record_hsw *pebs)
return 0;
}
-static inline u64 intel_hsw_transaction(struct pebs_record_hsw *pebs)
+static inline u64 intel_hsw_transaction(struct pebs_record_skl *pebs)
{
u64 txn = (pebs->tsx_tuning & PEBS_HSW_TSX_FLAGS) >> 32;
@@ -918,7 +954,7 @@ static void setup_pebs_sample_data(struct perf_event *event,
* unconditionally access the 'extra' entries.
*/
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
- struct pebs_record_hsw *pebs = __pebs;
+ struct pebs_record_skl *pebs = __pebs;
u64 sample_type;
int fll, fst, dsrc;
int fl = event->hw.flags;
@@ -1016,6 +1052,16 @@ static void setup_pebs_sample_data(struct perf_event *event,
data->txn = intel_hsw_transaction(pebs);
}
+ /*
+ * v3 supplies an accurate time stamp, so we use that
+ * for the time stamp.
+ *
+ * We can only do this for the default trace clock.
+ */
+ if (x86_pmu.intel_cap.pebs_format >= 3 &&
+ event->attr.use_clockid == 0)
+ data->time = native_sched_clock_from_tsc(pebs->tsc);
+
if (has_branch_stack(event))
data->br_stack = &cpuc->lbr_stack;
}
@@ -1142,6 +1188,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
for (at = base; at < top; at += x86_pmu.pebs_record_size) {
struct pebs_record_nhm *p = at;
+ u64 pebs_status;
/* PEBS v3 has accurate status bits */
if (x86_pmu.intel_cap.pebs_format >= 3) {
@@ -1152,12 +1199,17 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
continue;
}
- bit = find_first_bit((unsigned long *)&p->status,
+ pebs_status = p->status & cpuc->pebs_enabled;
+ pebs_status &= (1ULL << x86_pmu.max_pebs_events) - 1;
+
+ bit = find_first_bit((unsigned long *)&pebs_status,
x86_pmu.max_pebs_events);
- if (bit >= x86_pmu.max_pebs_events)
- continue;
- if (!test_bit(bit, cpuc->active_mask))
+ if (WARN(bit >= x86_pmu.max_pebs_events,
+ "PEBS record without PEBS event! status=%Lx pebs_enabled=%Lx active_mask=%Lx",
+ (unsigned long long)p->status, (unsigned long long)cpuc->pebs_enabled,
+ *(unsigned long long *)cpuc->active_mask))
continue;
+
/*
* The PEBS hardware does not deal well with the situation
* when events happen near to each other and multiple bits
@@ -1172,27 +1224,21 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
* one, and it's not possible to reconstruct all events
* that caused the PEBS record. It's called collision.
* If collision happened, the record will be dropped.
- *
*/
- if (p->status != (1 << bit)) {
- u64 pebs_status;
-
- /* slow path */
- pebs_status = p->status & cpuc->pebs_enabled;
- pebs_status &= (1ULL << MAX_PEBS_EVENTS) - 1;
- if (pebs_status != (1 << bit)) {
- for_each_set_bit(i, (unsigned long *)&pebs_status,
- MAX_PEBS_EVENTS)
- error[i]++;
- continue;
- }
+ if (p->status != (1ULL << bit)) {
+ for_each_set_bit(i, (unsigned long *)&pebs_status,
+ x86_pmu.max_pebs_events)
+ error[i]++;
+ continue;
}
+
counts[bit]++;
}
for (bit = 0; bit < x86_pmu.max_pebs_events; bit++) {
if ((counts[bit] == 0) && (error[bit] == 0))
continue;
+
event = cpuc->events[bit];
WARN_ON_ONCE(!event);
WARN_ON_ONCE(!event->attr.precise_ip);
@@ -1245,6 +1291,14 @@ void __init intel_ds_init(void)
x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm;
break;
+ case 3:
+ pr_cont("PEBS fmt3%c, ", pebs_type);
+ x86_pmu.pebs_record_size =
+ sizeof(struct pebs_record_skl);
+ x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm;
+ x86_pmu.free_running_flags |= PERF_SAMPLE_TIME;
+ break;
+
default:
printk(KERN_CONT "no PEBS fmt%d%c, ", format, pebs_type);
x86_pmu.pebs = 0;
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index 452a7bd2dedb..b2c9475b7ff2 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -13,7 +13,8 @@ enum {
LBR_FORMAT_EIP = 0x02,
LBR_FORMAT_EIP_FLAGS = 0x03,
LBR_FORMAT_EIP_FLAGS2 = 0x04,
- LBR_FORMAT_MAX_KNOWN = LBR_FORMAT_EIP_FLAGS2,
+ LBR_FORMAT_INFO = 0x05,
+ LBR_FORMAT_MAX_KNOWN = LBR_FORMAT_INFO,
};
static enum {
@@ -140,6 +141,13 @@ static void __intel_pmu_lbr_enable(bool pmi)
u64 debugctl, lbr_select = 0, orig_debugctl;
/*
+ * No need to unfreeze manually, as v4 can do that as part
+ * of the GLOBAL_STATUS ack.
+ */
+ if (pmi && x86_pmu.version >= 4)
+ return;
+
+ /*
* No need to reprogram LBR_SELECT in a PMI, as it
* did not change.
*/
@@ -186,6 +194,8 @@ static void intel_pmu_lbr_reset_64(void)
for (i = 0; i < x86_pmu.lbr_nr; i++) {
wrmsrl(x86_pmu.lbr_from + i, 0);
wrmsrl(x86_pmu.lbr_to + i, 0);
+ if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)
+ wrmsrl(MSR_LBR_INFO_0 + i, 0);
}
}
@@ -230,10 +240,12 @@ static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx)
mask = x86_pmu.lbr_nr - 1;
tos = intel_pmu_lbr_tos();
- for (i = 0; i < x86_pmu.lbr_nr; i++) {
+ for (i = 0; i < tos; i++) {
lbr_idx = (tos - i) & mask;
wrmsrl(x86_pmu.lbr_from + lbr_idx, task_ctx->lbr_from[i]);
wrmsrl(x86_pmu.lbr_to + lbr_idx, task_ctx->lbr_to[i]);
+ if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)
+ wrmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr_info[i]);
}
task_ctx->lbr_stack_state = LBR_NONE;
}
@@ -251,10 +263,12 @@ static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx)
mask = x86_pmu.lbr_nr - 1;
tos = intel_pmu_lbr_tos();
- for (i = 0; i < x86_pmu.lbr_nr; i++) {
+ for (i = 0; i < tos; i++) {
lbr_idx = (tos - i) & mask;
rdmsrl(x86_pmu.lbr_from + lbr_idx, task_ctx->lbr_from[i]);
rdmsrl(x86_pmu.lbr_to + lbr_idx, task_ctx->lbr_to[i]);
+ if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)
+ rdmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr_info[i]);
}
task_ctx->lbr_stack_state = LBR_VALID;
}
@@ -411,16 +425,31 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
u64 tos = intel_pmu_lbr_tos();
int i;
int out = 0;
+ int num = x86_pmu.lbr_nr;
- for (i = 0; i < x86_pmu.lbr_nr; i++) {
+ if (cpuc->lbr_sel->config & LBR_CALL_STACK)
+ num = tos;
+
+ for (i = 0; i < num; i++) {
unsigned long lbr_idx = (tos - i) & mask;
u64 from, to, mis = 0, pred = 0, in_tx = 0, abort = 0;
int skip = 0;
+ u16 cycles = 0;
int lbr_flags = lbr_desc[lbr_format];
rdmsrl(x86_pmu.lbr_from + lbr_idx, from);
rdmsrl(x86_pmu.lbr_to + lbr_idx, to);
+ if (lbr_format == LBR_FORMAT_INFO) {
+ u64 info;
+
+ rdmsrl(MSR_LBR_INFO_0 + lbr_idx, info);
+ mis = !!(info & LBR_INFO_MISPRED);
+ pred = !mis;
+ in_tx = !!(info & LBR_INFO_IN_TX);
+ abort = !!(info & LBR_INFO_ABORT);
+ cycles = (info & LBR_INFO_CYCLES);
+ }
if (lbr_flags & LBR_EIP_FLAGS) {
mis = !!(from & LBR_FROM_FLAG_MISPRED);
pred = !mis;
@@ -450,6 +479,7 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
cpuc->lbr_entries[out].predicted = pred;
cpuc->lbr_entries[out].in_tx = in_tx;
cpuc->lbr_entries[out].abort = abort;
+ cpuc->lbr_entries[out].cycles = cycles;
cpuc->lbr_entries[out].reserved = 0;
out++;
}
@@ -947,6 +977,26 @@ void intel_pmu_lbr_init_hsw(void)
pr_cont("16-deep LBR, ");
}
+/* skylake */
+__init void intel_pmu_lbr_init_skl(void)
+{
+ x86_pmu.lbr_nr = 32;
+ x86_pmu.lbr_tos = MSR_LBR_TOS;
+ x86_pmu.lbr_from = MSR_LBR_NHM_FROM;
+ x86_pmu.lbr_to = MSR_LBR_NHM_TO;
+
+ x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
+ x86_pmu.lbr_sel_map = hsw_lbr_sel_map;
+
+ /*
+ * SW branch filter usage:
+ * - support syscall, sysret capture.
+ * That requires LBR_FAR but that means far
+ * jmp need to be filtered out
+ */
+ pr_cont("32-deep LBR, ");
+}
+
/* atom */
void __init intel_pmu_lbr_init_atom(void)
{
diff --git a/arch/x86/kernel/cpu/perf_event_intel_pt.c b/arch/x86/kernel/cpu/perf_event_intel_pt.c
index 183de719628d..42169283448b 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_pt.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_pt.c
@@ -65,15 +65,21 @@ static struct pt_cap_desc {
} pt_caps[] = {
PT_CAP(max_subleaf, 0, CR_EAX, 0xffffffff),
PT_CAP(cr3_filtering, 0, CR_EBX, BIT(0)),
+ PT_CAP(psb_cyc, 0, CR_EBX, BIT(1)),
+ PT_CAP(mtc, 0, CR_EBX, BIT(3)),
PT_CAP(topa_output, 0, CR_ECX, BIT(0)),
PT_CAP(topa_multiple_entries, 0, CR_ECX, BIT(1)),
+ PT_CAP(single_range_output, 0, CR_ECX, BIT(2)),
PT_CAP(payloads_lip, 0, CR_ECX, BIT(31)),
+ PT_CAP(mtc_periods, 1, CR_EAX, 0xffff0000),
+ PT_CAP(cycle_thresholds, 1, CR_EBX, 0xffff),
+ PT_CAP(psb_periods, 1, CR_EBX, 0xffff0000),
};
static u32 pt_cap_get(enum pt_capabilities cap)
{
struct pt_cap_desc *cd = &pt_caps[cap];
- u32 c = pt_pmu.caps[cd->leaf * 4 + cd->reg];
+ u32 c = pt_pmu.caps[cd->leaf * PT_CPUID_REGS_NUM + cd->reg];
unsigned int shift = __ffs(cd->mask);
return (c & cd->mask) >> shift;
@@ -94,12 +100,22 @@ static struct attribute_group pt_cap_group = {
.name = "caps",
};
+PMU_FORMAT_ATTR(cyc, "config:1" );
+PMU_FORMAT_ATTR(mtc, "config:9" );
PMU_FORMAT_ATTR(tsc, "config:10" );
PMU_FORMAT_ATTR(noretcomp, "config:11" );
+PMU_FORMAT_ATTR(mtc_period, "config:14-17" );
+PMU_FORMAT_ATTR(cyc_thresh, "config:19-22" );
+PMU_FORMAT_ATTR(psb_period, "config:24-27" );
static struct attribute *pt_formats_attr[] = {
+ &format_attr_cyc.attr,
+ &format_attr_mtc.attr,
&format_attr_tsc.attr,
&format_attr_noretcomp.attr,
+ &format_attr_mtc_period.attr,
+ &format_attr_cyc_thresh.attr,
+ &format_attr_psb_period.attr,
NULL,
};
@@ -129,10 +145,10 @@ static int __init pt_pmu_hw_init(void)
for (i = 0; i < PT_CPUID_LEAVES; i++) {
cpuid_count(20, i,
- &pt_pmu.caps[CR_EAX + i*4],
- &pt_pmu.caps[CR_EBX + i*4],
- &pt_pmu.caps[CR_ECX + i*4],
- &pt_pmu.caps[CR_EDX + i*4]);
+ &pt_pmu.caps[CR_EAX + i*PT_CPUID_REGS_NUM],
+ &pt_pmu.caps[CR_EBX + i*PT_CPUID_REGS_NUM],
+ &pt_pmu.caps[CR_ECX + i*PT_CPUID_REGS_NUM],
+ &pt_pmu.caps[CR_EDX + i*PT_CPUID_REGS_NUM]);
}
ret = -ENOMEM;
@@ -170,15 +186,65 @@ fail:
return ret;
}
-#define PT_CONFIG_MASK (RTIT_CTL_TSC_EN | RTIT_CTL_DISRETC)
+#define RTIT_CTL_CYC_PSB (RTIT_CTL_CYCLEACC | \
+ RTIT_CTL_CYC_THRESH | \
+ RTIT_CTL_PSB_FREQ)
+
+#define RTIT_CTL_MTC (RTIT_CTL_MTC_EN | \
+ RTIT_CTL_MTC_RANGE)
+
+#define PT_CONFIG_MASK (RTIT_CTL_TSC_EN | \
+ RTIT_CTL_DISRETC | \
+ RTIT_CTL_CYC_PSB | \
+ RTIT_CTL_MTC)
static bool pt_event_valid(struct perf_event *event)
{
u64 config = event->attr.config;
+ u64 allowed, requested;
if ((config & PT_CONFIG_MASK) != config)
return false;
+ if (config & RTIT_CTL_CYC_PSB) {
+ if (!pt_cap_get(PT_CAP_psb_cyc))
+ return false;
+
+ allowed = pt_cap_get(PT_CAP_psb_periods);
+ requested = (config & RTIT_CTL_PSB_FREQ) >>
+ RTIT_CTL_PSB_FREQ_OFFSET;
+ if (requested && (!(allowed & BIT(requested))))
+ return false;
+
+ allowed = pt_cap_get(PT_CAP_cycle_thresholds);
+ requested = (config & RTIT_CTL_CYC_THRESH) >>
+ RTIT_CTL_CYC_THRESH_OFFSET;
+ if (requested && (!(allowed & BIT(requested))))
+ return false;
+ }
+
+ if (config & RTIT_CTL_MTC) {
+ /*
+ * In the unlikely case that CPUID lists valid mtc periods,
+ * but not the mtc capability, drop out here.
+ *
+ * Spec says that setting mtc period bits while mtc bit in
+ * CPUID is 0 will #GP, so better safe than sorry.
+ */
+ if (!pt_cap_get(PT_CAP_mtc))
+ return false;
+
+ allowed = pt_cap_get(PT_CAP_mtc_periods);
+ if (!allowed)
+ return false;
+
+ requested = (config & RTIT_CTL_MTC_RANGE) >>
+ RTIT_CTL_MTC_RANGE_OFFSET;
+
+ if (!(allowed & BIT(requested)))
+ return false;
+ }
+
return true;
}
@@ -191,6 +257,11 @@ static void pt_config(struct perf_event *event)
{
u64 reg;
+ if (!event->hw.itrace_started) {
+ event->hw.itrace_started = 1;
+ wrmsrl(MSR_IA32_RTIT_STATUS, 0);
+ }
+
reg = RTIT_CTL_TOPA | RTIT_CTL_BRANCH_EN | RTIT_CTL_TRACEEN;
if (!event->attr.exclude_kernel)
@@ -910,7 +981,6 @@ void intel_pt_interrupt(void)
pt_config_buffer(buf->cur->table, buf->cur_idx,
buf->output_off);
- wrmsrl(MSR_IA32_RTIT_STATUS, 0);
pt_config(event);
}
}
@@ -934,7 +1004,6 @@ static void pt_event_start(struct perf_event *event, int mode)
pt_config_buffer(buf->cur->table, buf->cur_idx,
buf->output_off);
- wrmsrl(MSR_IA32_RTIT_STATUS, 0);
pt_config(event);
}
diff --git a/arch/x86/kernel/cpu/perf_event_intel_rapl.c b/arch/x86/kernel/cpu/perf_event_intel_rapl.c
index 5cbd4e64feb5..81431c0f0614 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_rapl.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_rapl.c
@@ -86,6 +86,10 @@ static const char *rapl_domain_names[NR_RAPL_DOMAINS] __initconst = {
1<<RAPL_IDX_RAM_NRG_STAT|\
1<<RAPL_IDX_PP1_NRG_STAT)
+/* Knights Landing has PKG, RAM */
+#define RAPL_IDX_KNL (1<<RAPL_IDX_PKG_NRG_STAT|\
+ 1<<RAPL_IDX_RAM_NRG_STAT)
+
/*
* event code: LSB 8 bits, passed in attr->config
* any other bit is reserved
@@ -486,6 +490,18 @@ static struct attribute *rapl_events_hsw_attr[] = {
NULL,
};
+static struct attribute *rapl_events_knl_attr[] = {
+ EVENT_PTR(rapl_pkg),
+ EVENT_PTR(rapl_ram),
+
+ EVENT_PTR(rapl_pkg_unit),
+ EVENT_PTR(rapl_ram_unit),
+
+ EVENT_PTR(rapl_pkg_scale),
+ EVENT_PTR(rapl_ram_scale),
+ NULL,
+};
+
static struct attribute_group rapl_pmu_events_group = {
.name = "events",
.attrs = NULL, /* patched at runtime */
@@ -730,6 +746,10 @@ static int __init rapl_pmu_init(void)
rapl_cntr_mask = RAPL_IDX_SRV;
rapl_pmu_events_group.attrs = rapl_events_srv_attr;
break;
+ case 87: /* Knights Landing */
+ rapl_add_quirk(rapl_hsw_server_quirk);
+ rapl_cntr_mask = RAPL_IDX_KNL;
+ rapl_pmu_events_group.attrs = rapl_events_knl_attr;
default:
/* unsupported */
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
index 21b5e38c921b..560e5255b15e 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
@@ -911,6 +911,9 @@ static int __init uncore_pci_init(void)
case 63: /* Haswell-EP */
ret = hswep_uncore_pci_init();
break;
+ case 86: /* BDX-DE */
+ ret = bdx_uncore_pci_init();
+ break;
case 42: /* Sandy Bridge */
ret = snb_uncore_pci_init();
break;
@@ -1209,6 +1212,11 @@ static int __init uncore_cpu_init(void)
break;
case 42: /* Sandy Bridge */
case 58: /* Ivy Bridge */
+ case 60: /* Haswell */
+ case 69: /* Haswell */
+ case 70: /* Haswell */
+ case 61: /* Broadwell */
+ case 71: /* Broadwell */
snb_uncore_cpu_init();
break;
case 45: /* Sandy Bridge-EP */
@@ -1224,6 +1232,9 @@ static int __init uncore_cpu_init(void)
case 63: /* Haswell-EP */
hswep_uncore_cpu_init();
break;
+ case 86: /* BDX-DE */
+ bdx_uncore_cpu_init();
+ break;
default:
return 0;
}
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
index 0f77f0a196e4..72c54c2e5b1a 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
@@ -336,6 +336,8 @@ int ivbep_uncore_pci_init(void);
void ivbep_uncore_cpu_init(void);
int hswep_uncore_pci_init(void);
void hswep_uncore_cpu_init(void);
+int bdx_uncore_pci_init(void);
+void bdx_uncore_cpu_init(void);
/* perf_event_intel_uncore_nhmex.c */
void nhmex_uncore_cpu_init(void);
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
index b005a78c7012..f78574b3cb55 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
@@ -45,6 +45,11 @@
#define SNB_UNC_CBO_0_PER_CTR0 0x706
#define SNB_UNC_CBO_MSR_OFFSET 0x10
+/* SNB ARB register */
+#define SNB_UNC_ARB_PER_CTR0 0x3b0
+#define SNB_UNC_ARB_PERFEVTSEL0 0x3b2
+#define SNB_UNC_ARB_MSR_OFFSET 0x10
+
/* NHM global control register */
#define NHM_UNC_PERF_GLOBAL_CTL 0x391
#define NHM_UNC_FIXED_CTR 0x394
@@ -115,7 +120,7 @@ static struct intel_uncore_ops snb_uncore_msr_ops = {
.read_counter = uncore_msr_read_counter,
};
-static struct event_constraint snb_uncore_cbox_constraints[] = {
+static struct event_constraint snb_uncore_arb_constraints[] = {
UNCORE_EVENT_CONSTRAINT(0x80, 0x1),
UNCORE_EVENT_CONSTRAINT(0x83, 0x1),
EVENT_CONSTRAINT_END
@@ -134,14 +139,28 @@ static struct intel_uncore_type snb_uncore_cbox = {
.single_fixed = 1,
.event_mask = SNB_UNC_RAW_EVENT_MASK,
.msr_offset = SNB_UNC_CBO_MSR_OFFSET,
- .constraints = snb_uncore_cbox_constraints,
.ops = &snb_uncore_msr_ops,
.format_group = &snb_uncore_format_group,
.event_descs = snb_uncore_events,
};
+static struct intel_uncore_type snb_uncore_arb = {
+ .name = "arb",
+ .num_counters = 2,
+ .num_boxes = 1,
+ .perf_ctr_bits = 44,
+ .perf_ctr = SNB_UNC_ARB_PER_CTR0,
+ .event_ctl = SNB_UNC_ARB_PERFEVTSEL0,
+ .event_mask = SNB_UNC_RAW_EVENT_MASK,
+ .msr_offset = SNB_UNC_ARB_MSR_OFFSET,
+ .constraints = snb_uncore_arb_constraints,
+ .ops = &snb_uncore_msr_ops,
+ .format_group = &snb_uncore_format_group,
+};
+
static struct intel_uncore_type *snb_msr_uncores[] = {
&snb_uncore_cbox,
+ &snb_uncore_arb,
NULL,
};
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
index 6d6e85dd5849..694510a887dc 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
@@ -2215,7 +2215,7 @@ static struct intel_uncore_type *hswep_pci_uncores[] = {
NULL,
};
-static DEFINE_PCI_DEVICE_TABLE(hswep_uncore_pci_ids) = {
+static const struct pci_device_id hswep_uncore_pci_ids[] = {
{ /* Home Agent 0 */
PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f30),
.driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_HA, 0),
@@ -2321,3 +2321,167 @@ int hswep_uncore_pci_init(void)
return 0;
}
/* end of Haswell-EP uncore support */
+
+/* BDX-DE uncore support */
+
+static struct intel_uncore_type bdx_uncore_ubox = {
+ .name = "ubox",
+ .num_counters = 2,
+ .num_boxes = 1,
+ .perf_ctr_bits = 48,
+ .fixed_ctr_bits = 48,
+ .perf_ctr = HSWEP_U_MSR_PMON_CTR0,
+ .event_ctl = HSWEP_U_MSR_PMON_CTL0,
+ .event_mask = SNBEP_U_MSR_PMON_RAW_EVENT_MASK,
+ .fixed_ctr = HSWEP_U_MSR_PMON_UCLK_FIXED_CTR,
+ .fixed_ctl = HSWEP_U_MSR_PMON_UCLK_FIXED_CTL,
+ .num_shared_regs = 1,
+ .ops = &ivbep_uncore_msr_ops,
+ .format_group = &ivbep_uncore_ubox_format_group,
+};
+
+static struct event_constraint bdx_uncore_cbox_constraints[] = {
+ UNCORE_EVENT_CONSTRAINT(0x09, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x11, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x36, 0x1),
+ EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type bdx_uncore_cbox = {
+ .name = "cbox",
+ .num_counters = 4,
+ .num_boxes = 8,
+ .perf_ctr_bits = 48,
+ .event_ctl = HSWEP_C0_MSR_PMON_CTL0,
+ .perf_ctr = HSWEP_C0_MSR_PMON_CTR0,
+ .event_mask = SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK,
+ .box_ctl = HSWEP_C0_MSR_PMON_BOX_CTL,
+ .msr_offset = HSWEP_CBO_MSR_OFFSET,
+ .num_shared_regs = 1,
+ .constraints = bdx_uncore_cbox_constraints,
+ .ops = &hswep_uncore_cbox_ops,
+ .format_group = &hswep_uncore_cbox_format_group,
+};
+
+static struct intel_uncore_type *bdx_msr_uncores[] = {
+ &bdx_uncore_ubox,
+ &bdx_uncore_cbox,
+ &hswep_uncore_pcu,
+ NULL,
+};
+
+void bdx_uncore_cpu_init(void)
+{
+ if (bdx_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
+ bdx_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
+ uncore_msr_uncores = bdx_msr_uncores;
+}
+
+static struct intel_uncore_type bdx_uncore_ha = {
+ .name = "ha",
+ .num_counters = 4,
+ .num_boxes = 1,
+ .perf_ctr_bits = 48,
+ SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct intel_uncore_type bdx_uncore_imc = {
+ .name = "imc",
+ .num_counters = 5,
+ .num_boxes = 2,
+ .perf_ctr_bits = 48,
+ .fixed_ctr_bits = 48,
+ .fixed_ctr = SNBEP_MC_CHy_PCI_PMON_FIXED_CTR,
+ .fixed_ctl = SNBEP_MC_CHy_PCI_PMON_FIXED_CTL,
+ .event_descs = hswep_uncore_imc_events,
+ SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct intel_uncore_type bdx_uncore_irp = {
+ .name = "irp",
+ .num_counters = 4,
+ .num_boxes = 1,
+ .perf_ctr_bits = 48,
+ .event_mask = SNBEP_PMON_RAW_EVENT_MASK,
+ .box_ctl = SNBEP_PCI_PMON_BOX_CTL,
+ .ops = &hswep_uncore_irp_ops,
+ .format_group = &snbep_uncore_format_group,
+};
+
+
+static struct event_constraint bdx_uncore_r2pcie_constraints[] = {
+ UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x13, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x23, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x25, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x2d, 0x3),
+ EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type bdx_uncore_r2pcie = {
+ .name = "r2pcie",
+ .num_counters = 4,
+ .num_boxes = 1,
+ .perf_ctr_bits = 48,
+ .constraints = bdx_uncore_r2pcie_constraints,
+ SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+enum {
+ BDX_PCI_UNCORE_HA,
+ BDX_PCI_UNCORE_IMC,
+ BDX_PCI_UNCORE_IRP,
+ BDX_PCI_UNCORE_R2PCIE,
+};
+
+static struct intel_uncore_type *bdx_pci_uncores[] = {
+ [BDX_PCI_UNCORE_HA] = &bdx_uncore_ha,
+ [BDX_PCI_UNCORE_IMC] = &bdx_uncore_imc,
+ [BDX_PCI_UNCORE_IRP] = &bdx_uncore_irp,
+ [BDX_PCI_UNCORE_R2PCIE] = &bdx_uncore_r2pcie,
+ NULL,
+};
+
+static DEFINE_PCI_DEVICE_TABLE(bdx_uncore_pci_ids) = {
+ { /* Home Agent 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f30),
+ .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_HA, 0),
+ },
+ { /* MC0 Channel 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fb0),
+ .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 0),
+ },
+ { /* MC0 Channel 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fb1),
+ .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 1),
+ },
+ { /* IRP */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f39),
+ .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IRP, 0),
+ },
+ { /* R2PCIe */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f34),
+ .driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_R2PCIE, 0),
+ },
+ { /* end: all zeroes */ }
+};
+
+static struct pci_driver bdx_uncore_pci_driver = {
+ .name = "bdx_uncore",
+ .id_table = bdx_uncore_pci_ids,
+};
+
+int bdx_uncore_pci_init(void)
+{
+ int ret = snbep_pci2phy_map_init(0x6f1e);
+
+ if (ret)
+ return ret;
+ uncore_pci_uncores = bdx_pci_uncores;
+ uncore_pci_driver = &bdx_uncore_pci_driver;
+ return 0;
+}
+
+/* end of BDX-DE uncore support */
diff --git a/arch/x86/kernel/cpu/perf_event_msr.c b/arch/x86/kernel/cpu/perf_event_msr.c
new file mode 100644
index 000000000000..f32ac13934f2
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_msr.c
@@ -0,0 +1,242 @@
+#include <linux/perf_event.h>
+
+enum perf_msr_id {
+ PERF_MSR_TSC = 0,
+ PERF_MSR_APERF = 1,
+ PERF_MSR_MPERF = 2,
+ PERF_MSR_PPERF = 3,
+ PERF_MSR_SMI = 4,
+
+ PERF_MSR_EVENT_MAX,
+};
+
+static bool test_aperfmperf(int idx)
+{
+ return boot_cpu_has(X86_FEATURE_APERFMPERF);
+}
+
+static bool test_intel(int idx)
+{
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
+ boot_cpu_data.x86 != 6)
+ return false;
+
+ switch (boot_cpu_data.x86_model) {
+ case 30: /* 45nm Nehalem */
+ case 26: /* 45nm Nehalem-EP */
+ case 46: /* 45nm Nehalem-EX */
+
+ case 37: /* 32nm Westmere */
+ case 44: /* 32nm Westmere-EP */
+ case 47: /* 32nm Westmere-EX */
+
+ case 42: /* 32nm SandyBridge */
+ case 45: /* 32nm SandyBridge-E/EN/EP */
+
+ case 58: /* 22nm IvyBridge */
+ case 62: /* 22nm IvyBridge-EP/EX */
+
+ case 60: /* 22nm Haswell Core */
+ case 63: /* 22nm Haswell Server */
+ case 69: /* 22nm Haswell ULT */
+ case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */
+
+ case 61: /* 14nm Broadwell Core-M */
+ case 86: /* 14nm Broadwell Xeon D */
+ case 71: /* 14nm Broadwell + GT3e (Intel Iris Pro graphics) */
+ case 79: /* 14nm Broadwell Server */
+
+ case 55: /* 22nm Atom "Silvermont" */
+ case 77: /* 22nm Atom "Silvermont Avoton/Rangely" */
+ case 76: /* 14nm Atom "Airmont" */
+ if (idx == PERF_MSR_SMI)
+ return true;
+ break;
+
+ case 78: /* 14nm Skylake Mobile */
+ case 94: /* 14nm Skylake Desktop */
+ if (idx == PERF_MSR_SMI || idx == PERF_MSR_PPERF)
+ return true;
+ break;
+ }
+
+ return false;
+}
+
+struct perf_msr {
+ u64 msr;
+ struct perf_pmu_events_attr *attr;
+ bool (*test)(int idx);
+};
+
+PMU_EVENT_ATTR_STRING(tsc, evattr_tsc, "event=0x00");
+PMU_EVENT_ATTR_STRING(aperf, evattr_aperf, "event=0x01");
+PMU_EVENT_ATTR_STRING(mperf, evattr_mperf, "event=0x02");
+PMU_EVENT_ATTR_STRING(pperf, evattr_pperf, "event=0x03");
+PMU_EVENT_ATTR_STRING(smi, evattr_smi, "event=0x04");
+
+static struct perf_msr msr[] = {
+ [PERF_MSR_TSC] = { 0, &evattr_tsc, NULL, },
+ [PERF_MSR_APERF] = { MSR_IA32_APERF, &evattr_aperf, test_aperfmperf, },
+ [PERF_MSR_MPERF] = { MSR_IA32_MPERF, &evattr_mperf, test_aperfmperf, },
+ [PERF_MSR_PPERF] = { MSR_PPERF, &evattr_pperf, test_intel, },
+ [PERF_MSR_SMI] = { MSR_SMI_COUNT, &evattr_smi, test_intel, },
+};
+
+static struct attribute *events_attrs[PERF_MSR_EVENT_MAX + 1] = {
+ NULL,
+};
+
+static struct attribute_group events_attr_group = {
+ .name = "events",
+ .attrs = events_attrs,
+};
+
+PMU_FORMAT_ATTR(event, "config:0-63");
+static struct attribute *format_attrs[] = {
+ &format_attr_event.attr,
+ NULL,
+};
+static struct attribute_group format_attr_group = {
+ .name = "format",
+ .attrs = format_attrs,
+};
+
+static const struct attribute_group *attr_groups[] = {
+ &events_attr_group,
+ &format_attr_group,
+ NULL,
+};
+
+static int msr_event_init(struct perf_event *event)
+{
+ u64 cfg = event->attr.config;
+
+ if (event->attr.type != event->pmu->type)
+ return -ENOENT;
+
+ if (cfg >= PERF_MSR_EVENT_MAX)
+ return -EINVAL;
+
+ /* unsupported modes and filters */
+ if (event->attr.exclude_user ||
+ event->attr.exclude_kernel ||
+ event->attr.exclude_hv ||
+ event->attr.exclude_idle ||
+ event->attr.exclude_host ||
+ event->attr.exclude_guest ||
+ event->attr.sample_period) /* no sampling */
+ return -EINVAL;
+
+ if (!msr[cfg].attr)
+ return -EINVAL;
+
+ event->hw.idx = -1;
+ event->hw.event_base = msr[cfg].msr;
+ event->hw.config = cfg;
+
+ return 0;
+}
+
+static inline u64 msr_read_counter(struct perf_event *event)
+{
+ u64 now;
+
+ if (event->hw.event_base)
+ rdmsrl(event->hw.event_base, now);
+ else
+ rdtscll(now);
+
+ return now;
+}
+static void msr_event_update(struct perf_event *event)
+{
+ u64 prev, now;
+ s64 delta;
+
+ /* Careful, an NMI might modify the previous event value. */
+again:
+ prev = local64_read(&event->hw.prev_count);
+ now = msr_read_counter(event);
+
+ if (local64_cmpxchg(&event->hw.prev_count, prev, now) != prev)
+ goto again;
+
+ delta = now - prev;
+ if (unlikely(event->hw.event_base == MSR_SMI_COUNT)) {
+ delta <<= 32;
+ delta >>= 32; /* sign extend */
+ }
+ local64_add(now - prev, &event->count);
+}
+
+static void msr_event_start(struct perf_event *event, int flags)
+{
+ u64 now;
+
+ now = msr_read_counter(event);
+ local64_set(&event->hw.prev_count, now);
+}
+
+static void msr_event_stop(struct perf_event *event, int flags)
+{
+ msr_event_update(event);
+}
+
+static void msr_event_del(struct perf_event *event, int flags)
+{
+ msr_event_stop(event, PERF_EF_UPDATE);
+}
+
+static int msr_event_add(struct perf_event *event, int flags)
+{
+ if (flags & PERF_EF_START)
+ msr_event_start(event, flags);
+
+ return 0;
+}
+
+static struct pmu pmu_msr = {
+ .task_ctx_nr = perf_sw_context,
+ .attr_groups = attr_groups,
+ .event_init = msr_event_init,
+ .add = msr_event_add,
+ .del = msr_event_del,
+ .start = msr_event_start,
+ .stop = msr_event_stop,
+ .read = msr_event_update,
+ .capabilities = PERF_PMU_CAP_NO_INTERRUPT,
+};
+
+static int __init msr_init(void)
+{
+ int i, j = 0;
+
+ if (!boot_cpu_has(X86_FEATURE_TSC)) {
+ pr_cont("no MSR PMU driver.\n");
+ return 0;
+ }
+
+ /* Probe the MSRs. */
+ for (i = PERF_MSR_TSC + 1; i < PERF_MSR_EVENT_MAX; i++) {
+ u64 val;
+
+ /*
+ * Virt sucks arse; you cannot tell if a R/O MSR is present :/
+ */
+ if (!msr[i].test(i) || rdmsrl_safe(msr[i].msr, &val))
+ msr[i].attr = NULL;
+ }
+
+ /* List remaining MSRs in the sysfs attrs. */
+ for (i = 0; i < PERF_MSR_EVENT_MAX; i++) {
+ if (msr[i].attr)
+ events_attrs[j++] = &msr[i].attr->attr.attr;
+ }
+ events_attrs[j] = NULL;
+
+ perf_pmu_register(&pmu_msr, "msr", -1);
+
+ return 0;
+}
+device_initcall(msr_init);
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index 3d423a101fae..608fb26c7254 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -37,7 +37,7 @@ void init_scattered_cpuid_features(struct cpuinfo_x86 *c)
{ X86_FEATURE_PLN, CR_EAX, 4, 0x00000006, 0 },
{ X86_FEATURE_PTS, CR_EAX, 6, 0x00000006, 0 },
{ X86_FEATURE_HWP, CR_EAX, 7, 0x00000006, 0 },
- { X86_FEATURE_HWP_NOITFY, CR_EAX, 8, 0x00000006, 0 },
+ { X86_FEATURE_HWP_NOTIFY, CR_EAX, 8, 0x00000006, 0 },
{ X86_FEATURE_HWP_ACT_WINDOW, CR_EAX, 9, 0x00000006, 0 },
{ X86_FEATURE_HWP_EPP, CR_EAX,10, 0x00000006, 0 },
{ X86_FEATURE_HWP_PKG_REQ, CR_EAX,11, 0x00000006, 0 },
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index 83741a71558f..bd3507da39f0 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -170,7 +170,7 @@ static int cpuid_class_cpu_callback(struct notifier_block *nfb,
return notifier_from_errno(err);
}
-static struct notifier_block __refdata cpuid_class_cpu_notifier =
+static struct notifier_block cpuid_class_cpu_notifier =
{
.notifier_call = cpuid_class_cpu_callback,
};
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index e068d6683dba..74ca2fe7a0b3 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -185,10 +185,9 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
}
#ifdef CONFIG_KEXEC_FILE
-static int get_nr_ram_ranges_callback(unsigned long start_pfn,
- unsigned long nr_pfn, void *arg)
+static int get_nr_ram_ranges_callback(u64 start, u64 end, void *arg)
{
- int *nr_ranges = arg;
+ unsigned int *nr_ranges = arg;
(*nr_ranges)++;
return 0;
@@ -214,7 +213,7 @@ static void fill_up_crash_elf_data(struct crash_elf_data *ced,
ced->image = image;
- walk_system_ram_range(0, -1, &nr_ranges,
+ walk_system_ram_res(0, -1, &nr_ranges,
get_nr_ram_ranges_callback);
ced->max_nr_ranges = nr_ranges;
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index 89427d8d4fc5..eec40f595ab9 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -175,7 +175,9 @@ static __init void early_serial_init(char *s)
}
if (*s) {
- if (kstrtoul(s, 0, &baud) < 0 || baud == 0)
+ baud = simple_strtoull(s, &e, 0);
+
+ if (baud == 0 || s == e)
baud = DEFAULT_BAUD;
}
diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c
index f5d0730e7b08..4d38416e2a7f 100644
--- a/arch/x86/kernel/espfix_64.c
+++ b/arch/x86/kernel/espfix_64.c
@@ -110,7 +110,7 @@ static void init_espfix_random(void)
*/
if (!arch_get_random_long(&rand)) {
/* The constant is an arbitrary large prime */
- rdtscll(rand);
+ rand = rdtsc();
rand *= 0xc345c6b72fd16123UL;
}
@@ -131,25 +131,24 @@ void __init init_espfix_bsp(void)
init_espfix_random();
/* The rest is the same as for any other processor */
- init_espfix_ap();
+ init_espfix_ap(0);
}
-void init_espfix_ap(void)
+void init_espfix_ap(int cpu)
{
- unsigned int cpu, page;
+ unsigned int page;
unsigned long addr;
pud_t pud, *pud_p;
pmd_t pmd, *pmd_p;
pte_t pte, *pte_p;
- int n;
+ int n, node;
void *stack_page;
pteval_t ptemask;
/* We only have to do this once... */
- if (likely(this_cpu_read(espfix_stack)))
+ if (likely(per_cpu(espfix_stack, cpu)))
return; /* Already initialized */
- cpu = smp_processor_id();
addr = espfix_base_addr(cpu);
page = cpu/ESPFIX_STACKS_PER_PAGE;
@@ -165,12 +164,15 @@ void init_espfix_ap(void)
if (stack_page)
goto unlock_done;
+ node = cpu_to_node(cpu);
ptemask = __supported_pte_mask;
pud_p = &espfix_pud_page[pud_index(addr)];
pud = *pud_p;
if (!pud_present(pud)) {
- pmd_p = (pmd_t *)__get_free_page(PGALLOC_GFP);
+ struct page *page = alloc_pages_node(node, PGALLOC_GFP, 0);
+
+ pmd_p = (pmd_t *)page_address(page);
pud = __pud(__pa(pmd_p) | (PGTABLE_PROT & ptemask));
paravirt_alloc_pmd(&init_mm, __pa(pmd_p) >> PAGE_SHIFT);
for (n = 0; n < ESPFIX_PUD_CLONES; n++)
@@ -180,7 +182,9 @@ void init_espfix_ap(void)
pmd_p = pmd_offset(&pud, addr);
pmd = *pmd_p;
if (!pmd_present(pmd)) {
- pte_p = (pte_t *)__get_free_page(PGALLOC_GFP);
+ struct page *page = alloc_pages_node(node, PGALLOC_GFP, 0);
+
+ pte_p = (pte_t *)page_address(page);
pmd = __pmd(__pa(pte_p) | (PGTABLE_PROT & ptemask));
paravirt_alloc_pte(&init_mm, __pa(pte_p) >> PAGE_SHIFT);
for (n = 0; n < ESPFIX_PMD_CLONES; n++)
@@ -188,7 +192,7 @@ void init_espfix_ap(void)
}
pte_p = pte_offset_kernel(&pmd, addr);
- stack_page = (void *)__get_free_page(GFP_KERNEL);
+ stack_page = page_address(alloc_pages_node(node, GFP_KERNEL, 0));
pte = __pte(__pa(stack_page) | (__PAGE_KERNEL_RO & ptemask));
for (n = 0; n < ESPFIX_PTE_CLONES; n++)
set_pte(&pte_p[n*PTE_STRIDE], pte);
@@ -199,7 +203,7 @@ void init_espfix_ap(void)
unlock_done:
mutex_unlock(&espfix_init_mutex);
done:
- this_cpu_write(espfix_stack, addr);
- this_cpu_write(espfix_waddr, (unsigned long)stack_page
- + (addr & ~PAGE_MASK));
+ per_cpu(espfix_stack, cpu) = addr;
+ per_cpu(espfix_waddr, cpu) = (unsigned long)stack_page
+ + (addr & ~PAGE_MASK);
}
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 79de954626fd..d25097c3fc1d 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -270,7 +270,7 @@ int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu)
dst_fpu->fpregs_active = 0;
dst_fpu->last_cpu = -1;
- if (src_fpu->fpstate_active)
+ if (src_fpu->fpstate_active && cpu_has_fpu)
fpu_copy(dst_fpu, src_fpu);
return 0;
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index 32826791e675..d14e9ac3235a 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -4,6 +4,8 @@
#include <asm/fpu/internal.h>
#include <asm/tlbflush.h>
+#include <linux/sched.h>
+
/*
* Initialize the TS bit in CR0 according to the style of context-switches
* we are using:
@@ -38,7 +40,12 @@ static void fpu__init_cpu_generic(void)
write_cr0(cr0);
/* Flush out any pending x87 state: */
- asm volatile ("fninit");
+#ifdef CONFIG_MATH_EMULATION
+ if (!cpu_has_fpu)
+ fpstate_init_soft(&current->thread.fpu.state.soft);
+ else
+#endif
+ asm volatile ("fninit");
}
/*
@@ -136,6 +143,43 @@ static void __init fpu__init_system_generic(void)
unsigned int xstate_size;
EXPORT_SYMBOL_GPL(xstate_size);
+/* Enforce that 'MEMBER' is the last field of 'TYPE': */
+#define CHECK_MEMBER_AT_END_OF(TYPE, MEMBER) \
+ BUILD_BUG_ON(sizeof(TYPE) != offsetofend(TYPE, MEMBER))
+
+/*
+ * We append the 'struct fpu' to the task_struct:
+ */
+static void __init fpu__init_task_struct_size(void)
+{
+ int task_size = sizeof(struct task_struct);
+
+ /*
+ * Subtract off the static size of the register state.
+ * It potentially has a bunch of padding.
+ */
+ task_size -= sizeof(((struct task_struct *)0)->thread.fpu.state);
+
+ /*
+ * Add back the dynamically-calculated register state
+ * size.
+ */
+ task_size += xstate_size;
+
+ /*
+ * We dynamically size 'struct fpu', so we require that
+ * it be at the end of 'thread_struct' and that
+ * 'thread_struct' be at the end of 'task_struct'. If
+ * you hit a compile error here, check the structure to
+ * see if something got added to the end.
+ */
+ CHECK_MEMBER_AT_END_OF(struct fpu, state);
+ CHECK_MEMBER_AT_END_OF(struct thread_struct, fpu);
+ CHECK_MEMBER_AT_END_OF(struct task_struct, thread);
+
+ arch_task_struct_size = task_size;
+}
+
/*
* Set up the xstate_size based on the legacy FPU context size.
*
@@ -287,6 +331,7 @@ void __init fpu__init_system(struct cpuinfo_x86 *c)
fpu__init_system_generic();
fpu__init_system_xstate_size_legacy();
fpu__init_system_xstate();
+ fpu__init_task_struct_size();
fpu__init_system_ctx_switch();
}
@@ -311,9 +356,15 @@ static int __init x86_noxsave_setup(char *s)
setup_clear_cpu_cap(X86_FEATURE_XSAVE);
setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
+ setup_clear_cpu_cap(X86_FEATURE_XSAVEC);
setup_clear_cpu_cap(X86_FEATURE_XSAVES);
setup_clear_cpu_cap(X86_FEATURE_AVX);
setup_clear_cpu_cap(X86_FEATURE_AVX2);
+ setup_clear_cpu_cap(X86_FEATURE_AVX512F);
+ setup_clear_cpu_cap(X86_FEATURE_AVX512PF);
+ setup_clear_cpu_cap(X86_FEATURE_AVX512ER);
+ setup_clear_cpu_cap(X86_FEATURE_AVX512CD);
+ setup_clear_cpu_cap(X86_FEATURE_MPX);
return 1;
}
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 5a4668136e98..f129a9af6357 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -161,11 +161,12 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)
/* Kill off the identity-map trampoline */
reset_early_page_tables();
- kasan_map_early_shadow(early_level4_pgt);
-
- /* clear bss before set_intr_gate with early_idt_handler */
clear_bss();
+ clear_page(init_level4_pgt);
+
+ kasan_early_init();
+
for (i = 0; i < NUM_EXCEPTION_VECTORS; i++)
set_intr_gate(i, early_idt_handler_array[i]);
load_idt((const struct desc_ptr *)&idt_descr);
@@ -177,12 +178,9 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)
*/
load_ucode_bsp();
- clear_page(init_level4_pgt);
/* set init_level4_pgt kernel high mapping*/
init_level4_pgt[511] = early_level4_pgt[511];
- kasan_map_early_shadow(init_level4_pgt);
-
x86_64_start_reservations(real_mode_data);
}
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index e5c27f729a38..1d40ca8a73f2 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -516,38 +516,9 @@ ENTRY(phys_base)
/* This must match the first entry in level2_kernel_pgt */
.quad 0x0000000000000000
-#ifdef CONFIG_KASAN
-#define FILL(VAL, COUNT) \
- .rept (COUNT) ; \
- .quad (VAL) ; \
- .endr
-
-NEXT_PAGE(kasan_zero_pte)
- FILL(kasan_zero_page - __START_KERNEL_map + _KERNPG_TABLE, 512)
-NEXT_PAGE(kasan_zero_pmd)
- FILL(kasan_zero_pte - __START_KERNEL_map + _KERNPG_TABLE, 512)
-NEXT_PAGE(kasan_zero_pud)
- FILL(kasan_zero_pmd - __START_KERNEL_map + _KERNPG_TABLE, 512)
-
-#undef FILL
-#endif
-
-
#include "../../x86/xen/xen-head.S"
__PAGE_ALIGNED_BSS
NEXT_PAGE(empty_zero_page)
.skip PAGE_SIZE
-#ifdef CONFIG_KASAN
-/*
- * This page used as early shadow. We don't use empty_zero_page
- * at early stages, stack instrumentation could write some garbage
- * to this page.
- * Latter we reuse it as zero shadow for large ranges of memory
- * that allowed to access, but not instrumented by kasan
- * (vmalloc/vmemmap ...).
- */
-NEXT_PAGE(kasan_zero_page)
- .skip PAGE_SIZE
-#endif
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 10757d0a3fcf..88b4da373081 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -226,22 +226,7 @@ static void hpet_reserve_platform_timers(unsigned int id) { }
*/
static unsigned long hpet_freq;
-static void hpet_legacy_set_mode(enum clock_event_mode mode,
- struct clock_event_device *evt);
-static int hpet_legacy_next_event(unsigned long delta,
- struct clock_event_device *evt);
-
-/*
- * The hpet clock event device
- */
-static struct clock_event_device hpet_clockevent = {
- .name = "hpet",
- .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
- .set_mode = hpet_legacy_set_mode,
- .set_next_event = hpet_legacy_next_event,
- .irq = 0,
- .rating = 50,
-};
+static struct clock_event_device hpet_clockevent;
static void hpet_stop_counter(void)
{
@@ -306,64 +291,74 @@ static void hpet_legacy_clockevent_register(void)
printk(KERN_DEBUG "hpet clockevent registered\n");
}
-static void hpet_set_mode(enum clock_event_mode mode,
- struct clock_event_device *evt, int timer)
+static int hpet_set_periodic(struct clock_event_device *evt, int timer)
{
unsigned int cfg, cmp, now;
uint64_t delta;
- switch (mode) {
- case CLOCK_EVT_MODE_PERIODIC:
- hpet_stop_counter();
- delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * evt->mult;
- delta >>= evt->shift;
- now = hpet_readl(HPET_COUNTER);
- cmp = now + (unsigned int) delta;
- cfg = hpet_readl(HPET_Tn_CFG(timer));
- cfg |= HPET_TN_ENABLE | HPET_TN_PERIODIC |
- HPET_TN_SETVAL | HPET_TN_32BIT;
- hpet_writel(cfg, HPET_Tn_CFG(timer));
- hpet_writel(cmp, HPET_Tn_CMP(timer));
- udelay(1);
- /*
- * HPET on AMD 81xx needs a second write (with HPET_TN_SETVAL
- * cleared) to T0_CMP to set the period. The HPET_TN_SETVAL
- * bit is automatically cleared after the first write.
- * (See AMD-8111 HyperTransport I/O Hub Data Sheet,
- * Publication # 24674)
- */
- hpet_writel((unsigned int) delta, HPET_Tn_CMP(timer));
- hpet_start_counter();
- hpet_print_config();
- break;
+ hpet_stop_counter();
+ delta = ((uint64_t)(NSEC_PER_SEC / HZ)) * evt->mult;
+ delta >>= evt->shift;
+ now = hpet_readl(HPET_COUNTER);
+ cmp = now + (unsigned int)delta;
+ cfg = hpet_readl(HPET_Tn_CFG(timer));
+ cfg |= HPET_TN_ENABLE | HPET_TN_PERIODIC | HPET_TN_SETVAL |
+ HPET_TN_32BIT;
+ hpet_writel(cfg, HPET_Tn_CFG(timer));
+ hpet_writel(cmp, HPET_Tn_CMP(timer));
+ udelay(1);
+ /*
+ * HPET on AMD 81xx needs a second write (with HPET_TN_SETVAL
+ * cleared) to T0_CMP to set the period. The HPET_TN_SETVAL
+ * bit is automatically cleared after the first write.
+ * (See AMD-8111 HyperTransport I/O Hub Data Sheet,
+ * Publication # 24674)
+ */
+ hpet_writel((unsigned int)delta, HPET_Tn_CMP(timer));
+ hpet_start_counter();
+ hpet_print_config();
- case CLOCK_EVT_MODE_ONESHOT:
- cfg = hpet_readl(HPET_Tn_CFG(timer));
- cfg &= ~HPET_TN_PERIODIC;
- cfg |= HPET_TN_ENABLE | HPET_TN_32BIT;
- hpet_writel(cfg, HPET_Tn_CFG(timer));
- break;
+ return 0;
+}
- case CLOCK_EVT_MODE_UNUSED:
- case CLOCK_EVT_MODE_SHUTDOWN:
- cfg = hpet_readl(HPET_Tn_CFG(timer));
- cfg &= ~HPET_TN_ENABLE;
- hpet_writel(cfg, HPET_Tn_CFG(timer));
- break;
+static int hpet_set_oneshot(struct clock_event_device *evt, int timer)
+{
+ unsigned int cfg;
- case CLOCK_EVT_MODE_RESUME:
- if (timer == 0) {
- hpet_enable_legacy_int();
- } else {
- struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
- irq_domain_activate_irq(irq_get_irq_data(hdev->irq));
- disable_irq(hdev->irq);
- irq_set_affinity(hdev->irq, cpumask_of(hdev->cpu));
- enable_irq(hdev->irq);
- }
- hpet_print_config();
- break;
+ cfg = hpet_readl(HPET_Tn_CFG(timer));
+ cfg &= ~HPET_TN_PERIODIC;
+ cfg |= HPET_TN_ENABLE | HPET_TN_32BIT;
+ hpet_writel(cfg, HPET_Tn_CFG(timer));
+
+ return 0;
+}
+
+static int hpet_shutdown(struct clock_event_device *evt, int timer)
+{
+ unsigned int cfg;
+
+ cfg = hpet_readl(HPET_Tn_CFG(timer));
+ cfg &= ~HPET_TN_ENABLE;
+ hpet_writel(cfg, HPET_Tn_CFG(timer));
+
+ return 0;
+}
+
+static int hpet_resume(struct clock_event_device *evt, int timer)
+{
+ if (!timer) {
+ hpet_enable_legacy_int();
+ } else {
+ struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
+
+ irq_domain_activate_irq(irq_get_irq_data(hdev->irq));
+ disable_irq(hdev->irq);
+ irq_set_affinity(hdev->irq, cpumask_of(hdev->cpu));
+ enable_irq(hdev->irq);
}
+ hpet_print_config();
+
+ return 0;
}
static int hpet_next_event(unsigned long delta,
@@ -403,10 +398,24 @@ static int hpet_next_event(unsigned long delta,
return res < HPET_MIN_CYCLES ? -ETIME : 0;
}
-static void hpet_legacy_set_mode(enum clock_event_mode mode,
- struct clock_event_device *evt)
+static int hpet_legacy_shutdown(struct clock_event_device *evt)
+{
+ return hpet_shutdown(evt, 0);
+}
+
+static int hpet_legacy_set_oneshot(struct clock_event_device *evt)
+{
+ return hpet_set_oneshot(evt, 0);
+}
+
+static int hpet_legacy_set_periodic(struct clock_event_device *evt)
{
- hpet_set_mode(mode, evt, 0);
+ return hpet_set_periodic(evt, 0);
+}
+
+static int hpet_legacy_resume(struct clock_event_device *evt)
+{
+ return hpet_resume(evt, 0);
}
static int hpet_legacy_next_event(unsigned long delta,
@@ -416,6 +425,22 @@ static int hpet_legacy_next_event(unsigned long delta,
}
/*
+ * The hpet clock event device
+ */
+static struct clock_event_device hpet_clockevent = {
+ .name = "hpet",
+ .features = CLOCK_EVT_FEAT_PERIODIC |
+ CLOCK_EVT_FEAT_ONESHOT,
+ .set_state_periodic = hpet_legacy_set_periodic,
+ .set_state_oneshot = hpet_legacy_set_oneshot,
+ .set_state_shutdown = hpet_legacy_shutdown,
+ .tick_resume = hpet_legacy_resume,
+ .set_next_event = hpet_legacy_next_event,
+ .irq = 0,
+ .rating = 50,
+};
+
+/*
* HPET MSI Support
*/
#ifdef CONFIG_PCI_MSI
@@ -426,7 +451,7 @@ static struct irq_domain *hpet_domain;
void hpet_msi_unmask(struct irq_data *data)
{
- struct hpet_dev *hdev = data->handler_data;
+ struct hpet_dev *hdev = irq_data_get_irq_handler_data(data);
unsigned int cfg;
/* unmask it */
@@ -437,7 +462,7 @@ void hpet_msi_unmask(struct irq_data *data)
void hpet_msi_mask(struct irq_data *data)
{
- struct hpet_dev *hdev = data->handler_data;
+ struct hpet_dev *hdev = irq_data_get_irq_handler_data(data);
unsigned int cfg;
/* mask it */
@@ -459,11 +484,32 @@ void hpet_msi_read(struct hpet_dev *hdev, struct msi_msg *msg)
msg->address_hi = 0;
}
-static void hpet_msi_set_mode(enum clock_event_mode mode,
- struct clock_event_device *evt)
+static int hpet_msi_shutdown(struct clock_event_device *evt)
+{
+ struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
+
+ return hpet_shutdown(evt, hdev->num);
+}
+
+static int hpet_msi_set_oneshot(struct clock_event_device *evt)
+{
+ struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
+
+ return hpet_set_oneshot(evt, hdev->num);
+}
+
+static int hpet_msi_set_periodic(struct clock_event_device *evt)
{
struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
- hpet_set_mode(mode, evt, hdev->num);
+
+ return hpet_set_periodic(evt, hdev->num);
+}
+
+static int hpet_msi_resume(struct clock_event_device *evt)
+{
+ struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
+
+ return hpet_resume(evt, hdev->num);
}
static int hpet_msi_next_event(unsigned long delta,
@@ -523,10 +569,14 @@ static void init_one_hpet_msi_clockevent(struct hpet_dev *hdev, int cpu)
evt->rating = 110;
evt->features = CLOCK_EVT_FEAT_ONESHOT;
- if (hdev->flags & HPET_DEV_PERI_CAP)
+ if (hdev->flags & HPET_DEV_PERI_CAP) {
evt->features |= CLOCK_EVT_FEAT_PERIODIC;
+ evt->set_state_periodic = hpet_msi_set_periodic;
+ }
- evt->set_mode = hpet_msi_set_mode;
+ evt->set_state_shutdown = hpet_msi_shutdown;
+ evt->set_state_oneshot = hpet_msi_set_oneshot;
+ evt->tick_resume = hpet_msi_resume;
evt->set_next_event = hpet_msi_next_event;
evt->cpumask = cpumask_of(hdev->cpu);
@@ -735,7 +785,7 @@ static int hpet_clocksource_register(void)
/* Verify whether hpet counter works */
t1 = hpet_readl(HPET_COUNTER);
- rdtscll(start);
+ start = rdtsc();
/*
* We don't know the TSC frequency yet, but waiting for
@@ -745,7 +795,7 @@ static int hpet_clocksource_register(void)
*/
do {
rep_nop();
- rdtscll(now);
+ now = rdtsc();
} while ((now - start) < 200000UL);
if (t1 == hpet_readl(HPET_COUNTER)) {
diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
index 7114ba220fd4..50a3fad5b89f 100644
--- a/arch/x86/kernel/hw_breakpoint.c
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -32,6 +32,7 @@
#include <linux/irqflags.h>
#include <linux/notifier.h>
#include <linux/kallsyms.h>
+#include <linux/kprobes.h>
#include <linux/percpu.h>
#include <linux/kdebug.h>
#include <linux/kernel.h>
@@ -179,7 +180,11 @@ int arch_check_bp_in_kernelspace(struct perf_event *bp)
va = info->address;
len = bp->attr.bp_len;
- return (va >= TASK_SIZE) && ((va + len - 1) >= TASK_SIZE);
+ /*
+ * We don't need to worry about va + len - 1 overflowing:
+ * we already require that va is aligned to a multiple of len.
+ */
+ return (va >= TASK_SIZE_MAX) || ((va + len - 1) >= TASK_SIZE_MAX);
}
int arch_bp_generic_fields(int x86_len, int x86_type,
@@ -243,6 +248,20 @@ static int arch_build_bp_info(struct perf_event *bp)
info->type = X86_BREAKPOINT_RW;
break;
case HW_BREAKPOINT_X:
+ /*
+ * We don't allow kernel breakpoints in places that are not
+ * acceptable for kprobes. On non-kprobes kernels, we don't
+ * allow kernel breakpoints at all.
+ */
+ if (bp->attr.bp_addr >= TASK_SIZE_MAX) {
+#ifdef CONFIG_KPROBES
+ if (within_kprobe_blacklist(bp->attr.bp_addr))
+ return -EINVAL;
+#else
+ return -EINVAL;
+#endif
+ }
+
info->type = X86_BREAKPOINT_EXECUTE;
/*
* x86 inst breakpoints need to have a specific undefined len.
@@ -276,8 +295,18 @@ static int arch_build_bp_info(struct perf_event *bp)
break;
#endif
default:
+ /* AMD range breakpoint */
if (!is_power_of_2(bp->attr.bp_len))
return -EINVAL;
+ if (bp->attr.bp_addr & (bp->attr.bp_len - 1))
+ return -EINVAL;
+ /*
+ * It's impossible to use a range breakpoint to fake out
+ * user vs kernel detection because bp_len - 1 can't
+ * have the high bit set. If we ever allow range instruction
+ * breakpoints, then we'll have to check for kprobe-blacklisted
+ * addresses anywhere in the range.
+ */
if (!cpu_has_bpext)
return -EOPNOTSUPP;
info->mask = bp->attr.bp_len - 1;
diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c
index f2b96de3c7c1..efb82f07b29c 100644
--- a/arch/x86/kernel/i8253.c
+++ b/arch/x86/kernel/i8253.c
@@ -34,7 +34,7 @@ static int __init init_pit_clocksource(void)
* - when local APIC timer is active (PIT is switched off)
*/
if (num_possible_cpus() > 1 || is_hpet_enabled() ||
- i8253_clockevent.mode != CLOCK_EVT_MODE_PERIODIC)
+ !clockevent_state_periodic(&i8253_clockevent))
return 0;
return clocksource_i8253_init();
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 88b366487b0e..f8062aaf5df9 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -139,10 +139,13 @@ int arch_show_interrupts(struct seq_file *p, int prec)
seq_puts(p, " Machine check polls\n");
#endif
#if IS_ENABLED(CONFIG_HYPERV) || defined(CONFIG_XEN)
- seq_printf(p, "%*s: ", prec, "HYP");
- for_each_online_cpu(j)
- seq_printf(p, "%10u ", irq_stats(j)->irq_hv_callback_count);
- seq_puts(p, " Hypervisor callback interrupts\n");
+ if (test_bit(HYPERVISOR_CALLBACK_VECTOR, used_vectors)) {
+ seq_printf(p, "%*s: ", prec, "HYP");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ",
+ irq_stats(j)->irq_hv_callback_count);
+ seq_puts(p, " Hypervisor callback interrupts\n");
+ }
#endif
seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count));
#if defined(CONFIG_X86_IO_APIC)
@@ -211,24 +214,38 @@ u64 arch_irq_stat(void)
__visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
{
struct pt_regs *old_regs = set_irq_regs(regs);
-
+ struct irq_desc * desc;
/* high bit used in ret_from_ code */
unsigned vector = ~regs->orig_ax;
- unsigned irq;
+
+ /*
+ * NB: Unlike exception entries, IRQ entries do not reliably
+ * handle context tracking in the low-level entry code. This is
+ * because syscall entries execute briefly with IRQs on before
+ * updating context tracking state, so we can take an IRQ from
+ * kernel mode with CONTEXT_USER. The low-level entry code only
+ * updates the context if we came from user mode, so we won't
+ * switch to CONTEXT_KERNEL. We'll fix that once the syscall
+ * code is cleaned up enough that we can cleanly defer enabling
+ * IRQs.
+ */
entering_irq();
- irq = __this_cpu_read(vector_irq[vector]);
+ /* entering_irq() tells RCU that we're not quiescent. Check it. */
+ RCU_LOCKDEP_WARN(!rcu_is_watching(), "IRQ failed to wake up RCU");
+
+ desc = __this_cpu_read(vector_irq[vector]);
- if (!handle_irq(irq, regs)) {
+ if (!handle_irq(desc, regs)) {
ack_APIC_irq();
- if (irq != VECTOR_RETRIGGERED) {
- pr_emerg_ratelimited("%s: %d.%d No irq handler for vector (irq %d)\n",
+ if (desc != VECTOR_RETRIGGERED) {
+ pr_emerg_ratelimited("%s: %d.%d No irq handler for vector\n",
__func__, smp_processor_id(),
- vector, irq);
+ vector);
} else {
- __this_cpu_write(vector_irq[vector], VECTOR_UNDEFINED);
+ __this_cpu_write(vector_irq[vector], VECTOR_UNUSED);
}
}
@@ -330,10 +347,10 @@ static struct cpumask affinity_new, online_new;
*/
int check_irq_vectors_for_cpu_disable(void)
{
- int irq, cpu;
unsigned int this_cpu, vector, this_count, count;
struct irq_desc *desc;
struct irq_data *data;
+ int cpu;
this_cpu = smp_processor_id();
cpumask_copy(&online_new, cpu_online_mask);
@@ -341,39 +358,43 @@ int check_irq_vectors_for_cpu_disable(void)
this_count = 0;
for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
- irq = __this_cpu_read(vector_irq[vector]);
- if (irq >= 0) {
- desc = irq_to_desc(irq);
- if (!desc)
- continue;
+ desc = __this_cpu_read(vector_irq[vector]);
+ if (IS_ERR_OR_NULL(desc))
+ continue;
+ /*
+ * Protect against concurrent action removal, affinity
+ * changes etc.
+ */
+ raw_spin_lock(&desc->lock);
+ data = irq_desc_get_irq_data(desc);
+ cpumask_copy(&affinity_new,
+ irq_data_get_affinity_mask(data));
+ cpumask_clear_cpu(this_cpu, &affinity_new);
- data = irq_desc_get_irq_data(desc);
- cpumask_copy(&affinity_new, data->affinity);
- cpumask_clear_cpu(this_cpu, &affinity_new);
-
- /* Do not count inactive or per-cpu irqs. */
- if (!irq_has_action(irq) || irqd_is_per_cpu(data))
- continue;
-
- /*
- * A single irq may be mapped to multiple
- * cpu's vector_irq[] (for example IOAPIC cluster
- * mode). In this case we have two
- * possibilities:
- *
- * 1) the resulting affinity mask is empty; that is
- * this the down'd cpu is the last cpu in the irq's
- * affinity mask, or
- *
- * 2) the resulting affinity mask is no longer
- * a subset of the online cpus but the affinity
- * mask is not zero; that is the down'd cpu is the
- * last online cpu in a user set affinity mask.
- */
- if (cpumask_empty(&affinity_new) ||
- !cpumask_subset(&affinity_new, &online_new))
- this_count++;
+ /* Do not count inactive or per-cpu irqs. */
+ if (!irq_desc_has_action(desc) || irqd_is_per_cpu(data)) {
+ raw_spin_unlock(&desc->lock);
+ continue;
}
+
+ raw_spin_unlock(&desc->lock);
+ /*
+ * A single irq may be mapped to multiple cpu's
+ * vector_irq[] (for example IOAPIC cluster mode). In
+ * this case we have two possibilities:
+ *
+ * 1) the resulting affinity mask is empty; that is
+ * this the down'd cpu is the last cpu in the irq's
+ * affinity mask, or
+ *
+ * 2) the resulting affinity mask is no longer a
+ * subset of the online cpus but the affinity mask is
+ * not zero; that is the down'd cpu is the last online
+ * cpu in a user set affinity mask.
+ */
+ if (cpumask_empty(&affinity_new) ||
+ !cpumask_subset(&affinity_new, &online_new))
+ this_count++;
}
count = 0;
@@ -385,12 +406,15 @@ int check_irq_vectors_for_cpu_disable(void)
* vector. If the vector is marked in the used vectors
* bitmap or an irq is assigned to it, we don't count
* it as available.
+ *
+ * As this is an inaccurate snapshot anyway, we can do
+ * this w/o holding vector_lock.
*/
for (vector = FIRST_EXTERNAL_VECTOR;
vector < first_system_vector; vector++) {
if (!test_bit(vector, used_vectors) &&
- per_cpu(vector_irq, cpu)[vector] < 0)
- count++;
+ IS_ERR_OR_NULL(per_cpu(vector_irq, cpu)[vector]))
+ count++;
}
}
@@ -426,7 +450,7 @@ void fixup_irqs(void)
raw_spin_lock(&desc->lock);
data = irq_desc_get_irq_data(desc);
- affinity = data->affinity;
+ affinity = irq_data_get_affinity_mask(data);
if (!irq_has_action(irq) || irqd_is_per_cpu(data) ||
cpumask_subset(affinity, cpu_online_mask)) {
raw_spin_unlock(&desc->lock);
@@ -486,20 +510,24 @@ void fixup_irqs(void)
*/
mdelay(1);
+ /*
+ * We can walk the vector array of this cpu without holding
+ * vector_lock because the cpu is already marked !online, so
+ * nothing else will touch it.
+ */
for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
unsigned int irr;
- if (__this_cpu_read(vector_irq[vector]) <= VECTOR_UNDEFINED)
+ if (IS_ERR_OR_NULL(__this_cpu_read(vector_irq[vector])))
continue;
irr = apic_read(APIC_IRR + (vector / 32 * 0x10));
if (irr & (1 << (vector % 32))) {
- irq = __this_cpu_read(vector_irq[vector]);
+ desc = __this_cpu_read(vector_irq[vector]);
- desc = irq_to_desc(irq);
+ raw_spin_lock(&desc->lock);
data = irq_desc_get_irq_data(desc);
chip = irq_data_get_irq_chip(data);
- raw_spin_lock(&desc->lock);
if (chip->irq_retrigger) {
chip->irq_retrigger(data);
__this_cpu_write(vector_irq[vector], VECTOR_RETRIGGERED);
@@ -507,7 +535,7 @@ void fixup_irqs(void)
raw_spin_unlock(&desc->lock);
}
if (__this_cpu_read(vector_irq[vector]) != VECTOR_RETRIGGERED)
- __this_cpu_write(vector_irq[vector], VECTOR_UNDEFINED);
+ __this_cpu_write(vector_irq[vector], VECTOR_UNUSED);
}
}
#endif
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index cd74f5978ab9..38da8f29a9c8 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -68,11 +68,10 @@ static inline void *current_stack(void)
return (void *)(current_stack_pointer() & ~(THREAD_SIZE - 1));
}
-static inline int
-execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq)
+static inline int execute_on_irq_stack(int overflow, struct irq_desc *desc)
{
struct irq_stack *curstk, *irqstk;
- u32 *isp, *prev_esp, arg1, arg2;
+ u32 *isp, *prev_esp, arg1;
curstk = (struct irq_stack *) current_stack();
irqstk = __this_cpu_read(hardirq_stack);
@@ -98,8 +97,8 @@ execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq)
asm volatile("xchgl %%ebx,%%esp \n"
"call *%%edi \n"
"movl %%ebx,%%esp \n"
- : "=a" (arg1), "=d" (arg2), "=b" (isp)
- : "0" (irq), "1" (desc), "2" (isp),
+ : "=a" (arg1), "=b" (isp)
+ : "0" (desc), "1" (isp),
"D" (desc->handle_irq)
: "memory", "cc", "ecx");
return 1;
@@ -148,21 +147,17 @@ void do_softirq_own_stack(void)
call_on_stack(__do_softirq, isp);
}
-bool handle_irq(unsigned irq, struct pt_regs *regs)
+bool handle_irq(struct irq_desc *desc, struct pt_regs *regs)
{
- struct irq_desc *desc;
- int overflow;
+ int overflow = check_stack_overflow();
- overflow = check_stack_overflow();
-
- desc = irq_to_desc(irq);
- if (unlikely(!desc))
+ if (IS_ERR_OR_NULL(desc))
return false;
- if (user_mode(regs) || !execute_on_irq_stack(overflow, desc, irq)) {
+ if (user_mode(regs) || !execute_on_irq_stack(overflow, desc)) {
if (unlikely(overflow))
print_stack_overflow();
- desc->handle_irq(irq, desc);
+ generic_handle_irq_desc(desc);
}
return true;
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index bc4604e500a3..c767cf2bc80a 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -68,16 +68,13 @@ static inline void stack_overflow_check(struct pt_regs *regs)
#endif
}
-bool handle_irq(unsigned irq, struct pt_regs *regs)
+bool handle_irq(struct irq_desc *desc, struct pt_regs *regs)
{
- struct irq_desc *desc;
-
stack_overflow_check(regs);
- desc = irq_to_desc(irq);
- if (unlikely(!desc))
+ if (unlikely(IS_ERR_OR_NULL(desc)))
return false;
- generic_handle_irq_desc(irq, desc);
+ generic_handle_irq_desc(desc);
return true;
}
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index a3a5e158ed69..1423ab1b0312 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -52,7 +52,7 @@ static struct irqaction irq2 = {
};
DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
- [0 ... NR_VECTORS - 1] = VECTOR_UNDEFINED,
+ [0 ... NR_VECTORS - 1] = VECTOR_UNUSED,
};
int vector_used_by_percpu_irq(unsigned int vector)
@@ -60,7 +60,7 @@ int vector_used_by_percpu_irq(unsigned int vector)
int cpu;
for_each_online_cpu(cpu) {
- if (per_cpu(vector_irq, cpu)[vector] > VECTOR_UNDEFINED)
+ if (!IS_ERR_OR_NULL(per_cpu(vector_irq, cpu)[vector]))
return 1;
}
@@ -94,7 +94,7 @@ void __init init_IRQ(void)
* irq's migrate etc.
*/
for (i = 0; i < nr_legacy_irqs(); i++)
- per_cpu(vector_irq, 0)[ISA_IRQ_VECTOR(i)] = i;
+ per_cpu(vector_irq, 0)[ISA_IRQ_VECTOR(i)] = irq_to_desc(i);
x86_init.irqs.intr_init();
}
diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c
index 26d5a55a2736..e565e0e4d216 100644
--- a/arch/x86/kernel/jump_label.c
+++ b/arch/x86/kernel/jump_label.c
@@ -45,7 +45,7 @@ static void __jump_label_transform(struct jump_entry *entry,
const unsigned char default_nop[] = { STATIC_KEY_INIT_NOP };
const unsigned char *ideal_nop = ideal_nops[NOP_ATOMIC5];
- if (type == JUMP_LABEL_ENABLE) {
+ if (type == JUMP_LABEL_JMP) {
if (init) {
/*
* Jump label is enabled for the first time.
diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c
index ca83f7ac388b..0f8a6bbaaa44 100644
--- a/arch/x86/kernel/kexec-bzimage64.c
+++ b/arch/x86/kernel/kexec-bzimage64.c
@@ -223,9 +223,6 @@ setup_boot_parameters(struct kimage *image, struct boot_params *params,
memset(&params->hd0_info, 0, sizeof(params->hd0_info));
memset(&params->hd1_info, 0, sizeof(params->hd1_info));
- /* Default sysdesc table */
- params->sys_desc_table.length = 0;
-
if (image->type == KEXEC_TYPE_CRASH) {
ret = crash_setup_memmap_entries(image, params);
if (ret)
@@ -536,7 +533,9 @@ static int bzImage64_verify_sig(const char *kernel, unsigned long kernel_len)
int ret;
ret = verify_pefile_signature(kernel, kernel_len,
- system_trusted_keyring, &trusted);
+ system_trusted_keyring,
+ VERIFYING_KEXEC_PE_SIGNATURE,
+ &trusted);
if (ret < 0)
return ret;
if (!trusted)
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 49487b488061..2c7aafa70702 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -200,7 +200,7 @@ static void kvm_setup_secondary_clock(void)
* kind of shutdown from our side, we unregister the clock by writting anything
* that does not have the 'enable' bit set in the msr
*/
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
static void kvm_crash_shutdown(struct pt_regs *regs)
{
native_write_msr(msr_kvm_system_time, 0, 0);
@@ -259,7 +259,7 @@ void __init kvmclock_init(void)
x86_platform.save_sched_clock_state = kvm_save_sched_clock_state;
x86_platform.restore_sched_clock_state = kvm_restore_sched_clock_state;
machine_ops.shutdown = kvm_shutdown;
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
machine_ops.crash_shutdown = kvm_crash_shutdown;
#endif
kvm_get_preset_lpj();
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index c37886d759cc..6acc9dd91f36 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -12,6 +12,7 @@
#include <linux/string.h>
#include <linux/mm.h>
#include <linux/smp.h>
+#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/uaccess.h>
@@ -20,82 +21,82 @@
#include <asm/mmu_context.h>
#include <asm/syscalls.h>
-#ifdef CONFIG_SMP
+/* context.lock is held for us, so we don't need any locking. */
static void flush_ldt(void *current_mm)
{
- if (current->active_mm == current_mm)
- load_LDT(&current->active_mm->context);
+ mm_context_t *pc;
+
+ if (current->active_mm != current_mm)
+ return;
+
+ pc = &current->active_mm->context;
+ set_ldt(pc->ldt->entries, pc->ldt->size);
}
-#endif
-static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
+/* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */
+static struct ldt_struct *alloc_ldt_struct(int size)
{
- void *oldldt, *newldt;
- int oldsize;
-
- if (mincount <= pc->size)
- return 0;
- oldsize = pc->size;
- mincount = (mincount + (PAGE_SIZE / LDT_ENTRY_SIZE - 1)) &
- (~(PAGE_SIZE / LDT_ENTRY_SIZE - 1));
- if (mincount * LDT_ENTRY_SIZE > PAGE_SIZE)
- newldt = vmalloc(mincount * LDT_ENTRY_SIZE);
+ struct ldt_struct *new_ldt;
+ int alloc_size;
+
+ if (size > LDT_ENTRIES)
+ return NULL;
+
+ new_ldt = kmalloc(sizeof(struct ldt_struct), GFP_KERNEL);
+ if (!new_ldt)
+ return NULL;
+
+ BUILD_BUG_ON(LDT_ENTRY_SIZE != sizeof(struct desc_struct));
+ alloc_size = size * LDT_ENTRY_SIZE;
+
+ /*
+ * Xen is very picky: it requires a page-aligned LDT that has no
+ * trailing nonzero bytes in any page that contains LDT descriptors.
+ * Keep it simple: zero the whole allocation and never allocate less
+ * than PAGE_SIZE.
+ */
+ if (alloc_size > PAGE_SIZE)
+ new_ldt->entries = vzalloc(alloc_size);
else
- newldt = (void *)__get_free_page(GFP_KERNEL);
-
- if (!newldt)
- return -ENOMEM;
+ new_ldt->entries = (void *)get_zeroed_page(GFP_KERNEL);
- if (oldsize)
- memcpy(newldt, pc->ldt, oldsize * LDT_ENTRY_SIZE);
- oldldt = pc->ldt;
- memset(newldt + oldsize * LDT_ENTRY_SIZE, 0,
- (mincount - oldsize) * LDT_ENTRY_SIZE);
+ if (!new_ldt->entries) {
+ kfree(new_ldt);
+ return NULL;
+ }
- paravirt_alloc_ldt(newldt, mincount);
+ new_ldt->size = size;
+ return new_ldt;
+}
-#ifdef CONFIG_X86_64
- /* CHECKME: Do we really need this ? */
- wmb();
-#endif
- pc->ldt = newldt;
- wmb();
- pc->size = mincount;
- wmb();
-
- if (reload) {
-#ifdef CONFIG_SMP
- preempt_disable();
- load_LDT(pc);
- if (!cpumask_equal(mm_cpumask(current->mm),
- cpumask_of(smp_processor_id())))
- smp_call_function(flush_ldt, current->mm, 1);
- preempt_enable();
-#else
- load_LDT(pc);
-#endif
- }
- if (oldsize) {
- paravirt_free_ldt(oldldt, oldsize);
- if (oldsize * LDT_ENTRY_SIZE > PAGE_SIZE)
- vfree(oldldt);
- else
- put_page(virt_to_page(oldldt));
- }
- return 0;
+/* After calling this, the LDT is immutable. */
+static void finalize_ldt_struct(struct ldt_struct *ldt)
+{
+ paravirt_alloc_ldt(ldt->entries, ldt->size);
}
-static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
+/* context.lock is held */
+static void install_ldt(struct mm_struct *current_mm,
+ struct ldt_struct *ldt)
{
- int err = alloc_ldt(new, old->size, 0);
- int i;
+ /* Synchronizes with lockless_dereference in load_mm_ldt. */
+ smp_store_release(&current_mm->context.ldt, ldt);
+
+ /* Activate the LDT for all CPUs using current_mm. */
+ on_each_cpu_mask(mm_cpumask(current_mm), flush_ldt, current_mm, true);
+}
- if (err < 0)
- return err;
+static void free_ldt_struct(struct ldt_struct *ldt)
+{
+ if (likely(!ldt))
+ return;
- for (i = 0; i < old->size; i++)
- write_ldt_entry(new->ldt, i, old->ldt + i * LDT_ENTRY_SIZE);
- return 0;
+ paravirt_free_ldt(ldt->entries, ldt->size);
+ if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE)
+ vfree(ldt->entries);
+ else
+ free_page((unsigned long)ldt->entries);
+ kfree(ldt);
}
/*
@@ -104,17 +105,37 @@ static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
*/
int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
{
+ struct ldt_struct *new_ldt;
struct mm_struct *old_mm;
int retval = 0;
mutex_init(&mm->context.lock);
- mm->context.size = 0;
old_mm = current->mm;
- if (old_mm && old_mm->context.size > 0) {
- mutex_lock(&old_mm->context.lock);
- retval = copy_ldt(&mm->context, &old_mm->context);
- mutex_unlock(&old_mm->context.lock);
+ if (!old_mm) {
+ mm->context.ldt = NULL;
+ return 0;
}
+
+ mutex_lock(&old_mm->context.lock);
+ if (!old_mm->context.ldt) {
+ mm->context.ldt = NULL;
+ goto out_unlock;
+ }
+
+ new_ldt = alloc_ldt_struct(old_mm->context.ldt->size);
+ if (!new_ldt) {
+ retval = -ENOMEM;
+ goto out_unlock;
+ }
+
+ memcpy(new_ldt->entries, old_mm->context.ldt->entries,
+ new_ldt->size * LDT_ENTRY_SIZE);
+ finalize_ldt_struct(new_ldt);
+
+ mm->context.ldt = new_ldt;
+
+out_unlock:
+ mutex_unlock(&old_mm->context.lock);
return retval;
}
@@ -125,53 +146,47 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
*/
void destroy_context(struct mm_struct *mm)
{
- if (mm->context.size) {
-#ifdef CONFIG_X86_32
- /* CHECKME: Can this ever happen ? */
- if (mm == current->active_mm)
- clear_LDT();
-#endif
- paravirt_free_ldt(mm->context.ldt, mm->context.size);
- if (mm->context.size * LDT_ENTRY_SIZE > PAGE_SIZE)
- vfree(mm->context.ldt);
- else
- put_page(virt_to_page(mm->context.ldt));
- mm->context.size = 0;
- }
+ free_ldt_struct(mm->context.ldt);
+ mm->context.ldt = NULL;
}
static int read_ldt(void __user *ptr, unsigned long bytecount)
{
- int err;
+ int retval;
unsigned long size;
struct mm_struct *mm = current->mm;
- if (!mm->context.size)
- return 0;
+ mutex_lock(&mm->context.lock);
+
+ if (!mm->context.ldt) {
+ retval = 0;
+ goto out_unlock;
+ }
+
if (bytecount > LDT_ENTRY_SIZE * LDT_ENTRIES)
bytecount = LDT_ENTRY_SIZE * LDT_ENTRIES;
- mutex_lock(&mm->context.lock);
- size = mm->context.size * LDT_ENTRY_SIZE;
+ size = mm->context.ldt->size * LDT_ENTRY_SIZE;
if (size > bytecount)
size = bytecount;
- err = 0;
- if (copy_to_user(ptr, mm->context.ldt, size))
- err = -EFAULT;
- mutex_unlock(&mm->context.lock);
- if (err < 0)
- goto error_return;
+ if (copy_to_user(ptr, mm->context.ldt->entries, size)) {
+ retval = -EFAULT;
+ goto out_unlock;
+ }
+
if (size != bytecount) {
- /* zero-fill the rest */
- if (clear_user(ptr + size, bytecount - size) != 0) {
- err = -EFAULT;
- goto error_return;
+ /* Zero-fill the rest and pretend we read bytecount bytes. */
+ if (clear_user(ptr + size, bytecount - size)) {
+ retval = -EFAULT;
+ goto out_unlock;
}
}
- return bytecount;
-error_return:
- return err;
+ retval = bytecount;
+
+out_unlock:
+ mutex_unlock(&mm->context.lock);
+ return retval;
}
static int read_default_ldt(void __user *ptr, unsigned long bytecount)
@@ -195,6 +210,8 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
struct desc_struct ldt;
int error;
struct user_desc ldt_info;
+ int oldsize, newsize;
+ struct ldt_struct *new_ldt, *old_ldt;
error = -EINVAL;
if (bytecount != sizeof(ldt_info))
@@ -213,34 +230,39 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
goto out;
}
- mutex_lock(&mm->context.lock);
- if (ldt_info.entry_number >= mm->context.size) {
- error = alloc_ldt(&current->mm->context,
- ldt_info.entry_number + 1, 1);
- if (error < 0)
- goto out_unlock;
- }
-
- /* Allow LDTs to be cleared by the user. */
- if (ldt_info.base_addr == 0 && ldt_info.limit == 0) {
- if (oldmode || LDT_empty(&ldt_info)) {
- memset(&ldt, 0, sizeof(ldt));
- goto install;
+ if ((oldmode && !ldt_info.base_addr && !ldt_info.limit) ||
+ LDT_empty(&ldt_info)) {
+ /* The user wants to clear the entry. */
+ memset(&ldt, 0, sizeof(ldt));
+ } else {
+ if (!IS_ENABLED(CONFIG_X86_16BIT) && !ldt_info.seg_32bit) {
+ error = -EINVAL;
+ goto out;
}
+
+ fill_ldt(&ldt, &ldt_info);
+ if (oldmode)
+ ldt.avl = 0;
}
- if (!IS_ENABLED(CONFIG_X86_16BIT) && !ldt_info.seg_32bit) {
- error = -EINVAL;
+ mutex_lock(&mm->context.lock);
+
+ old_ldt = mm->context.ldt;
+ oldsize = old_ldt ? old_ldt->size : 0;
+ newsize = max((int)(ldt_info.entry_number + 1), oldsize);
+
+ error = -ENOMEM;
+ new_ldt = alloc_ldt_struct(newsize);
+ if (!new_ldt)
goto out_unlock;
- }
- fill_ldt(&ldt, &ldt_info);
- if (oldmode)
- ldt.avl = 0;
+ if (old_ldt)
+ memcpy(new_ldt->entries, old_ldt->entries, oldsize * LDT_ENTRY_SIZE);
+ new_ldt->entries[ldt_info.entry_number] = ldt;
+ finalize_ldt_struct(new_ldt);
- /* Install the new entry ... */
-install:
- write_ldt_entry(mm->context.ldt, ldt_info.entry_number, &ldt);
+ install_ldt(mm, new_ldt);
+ free_ldt_struct(old_ldt);
error = 0;
out_unlock:
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index c3e985d1751c..697f90db0e37 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -110,7 +110,7 @@ static void nmi_max_handler(struct irq_work *w)
a->handler, whole_msecs, decimal_msecs);
}
-static int nmi_handle(unsigned int type, struct pt_regs *regs, bool b2b)
+static int nmi_handle(unsigned int type, struct pt_regs *regs)
{
struct nmi_desc *desc = nmi_to_desc(type);
struct nmiaction *a;
@@ -213,7 +213,7 @@ static void
pci_serr_error(unsigned char reason, struct pt_regs *regs)
{
/* check to see if anyone registered against these types of errors */
- if (nmi_handle(NMI_SERR, regs, false))
+ if (nmi_handle(NMI_SERR, regs))
return;
pr_emerg("NMI: PCI system error (SERR) for reason %02x on CPU %d.\n",
@@ -247,7 +247,7 @@ io_check_error(unsigned char reason, struct pt_regs *regs)
unsigned long i;
/* check to see if anyone registered against these types of errors */
- if (nmi_handle(NMI_IO_CHECK, regs, false))
+ if (nmi_handle(NMI_IO_CHECK, regs))
return;
pr_emerg(
@@ -284,7 +284,7 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
* as only the first one is ever run (unless it can actually determine
* if it caused the NMI)
*/
- handled = nmi_handle(NMI_UNKNOWN, regs, false);
+ handled = nmi_handle(NMI_UNKNOWN, regs);
if (handled) {
__this_cpu_add(nmi_stats.unknown, handled);
return;
@@ -332,7 +332,7 @@ static void default_do_nmi(struct pt_regs *regs)
__this_cpu_write(last_nmi_rip, regs->ip);
- handled = nmi_handle(NMI_LOCAL, regs, b2b);
+ handled = nmi_handle(NMI_LOCAL, regs);
__this_cpu_add(nmi_stats.normal, handled);
if (handled) {
/*
@@ -408,15 +408,15 @@ static void default_do_nmi(struct pt_regs *regs)
NOKPROBE_SYMBOL(default_do_nmi);
/*
- * NMIs can hit breakpoints which will cause it to lose its
- * NMI context with the CPU when the breakpoint does an iret.
- */
-#ifdef CONFIG_X86_32
-/*
- * For i386, NMIs use the same stack as the kernel, and we can
- * add a workaround to the iret problem in C (preventing nested
- * NMIs if an NMI takes a trap). Simply have 3 states the NMI
- * can be in:
+ * NMIs can page fault or hit breakpoints which will cause it to lose
+ * its NMI context with the CPU when the breakpoint or page fault does an IRET.
+ *
+ * As a result, NMIs can nest if NMIs get unmasked due an IRET during
+ * NMI processing. On x86_64, the asm glue protects us from nested NMIs
+ * if the outer NMI came from kernel mode, but we can still nest if the
+ * outer NMI came from user mode.
+ *
+ * To handle these nested NMIs, we have three states:
*
* 1) not running
* 2) executing
@@ -430,15 +430,14 @@ NOKPROBE_SYMBOL(default_do_nmi);
* (Note, the latch is binary, thus multiple NMIs triggering,
* when one is running, are ignored. Only one NMI is restarted.)
*
- * If an NMI hits a breakpoint that executes an iret, another
- * NMI can preempt it. We do not want to allow this new NMI
- * to run, but we want to execute it when the first one finishes.
- * We set the state to "latched", and the exit of the first NMI will
- * perform a dec_return, if the result is zero (NOT_RUNNING), then
- * it will simply exit the NMI handler. If not, the dec_return
- * would have set the state to NMI_EXECUTING (what we want it to
- * be when we are running). In this case, we simply jump back
- * to rerun the NMI handler again, and restart the 'latched' NMI.
+ * If an NMI executes an iret, another NMI can preempt it. We do not
+ * want to allow this new NMI to run, but we want to execute it when the
+ * first one finishes. We set the state to "latched", and the exit of
+ * the first NMI will perform a dec_return, if the result is zero
+ * (NOT_RUNNING), then it will simply exit the NMI handler. If not, the
+ * dec_return would have set the state to NMI_EXECUTING (what we want it
+ * to be when we are running). In this case, we simply jump back to
+ * rerun the NMI handler again, and restart the 'latched' NMI.
*
* No trap (breakpoint or page fault) should be hit before nmi_restart,
* thus there is no race between the first check of state for NOT_RUNNING
@@ -461,49 +460,36 @@ enum nmi_states {
static DEFINE_PER_CPU(enum nmi_states, nmi_state);
static DEFINE_PER_CPU(unsigned long, nmi_cr2);
-#define nmi_nesting_preprocess(regs) \
- do { \
- if (this_cpu_read(nmi_state) != NMI_NOT_RUNNING) { \
- this_cpu_write(nmi_state, NMI_LATCHED); \
- return; \
- } \
- this_cpu_write(nmi_state, NMI_EXECUTING); \
- this_cpu_write(nmi_cr2, read_cr2()); \
- } while (0); \
- nmi_restart:
-
-#define nmi_nesting_postprocess() \
- do { \
- if (unlikely(this_cpu_read(nmi_cr2) != read_cr2())) \
- write_cr2(this_cpu_read(nmi_cr2)); \
- if (this_cpu_dec_return(nmi_state)) \
- goto nmi_restart; \
- } while (0)
-#else /* x86_64 */
+#ifdef CONFIG_X86_64
/*
- * In x86_64 things are a bit more difficult. This has the same problem
- * where an NMI hitting a breakpoint that calls iret will remove the
- * NMI context, allowing a nested NMI to enter. What makes this more
- * difficult is that both NMIs and breakpoints have their own stack.
- * When a new NMI or breakpoint is executed, the stack is set to a fixed
- * point. If an NMI is nested, it will have its stack set at that same
- * fixed address that the first NMI had, and will start corrupting the
- * stack. This is handled in entry_64.S, but the same problem exists with
- * the breakpoint stack.
+ * In x86_64, we need to handle breakpoint -> NMI -> breakpoint. Without
+ * some care, the inner breakpoint will clobber the outer breakpoint's
+ * stack.
*
- * If a breakpoint is being processed, and the debug stack is being used,
- * if an NMI comes in and also hits a breakpoint, the stack pointer
- * will be set to the same fixed address as the breakpoint that was
- * interrupted, causing that stack to be corrupted. To handle this case,
- * check if the stack that was interrupted is the debug stack, and if
- * so, change the IDT so that new breakpoints will use the current stack
- * and not switch to the fixed address. On return of the NMI, switch back
- * to the original IDT.
+ * If a breakpoint is being processed, and the debug stack is being
+ * used, if an NMI comes in and also hits a breakpoint, the stack
+ * pointer will be set to the same fixed address as the breakpoint that
+ * was interrupted, causing that stack to be corrupted. To handle this
+ * case, check if the stack that was interrupted is the debug stack, and
+ * if so, change the IDT so that new breakpoints will use the current
+ * stack and not switch to the fixed address. On return of the NMI,
+ * switch back to the original IDT.
*/
static DEFINE_PER_CPU(int, update_debug_stack);
+#endif
-static inline void nmi_nesting_preprocess(struct pt_regs *regs)
+dotraplinkage notrace void
+do_nmi(struct pt_regs *regs, long error_code)
{
+ if (this_cpu_read(nmi_state) != NMI_NOT_RUNNING) {
+ this_cpu_write(nmi_state, NMI_LATCHED);
+ return;
+ }
+ this_cpu_write(nmi_state, NMI_EXECUTING);
+ this_cpu_write(nmi_cr2, read_cr2());
+nmi_restart:
+
+#ifdef CONFIG_X86_64
/*
* If we interrupted a breakpoint, it is possible that
* the nmi handler will have breakpoints too. We need to
@@ -514,22 +500,8 @@ static inline void nmi_nesting_preprocess(struct pt_regs *regs)
debug_stack_set_zero();
this_cpu_write(update_debug_stack, 1);
}
-}
-
-static inline void nmi_nesting_postprocess(void)
-{
- if (unlikely(this_cpu_read(update_debug_stack))) {
- debug_stack_reset();
- this_cpu_write(update_debug_stack, 0);
- }
-}
#endif
-dotraplinkage notrace void
-do_nmi(struct pt_regs *regs, long error_code)
-{
- nmi_nesting_preprocess(regs);
-
nmi_enter();
inc_irq_stat(__nmi_count);
@@ -539,8 +511,17 @@ do_nmi(struct pt_regs *regs, long error_code)
nmi_exit();
- /* On i386, may loop back to preprocess */
- nmi_nesting_postprocess();
+#ifdef CONFIG_X86_64
+ if (unlikely(this_cpu_read(update_debug_stack))) {
+ debug_stack_reset();
+ this_cpu_write(update_debug_stack, 0);
+ }
+#endif
+
+ if (unlikely(this_cpu_read(nmi_cr2) != read_cr2()))
+ write_cr2(this_cpu_read(nmi_cr2));
+ if (this_cpu_dec_return(nmi_state))
+ goto nmi_restart;
}
NOKPROBE_SYMBOL(do_nmi);
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 58bcfb67c01f..c2130aef3f9d 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -41,10 +41,18 @@
#include <asm/timer.h>
#include <asm/special_insns.h>
-/* nop stub */
-void _paravirt_nop(void)
-{
-}
+/*
+ * nop stub, which must not clobber anything *including the stack* to
+ * avoid confusing the entry prologues.
+ */
+extern void _paravirt_nop(void);
+asm (".pushsection .entry.text, \"ax\"\n"
+ ".global _paravirt_nop\n"
+ "_paravirt_nop:\n\t"
+ "ret\n\t"
+ ".size _paravirt_nop, . - _paravirt_nop\n\t"
+ ".type _paravirt_nop, @function\n\t"
+ ".popsection");
/* identity function, which can be inlined */
u32 _paravirt_ident_32(u32 x)
@@ -351,9 +359,7 @@ __visible struct pv_cpu_ops pv_cpu_ops = {
.wbinvd = native_wbinvd,
.read_msr = native_read_msr_safe,
.write_msr = native_write_msr_safe,
- .read_tsc = native_read_tsc,
.read_pmc = native_read_pmc,
- .read_tscp = native_read_tscp,
.load_tr_desc = native_load_tr_desc,
.set_ldt = native_set_ldt,
.load_gdt = native_load_gdt,
diff --git a/arch/x86/kernel/paravirt_patch_32.c b/arch/x86/kernel/paravirt_patch_32.c
index e1b013696dde..c89f50a76e97 100644
--- a/arch/x86/kernel/paravirt_patch_32.c
+++ b/arch/x86/kernel/paravirt_patch_32.c
@@ -10,7 +10,6 @@ DEF_NATIVE(pv_mmu_ops, read_cr2, "mov %cr2, %eax");
DEF_NATIVE(pv_mmu_ops, write_cr3, "mov %eax, %cr3");
DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax");
DEF_NATIVE(pv_cpu_ops, clts, "clts");
-DEF_NATIVE(pv_cpu_ops, read_tsc, "rdtsc");
#if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS)
DEF_NATIVE(pv_lock_ops, queued_spin_unlock, "movb $0, (%eax)");
@@ -52,7 +51,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
PATCH_SITE(pv_mmu_ops, read_cr3);
PATCH_SITE(pv_mmu_ops, write_cr3);
PATCH_SITE(pv_cpu_ops, clts);
- PATCH_SITE(pv_cpu_ops, read_tsc);
#if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS)
case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock):
if (pv_is_native_spin_unlock()) {
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 353972c1946c..1b55de1267cf 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -58,17 +58,6 @@ EXPORT_SYMBOL(x86_dma_fallback_dev);
/* Number of entries preallocated for DMA-API debugging */
#define PREALLOC_DMA_DEBUG_ENTRIES 65536
-int dma_set_mask(struct device *dev, u64 mask)
-{
- if (!dev->dma_mask || !dma_supported(dev, mask))
- return -EIO;
-
- *dev->dma_mask = mask;
-
- return 0;
-}
-EXPORT_SYMBOL(dma_set_mask);
-
void __init pci_iommu_alloc(void)
{
struct iommu_table_entry *p;
@@ -140,50 +129,19 @@ void dma_generic_free_coherent(struct device *dev, size_t size, void *vaddr,
free_pages((unsigned long)vaddr, get_order(size));
}
-void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle,
- gfp_t gfp, struct dma_attrs *attrs)
+bool arch_dma_alloc_attrs(struct device **dev, gfp_t *gfp)
{
- struct dma_map_ops *ops = get_dma_ops(dev);
- void *memory;
-
- gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
-
- if (dma_alloc_from_coherent(dev, size, dma_handle, &memory))
- return memory;
-
- if (!dev)
- dev = &x86_dma_fallback_dev;
-
- if (!is_device_dma_capable(dev))
- return NULL;
-
- if (!ops->alloc)
- return NULL;
-
- memory = ops->alloc(dev, size, dma_handle,
- dma_alloc_coherent_gfp_flags(dev, gfp), attrs);
- debug_dma_alloc_coherent(dev, size, *dma_handle, memory);
-
- return memory;
-}
-EXPORT_SYMBOL(dma_alloc_attrs);
-
-void dma_free_attrs(struct device *dev, size_t size,
- void *vaddr, dma_addr_t bus,
- struct dma_attrs *attrs)
-{
- struct dma_map_ops *ops = get_dma_ops(dev);
-
- WARN_ON(irqs_disabled()); /* for portability */
+ *gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
+ *gfp = dma_alloc_coherent_gfp_flags(*dev, *gfp);
- if (dma_release_from_coherent(dev, get_order(size), vaddr))
- return;
+ if (!*dev)
+ *dev = &x86_dma_fallback_dev;
+ if (!is_device_dma_capable(*dev))
+ return false;
+ return true;
- debug_dma_free_coherent(dev, size, vaddr, bus);
- if (ops->free)
- ops->free(dev, size, vaddr, bus, attrs);
}
-EXPORT_SYMBOL(dma_free_attrs);
+EXPORT_SYMBOL(arch_dma_alloc_attrs);
/*
* See <Documentation/x86/x86_64/boot-options.txt> for the iommu kernel
diff --git a/arch/x86/kernel/pmem.c b/arch/x86/kernel/pmem.c
index 64f90f53bb85..4f00b63d7ff3 100644
--- a/arch/x86/kernel/pmem.c
+++ b/arch/x86/kernel/pmem.c
@@ -3,80 +3,17 @@
* Copyright (c) 2015, Intel Corporation.
*/
#include <linux/platform_device.h>
-#include <linux/libnvdimm.h>
#include <linux/module.h>
-#include <asm/e820.h>
-
-static void e820_pmem_release(struct device *dev)
-{
- struct nvdimm_bus *nvdimm_bus = dev->platform_data;
-
- if (nvdimm_bus)
- nvdimm_bus_unregister(nvdimm_bus);
-}
-
-static struct platform_device e820_pmem = {
- .name = "e820_pmem",
- .id = -1,
- .dev = {
- .release = e820_pmem_release,
- },
-};
-
-static const struct attribute_group *e820_pmem_attribute_groups[] = {
- &nvdimm_bus_attribute_group,
- NULL,
-};
-
-static const struct attribute_group *e820_pmem_region_attribute_groups[] = {
- &nd_region_attribute_group,
- &nd_device_attribute_group,
- NULL,
-};
static __init int register_e820_pmem(void)
{
- static struct nvdimm_bus_descriptor nd_desc;
- struct device *dev = &e820_pmem.dev;
- struct nvdimm_bus *nvdimm_bus;
- int rc, i;
-
- rc = platform_device_register(&e820_pmem);
- if (rc)
- return rc;
-
- nd_desc.attr_groups = e820_pmem_attribute_groups;
- nd_desc.provider_name = "e820";
- nvdimm_bus = nvdimm_bus_register(dev, &nd_desc);
- if (!nvdimm_bus)
- goto err;
- dev->platform_data = nvdimm_bus;
-
- for (i = 0; i < e820.nr_map; i++) {
- struct e820entry *ei = &e820.map[i];
- struct resource res = {
- .flags = IORESOURCE_MEM,
- .start = ei->addr,
- .end = ei->addr + ei->size - 1,
- };
- struct nd_region_desc ndr_desc;
-
- if (ei->type != E820_PRAM)
- continue;
-
- memset(&ndr_desc, 0, sizeof(ndr_desc));
- ndr_desc.res = &res;
- ndr_desc.attr_groups = e820_pmem_region_attribute_groups;
- ndr_desc.numa_node = NUMA_NO_NODE;
- if (!nvdimm_pmem_region_create(nvdimm_bus, &ndr_desc))
- goto err;
- }
-
- return 0;
-
- err:
- dev_err(dev, "failed to register legacy persistent memory ranges\n");
- platform_device_unregister(&e820_pmem);
- return -ENXIO;
+ struct platform_device *pdev;
+
+ /*
+ * See drivers/nvdimm/e820.c for the implementation, this is
+ * simply here to trigger the module to load on demand.
+ */
+ pdev = platform_device_alloc("e820_pmem", -1);
+ return platform_device_add(pdev);
}
device_initcall(register_e820_pmem);
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 9cad694ed7c4..39e585a554b7 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -29,6 +29,8 @@
#include <asm/debugreg.h>
#include <asm/nmi.h>
#include <asm/tlbflush.h>
+#include <asm/mce.h>
+#include <asm/vm86.h>
/*
* per-CPU TSS segments. Threads are completely 'soft' on Linux,
@@ -81,7 +83,7 @@ EXPORT_SYMBOL_GPL(idle_notifier_unregister);
*/
int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
{
- *dst = *src;
+ memcpy(dst, src, arch_task_struct_size);
return fpu__copy(&dst->thread.fpu, &src->thread.fpu);
}
@@ -110,6 +112,8 @@ void exit_thread(void)
kfree(bp);
}
+ free_vm86(t);
+
fpu__drop(fpu);
}
@@ -319,6 +323,7 @@ void stop_this_cpu(void *dummy)
*/
set_cpu_online(smp_processor_id(), false);
disable_local_APIC();
+ mcheck_cpu_clear(this_cpu_ptr(&cpu_info));
for (;;)
halt();
@@ -408,6 +413,7 @@ static int prefer_mwait_c1_over_halt(const struct cpuinfo_x86 *c)
static void mwait_idle(void)
{
if (!current_set_polling_and_test()) {
+ trace_cpu_idle_rcuidle(1, smp_processor_id());
if (this_cpu_has(X86_BUG_CLFLUSH_MONITOR)) {
smp_mb(); /* quirk */
clflush((void *)&current_thread_info()->flags);
@@ -419,6 +425,7 @@ static void mwait_idle(void)
__sti_mwait(0, 0);
else
local_irq_enable();
+ trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
} else {
local_irq_enable();
}
@@ -499,3 +506,58 @@ unsigned long arch_randomize_brk(struct mm_struct *mm)
return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
}
+/*
+ * Called from fs/proc with a reference on @p to find the function
+ * which called into schedule(). This needs to be done carefully
+ * because the task might wake up and we might look at a stack
+ * changing under us.
+ */
+unsigned long get_wchan(struct task_struct *p)
+{
+ unsigned long start, bottom, top, sp, fp, ip;
+ int count = 0;
+
+ if (!p || p == current || p->state == TASK_RUNNING)
+ return 0;
+
+ start = (unsigned long)task_stack_page(p);
+ if (!start)
+ return 0;
+
+ /*
+ * Layout of the stack page:
+ *
+ * ----------- topmax = start + THREAD_SIZE - sizeof(unsigned long)
+ * PADDING
+ * ----------- top = topmax - TOP_OF_KERNEL_STACK_PADDING
+ * stack
+ * ----------- bottom = start + sizeof(thread_info)
+ * thread_info
+ * ----------- start
+ *
+ * The tasks stack pointer points at the location where the
+ * framepointer is stored. The data on the stack is:
+ * ... IP FP ... IP FP
+ *
+ * We need to read FP and IP, so we need to adjust the upper
+ * bound by another unsigned long.
+ */
+ top = start + THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING;
+ top -= 2 * sizeof(unsigned long);
+ bottom = start + sizeof(struct thread_info);
+
+ sp = READ_ONCE(p->thread.sp);
+ if (sp < bottom || sp > top)
+ return 0;
+
+ fp = READ_ONCE(*(unsigned long *)sp);
+ do {
+ if (fp < bottom || fp > top)
+ return 0;
+ ip = READ_ONCE(*(unsigned long *)(fp + sizeof(unsigned long)));
+ if (!in_sched_functions(ip))
+ return ip;
+ fp = READ_ONCE(*(unsigned long *)fp);
+ } while (count++ < 16 && p->state != TASK_RUNNING);
+ return 0;
+}
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index f73c962fe636..737527b40e5b 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -53,6 +53,7 @@
#include <asm/syscalls.h>
#include <asm/debugreg.h>
#include <asm/switch_to.h>
+#include <asm/vm86.h>
asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
asmlinkage void ret_from_kernel_thread(void) __asm__("ret_from_kernel_thread");
@@ -323,31 +324,3 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
return prev_p;
}
-
-#define top_esp (THREAD_SIZE - sizeof(unsigned long))
-#define top_ebp (THREAD_SIZE - 2*sizeof(unsigned long))
-
-unsigned long get_wchan(struct task_struct *p)
-{
- unsigned long bp, sp, ip;
- unsigned long stack_page;
- int count = 0;
- if (!p || p == current || p->state == TASK_RUNNING)
- return 0;
- stack_page = (unsigned long)task_stack_page(p);
- sp = p->thread.sp;
- if (!stack_page || sp < stack_page || sp > top_esp+stack_page)
- return 0;
- /* include/asm-i386/system.h:switch_to() pushes bp last. */
- bp = *(unsigned long *) sp;
- do {
- if (bp < stack_page || bp > top_ebp+stack_page)
- return 0;
- ip = *(unsigned long *) (bp+4);
- if (!in_sched_functions(ip))
- return ip;
- bp = *(unsigned long *) bp;
- } while (count++ < 16);
- return 0;
-}
-
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 71d7849a07f7..b35921a670b2 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -121,13 +121,15 @@ void __show_regs(struct pt_regs *regs, int all)
void release_thread(struct task_struct *dead_task)
{
if (dead_task->mm) {
- if (dead_task->mm->context.size) {
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
+ if (dead_task->mm->context.ldt) {
pr_warn("WARNING: dead process %s still has LDT? <%p/%d>\n",
dead_task->comm,
dead_task->mm->context.ldt,
- dead_task->mm->context.size);
+ dead_task->mm->context.ldt->size);
BUG();
}
+#endif
}
}
@@ -248,8 +250,8 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
__USER_CS, __USER_DS, 0);
}
-#ifdef CONFIG_IA32_EMULATION
-void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp)
+#ifdef CONFIG_COMPAT
+void compat_start_thread(struct pt_regs *regs, u32 new_ip, u32 new_sp)
{
start_thread_common(regs, new_ip, new_sp,
test_thread_flag(TIF_X32)
@@ -497,30 +499,6 @@ void set_personality_ia32(bool x32)
}
EXPORT_SYMBOL_GPL(set_personality_ia32);
-unsigned long get_wchan(struct task_struct *p)
-{
- unsigned long stack;
- u64 fp, ip;
- int count = 0;
-
- if (!p || p == current || p->state == TASK_RUNNING)
- return 0;
- stack = (unsigned long)task_stack_page(p);
- if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE)
- return 0;
- fp = *(u64 *)(p->thread.sp);
- do {
- if (fp < (unsigned long)stack ||
- fp >= (unsigned long)stack+THREAD_SIZE)
- return 0;
- ip = *(u64 *)(fp+8);
- if (!in_sched_functions(ip))
- return ip;
- fp = *(u64 *)fp;
- } while (count++ < 16);
- return 0;
-}
-
long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
{
int ret = 0;
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 9be72bc3613f..558f50edebca 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -37,12 +37,10 @@
#include <asm/proto.h>
#include <asm/hw_breakpoint.h>
#include <asm/traps.h>
+#include <asm/syscall.h>
#include "tls.h"
-#define CREATE_TRACE_POINTS
-#include <trace/events/syscalls.h>
-
enum x86_regset {
REGSET_GENERAL,
REGSET_FP,
@@ -1123,6 +1121,73 @@ static int genregs32_set(struct task_struct *target,
return ret;
}
+static long ia32_arch_ptrace(struct task_struct *child, compat_long_t request,
+ compat_ulong_t caddr, compat_ulong_t cdata)
+{
+ unsigned long addr = caddr;
+ unsigned long data = cdata;
+ void __user *datap = compat_ptr(data);
+ int ret;
+ __u32 val;
+
+ switch (request) {
+ case PTRACE_PEEKUSR:
+ ret = getreg32(child, addr, &val);
+ if (ret == 0)
+ ret = put_user(val, (__u32 __user *)datap);
+ break;
+
+ case PTRACE_POKEUSR:
+ ret = putreg32(child, addr, data);
+ break;
+
+ case PTRACE_GETREGS: /* Get all gp regs from the child. */
+ return copy_regset_to_user(child, &user_x86_32_view,
+ REGSET_GENERAL,
+ 0, sizeof(struct user_regs_struct32),
+ datap);
+
+ case PTRACE_SETREGS: /* Set all gp regs in the child. */
+ return copy_regset_from_user(child, &user_x86_32_view,
+ REGSET_GENERAL, 0,
+ sizeof(struct user_regs_struct32),
+ datap);
+
+ case PTRACE_GETFPREGS: /* Get the child FPU state. */
+ return copy_regset_to_user(child, &user_x86_32_view,
+ REGSET_FP, 0,
+ sizeof(struct user_i387_ia32_struct),
+ datap);
+
+ case PTRACE_SETFPREGS: /* Set the child FPU state. */
+ return copy_regset_from_user(
+ child, &user_x86_32_view, REGSET_FP,
+ 0, sizeof(struct user_i387_ia32_struct), datap);
+
+ case PTRACE_GETFPXREGS: /* Get the child extended FPU state. */
+ return copy_regset_to_user(child, &user_x86_32_view,
+ REGSET_XFP, 0,
+ sizeof(struct user32_fxsr_struct),
+ datap);
+
+ case PTRACE_SETFPXREGS: /* Set the child extended FPU state. */
+ return copy_regset_from_user(child, &user_x86_32_view,
+ REGSET_XFP, 0,
+ sizeof(struct user32_fxsr_struct),
+ datap);
+
+ case PTRACE_GET_THREAD_AREA:
+ case PTRACE_SET_THREAD_AREA:
+ return arch_ptrace(child, request, addr, data);
+
+ default:
+ return compat_ptrace_request(child, request, addr, data);
+ }
+
+ return ret;
+}
+#endif /* CONFIG_IA32_EMULATION */
+
#ifdef CONFIG_X86_X32_ABI
static long x32_arch_ptrace(struct task_struct *child,
compat_long_t request, compat_ulong_t caddr,
@@ -1211,78 +1276,21 @@ static long x32_arch_ptrace(struct task_struct *child,
}
#endif
+#ifdef CONFIG_COMPAT
long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
compat_ulong_t caddr, compat_ulong_t cdata)
{
- unsigned long addr = caddr;
- unsigned long data = cdata;
- void __user *datap = compat_ptr(data);
- int ret;
- __u32 val;
-
#ifdef CONFIG_X86_X32_ABI
if (!is_ia32_task())
return x32_arch_ptrace(child, request, caddr, cdata);
#endif
-
- switch (request) {
- case PTRACE_PEEKUSR:
- ret = getreg32(child, addr, &val);
- if (ret == 0)
- ret = put_user(val, (__u32 __user *)datap);
- break;
-
- case PTRACE_POKEUSR:
- ret = putreg32(child, addr, data);
- break;
-
- case PTRACE_GETREGS: /* Get all gp regs from the child. */
- return copy_regset_to_user(child, &user_x86_32_view,
- REGSET_GENERAL,
- 0, sizeof(struct user_regs_struct32),
- datap);
-
- case PTRACE_SETREGS: /* Set all gp regs in the child. */
- return copy_regset_from_user(child, &user_x86_32_view,
- REGSET_GENERAL, 0,
- sizeof(struct user_regs_struct32),
- datap);
-
- case PTRACE_GETFPREGS: /* Get the child FPU state. */
- return copy_regset_to_user(child, &user_x86_32_view,
- REGSET_FP, 0,
- sizeof(struct user_i387_ia32_struct),
- datap);
-
- case PTRACE_SETFPREGS: /* Set the child FPU state. */
- return copy_regset_from_user(
- child, &user_x86_32_view, REGSET_FP,
- 0, sizeof(struct user_i387_ia32_struct), datap);
-
- case PTRACE_GETFPXREGS: /* Get the child extended FPU state. */
- return copy_regset_to_user(child, &user_x86_32_view,
- REGSET_XFP, 0,
- sizeof(struct user32_fxsr_struct),
- datap);
-
- case PTRACE_SETFPXREGS: /* Set the child extended FPU state. */
- return copy_regset_from_user(child, &user_x86_32_view,
- REGSET_XFP, 0,
- sizeof(struct user32_fxsr_struct),
- datap);
-
- case PTRACE_GET_THREAD_AREA:
- case PTRACE_SET_THREAD_AREA:
- return arch_ptrace(child, request, addr, data);
-
- default:
- return compat_ptrace_request(child, request, addr, data);
- }
-
- return ret;
+#ifdef CONFIG_IA32_EMULATION
+ return ia32_arch_ptrace(child, request, caddr, cdata);
+#else
+ return 0;
+#endif
}
-
-#endif /* CONFIG_IA32_EMULATION */
+#endif /* CONFIG_COMPAT */
#ifdef CONFIG_X86_64
@@ -1434,201 +1442,3 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs,
/* Send us the fake SIGTRAP */
force_sig_info(SIGTRAP, &info, tsk);
}
-
-static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch)
-{
-#ifdef CONFIG_X86_64
- if (arch == AUDIT_ARCH_X86_64) {
- audit_syscall_entry(regs->orig_ax, regs->di,
- regs->si, regs->dx, regs->r10);
- } else
-#endif
- {
- audit_syscall_entry(regs->orig_ax, regs->bx,
- regs->cx, regs->dx, regs->si);
- }
-}
-
-/*
- * We can return 0 to resume the syscall or anything else to go to phase
- * 2. If we resume the syscall, we need to put something appropriate in
- * regs->orig_ax.
- *
- * NB: We don't have full pt_regs here, but regs->orig_ax and regs->ax
- * are fully functional.
- *
- * For phase 2's benefit, our return value is:
- * 0: resume the syscall
- * 1: go to phase 2; no seccomp phase 2 needed
- * anything else: go to phase 2; pass return value to seccomp
- */
-unsigned long syscall_trace_enter_phase1(struct pt_regs *regs, u32 arch)
-{
- unsigned long ret = 0;
- u32 work;
-
- BUG_ON(regs != task_pt_regs(current));
-
- work = ACCESS_ONCE(current_thread_info()->flags) &
- _TIF_WORK_SYSCALL_ENTRY;
-
- /*
- * If TIF_NOHZ is set, we are required to call user_exit() before
- * doing anything that could touch RCU.
- */
- if (work & _TIF_NOHZ) {
- user_exit();
- work &= ~_TIF_NOHZ;
- }
-
-#ifdef CONFIG_SECCOMP
- /*
- * Do seccomp first -- it should minimize exposure of other
- * code, and keeping seccomp fast is probably more valuable
- * than the rest of this.
- */
- if (work & _TIF_SECCOMP) {
- struct seccomp_data sd;
-
- sd.arch = arch;
- sd.nr = regs->orig_ax;
- sd.instruction_pointer = regs->ip;
-#ifdef CONFIG_X86_64
- if (arch == AUDIT_ARCH_X86_64) {
- sd.args[0] = regs->di;
- sd.args[1] = regs->si;
- sd.args[2] = regs->dx;
- sd.args[3] = regs->r10;
- sd.args[4] = regs->r8;
- sd.args[5] = regs->r9;
- } else
-#endif
- {
- sd.args[0] = regs->bx;
- sd.args[1] = regs->cx;
- sd.args[2] = regs->dx;
- sd.args[3] = regs->si;
- sd.args[4] = regs->di;
- sd.args[5] = regs->bp;
- }
-
- BUILD_BUG_ON(SECCOMP_PHASE1_OK != 0);
- BUILD_BUG_ON(SECCOMP_PHASE1_SKIP != 1);
-
- ret = seccomp_phase1(&sd);
- if (ret == SECCOMP_PHASE1_SKIP) {
- regs->orig_ax = -1;
- ret = 0;
- } else if (ret != SECCOMP_PHASE1_OK) {
- return ret; /* Go directly to phase 2 */
- }
-
- work &= ~_TIF_SECCOMP;
- }
-#endif
-
- /* Do our best to finish without phase 2. */
- if (work == 0)
- return ret; /* seccomp and/or nohz only (ret == 0 here) */
-
-#ifdef CONFIG_AUDITSYSCALL
- if (work == _TIF_SYSCALL_AUDIT) {
- /*
- * If there is no more work to be done except auditing,
- * then audit in phase 1. Phase 2 always audits, so, if
- * we audit here, then we can't go on to phase 2.
- */
- do_audit_syscall_entry(regs, arch);
- return 0;
- }
-#endif
-
- return 1; /* Something is enabled that we can't handle in phase 1 */
-}
-
-/* Returns the syscall nr to run (which should match regs->orig_ax). */
-long syscall_trace_enter_phase2(struct pt_regs *regs, u32 arch,
- unsigned long phase1_result)
-{
- long ret = 0;
- u32 work = ACCESS_ONCE(current_thread_info()->flags) &
- _TIF_WORK_SYSCALL_ENTRY;
-
- BUG_ON(regs != task_pt_regs(current));
-
- /*
- * If we stepped into a sysenter/syscall insn, it trapped in
- * kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP.
- * If user-mode had set TF itself, then it's still clear from
- * do_debug() and we need to set it again to restore the user
- * state. If we entered on the slow path, TF was already set.
- */
- if (work & _TIF_SINGLESTEP)
- regs->flags |= X86_EFLAGS_TF;
-
-#ifdef CONFIG_SECCOMP
- /*
- * Call seccomp_phase2 before running the other hooks so that
- * they can see any changes made by a seccomp tracer.
- */
- if (phase1_result > 1 && seccomp_phase2(phase1_result)) {
- /* seccomp failures shouldn't expose any additional code. */
- return -1;
- }
-#endif
-
- if (unlikely(work & _TIF_SYSCALL_EMU))
- ret = -1L;
-
- if ((ret || test_thread_flag(TIF_SYSCALL_TRACE)) &&
- tracehook_report_syscall_entry(regs))
- ret = -1L;
-
- if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
- trace_sys_enter(regs, regs->orig_ax);
-
- do_audit_syscall_entry(regs, arch);
-
- return ret ?: regs->orig_ax;
-}
-
-long syscall_trace_enter(struct pt_regs *regs)
-{
- u32 arch = is_ia32_task() ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64;
- unsigned long phase1_result = syscall_trace_enter_phase1(regs, arch);
-
- if (phase1_result == 0)
- return regs->orig_ax;
- else
- return syscall_trace_enter_phase2(regs, arch, phase1_result);
-}
-
-void syscall_trace_leave(struct pt_regs *regs)
-{
- bool step;
-
- /*
- * We may come here right after calling schedule_user()
- * or do_notify_resume(), in which case we can be in RCU
- * user mode.
- */
- user_exit();
-
- audit_syscall_exit(regs);
-
- if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
- trace_sys_exit(regs, regs->ax);
-
- /*
- * If TIF_SYSCALL_EMU is set, we only get here because of
- * TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP).
- * We already reported this syscall instruction in
- * syscall_trace_enter().
- */
- step = unlikely(test_thread_flag(TIF_SINGLESTEP)) &&
- !test_thread_flag(TIF_SYSCALL_EMU);
- if (step || test_thread_flag(TIF_SYSCALL_TRACE))
- tracehook_report_syscall_exit(regs, step);
-
- user_enter();
-}
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 86db4bcd7ce5..02693dd9a079 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -673,7 +673,7 @@ struct machine_ops machine_ops = {
.emergency_restart = native_machine_emergency_restart,
.restart = native_machine_restart,
.halt = native_machine_halt,
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
.crash_shutdown = native_machine_crash_shutdown,
#endif
};
@@ -703,7 +703,7 @@ void machine_halt(void)
machine_ops.halt();
}
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
void machine_crash_shutdown(struct pt_regs *regs)
{
machine_ops.crash_shutdown(regs);
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 80f874bf999e..fdb7f2a2d328 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -317,15 +317,12 @@ static u64 __init get_ramdisk_size(void)
return ramdisk_size;
}
-#define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT)
static void __init relocate_initrd(void)
{
/* Assume only end is not page aligned */
u64 ramdisk_image = get_ramdisk_image();
u64 ramdisk_size = get_ramdisk_size();
u64 area_size = PAGE_ALIGN(ramdisk_size);
- unsigned long slop, clen, mapaddr;
- char *p, *q;
/* We need to move the initrd down into directly mapped mem */
relocated_ramdisk = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped),
@@ -343,25 +340,8 @@ static void __init relocate_initrd(void)
printk(KERN_INFO "Allocated new RAMDISK: [mem %#010llx-%#010llx]\n",
relocated_ramdisk, relocated_ramdisk + ramdisk_size - 1);
- q = (char *)initrd_start;
-
- /* Copy the initrd */
- while (ramdisk_size) {
- slop = ramdisk_image & ~PAGE_MASK;
- clen = ramdisk_size;
- if (clen > MAX_MAP_CHUNK-slop)
- clen = MAX_MAP_CHUNK-slop;
- mapaddr = ramdisk_image & PAGE_MASK;
- p = early_memremap(mapaddr, clen+slop);
- memcpy(q, p+slop, clen);
- early_memunmap(p, clen+slop);
- q += clen;
- ramdisk_image += clen;
- ramdisk_size -= clen;
- }
+ copy_from_early_mem((void *)initrd_start, ramdisk_image, ramdisk_size);
- ramdisk_image = get_ramdisk_image();
- ramdisk_size = get_ramdisk_size();
printk(KERN_INFO "Move RAMDISK from [mem %#010llx-%#010llx] to"
" [mem %#010llx-%#010llx]\n",
ramdisk_image, ramdisk_image + ramdisk_size - 1,
@@ -498,7 +478,7 @@ static void __init memblock_x86_reserve_range_setup_data(void)
* --------- Crashkernel reservation ------------------------------
*/
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
/*
* Keep the crash kernel below this limit. On 32 bits earlier kernels
@@ -916,11 +896,6 @@ void __init setup_arch(char **cmdline_p)
#ifdef CONFIG_X86_32
apm_info.bios = boot_params.apm_bios_info;
ist_info = boot_params.ist_info;
- if (boot_params.sys_desc_table.length != 0) {
- machine_id = boot_params.sys_desc_table.table[0];
- machine_submodel_id = boot_params.sys_desc_table.table[1];
- BIOS_revision = boot_params.sys_desc_table.table[2];
- }
#endif
saved_video_mode = boot_params.hdr.vid_mode;
bootloader_type = boot_params.hdr.type_of_loader;
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 206996c1669d..da52e6bb5c7f 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -31,11 +31,11 @@
#include <asm/vdso.h>
#include <asm/mce.h>
#include <asm/sighandling.h>
+#include <asm/vm86.h>
#ifdef CONFIG_X86_64
#include <asm/proto.h>
#include <asm/ia32_unistd.h>
-#include <asm/sys_ia32.h>
#endif /* CONFIG_X86_64 */
#include <asm/syscall.h>
@@ -93,8 +93,15 @@ int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc)
COPY(r15);
#endif /* CONFIG_X86_64 */
+#ifdef CONFIG_X86_32
COPY_SEG_CPL3(cs);
COPY_SEG_CPL3(ss);
+#else /* !CONFIG_X86_32 */
+ /* Kernel saves and restores only the CS segment register on signals,
+ * which is the bare minimum needed to allow mixed 32/64-bit code.
+ * App's signal handler can save/restore other segments if needed. */
+ COPY_SEG_CPL3(cs);
+#endif /* CONFIG_X86_32 */
get_user_ex(tmpflags, &sc->flags);
regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
@@ -154,9 +161,8 @@ int setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
#else /* !CONFIG_X86_32 */
put_user_ex(regs->flags, &sc->flags);
put_user_ex(regs->cs, &sc->cs);
- put_user_ex(0, &sc->__pad2);
- put_user_ex(0, &sc->__pad1);
- put_user_ex(regs->ss, &sc->ss);
+ put_user_ex(0, &sc->gs);
+ put_user_ex(0, &sc->fs);
#endif /* CONFIG_X86_32 */
put_user_ex(fpstate, &sc->fpstate);
@@ -451,19 +457,9 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig,
regs->sp = (unsigned long)frame;
- /*
- * Set up the CS and SS registers to run signal handlers in
- * 64-bit mode, even if the handler happens to be interrupting
- * 32-bit or 16-bit code.
- *
- * SS is subtle. In 64-bit mode, we don't need any particular
- * SS descriptor, but we do need SS to be valid. It's possible
- * that the old SS is entirely bogus -- this can happen if the
- * signal we're trying to deliver is #GP or #SS caused by a bad
- * SS value.
- */
+ /* Set up the CS register to run signal handlers in 64-bit mode,
+ even if the handler happens to be interrupting 32-bit code. */
regs->cs = __USER_CS;
- regs->ss = __USER_DS;
return 0;
}
@@ -636,6 +632,9 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs)
bool stepping, failed;
struct fpu *fpu = &current->thread.fpu;
+ if (v8086_mode(regs))
+ save_v86_state((struct kernel_vm86_regs *) regs, VM86_SIGNAL);
+
/* Are we from a system call? */
if (syscall_get_nr(current, regs) >= 0) {
/* If so, check system call restarting.. */
@@ -701,7 +700,7 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs)
* want to handle. Thus you cannot kill init even with a SIGKILL even by
* mistake.
*/
-static void do_signal(struct pt_regs *regs)
+void do_signal(struct pt_regs *regs)
{
struct ksignal ksig;
@@ -736,32 +735,6 @@ static void do_signal(struct pt_regs *regs)
restore_saved_sigmask();
}
-/*
- * notification of userspace execution resumption
- * - triggered by the TIF_WORK_MASK flags
- */
-__visible void
-do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
-{
- user_exit();
-
- if (thread_info_flags & _TIF_UPROBE)
- uprobe_notify_resume(regs);
-
- /* deal with pending signal delivery */
- if (thread_info_flags & _TIF_SIGPENDING)
- do_signal(regs);
-
- if (thread_info_flags & _TIF_NOTIFY_RESUME) {
- clear_thread_flag(TIF_NOTIFY_RESUME);
- tracehook_notify_resume(regs);
- }
- if (thread_info_flags & _TIF_USER_RETURN_NOTIFY)
- fire_user_return_notifiers();
-
- user_enter();
-}
-
void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
{
struct task_struct *me = current;
diff --git a/arch/x86/kernel/signal_compat.c b/arch/x86/kernel/signal_compat.c
new file mode 100644
index 000000000000..dc3c0b1c816f
--- /dev/null
+++ b/arch/x86/kernel/signal_compat.c
@@ -0,0 +1,95 @@
+#include <linux/compat.h>
+#include <linux/uaccess.h>
+
+int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from)
+{
+ int err = 0;
+ bool ia32 = test_thread_flag(TIF_IA32);
+
+ if (!access_ok(VERIFY_WRITE, to, sizeof(compat_siginfo_t)))
+ return -EFAULT;
+
+ put_user_try {
+ /* If you change siginfo_t structure, please make sure that
+ this code is fixed accordingly.
+ It should never copy any pad contained in the structure
+ to avoid security leaks, but must copy the generic
+ 3 ints plus the relevant union member. */
+ put_user_ex(from->si_signo, &to->si_signo);
+ put_user_ex(from->si_errno, &to->si_errno);
+ put_user_ex((short)from->si_code, &to->si_code);
+
+ if (from->si_code < 0) {
+ put_user_ex(from->si_pid, &to->si_pid);
+ put_user_ex(from->si_uid, &to->si_uid);
+ put_user_ex(ptr_to_compat(from->si_ptr), &to->si_ptr);
+ } else {
+ /*
+ * First 32bits of unions are always present:
+ * si_pid === si_band === si_tid === si_addr(LS half)
+ */
+ put_user_ex(from->_sifields._pad[0],
+ &to->_sifields._pad[0]);
+ switch (from->si_code >> 16) {
+ case __SI_FAULT >> 16:
+ break;
+ case __SI_SYS >> 16:
+ put_user_ex(from->si_syscall, &to->si_syscall);
+ put_user_ex(from->si_arch, &to->si_arch);
+ break;
+ case __SI_CHLD >> 16:
+ if (ia32) {
+ put_user_ex(from->si_utime, &to->si_utime);
+ put_user_ex(from->si_stime, &to->si_stime);
+ } else {
+ put_user_ex(from->si_utime, &to->_sifields._sigchld_x32._utime);
+ put_user_ex(from->si_stime, &to->_sifields._sigchld_x32._stime);
+ }
+ put_user_ex(from->si_status, &to->si_status);
+ /* FALL THROUGH */
+ default:
+ case __SI_KILL >> 16:
+ put_user_ex(from->si_uid, &to->si_uid);
+ break;
+ case __SI_POLL >> 16:
+ put_user_ex(from->si_fd, &to->si_fd);
+ break;
+ case __SI_TIMER >> 16:
+ put_user_ex(from->si_overrun, &to->si_overrun);
+ put_user_ex(ptr_to_compat(from->si_ptr),
+ &to->si_ptr);
+ break;
+ /* This is not generated by the kernel as of now. */
+ case __SI_RT >> 16:
+ case __SI_MESGQ >> 16:
+ put_user_ex(from->si_uid, &to->si_uid);
+ put_user_ex(from->si_int, &to->si_int);
+ break;
+ }
+ }
+ } put_user_catch(err);
+
+ return err;
+}
+
+int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from)
+{
+ int err = 0;
+ u32 ptr32;
+
+ if (!access_ok(VERIFY_READ, from, sizeof(compat_siginfo_t)))
+ return -EFAULT;
+
+ get_user_try {
+ get_user_ex(to->si_signo, &from->si_signo);
+ get_user_ex(to->si_errno, &from->si_errno);
+ get_user_ex(to->si_code, &from->si_code);
+
+ get_user_ex(to->si_pid, &from->si_pid);
+ get_user_ex(to->si_uid, &from->si_uid);
+ get_user_ex(ptr32, &from->si_ptr);
+ to->si_ptr = compat_ptr(ptr32);
+ } get_user_catch(err);
+
+ return err;
+}
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 15aaa69bbb5e..12c8286206ce 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -30,6 +30,7 @@
#include <asm/proto.h>
#include <asm/apic.h>
#include <asm/nmi.h>
+#include <asm/mce.h>
#include <asm/trace/irq_vectors.h>
/*
* Some notes on x86 processor bugs affecting SMP operation:
@@ -243,6 +244,7 @@ static void native_stop_other_cpus(int wait)
finish:
local_irq_save(flags);
disable_local_APIC();
+ mcheck_cpu_clear(this_cpu_ptr(&cpu_info));
local_irq_restore(flags);
}
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 8add66b22f33..e0c198e5f920 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -97,8 +97,6 @@ DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map);
DEFINE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info);
EXPORT_PER_CPU_SYMBOL(cpu_info);
-atomic_t init_deasserted;
-
static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip)
{
unsigned long flags;
@@ -146,16 +144,11 @@ static void smp_callin(void)
/*
* If waken up by an INIT in an 82489DX configuration
- * we may get here before an INIT-deassert IPI reaches
- * our local APIC. We have to wait for the IPI or we'll
- * lock up on an APIC access.
- *
- * Since CPU0 is not wakened up by INIT, it doesn't wait for the IPI.
+ * cpu_callout_mask guarantees we don't get here before
+ * an INIT_deassert IPI reaches our local APIC, so it is
+ * now safe to touch our local APIC.
*/
cpuid = smp_processor_id();
- if (apic->wait_for_init_deassert && cpuid)
- while (!atomic_read(&init_deasserted))
- cpu_relax();
/*
* (This works even if the APIC is not enabled.)
@@ -171,11 +164,6 @@ static void smp_callin(void)
apic_ap_setup();
/*
- * Need to setup vector mappings before we enable interrupts.
- */
- setup_vector_irq(smp_processor_id());
-
- /*
* Save our processor parameters. Note: this information
* is needed for clock calibration.
*/
@@ -239,18 +227,13 @@ static void notrace start_secondary(void *unused)
check_tsc_sync_target();
/*
- * Enable the espfix hack for this CPU
- */
-#ifdef CONFIG_X86_ESPFIX64
- init_espfix_ap();
-#endif
-
- /*
- * We need to hold vector_lock so there the set of online cpus
- * does not change while we are assigning vectors to cpus. Holding
- * this lock ensures we don't half assign or remove an irq from a cpu.
+ * Lock vector_lock and initialize the vectors on this cpu
+ * before setting the cpu online. We must set it online with
+ * vector_lock held to prevent a concurrent setup/teardown
+ * from seeing a half valid vector space.
*/
lock_vector_lock();
+ setup_vector_irq(smp_processor_id());
set_cpu_online(smp_processor_id(), true);
unlock_vector_lock();
cpu_set_state_online(smp_processor_id());
@@ -630,7 +613,6 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
send_status = safe_apic_wait_icr_idle();
mb();
- atomic_set(&init_deasserted, 1);
/*
* Should we send STARTUP IPIs ?
@@ -675,7 +657,8 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
/*
* Give the other CPU some time to accept the IPI.
*/
- udelay(300);
+ if (init_udelay)
+ udelay(300);
pr_debug("Startup point 1\n");
@@ -685,7 +668,8 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
/*
* Give the other CPU some time to accept the IPI.
*/
- udelay(200);
+ if (init_udelay)
+ udelay(200);
if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
apic_write(APIC_ESR, 0);
@@ -854,6 +838,13 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
initial_code = (unsigned long)start_secondary;
stack_start = idle->thread.sp;
+ /*
+ * Enable the espfix hack for this CPU
+ */
+#ifdef CONFIG_X86_ESPFIX64
+ init_espfix_ap(cpu);
+#endif
+
/* So we see what's up */
announce_cpu(cpu, apicid);
@@ -862,8 +853,6 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
* the targeted processor.
*/
- atomic_set(&init_deasserted, 0);
-
if (get_uv_system_type() != UV_NON_UNIQUE_APIC) {
pr_debug("Setting warm reset code and vector.\n");
@@ -901,7 +890,7 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
if (!boot_error) {
/*
- * Wait 10s total for a response from AP
+ * Wait 10s total for first sign of life from AP
*/
boot_error = -1;
timeout = jiffies + 10*HZ;
@@ -914,7 +903,6 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
boot_error = 0;
break;
}
- udelay(100);
schedule();
}
}
@@ -930,7 +918,6 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
* for the MTRR work(triggered by the AP coming online)
* to be completed in the stop machine context.
*/
- udelay(100);
schedule();
}
}
@@ -995,8 +982,17 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle)
common_cpu_up(cpu, tidle);
+ /*
+ * We have to walk the irq descriptors to setup the vector
+ * space for the cpu which comes online. Prevent irq
+ * alloc/free across the bringup.
+ */
+ irq_lock_sparse();
+
err = do_boot_cpu(apicid, cpu, tidle);
+
if (err) {
+ irq_unlock_sparse();
pr_err("do_boot_cpu failed(%d) to wakeup CPU#%u\n", err, cpu);
return -EIO;
}
@@ -1014,6 +1010,8 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle)
touch_nmi_watchdog();
}
+ irq_unlock_sparse();
+
return 0;
}
@@ -1350,7 +1348,7 @@ static void remove_siblinginfo(int cpu)
cpumask_clear_cpu(cpu, cpu_sibling_setup_mask);
}
-static void __ref remove_cpu_from_maps(int cpu)
+static void remove_cpu_from_maps(int cpu)
{
set_cpu_online(cpu, false);
cpumask_clear_cpu(cpu, cpu_callout_mask);
diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c
index 9b4d51d0c0d0..c9a073866ca7 100644
--- a/arch/x86/kernel/step.c
+++ b/arch/x86/kernel/step.c
@@ -5,6 +5,7 @@
#include <linux/mm.h>
#include <linux/ptrace.h>
#include <asm/desc.h>
+#include <asm/mmu_context.h>
unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *regs)
{
@@ -17,6 +18,7 @@ unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *re
return addr;
}
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
/*
* We'll assume that the code segments in the GDT
* are all zero-based. That is largely true: the
@@ -27,13 +29,14 @@ unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *re
struct desc_struct *desc;
unsigned long base;
- seg &= ~7UL;
+ seg >>= 3;
mutex_lock(&child->mm->context.lock);
- if (unlikely((seg >> 3) >= child->mm->context.size))
+ if (unlikely(!child->mm->context.ldt ||
+ seg >= child->mm->context.ldt->size))
addr = -1L; /* bogus selector, access would fault */
else {
- desc = child->mm->context.ldt + seg;
+ desc = &child->mm->context.ldt->entries[seg];
base = get_desc_base(desc);
/* 16-bit code segment? */
@@ -43,6 +46,7 @@ unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *re
}
mutex_unlock(&child->mm->context.lock);
}
+#endif
return addr;
}
diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c
index 649b010da00b..12cbe2b88c0f 100644
--- a/arch/x86/kernel/topology.c
+++ b/arch/x86/kernel/topology.c
@@ -57,7 +57,7 @@ __setup("cpu0_hotplug", enable_cpu0_hotplug);
*
* This is only called for debugging CPU offline/online feature.
*/
-int __ref _debug_hotplug_cpu(int cpu, int action)
+int _debug_hotplug_cpu(int cpu, int action)
{
struct device *dev = get_cpu_device(cpu);
int ret;
@@ -104,7 +104,7 @@ static int __init debug_hotplug_cpu(void)
late_initcall_sync(debug_hotplug_cpu);
#endif /* CONFIG_DEBUG_HOTPLUG_CPU0 */
-int __ref arch_register_cpu(int num)
+int arch_register_cpu(int num)
{
struct cpuinfo_x86 *c = &cpu_data(num);
diff --git a/arch/x86/kernel/trace_clock.c b/arch/x86/kernel/trace_clock.c
index 25b993729f9b..80bb24d9b880 100644
--- a/arch/x86/kernel/trace_clock.c
+++ b/arch/x86/kernel/trace_clock.c
@@ -12,10 +12,5 @@
*/
u64 notrace trace_clock_x86_tsc(void)
{
- u64 ret;
-
- rdtsc_barrier();
- rdtscll(ret);
-
- return ret;
+ return rdtsc_ordered();
}
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index f5791927aa64..346eec73f7db 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -62,6 +62,7 @@
#include <asm/fpu/xstate.h>
#include <asm/trace/mpx.h>
#include <asm/mpx.h>
+#include <asm/vm86.h>
#ifdef CONFIG_X86_64
#include <asm/x86_init.h>
@@ -108,13 +109,10 @@ static inline void preempt_conditional_cli(struct pt_regs *regs)
preempt_count_dec();
}
-enum ctx_state ist_enter(struct pt_regs *regs)
+void ist_enter(struct pt_regs *regs)
{
- enum ctx_state prev_state;
-
if (user_mode(regs)) {
- /* Other than that, we're just an exception. */
- prev_state = exception_enter();
+ RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
} else {
/*
* We might have interrupted pretty much anything. In
@@ -123,32 +121,25 @@ enum ctx_state ist_enter(struct pt_regs *regs)
* but we need to notify RCU.
*/
rcu_nmi_enter();
- prev_state = CONTEXT_KERNEL; /* the value is irrelevant. */
}
/*
- * We are atomic because we're on the IST stack (or we're on x86_32,
- * in which case we still shouldn't schedule).
- *
- * This must be after exception_enter(), because exception_enter()
- * won't do anything if in_interrupt() returns true.
+ * We are atomic because we're on the IST stack; or we're on
+ * x86_32, in which case we still shouldn't schedule; or we're
+ * on x86_64 and entered from user mode, in which case we're
+ * still atomic unless ist_begin_non_atomic is called.
*/
preempt_count_add(HARDIRQ_OFFSET);
/* This code is a bit fragile. Test it. */
- rcu_lockdep_assert(rcu_is_watching(), "ist_enter didn't work");
-
- return prev_state;
+ RCU_LOCKDEP_WARN(!rcu_is_watching(), "ist_enter didn't work");
}
-void ist_exit(struct pt_regs *regs, enum ctx_state prev_state)
+void ist_exit(struct pt_regs *regs)
{
- /* Must be before exception_exit. */
preempt_count_sub(HARDIRQ_OFFSET);
- if (user_mode(regs))
- return exception_exit(prev_state);
- else
+ if (!user_mode(regs))
rcu_nmi_exit();
}
@@ -162,7 +153,7 @@ void ist_exit(struct pt_regs *regs, enum ctx_state prev_state)
* a double fault, it can be safe to schedule. ist_begin_non_atomic()
* begins a non-atomic section within an ist_enter()/ist_exit() region.
* Callers are responsible for enabling interrupts themselves inside
- * the non-atomic section, and callers must call is_end_non_atomic()
+ * the non-atomic section, and callers must call ist_end_non_atomic()
* before ist_exit().
*/
void ist_begin_non_atomic(struct pt_regs *regs)
@@ -289,17 +280,16 @@ NOKPROBE_SYMBOL(do_trap);
static void do_error_trap(struct pt_regs *regs, long error_code, char *str,
unsigned long trapnr, int signr)
{
- enum ctx_state prev_state = exception_enter();
siginfo_t info;
+ RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
+
if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) !=
NOTIFY_STOP) {
conditional_sti(regs);
do_trap(trapnr, signr, str, regs, error_code,
fill_trap_info(regs, signr, trapnr, &info));
}
-
- exception_exit(prev_state);
}
#define DO_ERROR(trapnr, signr, str, name) \
@@ -351,7 +341,7 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
}
#endif
- ist_enter(regs); /* Discard prev_state because we won't return. */
+ ist_enter(regs);
notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV);
tsk->thread.error_code = error_code;
@@ -371,14 +361,13 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
dotraplinkage void do_bounds(struct pt_regs *regs, long error_code)
{
- enum ctx_state prev_state;
const struct bndcsr *bndcsr;
siginfo_t *info;
- prev_state = exception_enter();
+ RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
if (notify_die(DIE_TRAP, "bounds", regs, error_code,
X86_TRAP_BR, SIGSEGV) == NOTIFY_STOP)
- goto exit;
+ return;
conditional_sti(regs);
if (!user_mode(regs))
@@ -435,9 +424,8 @@ dotraplinkage void do_bounds(struct pt_regs *regs, long error_code)
die("bounds", regs, error_code);
}
-exit:
- exception_exit(prev_state);
return;
+
exit_trap:
/*
* This path out is for all the cases where we could not
@@ -447,35 +435,33 @@ exit_trap:
* time..
*/
do_trap(X86_TRAP_BR, SIGSEGV, "bounds", regs, error_code, NULL);
- exception_exit(prev_state);
}
dotraplinkage void
do_general_protection(struct pt_regs *regs, long error_code)
{
struct task_struct *tsk;
- enum ctx_state prev_state;
- prev_state = exception_enter();
+ RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
conditional_sti(regs);
if (v8086_mode(regs)) {
local_irq_enable();
handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code);
- goto exit;
+ return;
}
tsk = current;
if (!user_mode(regs)) {
if (fixup_exception(regs))
- goto exit;
+ return;
tsk->thread.error_code = error_code;
tsk->thread.trap_nr = X86_TRAP_GP;
if (notify_die(DIE_GPF, "general protection fault", regs, error_code,
X86_TRAP_GP, SIGSEGV) != NOTIFY_STOP)
die("general protection fault", regs, error_code);
- goto exit;
+ return;
}
tsk->thread.error_code = error_code;
@@ -491,16 +477,12 @@ do_general_protection(struct pt_regs *regs, long error_code)
}
force_sig_info(SIGSEGV, SEND_SIG_PRIV, tsk);
-exit:
- exception_exit(prev_state);
}
NOKPROBE_SYMBOL(do_general_protection);
/* May run on IST stack. */
dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code)
{
- enum ctx_state prev_state;
-
#ifdef CONFIG_DYNAMIC_FTRACE
/*
* ftrace must be first, everything else may cause a recursive crash.
@@ -513,7 +495,8 @@ dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code)
if (poke_int3_handler(regs))
return;
- prev_state = ist_enter(regs);
+ ist_enter(regs);
+ RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
#ifdef CONFIG_KGDB_LOW_LEVEL_TRAP
if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP,
SIGTRAP) == NOTIFY_STOP)
@@ -539,7 +522,7 @@ dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code)
preempt_conditional_cli(regs);
debug_stack_usage_dec();
exit:
- ist_exit(regs, prev_state);
+ ist_exit(regs);
}
NOKPROBE_SYMBOL(do_int3);
@@ -615,12 +598,11 @@ NOKPROBE_SYMBOL(fixup_bad_iret);
dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
{
struct task_struct *tsk = current;
- enum ctx_state prev_state;
int user_icebp = 0;
unsigned long dr6;
int si_code;
- prev_state = ist_enter(regs);
+ ist_enter(regs);
get_debugreg(dr6, 6);
@@ -695,7 +677,7 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
debug_stack_usage_dec();
exit:
- ist_exit(regs, prev_state);
+ ist_exit(regs);
}
NOKPROBE_SYMBOL(do_debug);
@@ -747,21 +729,15 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr)
dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code)
{
- enum ctx_state prev_state;
-
- prev_state = exception_enter();
+ RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
math_error(regs, error_code, X86_TRAP_MF);
- exception_exit(prev_state);
}
dotraplinkage void
do_simd_coprocessor_error(struct pt_regs *regs, long error_code)
{
- enum ctx_state prev_state;
-
- prev_state = exception_enter();
+ RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
math_error(regs, error_code, X86_TRAP_XF);
- exception_exit(prev_state);
}
dotraplinkage void
@@ -773,9 +749,7 @@ do_spurious_interrupt_bug(struct pt_regs *regs, long error_code)
dotraplinkage void
do_device_not_available(struct pt_regs *regs, long error_code)
{
- enum ctx_state prev_state;
-
- prev_state = exception_enter();
+ RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
BUG_ON(use_eager_fpu());
#ifdef CONFIG_MATH_EMULATION
@@ -786,7 +760,6 @@ do_device_not_available(struct pt_regs *regs, long error_code)
info.regs = regs;
math_emulate(&info);
- exception_exit(prev_state);
return;
}
#endif
@@ -794,7 +767,6 @@ do_device_not_available(struct pt_regs *regs, long error_code)
#ifdef CONFIG_X86_32
conditional_sti(regs);
#endif
- exception_exit(prev_state);
}
NOKPROBE_SYMBOL(do_device_not_available);
@@ -802,9 +774,8 @@ NOKPROBE_SYMBOL(do_device_not_available);
dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
{
siginfo_t info;
- enum ctx_state prev_state;
- prev_state = exception_enter();
+ RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
local_irq_enable();
info.si_signo = SIGILL;
@@ -816,7 +787,6 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
do_trap(X86_TRAP_IRET, SIGILL, "iret exception", regs, error_code,
&info);
}
- exception_exit(prev_state);
}
#endif
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 505449700e0c..c3f7602cd038 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -21,6 +21,7 @@
#include <asm/hypervisor.h>
#include <asm/nmi.h>
#include <asm/x86_init.h>
+#include <asm/geode.h>
unsigned int __read_mostly cpu_khz; /* TSC clocks / usec, not used here */
EXPORT_SYMBOL(cpu_khz);
@@ -38,7 +39,7 @@ static int __read_mostly tsc_unstable;
erroneous rdtsc usage on !cpu_has_tsc processors */
static int __read_mostly tsc_disabled = -1;
-static struct static_key __use_tsc = STATIC_KEY_INIT;
+static DEFINE_STATIC_KEY_FALSE(__use_tsc);
int tsc_clocksource_reliable;
@@ -248,7 +249,7 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
data = cyc2ns_write_begin(cpu);
- rdtscll(tsc_now);
+ tsc_now = rdtsc();
ns_now = cycles_2_ns(tsc_now);
/*
@@ -274,7 +275,12 @@ done:
*/
u64 native_sched_clock(void)
{
- u64 tsc_now;
+ if (static_branch_likely(&__use_tsc)) {
+ u64 tsc_now = rdtsc();
+
+ /* return the value in ns */
+ return cycles_2_ns(tsc_now);
+ }
/*
* Fall back to jiffies if there's no TSC available:
@@ -284,16 +290,17 @@ u64 native_sched_clock(void)
* very important for it to be as fast as the platform
* can achieve it. )
*/
- if (!static_key_false(&__use_tsc)) {
- /* No locking but a rare wrong value is not a big deal: */
- return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ);
- }
- /* read the Time Stamp Counter: */
- rdtscll(tsc_now);
+ /* No locking but a rare wrong value is not a big deal: */
+ return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ);
+}
- /* return the value in ns */
- return cycles_2_ns(tsc_now);
+/*
+ * Generate a sched_clock if you already have a TSC value.
+ */
+u64 native_sched_clock_from_tsc(u64 tsc)
+{
+ return cycles_2_ns(tsc);
}
/* We need to define a real function for sched_clock, to override the
@@ -308,12 +315,6 @@ unsigned long long
sched_clock(void) __attribute__((alias("native_sched_clock")));
#endif
-unsigned long long native_read_tsc(void)
-{
- return __native_read_tsc();
-}
-EXPORT_SYMBOL(native_read_tsc);
-
int check_tsc_unstable(void)
{
return tsc_unstable;
@@ -598,10 +599,19 @@ static unsigned long quick_pit_calibrate(void)
if (!pit_expect_msb(0xff-i, &delta, &d2))
break;
+ delta -= tsc;
+
+ /*
+ * Extrapolate the error and fail fast if the error will
+ * never be below 500 ppm.
+ */
+ if (i == 1 &&
+ d1 + d2 >= (delta * MAX_QUICK_PIT_ITERATIONS) >> 11)
+ return 0;
+
/*
* Iterate until the error is less than 500 ppm
*/
- delta -= tsc;
if (d1+d2 >= delta >> 11)
continue;
@@ -967,7 +977,7 @@ static struct clocksource clocksource_tsc;
*/
static cycle_t read_tsc(struct clocksource *cs)
{
- return (cycle_t)get_cycles();
+ return (cycle_t)rdtsc_ordered();
}
/*
@@ -1004,15 +1014,17 @@ EXPORT_SYMBOL_GPL(mark_tsc_unstable);
static void __init check_system_tsc_reliable(void)
{
-#ifdef CONFIG_MGEODE_LX
- /* RTSC counts during suspend */
+#if defined(CONFIG_MGEODEGX1) || defined(CONFIG_MGEODE_LX) || defined(CONFIG_X86_GENERIC)
+ if (is_geode_lx()) {
+ /* RTSC counts during suspend */
#define RTSC_SUSP 0x100
- unsigned long res_low, res_high;
+ unsigned long res_low, res_high;
- rdmsr_safe(MSR_GEODE_BUSCONT_CONF0, &res_low, &res_high);
- /* Geode_LX - the OLPC CPU has a very reliable TSC */
- if (res_low & RTSC_SUSP)
- tsc_clocksource_reliable = 1;
+ rdmsr_safe(MSR_GEODE_BUSCONT_CONF0, &res_low, &res_high);
+ /* Geode_LX - the OLPC CPU has a very reliable TSC */
+ if (res_low & RTSC_SUSP)
+ tsc_clocksource_reliable = 1;
+ }
#endif
if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE))
tsc_clocksource_reliable = 1;
@@ -1201,7 +1213,7 @@ void __init tsc_init(void)
/* now allow native_sched_clock() to use rdtsc */
tsc_disabled = 0;
- static_key_slow_inc(&__use_tsc);
+ static_branch_enable(&__use_tsc);
if (!no_sched_irq_time)
enable_sched_clock_irqtime();
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index dd8d0791dfb5..78083bf23ed1 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -39,16 +39,15 @@ static cycles_t max_warp;
static int nr_warps;
/*
- * TSC-warp measurement loop running on both CPUs:
+ * TSC-warp measurement loop running on both CPUs. This is not called
+ * if there is no TSC.
*/
static void check_tsc_warp(unsigned int timeout)
{
cycles_t start, now, prev, end;
int i;
- rdtsc_barrier();
- start = get_cycles();
- rdtsc_barrier();
+ start = rdtsc_ordered();
/*
* The measurement runs for 'timeout' msecs:
*/
@@ -63,9 +62,7 @@ static void check_tsc_warp(unsigned int timeout)
*/
arch_spin_lock(&sync_lock);
prev = last_tsc;
- rdtsc_barrier();
- now = get_cycles();
- rdtsc_barrier();
+ now = rdtsc_ordered();
last_tsc = now;
arch_spin_unlock(&sync_lock);
@@ -126,7 +123,7 @@ void check_tsc_sync_source(int cpu)
/*
* No need to check if we already know that the TSC is not
- * synchronized:
+ * synchronized or if we have no TSC.
*/
if (unsynchronized_tsc())
return;
@@ -190,6 +187,7 @@ void check_tsc_sync_target(void)
{
int cpus = 2;
+ /* Also aborts if there is no TSC. */
if (unsynchronized_tsc() || tsc_clocksource_reliable)
return;
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index 66476244731e..bf4db6eaec8f 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -985,3 +985,12 @@ arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs
return -1;
}
+
+bool arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check ctx,
+ struct pt_regs *regs)
+{
+ if (ctx == RP_CHECK_CALL) /* sp was just decremented by "call" insn */
+ return regs->sp < ret->stack;
+ else
+ return regs->sp <= ret->stack;
+}
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index fc9db6ef2a95..524619351961 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -44,11 +44,15 @@
#include <linux/ptrace.h>
#include <linux/audit.h>
#include <linux/stddef.h>
+#include <linux/slab.h>
+#include <linux/security.h>
#include <asm/uaccess.h>
#include <asm/io.h>
#include <asm/tlbflush.h>
#include <asm/irq.h>
+#include <asm/traps.h>
+#include <asm/vm86.h>
/*
* Known problems:
@@ -66,10 +70,6 @@
*/
-#define KVM86 ((struct kernel_vm86_struct *)regs)
-#define VMPI KVM86->vm86plus
-
-
/*
* 8- and 16-bit register defines..
*/
@@ -81,8 +81,8 @@
/*
* virtual flags (16 and 32-bit versions)
*/
-#define VFLAGS (*(unsigned short *)&(current->thread.v86flags))
-#define VEFLAGS (current->thread.v86flags)
+#define VFLAGS (*(unsigned short *)&(current->thread.vm86->veflags))
+#define VEFLAGS (current->thread.vm86->veflags)
#define set_flags(X, new, mask) \
((X) = ((X) & ~(mask)) | ((new) & (mask)))
@@ -90,46 +90,13 @@
#define SAFE_MASK (0xDD5)
#define RETURN_MASK (0xDFF)
-/* convert kernel_vm86_regs to vm86_regs */
-static int copy_vm86_regs_to_user(struct vm86_regs __user *user,
- const struct kernel_vm86_regs *regs)
-{
- int ret = 0;
-
- /*
- * kernel_vm86_regs is missing gs, so copy everything up to
- * (but not including) orig_eax, and then rest including orig_eax.
- */
- ret += copy_to_user(user, regs, offsetof(struct kernel_vm86_regs, pt.orig_ax));
- ret += copy_to_user(&user->orig_eax, &regs->pt.orig_ax,
- sizeof(struct kernel_vm86_regs) -
- offsetof(struct kernel_vm86_regs, pt.orig_ax));
-
- return ret;
-}
-
-/* convert vm86_regs to kernel_vm86_regs */
-static int copy_vm86_regs_from_user(struct kernel_vm86_regs *regs,
- const struct vm86_regs __user *user,
- unsigned extra)
-{
- int ret = 0;
-
- /* copy ax-fs inclusive */
- ret += copy_from_user(regs, user, offsetof(struct kernel_vm86_regs, pt.orig_ax));
- /* copy orig_ax-__gsh+extra */
- ret += copy_from_user(&regs->pt.orig_ax, &user->orig_eax,
- sizeof(struct kernel_vm86_regs) -
- offsetof(struct kernel_vm86_regs, pt.orig_ax) +
- extra);
- return ret;
-}
-
-struct pt_regs *save_v86_state(struct kernel_vm86_regs *regs)
+void save_v86_state(struct kernel_vm86_regs *regs, int retval)
{
struct tss_struct *tss;
- struct pt_regs *ret;
- unsigned long tmp;
+ struct task_struct *tsk = current;
+ struct vm86plus_struct __user *user;
+ struct vm86 *vm86 = current->thread.vm86;
+ long err = 0;
/*
* This gets called from entry.S with interrupts disabled, but
@@ -138,31 +105,57 @@ struct pt_regs *save_v86_state(struct kernel_vm86_regs *regs)
*/
local_irq_enable();
- if (!current->thread.vm86_info) {
- pr_alert("no vm86_info: BAD\n");
+ if (!vm86 || !vm86->user_vm86) {
+ pr_alert("no user_vm86: BAD\n");
+ do_exit(SIGSEGV);
+ }
+ set_flags(regs->pt.flags, VEFLAGS, X86_EFLAGS_VIF | vm86->veflags_mask);
+ user = vm86->user_vm86;
+
+ if (!access_ok(VERIFY_WRITE, user, vm86->vm86plus.is_vm86pus ?
+ sizeof(struct vm86plus_struct) :
+ sizeof(struct vm86_struct))) {
+ pr_alert("could not access userspace vm86 info\n");
do_exit(SIGSEGV);
}
- set_flags(regs->pt.flags, VEFLAGS, X86_EFLAGS_VIF | current->thread.v86mask);
- tmp = copy_vm86_regs_to_user(&current->thread.vm86_info->regs, regs);
- tmp += put_user(current->thread.screen_bitmap, &current->thread.vm86_info->screen_bitmap);
- if (tmp) {
- pr_alert("could not access userspace vm86_info\n");
+
+ put_user_try {
+ put_user_ex(regs->pt.bx, &user->regs.ebx);
+ put_user_ex(regs->pt.cx, &user->regs.ecx);
+ put_user_ex(regs->pt.dx, &user->regs.edx);
+ put_user_ex(regs->pt.si, &user->regs.esi);
+ put_user_ex(regs->pt.di, &user->regs.edi);
+ put_user_ex(regs->pt.bp, &user->regs.ebp);
+ put_user_ex(regs->pt.ax, &user->regs.eax);
+ put_user_ex(regs->pt.ip, &user->regs.eip);
+ put_user_ex(regs->pt.cs, &user->regs.cs);
+ put_user_ex(regs->pt.flags, &user->regs.eflags);
+ put_user_ex(regs->pt.sp, &user->regs.esp);
+ put_user_ex(regs->pt.ss, &user->regs.ss);
+ put_user_ex(regs->es, &user->regs.es);
+ put_user_ex(regs->ds, &user->regs.ds);
+ put_user_ex(regs->fs, &user->regs.fs);
+ put_user_ex(regs->gs, &user->regs.gs);
+
+ put_user_ex(vm86->screen_bitmap, &user->screen_bitmap);
+ } put_user_catch(err);
+ if (err) {
+ pr_alert("could not access userspace vm86 info\n");
do_exit(SIGSEGV);
}
tss = &per_cpu(cpu_tss, get_cpu());
- current->thread.sp0 = current->thread.saved_sp0;
- current->thread.sysenter_cs = __KERNEL_CS;
- load_sp0(tss, &current->thread);
- current->thread.saved_sp0 = 0;
+ tsk->thread.sp0 = vm86->saved_sp0;
+ tsk->thread.sysenter_cs = __KERNEL_CS;
+ load_sp0(tss, &tsk->thread);
+ vm86->saved_sp0 = 0;
put_cpu();
- ret = KVM86->regs32;
+ memcpy(&regs->pt, &vm86->regs32, sizeof(struct pt_regs));
- ret->fs = current->thread.saved_fs;
- set_user_gs(ret, current->thread.saved_gs);
+ lazy_load_gs(vm86->regs32.gs);
- return ret;
+ regs->pt.ax = retval;
}
static void mark_screen_rdonly(struct mm_struct *mm)
@@ -200,45 +193,16 @@ out:
static int do_vm86_irq_handling(int subfunction, int irqnumber);
-static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk);
+static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus);
-SYSCALL_DEFINE1(vm86old, struct vm86_struct __user *, v86)
+SYSCALL_DEFINE1(vm86old, struct vm86_struct __user *, user_vm86)
{
- struct kernel_vm86_struct info; /* declare this _on top_,
- * this avoids wasting of stack space.
- * This remains on the stack until we
- * return to 32 bit user space.
- */
- struct task_struct *tsk = current;
- int tmp;
-
- if (tsk->thread.saved_sp0)
- return -EPERM;
- tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs,
- offsetof(struct kernel_vm86_struct, vm86plus) -
- sizeof(info.regs));
- if (tmp)
- return -EFAULT;
- memset(&info.vm86plus, 0, (int)&info.regs32 - (int)&info.vm86plus);
- info.regs32 = current_pt_regs();
- tsk->thread.vm86_info = v86;
- do_sys_vm86(&info, tsk);
- return 0; /* we never return here */
+ return do_sys_vm86((struct vm86plus_struct __user *) user_vm86, false);
}
SYSCALL_DEFINE2(vm86, unsigned long, cmd, unsigned long, arg)
{
- struct kernel_vm86_struct info; /* declare this _on top_,
- * this avoids wasting of stack space.
- * This remains on the stack until we
- * return to 32 bit user space.
- */
- struct task_struct *tsk;
- int tmp;
- struct vm86plus_struct __user *v86;
-
- tsk = current;
switch (cmd) {
case VM86_REQUEST_IRQ:
case VM86_FREE_IRQ:
@@ -256,114 +220,159 @@ SYSCALL_DEFINE2(vm86, unsigned long, cmd, unsigned long, arg)
}
/* we come here only for functions VM86_ENTER, VM86_ENTER_NO_BYPASS */
- if (tsk->thread.saved_sp0)
- return -EPERM;
- v86 = (struct vm86plus_struct __user *)arg;
- tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs,
- offsetof(struct kernel_vm86_struct, regs32) -
- sizeof(info.regs));
- if (tmp)
- return -EFAULT;
- info.regs32 = current_pt_regs();
- info.vm86plus.is_vm86pus = 1;
- tsk->thread.vm86_info = (struct vm86_struct __user *)v86;
- do_sys_vm86(&info, tsk);
- return 0; /* we never return here */
+ return do_sys_vm86((struct vm86plus_struct __user *) arg, true);
}
-static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk)
+static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus)
{
struct tss_struct *tss;
-/*
- * make sure the vm86() system call doesn't try to do anything silly
- */
- info->regs.pt.ds = 0;
- info->regs.pt.es = 0;
- info->regs.pt.fs = 0;
-#ifndef CONFIG_X86_32_LAZY_GS
- info->regs.pt.gs = 0;
-#endif
+ struct task_struct *tsk = current;
+ struct vm86 *vm86 = tsk->thread.vm86;
+ struct kernel_vm86_regs vm86regs;
+ struct pt_regs *regs = current_pt_regs();
+ unsigned long err = 0;
+
+ err = security_mmap_addr(0);
+ if (err) {
+ /*
+ * vm86 cannot virtualize the address space, so vm86 users
+ * need to manage the low 1MB themselves using mmap. Given
+ * that BIOS places important data in the first page, vm86
+ * is essentially useless if mmap_min_addr != 0. DOSEMU,
+ * for example, won't even bother trying to use vm86 if it
+ * can't map a page at virtual address 0.
+ *
+ * To reduce the available kernel attack surface, simply
+ * disallow vm86(old) for users who cannot mmap at va 0.
+ *
+ * The implementation of security_mmap_addr will allow
+ * suitably privileged users to map va 0 even if
+ * vm.mmap_min_addr is set above 0, and we want this
+ * behavior for vm86 as well, as it ensures that legacy
+ * tools like vbetool will not fail just because of
+ * vm.mmap_min_addr.
+ */
+ pr_info_once("Denied a call to vm86(old) from %s[%d] (uid: %d). Set the vm.mmap_min_addr sysctl to 0 and/or adjust LSM mmap_min_addr policy to enable vm86 if you are using a vm86-based DOS emulator.\n",
+ current->comm, task_pid_nr(current),
+ from_kuid_munged(&init_user_ns, current_uid()));
+ return -EPERM;
+ }
+
+ if (!vm86) {
+ if (!(vm86 = kzalloc(sizeof(*vm86), GFP_KERNEL)))
+ return -ENOMEM;
+ tsk->thread.vm86 = vm86;
+ }
+ if (vm86->saved_sp0)
+ return -EPERM;
+
+ if (!access_ok(VERIFY_READ, user_vm86, plus ?
+ sizeof(struct vm86_struct) :
+ sizeof(struct vm86plus_struct)))
+ return -EFAULT;
+
+ memset(&vm86regs, 0, sizeof(vm86regs));
+ get_user_try {
+ unsigned short seg;
+ get_user_ex(vm86regs.pt.bx, &user_vm86->regs.ebx);
+ get_user_ex(vm86regs.pt.cx, &user_vm86->regs.ecx);
+ get_user_ex(vm86regs.pt.dx, &user_vm86->regs.edx);
+ get_user_ex(vm86regs.pt.si, &user_vm86->regs.esi);
+ get_user_ex(vm86regs.pt.di, &user_vm86->regs.edi);
+ get_user_ex(vm86regs.pt.bp, &user_vm86->regs.ebp);
+ get_user_ex(vm86regs.pt.ax, &user_vm86->regs.eax);
+ get_user_ex(vm86regs.pt.ip, &user_vm86->regs.eip);
+ get_user_ex(seg, &user_vm86->regs.cs);
+ vm86regs.pt.cs = seg;
+ get_user_ex(vm86regs.pt.flags, &user_vm86->regs.eflags);
+ get_user_ex(vm86regs.pt.sp, &user_vm86->regs.esp);
+ get_user_ex(seg, &user_vm86->regs.ss);
+ vm86regs.pt.ss = seg;
+ get_user_ex(vm86regs.es, &user_vm86->regs.es);
+ get_user_ex(vm86regs.ds, &user_vm86->regs.ds);
+ get_user_ex(vm86regs.fs, &user_vm86->regs.fs);
+ get_user_ex(vm86regs.gs, &user_vm86->regs.gs);
+
+ get_user_ex(vm86->flags, &user_vm86->flags);
+ get_user_ex(vm86->screen_bitmap, &user_vm86->screen_bitmap);
+ get_user_ex(vm86->cpu_type, &user_vm86->cpu_type);
+ } get_user_catch(err);
+ if (err)
+ return err;
+
+ if (copy_from_user(&vm86->int_revectored,
+ &user_vm86->int_revectored,
+ sizeof(struct revectored_struct)))
+ return -EFAULT;
+ if (copy_from_user(&vm86->int21_revectored,
+ &user_vm86->int21_revectored,
+ sizeof(struct revectored_struct)))
+ return -EFAULT;
+ if (plus) {
+ if (copy_from_user(&vm86->vm86plus, &user_vm86->vm86plus,
+ sizeof(struct vm86plus_info_struct)))
+ return -EFAULT;
+ vm86->vm86plus.is_vm86pus = 1;
+ } else
+ memset(&vm86->vm86plus, 0,
+ sizeof(struct vm86plus_info_struct));
+
+ memcpy(&vm86->regs32, regs, sizeof(struct pt_regs));
+ vm86->user_vm86 = user_vm86;
/*
* The flags register is also special: we cannot trust that the user
* has set it up safely, so this makes sure interrupt etc flags are
* inherited from protected mode.
*/
- VEFLAGS = info->regs.pt.flags;
- info->regs.pt.flags &= SAFE_MASK;
- info->regs.pt.flags |= info->regs32->flags & ~SAFE_MASK;
- info->regs.pt.flags |= X86_VM_MASK;
+ VEFLAGS = vm86regs.pt.flags;
+ vm86regs.pt.flags &= SAFE_MASK;
+ vm86regs.pt.flags |= regs->flags & ~SAFE_MASK;
+ vm86regs.pt.flags |= X86_VM_MASK;
+
+ vm86regs.pt.orig_ax = regs->orig_ax;
- switch (info->cpu_type) {
+ switch (vm86->cpu_type) {
case CPU_286:
- tsk->thread.v86mask = 0;
+ vm86->veflags_mask = 0;
break;
case CPU_386:
- tsk->thread.v86mask = X86_EFLAGS_NT | X86_EFLAGS_IOPL;
+ vm86->veflags_mask = X86_EFLAGS_NT | X86_EFLAGS_IOPL;
break;
case CPU_486:
- tsk->thread.v86mask = X86_EFLAGS_AC | X86_EFLAGS_NT | X86_EFLAGS_IOPL;
+ vm86->veflags_mask = X86_EFLAGS_AC | X86_EFLAGS_NT | X86_EFLAGS_IOPL;
break;
default:
- tsk->thread.v86mask = X86_EFLAGS_ID | X86_EFLAGS_AC | X86_EFLAGS_NT | X86_EFLAGS_IOPL;
+ vm86->veflags_mask = X86_EFLAGS_ID | X86_EFLAGS_AC | X86_EFLAGS_NT | X86_EFLAGS_IOPL;
break;
}
/*
- * Save old state, set default return value (%ax) to 0 (VM86_SIGNAL)
+ * Save old state
*/
- info->regs32->ax = VM86_SIGNAL;
- tsk->thread.saved_sp0 = tsk->thread.sp0;
- tsk->thread.saved_fs = info->regs32->fs;
- tsk->thread.saved_gs = get_user_gs(info->regs32);
+ vm86->saved_sp0 = tsk->thread.sp0;
+ lazy_save_gs(vm86->regs32.gs);
tss = &per_cpu(cpu_tss, get_cpu());
- tsk->thread.sp0 = (unsigned long) &info->VM86_TSS_ESP0;
+ /* make room for real-mode segments */
+ tsk->thread.sp0 += 16;
if (cpu_has_sep)
tsk->thread.sysenter_cs = 0;
load_sp0(tss, &tsk->thread);
put_cpu();
- tsk->thread.screen_bitmap = info->screen_bitmap;
- if (info->flags & VM86_SCREEN_BITMAP)
+ if (vm86->flags & VM86_SCREEN_BITMAP)
mark_screen_rdonly(tsk->mm);
- /*call __audit_syscall_exit since we do not exit via the normal paths */
-#ifdef CONFIG_AUDITSYSCALL
- if (unlikely(current->audit_context))
- __audit_syscall_exit(1, 0);
-#endif
-
- __asm__ __volatile__(
- "movl %0,%%esp\n\t"
- "movl %1,%%ebp\n\t"
-#ifdef CONFIG_X86_32_LAZY_GS
- "mov %2, %%gs\n\t"
-#endif
- "jmp resume_userspace"
- : /* no outputs */
- :"r" (&info->regs), "r" (task_thread_info(tsk)), "r" (0));
- /* we never return here */
-}
-
-static inline void return_to_32bit(struct kernel_vm86_regs *regs16, int retval)
-{
- struct pt_regs *regs32;
-
- regs32 = save_v86_state(regs16);
- regs32->ax = retval;
- __asm__ __volatile__("movl %0,%%esp\n\t"
- "movl %1,%%ebp\n\t"
- "jmp resume_userspace"
- : : "r" (regs32), "r" (current_thread_info()));
+ memcpy((struct kernel_vm86_regs *)regs, &vm86regs, sizeof(vm86regs));
+ force_iret();
+ return regs->ax;
}
static inline void set_IF(struct kernel_vm86_regs *regs)
{
VEFLAGS |= X86_EFLAGS_VIF;
- if (VEFLAGS & X86_EFLAGS_VIP)
- return_to_32bit(regs, VM86_STI);
}
static inline void clear_IF(struct kernel_vm86_regs *regs)
@@ -395,7 +404,7 @@ static inline void clear_AC(struct kernel_vm86_regs *regs)
static inline void set_vflags_long(unsigned long flags, struct kernel_vm86_regs *regs)
{
- set_flags(VEFLAGS, flags, current->thread.v86mask);
+ set_flags(VEFLAGS, flags, current->thread.vm86->veflags_mask);
set_flags(regs->pt.flags, flags, SAFE_MASK);
if (flags & X86_EFLAGS_IF)
set_IF(regs);
@@ -405,7 +414,7 @@ static inline void set_vflags_long(unsigned long flags, struct kernel_vm86_regs
static inline void set_vflags_short(unsigned short flags, struct kernel_vm86_regs *regs)
{
- set_flags(VFLAGS, flags, current->thread.v86mask);
+ set_flags(VFLAGS, flags, current->thread.vm86->veflags_mask);
set_flags(regs->pt.flags, flags, SAFE_MASK);
if (flags & X86_EFLAGS_IF)
set_IF(regs);
@@ -420,7 +429,7 @@ static inline unsigned long get_vflags(struct kernel_vm86_regs *regs)
if (VEFLAGS & X86_EFLAGS_VIF)
flags |= X86_EFLAGS_IF;
flags |= X86_EFLAGS_IOPL;
- return flags | (VEFLAGS & current->thread.v86mask);
+ return flags | (VEFLAGS & current->thread.vm86->veflags_mask);
}
static inline int is_revectored(int nr, struct revectored_struct *bitmap)
@@ -518,12 +527,13 @@ static void do_int(struct kernel_vm86_regs *regs, int i,
{
unsigned long __user *intr_ptr;
unsigned long segoffs;
+ struct vm86 *vm86 = current->thread.vm86;
if (regs->pt.cs == BIOSSEG)
goto cannot_handle;
- if (is_revectored(i, &KVM86->int_revectored))
+ if (is_revectored(i, &vm86->int_revectored))
goto cannot_handle;
- if (i == 0x21 && is_revectored(AH(regs), &KVM86->int21_revectored))
+ if (i == 0x21 && is_revectored(AH(regs), &vm86->int21_revectored))
goto cannot_handle;
intr_ptr = (unsigned long __user *) (i << 2);
if (get_user(segoffs, intr_ptr))
@@ -542,18 +552,16 @@ static void do_int(struct kernel_vm86_regs *regs, int i,
return;
cannot_handle:
- return_to_32bit(regs, VM86_INTx + (i << 8));
+ save_v86_state(regs, VM86_INTx + (i << 8));
}
int handle_vm86_trap(struct kernel_vm86_regs *regs, long error_code, int trapno)
{
- if (VMPI.is_vm86pus) {
+ struct vm86 *vm86 = current->thread.vm86;
+
+ if (vm86->vm86plus.is_vm86pus) {
if ((trapno == 3) || (trapno == 1)) {
- KVM86->regs32->ax = VM86_TRAP + (trapno << 8);
- /* setting this flag forces the code in entry_32.S to
- the path where we call save_v86_state() and change
- the stack pointer to KVM86->regs32 */
- set_thread_flag(TIF_NOTIFY_RESUME);
+ save_v86_state(regs, VM86_TRAP + (trapno << 8));
return 0;
}
do_int(regs, trapno, (unsigned char __user *) (regs->pt.ss << 4), SP(regs));
@@ -574,16 +582,11 @@ void handle_vm86_fault(struct kernel_vm86_regs *regs, long error_code)
unsigned char __user *ssp;
unsigned short ip, sp, orig_flags;
int data32, pref_done;
+ struct vm86plus_info_struct *vmpi = &current->thread.vm86->vm86plus;
#define CHECK_IF_IN_TRAP \
- if (VMPI.vm86dbg_active && VMPI.vm86dbg_TFpendig) \
+ if (vmpi->vm86dbg_active && vmpi->vm86dbg_TFpendig) \
newflags |= X86_EFLAGS_TF
-#define VM86_FAULT_RETURN do { \
- if (VMPI.force_return_for_pic && (VEFLAGS & (X86_EFLAGS_IF | X86_EFLAGS_VIF))) \
- return_to_32bit(regs, VM86_PICRETURN); \
- if (orig_flags & X86_EFLAGS_TF) \
- handle_vm86_trap(regs, 0, 1); \
- return; } while (0)
orig_flags = *(unsigned short *)&regs->pt.flags;
@@ -622,7 +625,7 @@ void handle_vm86_fault(struct kernel_vm86_regs *regs, long error_code)
SP(regs) -= 2;
}
IP(regs) = ip;
- VM86_FAULT_RETURN;
+ goto vm86_fault_return;
/* popf */
case 0x9d:
@@ -642,16 +645,18 @@ void handle_vm86_fault(struct kernel_vm86_regs *regs, long error_code)
else
set_vflags_short(newflags, regs);
- VM86_FAULT_RETURN;
+ goto check_vip;
}
/* int xx */
case 0xcd: {
int intno = popb(csp, ip, simulate_sigsegv);
IP(regs) = ip;
- if (VMPI.vm86dbg_active) {
- if ((1 << (intno & 7)) & VMPI.vm86dbg_intxxtab[intno >> 3])
- return_to_32bit(regs, VM86_INTx + (intno << 8));
+ if (vmpi->vm86dbg_active) {
+ if ((1 << (intno & 7)) & vmpi->vm86dbg_intxxtab[intno >> 3]) {
+ save_v86_state(regs, VM86_INTx + (intno << 8));
+ return;
+ }
}
do_int(regs, intno, ssp, sp);
return;
@@ -682,14 +687,14 @@ void handle_vm86_fault(struct kernel_vm86_regs *regs, long error_code)
} else {
set_vflags_short(newflags, regs);
}
- VM86_FAULT_RETURN;
+ goto check_vip;
}
/* cli */
case 0xfa:
IP(regs) = ip;
clear_IF(regs);
- VM86_FAULT_RETURN;
+ goto vm86_fault_return;
/* sti */
/*
@@ -701,14 +706,29 @@ void handle_vm86_fault(struct kernel_vm86_regs *regs, long error_code)
case 0xfb:
IP(regs) = ip;
set_IF(regs);
- VM86_FAULT_RETURN;
+ goto check_vip;
default:
- return_to_32bit(regs, VM86_UNKNOWN);
+ save_v86_state(regs, VM86_UNKNOWN);
}
return;
+check_vip:
+ if (VEFLAGS & X86_EFLAGS_VIP) {
+ save_v86_state(regs, VM86_STI);
+ return;
+ }
+
+vm86_fault_return:
+ if (vmpi->force_return_for_pic && (VEFLAGS & (X86_EFLAGS_IF | X86_EFLAGS_VIF))) {
+ save_v86_state(regs, VM86_PICRETURN);
+ return;
+ }
+ if (orig_flags & X86_EFLAGS_TF)
+ handle_vm86_trap(regs, 0, X86_TRAP_DB);
+ return;
+
simulate_sigsegv:
/* FIXME: After a long discussion with Stas we finally
* agreed, that this is wrong. Here we should
@@ -720,7 +740,7 @@ simulate_sigsegv:
* should be a mixture of the two, but how do we
* get the information? [KD]
*/
- return_to_32bit(regs, VM86_UNKNOWN);
+ save_v86_state(regs, VM86_UNKNOWN);
}
/* ---------------- vm86 special IRQ passing stuff ----------------- */
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 00bf300fd846..74e4bf11f562 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -364,7 +364,7 @@ INIT_PER_CPU(irq_stack_union);
#endif /* CONFIG_X86_32 */
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
#include <asm/kexec.h>
. = ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE,
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index 67d215cb8953..a1ff508bb423 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -12,7 +12,9 @@ kvm-y += $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o \
kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o
kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
- i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o
+ i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \
+ hyperv.o
+
kvm-$(CONFIG_KVM_DEVICE_ASSIGNMENT) += assigned-dev.o iommu.o
kvm-intel-y += vmx.o pmu_intel.o
kvm-amd-y += svm.o pmu_amd.o
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 64dd46793099..2fbea2544f24 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -98,6 +98,8 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
best->ebx = xstate_required_size(vcpu->arch.xcr0, true);
vcpu->arch.eager_fpu = use_eager_fpu() || guest_cpuid_has_mpx(vcpu);
+ if (vcpu->arch.eager_fpu)
+ kvm_x86_ops->fpu_activate(vcpu);
/*
* The existing code assumes virtual address is 48-bit in the canonical
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index e7a4fde5d631..b372a7557c16 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -650,6 +650,7 @@ static __always_inline int __linearize(struct x86_emulate_ctxt *ctxt,
u16 sel;
la = seg_base(ctxt, addr.seg) + addr.ea;
+ *linear = la;
*max_size = 0;
switch (mode) {
case X86EMUL_MODE_PROT64:
@@ -693,7 +694,6 @@ static __always_inline int __linearize(struct x86_emulate_ctxt *ctxt,
}
if (insn_aligned(ctxt, size) && ((la & (size - 1)) != 0))
return emulate_gp(ctxt, 0);
- *linear = la;
return X86EMUL_CONTINUE;
bad:
if (addr.seg == VCPU_SREG_SS)
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
new file mode 100644
index 000000000000..a8160d2ae362
--- /dev/null
+++ b/arch/x86/kvm/hyperv.c
@@ -0,0 +1,377 @@
+/*
+ * KVM Microsoft Hyper-V emulation
+ *
+ * derived from arch/x86/kvm/x86.c
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright (C) 2008 Qumranet, Inc.
+ * Copyright IBM Corporation, 2008
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ * Copyright (C) 2015 Andrey Smetanin <asmetanin@virtuozzo.com>
+ *
+ * Authors:
+ * Avi Kivity <avi@qumranet.com>
+ * Yaniv Kamay <yaniv@qumranet.com>
+ * Amit Shah <amit.shah@qumranet.com>
+ * Ben-Ami Yassour <benami@il.ibm.com>
+ * Andrey Smetanin <asmetanin@virtuozzo.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include "x86.h"
+#include "lapic.h"
+#include "hyperv.h"
+
+#include <linux/kvm_host.h>
+#include <trace/events/kvm.h>
+
+#include "trace.h"
+
+static bool kvm_hv_msr_partition_wide(u32 msr)
+{
+ bool r = false;
+
+ switch (msr) {
+ case HV_X64_MSR_GUEST_OS_ID:
+ case HV_X64_MSR_HYPERCALL:
+ case HV_X64_MSR_REFERENCE_TSC:
+ case HV_X64_MSR_TIME_REF_COUNT:
+ case HV_X64_MSR_CRASH_CTL:
+ case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
+ r = true;
+ break;
+ }
+
+ return r;
+}
+
+static int kvm_hv_msr_get_crash_data(struct kvm_vcpu *vcpu,
+ u32 index, u64 *pdata)
+{
+ struct kvm_hv *hv = &vcpu->kvm->arch.hyperv;
+
+ if (WARN_ON_ONCE(index >= ARRAY_SIZE(hv->hv_crash_param)))
+ return -EINVAL;
+
+ *pdata = hv->hv_crash_param[index];
+ return 0;
+}
+
+static int kvm_hv_msr_get_crash_ctl(struct kvm_vcpu *vcpu, u64 *pdata)
+{
+ struct kvm_hv *hv = &vcpu->kvm->arch.hyperv;
+
+ *pdata = hv->hv_crash_ctl;
+ return 0;
+}
+
+static int kvm_hv_msr_set_crash_ctl(struct kvm_vcpu *vcpu, u64 data, bool host)
+{
+ struct kvm_hv *hv = &vcpu->kvm->arch.hyperv;
+
+ if (host)
+ hv->hv_crash_ctl = data & HV_X64_MSR_CRASH_CTL_NOTIFY;
+
+ if (!host && (data & HV_X64_MSR_CRASH_CTL_NOTIFY)) {
+
+ vcpu_debug(vcpu, "hv crash (0x%llx 0x%llx 0x%llx 0x%llx 0x%llx)\n",
+ hv->hv_crash_param[0],
+ hv->hv_crash_param[1],
+ hv->hv_crash_param[2],
+ hv->hv_crash_param[3],
+ hv->hv_crash_param[4]);
+
+ /* Send notification about crash to user space */
+ kvm_make_request(KVM_REQ_HV_CRASH, vcpu);
+ }
+
+ return 0;
+}
+
+static int kvm_hv_msr_set_crash_data(struct kvm_vcpu *vcpu,
+ u32 index, u64 data)
+{
+ struct kvm_hv *hv = &vcpu->kvm->arch.hyperv;
+
+ if (WARN_ON_ONCE(index >= ARRAY_SIZE(hv->hv_crash_param)))
+ return -EINVAL;
+
+ hv->hv_crash_param[index] = data;
+ return 0;
+}
+
+static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
+ bool host)
+{
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_hv *hv = &kvm->arch.hyperv;
+
+ switch (msr) {
+ case HV_X64_MSR_GUEST_OS_ID:
+ hv->hv_guest_os_id = data;
+ /* setting guest os id to zero disables hypercall page */
+ if (!hv->hv_guest_os_id)
+ hv->hv_hypercall &= ~HV_X64_MSR_HYPERCALL_ENABLE;
+ break;
+ case HV_X64_MSR_HYPERCALL: {
+ u64 gfn;
+ unsigned long addr;
+ u8 instructions[4];
+
+ /* if guest os id is not set hypercall should remain disabled */
+ if (!hv->hv_guest_os_id)
+ break;
+ if (!(data & HV_X64_MSR_HYPERCALL_ENABLE)) {
+ hv->hv_hypercall = data;
+ break;
+ }
+ gfn = data >> HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT;
+ addr = gfn_to_hva(kvm, gfn);
+ if (kvm_is_error_hva(addr))
+ return 1;
+ kvm_x86_ops->patch_hypercall(vcpu, instructions);
+ ((unsigned char *)instructions)[3] = 0xc3; /* ret */
+ if (__copy_to_user((void __user *)addr, instructions, 4))
+ return 1;
+ hv->hv_hypercall = data;
+ mark_page_dirty(kvm, gfn);
+ break;
+ }
+ case HV_X64_MSR_REFERENCE_TSC: {
+ u64 gfn;
+ HV_REFERENCE_TSC_PAGE tsc_ref;
+
+ memset(&tsc_ref, 0, sizeof(tsc_ref));
+ hv->hv_tsc_page = data;
+ if (!(data & HV_X64_MSR_TSC_REFERENCE_ENABLE))
+ break;
+ gfn = data >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT;
+ if (kvm_write_guest(
+ kvm,
+ gfn << HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT,
+ &tsc_ref, sizeof(tsc_ref)))
+ return 1;
+ mark_page_dirty(kvm, gfn);
+ break;
+ }
+ case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
+ return kvm_hv_msr_set_crash_data(vcpu,
+ msr - HV_X64_MSR_CRASH_P0,
+ data);
+ case HV_X64_MSR_CRASH_CTL:
+ return kvm_hv_msr_set_crash_ctl(vcpu, data, host);
+ default:
+ vcpu_unimpl(vcpu, "Hyper-V uhandled wrmsr: 0x%x data 0x%llx\n",
+ msr, data);
+ return 1;
+ }
+ return 0;
+}
+
+static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+{
+ struct kvm_vcpu_hv *hv = &vcpu->arch.hyperv;
+
+ switch (msr) {
+ case HV_X64_MSR_APIC_ASSIST_PAGE: {
+ u64 gfn;
+ unsigned long addr;
+
+ if (!(data & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE)) {
+ hv->hv_vapic = data;
+ if (kvm_lapic_enable_pv_eoi(vcpu, 0))
+ return 1;
+ break;
+ }
+ gfn = data >> HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT;
+ addr = kvm_vcpu_gfn_to_hva(vcpu, gfn);
+ if (kvm_is_error_hva(addr))
+ return 1;
+ if (__clear_user((void __user *)addr, PAGE_SIZE))
+ return 1;
+ hv->hv_vapic = data;
+ kvm_vcpu_mark_page_dirty(vcpu, gfn);
+ if (kvm_lapic_enable_pv_eoi(vcpu,
+ gfn_to_gpa(gfn) | KVM_MSR_ENABLED))
+ return 1;
+ break;
+ }
+ case HV_X64_MSR_EOI:
+ return kvm_hv_vapic_msr_write(vcpu, APIC_EOI, data);
+ case HV_X64_MSR_ICR:
+ return kvm_hv_vapic_msr_write(vcpu, APIC_ICR, data);
+ case HV_X64_MSR_TPR:
+ return kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data);
+ default:
+ vcpu_unimpl(vcpu, "Hyper-V uhandled wrmsr: 0x%x data 0x%llx\n",
+ msr, data);
+ return 1;
+ }
+
+ return 0;
+}
+
+static int kvm_hv_get_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
+{
+ u64 data = 0;
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_hv *hv = &kvm->arch.hyperv;
+
+ switch (msr) {
+ case HV_X64_MSR_GUEST_OS_ID:
+ data = hv->hv_guest_os_id;
+ break;
+ case HV_X64_MSR_HYPERCALL:
+ data = hv->hv_hypercall;
+ break;
+ case HV_X64_MSR_TIME_REF_COUNT: {
+ data =
+ div_u64(get_kernel_ns() + kvm->arch.kvmclock_offset, 100);
+ break;
+ }
+ case HV_X64_MSR_REFERENCE_TSC:
+ data = hv->hv_tsc_page;
+ break;
+ case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
+ return kvm_hv_msr_get_crash_data(vcpu,
+ msr - HV_X64_MSR_CRASH_P0,
+ pdata);
+ case HV_X64_MSR_CRASH_CTL:
+ return kvm_hv_msr_get_crash_ctl(vcpu, pdata);
+ default:
+ vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
+ return 1;
+ }
+
+ *pdata = data;
+ return 0;
+}
+
+static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
+{
+ u64 data = 0;
+ struct kvm_vcpu_hv *hv = &vcpu->arch.hyperv;
+
+ switch (msr) {
+ case HV_X64_MSR_VP_INDEX: {
+ int r;
+ struct kvm_vcpu *v;
+
+ kvm_for_each_vcpu(r, v, vcpu->kvm) {
+ if (v == vcpu) {
+ data = r;
+ break;
+ }
+ }
+ break;
+ }
+ case HV_X64_MSR_EOI:
+ return kvm_hv_vapic_msr_read(vcpu, APIC_EOI, pdata);
+ case HV_X64_MSR_ICR:
+ return kvm_hv_vapic_msr_read(vcpu, APIC_ICR, pdata);
+ case HV_X64_MSR_TPR:
+ return kvm_hv_vapic_msr_read(vcpu, APIC_TASKPRI, pdata);
+ case HV_X64_MSR_APIC_ASSIST_PAGE:
+ data = hv->hv_vapic;
+ break;
+ default:
+ vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
+ return 1;
+ }
+ *pdata = data;
+ return 0;
+}
+
+int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
+{
+ if (kvm_hv_msr_partition_wide(msr)) {
+ int r;
+
+ mutex_lock(&vcpu->kvm->lock);
+ r = kvm_hv_set_msr_pw(vcpu, msr, data, host);
+ mutex_unlock(&vcpu->kvm->lock);
+ return r;
+ } else
+ return kvm_hv_set_msr(vcpu, msr, data);
+}
+
+int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
+{
+ if (kvm_hv_msr_partition_wide(msr)) {
+ int r;
+
+ mutex_lock(&vcpu->kvm->lock);
+ r = kvm_hv_get_msr_pw(vcpu, msr, pdata);
+ mutex_unlock(&vcpu->kvm->lock);
+ return r;
+ } else
+ return kvm_hv_get_msr(vcpu, msr, pdata);
+}
+
+bool kvm_hv_hypercall_enabled(struct kvm *kvm)
+{
+ return kvm->arch.hyperv.hv_hypercall & HV_X64_MSR_HYPERCALL_ENABLE;
+}
+
+int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
+{
+ u64 param, ingpa, outgpa, ret;
+ uint16_t code, rep_idx, rep_cnt, res = HV_STATUS_SUCCESS, rep_done = 0;
+ bool fast, longmode;
+
+ /*
+ * hypercall generates UD from non zero cpl and real mode
+ * per HYPER-V spec
+ */
+ if (kvm_x86_ops->get_cpl(vcpu) != 0 || !is_protmode(vcpu)) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 0;
+ }
+
+ longmode = is_64_bit_mode(vcpu);
+
+ if (!longmode) {
+ param = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDX) << 32) |
+ (kvm_register_read(vcpu, VCPU_REGS_RAX) & 0xffffffff);
+ ingpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RBX) << 32) |
+ (kvm_register_read(vcpu, VCPU_REGS_RCX) & 0xffffffff);
+ outgpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDI) << 32) |
+ (kvm_register_read(vcpu, VCPU_REGS_RSI) & 0xffffffff);
+ }
+#ifdef CONFIG_X86_64
+ else {
+ param = kvm_register_read(vcpu, VCPU_REGS_RCX);
+ ingpa = kvm_register_read(vcpu, VCPU_REGS_RDX);
+ outgpa = kvm_register_read(vcpu, VCPU_REGS_R8);
+ }
+#endif
+
+ code = param & 0xffff;
+ fast = (param >> 16) & 0x1;
+ rep_cnt = (param >> 32) & 0xfff;
+ rep_idx = (param >> 48) & 0xfff;
+
+ trace_kvm_hv_hypercall(code, fast, rep_cnt, rep_idx, ingpa, outgpa);
+
+ switch (code) {
+ case HV_X64_HV_NOTIFY_LONG_SPIN_WAIT:
+ kvm_vcpu_on_spin(vcpu);
+ break;
+ default:
+ res = HV_STATUS_INVALID_HYPERCALL_CODE;
+ break;
+ }
+
+ ret = res | (((u64)rep_done & 0xfff) << 32);
+ if (longmode) {
+ kvm_register_write(vcpu, VCPU_REGS_RAX, ret);
+ } else {
+ kvm_register_write(vcpu, VCPU_REGS_RDX, ret >> 32);
+ kvm_register_write(vcpu, VCPU_REGS_RAX, ret & 0xffffffff);
+ }
+
+ return 1;
+}
diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h
new file mode 100644
index 000000000000..c7bce559f67b
--- /dev/null
+++ b/arch/x86/kvm/hyperv.h
@@ -0,0 +1,32 @@
+/*
+ * KVM Microsoft Hyper-V emulation
+ *
+ * derived from arch/x86/kvm/x86.c
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright (C) 2008 Qumranet, Inc.
+ * Copyright IBM Corporation, 2008
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ * Copyright (C) 2015 Andrey Smetanin <asmetanin@virtuozzo.com>
+ *
+ * Authors:
+ * Avi Kivity <avi@qumranet.com>
+ * Yaniv Kamay <yaniv@qumranet.com>
+ * Amit Shah <amit.shah@qumranet.com>
+ * Ben-Ami Yassour <benami@il.ibm.com>
+ * Andrey Smetanin <asmetanin@virtuozzo.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef __ARCH_X86_KVM_HYPERV_H__
+#define __ARCH_X86_KVM_HYPERV_H__
+
+int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host);
+int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
+bool kvm_hv_hypercall_enabled(struct kvm *kvm);
+int kvm_hv_hypercall(struct kvm_vcpu *vcpu);
+
+#endif
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index fef922ff2635..7cc2360f1848 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -651,15 +651,10 @@ fail_unlock:
return NULL;
}
-void kvm_destroy_pic(struct kvm *kvm)
+void kvm_destroy_pic(struct kvm_pic *vpic)
{
- struct kvm_pic *vpic = kvm->arch.vpic;
-
- if (vpic) {
- kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &vpic->dev_master);
- kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &vpic->dev_slave);
- kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &vpic->dev_eclr);
- kvm->arch.vpic = NULL;
- kfree(vpic);
- }
+ kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_master);
+ kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_slave);
+ kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_eclr);
+ kfree(vpic);
}
diff --git a/arch/x86/kvm/iommu.c b/arch/x86/kvm/iommu.c
index 7dbced309ddb..5c520ebf6343 100644
--- a/arch/x86/kvm/iommu.c
+++ b/arch/x86/kvm/iommu.c
@@ -200,6 +200,7 @@ int kvm_assign_device(struct kvm *kvm, struct pci_dev *pdev)
goto out_unmap;
}
+ kvm_arch_start_assignment(kvm);
pci_set_dev_assigned(pdev);
dev_info(&pdev->dev, "kvm assign device\n");
@@ -224,6 +225,7 @@ int kvm_deassign_device(struct kvm *kvm, struct pci_dev *pdev)
iommu_detach_device(domain, &pdev->dev);
pci_clear_dev_assigned(pdev);
+ kvm_arch_end_assignment(kvm);
dev_info(&pdev->dev, "kvm deassign device\n");
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index ad68c73008c5..3d782a2c336a 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -74,7 +74,7 @@ struct kvm_pic {
};
struct kvm_pic *kvm_create_pic(struct kvm *kvm);
-void kvm_destroy_pic(struct kvm *kvm);
+void kvm_destroy_pic(struct kvm_pic *vpic);
int kvm_pic_read_irq(struct kvm *kvm);
void kvm_pic_update_irq(struct kvm_pic *s);
@@ -85,11 +85,11 @@ static inline struct kvm_pic *pic_irqchip(struct kvm *kvm)
static inline int irqchip_in_kernel(struct kvm *kvm)
{
- int ret;
+ struct kvm_pic *vpic = pic_irqchip(kvm);
- ret = (pic_irqchip(kvm) != NULL);
+ /* Read vpic before kvm->irq_routing. */
smp_rmb();
- return ret;
+ return vpic != NULL;
}
void kvm_pic_reset(struct kvm_kpic_state *s);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 954e98a8c2e3..8d9013c5e1ee 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1172,7 +1172,7 @@ void wait_lapic_expire(struct kvm_vcpu *vcpu)
tsc_deadline = apic->lapic_timer.expired_tscdeadline;
apic->lapic_timer.expired_tscdeadline = 0;
- guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, native_read_tsc());
+ guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, rdtsc());
trace_kvm_wait_lapic_expire(vcpu->vcpu_id, guest_tsc - tsc_deadline);
/* __delay is delay_tsc whenever the hardware has TSC, thus always. */
@@ -1240,7 +1240,7 @@ static void start_apic_timer(struct kvm_lapic *apic)
local_irq_save(flags);
now = apic->lapic_timer.timer.base->get_time();
- guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, native_read_tsc());
+ guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, rdtsc());
if (likely(tscdeadline > guest_tsc)) {
ns = (tscdeadline - guest_tsc) * 1000000ULL;
do_div(ns, this_tsc_khz);
@@ -1595,7 +1595,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
for (i = 0; i < APIC_LVT_NUM; i++)
apic_set_reg(apic, APIC_LVTT + 0x10 * i, APIC_LVT_MASKED);
apic_update_lvtt(apic);
- if (!(vcpu->kvm->arch.disabled_quirks & KVM_QUIRK_LINT0_REENABLED))
+ if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_LINT0_REENABLED))
apic_set_reg(apic, APIC_LVT0,
SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT));
apic_manage_nmi_watchdog(apic, kvm_apic_get_reg(apic, APIC_LVT0));
@@ -1900,8 +1900,9 @@ void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu)
if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention))
return;
- kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.apic->vapic_cache, &data,
- sizeof(u32));
+ if (kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.apic->vapic_cache, &data,
+ sizeof(u32)))
+ return;
apic_set_tpr(vcpu->arch.apic, data & 0xff);
}
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 71952748222a..764037991d26 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -91,7 +91,7 @@ int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data);
static inline bool kvm_hv_vapic_assist_page_enabled(struct kvm_vcpu *vcpu)
{
- return vcpu->arch.hv_vapic & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE;
+ return vcpu->arch.hyperv.hv_vapic & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE;
}
int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index f807496b62c2..ff606f507913 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -357,12 +357,6 @@ static u64 __get_spte_lockless(u64 *sptep)
{
return ACCESS_ONCE(*sptep);
}
-
-static bool __check_direct_spte_mmio_pf(u64 spte)
-{
- /* It is valid if the spte is zapped. */
- return spte == 0ull;
-}
#else
union split_spte {
struct {
@@ -478,23 +472,6 @@ retry:
return spte.spte;
}
-
-static bool __check_direct_spte_mmio_pf(u64 spte)
-{
- union split_spte sspte = (union split_spte)spte;
- u32 high_mmio_mask = shadow_mmio_mask >> 32;
-
- /* It is valid if the spte is zapped. */
- if (spte == 0ull)
- return true;
-
- /* It is valid if the spte is being zapped. */
- if (sspte.spte_low == 0ull &&
- (sspte.spte_high & high_mmio_mask) == high_mmio_mask)
- return true;
-
- return false;
-}
#endif
static bool spte_is_locklessly_modifiable(u64 spte)
@@ -2479,6 +2456,14 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
return 0;
}
+static bool kvm_is_mmio_pfn(pfn_t pfn)
+{
+ if (pfn_valid(pfn))
+ return !is_zero_pfn(pfn) && PageReserved(pfn_to_page(pfn));
+
+ return true;
+}
+
static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
unsigned pte_access, int level,
gfn_t gfn, pfn_t pfn, bool speculative,
@@ -2506,7 +2491,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
spte |= PT_PAGE_SIZE_MASK;
if (tdp_enabled)
spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn,
- kvm_is_reserved_pfn(pfn));
+ kvm_is_mmio_pfn(pfn));
if (host_writable)
spte |= SPTE_HOST_WRITEABLE;
@@ -3283,54 +3268,90 @@ static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr,
return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access, exception);
}
-static bool quickly_check_mmio_pf(struct kvm_vcpu *vcpu, u64 addr, bool direct)
+static bool
+__is_rsvd_bits_set(struct rsvd_bits_validate *rsvd_check, u64 pte, int level)
{
- if (direct)
- return vcpu_match_mmio_gpa(vcpu, addr);
+ int bit7 = (pte >> 7) & 1, low6 = pte & 0x3f;
- return vcpu_match_mmio_gva(vcpu, addr);
+ return (pte & rsvd_check->rsvd_bits_mask[bit7][level-1]) |
+ ((rsvd_check->bad_mt_xwr & (1ull << low6)) != 0);
}
+static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level)
+{
+ return __is_rsvd_bits_set(&mmu->guest_rsvd_check, gpte, level);
+}
-/*
- * On direct hosts, the last spte is only allows two states
- * for mmio page fault:
- * - It is the mmio spte
- * - It is zapped or it is being zapped.
- *
- * This function completely checks the spte when the last spte
- * is not the mmio spte.
- */
-static bool check_direct_spte_mmio_pf(u64 spte)
+static bool is_shadow_zero_bits_set(struct kvm_mmu *mmu, u64 spte, int level)
{
- return __check_direct_spte_mmio_pf(spte);
+ return __is_rsvd_bits_set(&mmu->shadow_zero_check, spte, level);
}
-static u64 walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr)
+static bool quickly_check_mmio_pf(struct kvm_vcpu *vcpu, u64 addr, bool direct)
+{
+ if (direct)
+ return vcpu_match_mmio_gpa(vcpu, addr);
+
+ return vcpu_match_mmio_gva(vcpu, addr);
+}
+
+/* return true if reserved bit is detected on spte. */
+static bool
+walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
{
struct kvm_shadow_walk_iterator iterator;
- u64 spte = 0ull;
+ u64 sptes[PT64_ROOT_LEVEL], spte = 0ull;
+ int root, leaf;
+ bool reserved = false;
if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
- return spte;
+ goto exit;
walk_shadow_page_lockless_begin(vcpu);
- for_each_shadow_entry_lockless(vcpu, addr, iterator, spte)
+
+ for (shadow_walk_init(&iterator, vcpu, addr),
+ leaf = root = iterator.level;
+ shadow_walk_okay(&iterator);
+ __shadow_walk_next(&iterator, spte)) {
+ spte = mmu_spte_get_lockless(iterator.sptep);
+
+ sptes[leaf - 1] = spte;
+ leaf--;
+
if (!is_shadow_present_pte(spte))
break;
+
+ reserved |= is_shadow_zero_bits_set(&vcpu->arch.mmu, spte,
+ iterator.level);
+ }
+
walk_shadow_page_lockless_end(vcpu);
- return spte;
+ if (reserved) {
+ pr_err("%s: detect reserved bits on spte, addr 0x%llx, dump hierarchy:\n",
+ __func__, addr);
+ while (root > leaf) {
+ pr_err("------ spte 0x%llx level %d.\n",
+ sptes[root - 1], root);
+ root--;
+ }
+ }
+exit:
+ *sptep = spte;
+ return reserved;
}
int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct)
{
u64 spte;
+ bool reserved;
if (quickly_check_mmio_pf(vcpu, addr, direct))
return RET_MMIO_PF_EMULATE;
- spte = walk_shadow_page_get_mmio_spte(vcpu, addr);
+ reserved = walk_shadow_page_get_mmio_spte(vcpu, addr, &spte);
+ if (unlikely(reserved))
+ return RET_MMIO_PF_BUG;
if (is_mmio_spte(spte)) {
gfn_t gfn = get_mmio_spte_gfn(spte);
@@ -3348,13 +3369,6 @@ int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct)
}
/*
- * It's ok if the gva is remapped by other cpus on shadow guest,
- * it's a BUG if the gfn is not a mmio page.
- */
- if (direct && !check_direct_spte_mmio_pf(spte))
- return RET_MMIO_PF_BUG;
-
- /*
* If the page table is zapped by other cpus, let CPU fault again on
* the address.
*/
@@ -3596,102 +3610,119 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned gp
#include "paging_tmpl.h"
#undef PTTYPE
-static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
- struct kvm_mmu *context)
+static void
+__reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
+ struct rsvd_bits_validate *rsvd_check,
+ int maxphyaddr, int level, bool nx, bool gbpages,
+ bool pse, bool amd)
{
- int maxphyaddr = cpuid_maxphyaddr(vcpu);
u64 exb_bit_rsvd = 0;
u64 gbpages_bit_rsvd = 0;
u64 nonleaf_bit8_rsvd = 0;
- context->bad_mt_xwr = 0;
+ rsvd_check->bad_mt_xwr = 0;
- if (!context->nx)
+ if (!nx)
exb_bit_rsvd = rsvd_bits(63, 63);
- if (!guest_cpuid_has_gbpages(vcpu))
+ if (!gbpages)
gbpages_bit_rsvd = rsvd_bits(7, 7);
/*
* Non-leaf PML4Es and PDPEs reserve bit 8 (which would be the G bit for
* leaf entries) on AMD CPUs only.
*/
- if (guest_cpuid_is_amd(vcpu))
+ if (amd)
nonleaf_bit8_rsvd = rsvd_bits(8, 8);
- switch (context->root_level) {
+ switch (level) {
case PT32_ROOT_LEVEL:
/* no rsvd bits for 2 level 4K page table entries */
- context->rsvd_bits_mask[0][1] = 0;
- context->rsvd_bits_mask[0][0] = 0;
- context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0];
+ rsvd_check->rsvd_bits_mask[0][1] = 0;
+ rsvd_check->rsvd_bits_mask[0][0] = 0;
+ rsvd_check->rsvd_bits_mask[1][0] =
+ rsvd_check->rsvd_bits_mask[0][0];
- if (!is_pse(vcpu)) {
- context->rsvd_bits_mask[1][1] = 0;
+ if (!pse) {
+ rsvd_check->rsvd_bits_mask[1][1] = 0;
break;
}
if (is_cpuid_PSE36())
/* 36bits PSE 4MB page */
- context->rsvd_bits_mask[1][1] = rsvd_bits(17, 21);
+ rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(17, 21);
else
/* 32 bits PSE 4MB page */
- context->rsvd_bits_mask[1][1] = rsvd_bits(13, 21);
+ rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(13, 21);
break;
case PT32E_ROOT_LEVEL:
- context->rsvd_bits_mask[0][2] =
+ rsvd_check->rsvd_bits_mask[0][2] =
rsvd_bits(maxphyaddr, 63) |
rsvd_bits(5, 8) | rsvd_bits(1, 2); /* PDPTE */
- context->rsvd_bits_mask[0][1] = exb_bit_rsvd |
+ rsvd_check->rsvd_bits_mask[0][1] = exb_bit_rsvd |
rsvd_bits(maxphyaddr, 62); /* PDE */
- context->rsvd_bits_mask[0][0] = exb_bit_rsvd |
+ rsvd_check->rsvd_bits_mask[0][0] = exb_bit_rsvd |
rsvd_bits(maxphyaddr, 62); /* PTE */
- context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
+ rsvd_check->rsvd_bits_mask[1][1] = exb_bit_rsvd |
rsvd_bits(maxphyaddr, 62) |
rsvd_bits(13, 20); /* large page */
- context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0];
+ rsvd_check->rsvd_bits_mask[1][0] =
+ rsvd_check->rsvd_bits_mask[0][0];
break;
case PT64_ROOT_LEVEL:
- context->rsvd_bits_mask[0][3] = exb_bit_rsvd |
- nonleaf_bit8_rsvd | rsvd_bits(7, 7) | rsvd_bits(maxphyaddr, 51);
- context->rsvd_bits_mask[0][2] = exb_bit_rsvd |
- nonleaf_bit8_rsvd | gbpages_bit_rsvd | rsvd_bits(maxphyaddr, 51);
- context->rsvd_bits_mask[0][1] = exb_bit_rsvd |
+ rsvd_check->rsvd_bits_mask[0][3] = exb_bit_rsvd |
+ nonleaf_bit8_rsvd | rsvd_bits(7, 7) |
+ rsvd_bits(maxphyaddr, 51);
+ rsvd_check->rsvd_bits_mask[0][2] = exb_bit_rsvd |
+ nonleaf_bit8_rsvd | gbpages_bit_rsvd |
+ rsvd_bits(maxphyaddr, 51);
+ rsvd_check->rsvd_bits_mask[0][1] = exb_bit_rsvd |
rsvd_bits(maxphyaddr, 51);
- context->rsvd_bits_mask[0][0] = exb_bit_rsvd |
+ rsvd_check->rsvd_bits_mask[0][0] = exb_bit_rsvd |
rsvd_bits(maxphyaddr, 51);
- context->rsvd_bits_mask[1][3] = context->rsvd_bits_mask[0][3];
- context->rsvd_bits_mask[1][2] = exb_bit_rsvd |
+ rsvd_check->rsvd_bits_mask[1][3] =
+ rsvd_check->rsvd_bits_mask[0][3];
+ rsvd_check->rsvd_bits_mask[1][2] = exb_bit_rsvd |
gbpages_bit_rsvd | rsvd_bits(maxphyaddr, 51) |
rsvd_bits(13, 29);
- context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
+ rsvd_check->rsvd_bits_mask[1][1] = exb_bit_rsvd |
rsvd_bits(maxphyaddr, 51) |
rsvd_bits(13, 20); /* large page */
- context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0];
+ rsvd_check->rsvd_bits_mask[1][0] =
+ rsvd_check->rsvd_bits_mask[0][0];
break;
}
}
-static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu,
- struct kvm_mmu *context, bool execonly)
+static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *context)
+{
+ __reset_rsvds_bits_mask(vcpu, &context->guest_rsvd_check,
+ cpuid_maxphyaddr(vcpu), context->root_level,
+ context->nx, guest_cpuid_has_gbpages(vcpu),
+ is_pse(vcpu), guest_cpuid_is_amd(vcpu));
+}
+
+static void
+__reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check,
+ int maxphyaddr, bool execonly)
{
- int maxphyaddr = cpuid_maxphyaddr(vcpu);
int pte;
- context->rsvd_bits_mask[0][3] =
+ rsvd_check->rsvd_bits_mask[0][3] =
rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7);
- context->rsvd_bits_mask[0][2] =
+ rsvd_check->rsvd_bits_mask[0][2] =
rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6);
- context->rsvd_bits_mask[0][1] =
+ rsvd_check->rsvd_bits_mask[0][1] =
rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6);
- context->rsvd_bits_mask[0][0] = rsvd_bits(maxphyaddr, 51);
+ rsvd_check->rsvd_bits_mask[0][0] = rsvd_bits(maxphyaddr, 51);
/* large page */
- context->rsvd_bits_mask[1][3] = context->rsvd_bits_mask[0][3];
- context->rsvd_bits_mask[1][2] =
+ rsvd_check->rsvd_bits_mask[1][3] = rsvd_check->rsvd_bits_mask[0][3];
+ rsvd_check->rsvd_bits_mask[1][2] =
rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 29);
- context->rsvd_bits_mask[1][1] =
+ rsvd_check->rsvd_bits_mask[1][1] =
rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 20);
- context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0];
+ rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0];
for (pte = 0; pte < 64; pte++) {
int rwx_bits = pte & 7;
@@ -3699,10 +3730,75 @@ static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu,
if (mt == 0x2 || mt == 0x3 || mt == 0x7 ||
rwx_bits == 0x2 || rwx_bits == 0x6 ||
(rwx_bits == 0x4 && !execonly))
- context->bad_mt_xwr |= (1ull << pte);
+ rsvd_check->bad_mt_xwr |= (1ull << pte);
}
}
+static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *context, bool execonly)
+{
+ __reset_rsvds_bits_mask_ept(&context->guest_rsvd_check,
+ cpuid_maxphyaddr(vcpu), execonly);
+}
+
+/*
+ * the page table on host is the shadow page table for the page
+ * table in guest or amd nested guest, its mmu features completely
+ * follow the features in guest.
+ */
+void
+reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
+{
+ /*
+ * Passing "true" to the last argument is okay; it adds a check
+ * on bit 8 of the SPTEs which KVM doesn't use anyway.
+ */
+ __reset_rsvds_bits_mask(vcpu, &context->shadow_zero_check,
+ boot_cpu_data.x86_phys_bits,
+ context->shadow_root_level, context->nx,
+ guest_cpuid_has_gbpages(vcpu), is_pse(vcpu),
+ true);
+}
+EXPORT_SYMBOL_GPL(reset_shadow_zero_bits_mask);
+
+static inline bool boot_cpu_is_amd(void)
+{
+ WARN_ON_ONCE(!tdp_enabled);
+ return shadow_x_mask == 0;
+}
+
+/*
+ * the direct page table on host, use as much mmu features as
+ * possible, however, kvm currently does not do execution-protection.
+ */
+static void
+reset_tdp_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *context)
+{
+ if (boot_cpu_is_amd())
+ __reset_rsvds_bits_mask(vcpu, &context->shadow_zero_check,
+ boot_cpu_data.x86_phys_bits,
+ context->shadow_root_level, false,
+ cpu_has_gbpages, true, true);
+ else
+ __reset_rsvds_bits_mask_ept(&context->shadow_zero_check,
+ boot_cpu_data.x86_phys_bits,
+ false);
+
+}
+
+/*
+ * as the comments in reset_shadow_zero_bits_mask() except it
+ * is the shadow page table for intel nested guest.
+ */
+static void
+reset_ept_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *context, bool execonly)
+{
+ __reset_rsvds_bits_mask_ept(&context->shadow_zero_check,
+ boot_cpu_data.x86_phys_bits, execonly);
+}
+
static void update_permission_bitmask(struct kvm_vcpu *vcpu,
struct kvm_mmu *mmu, bool ept)
{
@@ -3881,6 +3977,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
update_permission_bitmask(vcpu, context, false);
update_last_pte_bitmap(vcpu, context);
+ reset_tdp_shadow_zero_bits_mask(vcpu, context);
}
void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu)
@@ -3908,6 +4005,7 @@ void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu)
context->base_role.smap_andnot_wp
= smap && !is_write_protection(vcpu);
context->base_role.smm = is_smm(vcpu);
+ reset_shadow_zero_bits_mask(vcpu, context);
}
EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu);
@@ -3931,6 +4029,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly)
update_permission_bitmask(vcpu, context, true);
reset_rsvds_bits_mask_ept(vcpu, context, execonly);
+ reset_ept_shadow_zero_bits_mask(vcpu, context, execonly);
}
EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu);
@@ -4852,28 +4951,6 @@ unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm)
return nr_mmu_pages;
}
-int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4])
-{
- struct kvm_shadow_walk_iterator iterator;
- u64 spte;
- int nr_sptes = 0;
-
- if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
- return nr_sptes;
-
- walk_shadow_page_lockless_begin(vcpu);
- for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) {
- sptes[iterator.level-1] = spte;
- nr_sptes++;
- if (!is_shadow_present_pte(spte))
- break;
- }
- walk_shadow_page_lockless_end(vcpu);
-
- return nr_sptes;
-}
-EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy);
-
void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
{
kvm_mmu_unload(vcpu);
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 398d21c0f6dd..e4202e41d535 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -50,9 +50,11 @@ static inline u64 rsvd_bits(int s, int e)
return ((1ULL << (e - s + 1)) - 1) << s;
}
-int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]);
void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask);
+void
+reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
+
/*
* Return values of handle_mmio_page_fault_common:
* RET_MMIO_PF_EMULATE: it is a real mmio page fault, emulate the instruction
diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c
index de1d2d8062e2..9e8bf13572e6 100644
--- a/arch/x86/kvm/mtrr.c
+++ b/arch/x86/kvm/mtrr.c
@@ -120,6 +120,16 @@ static u8 mtrr_default_type(struct kvm_mtrr *mtrr_state)
return mtrr_state->deftype & IA32_MTRR_DEF_TYPE_TYPE_MASK;
}
+static u8 mtrr_disabled_type(void)
+{
+ /*
+ * Intel SDM 11.11.2.2: all MTRRs are disabled when
+ * IA32_MTRR_DEF_TYPE.E bit is cleared, and the UC
+ * memory type is applied to all of physical memory.
+ */
+ return MTRR_TYPE_UNCACHABLE;
+}
+
/*
* Three terms are used in the following code:
* - segment, it indicates the address segments covered by fixed MTRRs.
@@ -434,6 +444,8 @@ struct mtrr_iter {
/* output fields. */
int mem_type;
+ /* mtrr is completely disabled? */
+ bool mtrr_disabled;
/* [start, end) is not fully covered in MTRRs? */
bool partial_map;
@@ -549,7 +561,7 @@ static void mtrr_lookup_var_next(struct mtrr_iter *iter)
static void mtrr_lookup_start(struct mtrr_iter *iter)
{
if (!mtrr_is_enabled(iter->mtrr_state)) {
- iter->partial_map = true;
+ iter->mtrr_disabled = true;
return;
}
@@ -563,6 +575,7 @@ static void mtrr_lookup_init(struct mtrr_iter *iter,
iter->mtrr_state = mtrr_state;
iter->start = start;
iter->end = end;
+ iter->mtrr_disabled = false;
iter->partial_map = false;
iter->fixed = false;
iter->range = NULL;
@@ -656,15 +669,19 @@ u8 kvm_mtrr_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn)
return MTRR_TYPE_WRBACK;
}
- /* It is not covered by MTRRs. */
- if (iter.partial_map) {
- /*
- * We just check one page, partially covered by MTRRs is
- * impossible.
- */
- WARN_ON(type != -1);
- type = mtrr_default_type(mtrr_state);
- }
+ if (iter.mtrr_disabled)
+ return mtrr_disabled_type();
+
+ /* not contained in any MTRRs. */
+ if (type == -1)
+ return mtrr_default_type(mtrr_state);
+
+ /*
+ * We just check one page, partially covered by MTRRs is
+ * impossible.
+ */
+ WARN_ON(iter.partial_map);
+
return type;
}
EXPORT_SYMBOL_GPL(kvm_mtrr_get_guest_memory_type);
@@ -689,6 +706,9 @@ bool kvm_mtrr_check_gfn_range_consistency(struct kvm_vcpu *vcpu, gfn_t gfn,
return false;
}
+ if (iter.mtrr_disabled)
+ return true;
+
if (!iter.partial_map)
return true;
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 0f67d7e24800..736e6ab8784d 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -128,14 +128,6 @@ static inline void FNAME(protect_clean_gpte)(unsigned *access, unsigned gpte)
*access &= mask;
}
-static bool FNAME(is_rsvd_bits_set)(struct kvm_mmu *mmu, u64 gpte, int level)
-{
- int bit7 = (gpte >> 7) & 1, low6 = gpte & 0x3f;
-
- return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) |
- ((mmu->bad_mt_xwr & (1ull << low6)) != 0);
-}
-
static inline int FNAME(is_present_gpte)(unsigned long pte)
{
#if PTTYPE != PTTYPE_EPT
@@ -172,7 +164,7 @@ static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *sp, u64 *spte,
u64 gpte)
{
- if (FNAME(is_rsvd_bits_set)(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
+ if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
goto no_present;
if (!FNAME(is_present_gpte)(gpte))
@@ -353,8 +345,7 @@ retry_walk:
if (unlikely(!FNAME(is_present_gpte)(pte)))
goto error;
- if (unlikely(FNAME(is_rsvd_bits_set)(mmu, pte,
- walker->level))) {
+ if (unlikely(is_rsvd_bits_set(mmu, pte, walker->level))) {
errcode |= PFERR_RSVD_MASK | PFERR_PRESENT_MASK;
goto error;
}
diff --git a/arch/x86/kvm/pmu_amd.c b/arch/x86/kvm/pmu_amd.c
index 886aa25a7131..39b91127ef07 100644
--- a/arch/x86/kvm/pmu_amd.c
+++ b/arch/x86/kvm/pmu_amd.c
@@ -133,8 +133,6 @@ static int amd_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
/* MSR_K7_PERFCTRn */
pmc = get_gp_pmc(pmu, msr, MSR_K7_PERFCTR0);
if (pmc) {
- if (!msr_info->host_initiated)
- data = (s64)data;
pmc->counter += data - pmc_read_counter(pmc);
return 0;
}
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 602b974a60a6..2f9ed1ff0632 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -202,6 +202,7 @@ module_param(npt, int, S_IRUGO);
static int nested = true;
module_param(nested, int, S_IRUGO);
+static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
static void svm_flush_tlb(struct kvm_vcpu *vcpu);
static void svm_complete_interrupts(struct vcpu_svm *svm);
@@ -513,7 +514,7 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
struct vcpu_svm *svm = to_svm(vcpu);
if (svm->vmcb->control.next_rip != 0) {
- WARN_ON(!static_cpu_has(X86_FEATURE_NRIPS));
+ WARN_ON_ONCE(!static_cpu_has(X86_FEATURE_NRIPS));
svm->next_rip = svm->vmcb->control.next_rip;
}
@@ -1080,7 +1081,7 @@ static u64 svm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
{
u64 tsc;
- tsc = svm_scale_tsc(vcpu, native_read_tsc());
+ tsc = svm_scale_tsc(vcpu, rdtsc());
return target_tsc - tsc;
}
@@ -1167,7 +1168,8 @@ static void init_vmcb(struct vcpu_svm *svm, bool init_event)
* svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0.
* It also updates the guest-visible cr0 value.
*/
- (void)kvm_set_cr0(&svm->vcpu, X86_CR0_NW | X86_CR0_CD | X86_CR0_ET);
+ svm_set_cr0(&svm->vcpu, X86_CR0_NW | X86_CR0_CD | X86_CR0_ET);
+ kvm_mmu_reset_context(&svm->vcpu);
save->cr4 = X86_CR4_PAE;
/* rdx = ?? */
@@ -1579,7 +1581,7 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
* does not do it - this results in some delay at
* reboot
*/
- if (!(vcpu->kvm->arch.disabled_quirks & KVM_QUIRK_CD_NW_CLEARED))
+ if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
cr0 &= ~(X86_CR0_CD | X86_CR0_NW);
svm->vmcb->save.cr0 = cr0;
mark_dirty(svm->vmcb, VMCB_CR);
@@ -2013,6 +2015,7 @@ static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
vcpu->arch.mmu.get_pdptr = nested_svm_get_tdp_pdptr;
vcpu->arch.mmu.inject_page_fault = nested_svm_inject_npf_exit;
vcpu->arch.mmu.shadow_root_level = get_npt_level();
+ reset_shadow_zero_bits_mask(vcpu, &vcpu->arch.mmu);
vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu;
}
@@ -3079,7 +3082,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
switch (msr_info->index) {
case MSR_IA32_TSC: {
msr_info->data = svm->vmcb->control.tsc_offset +
- svm_scale_tsc(vcpu, native_read_tsc());
+ svm_scale_tsc(vcpu, rdtsc());
break;
}
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index e856dd566f4c..06ef4908ba61 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1264,7 +1264,7 @@ static void vmcs_load(struct vmcs *vmcs)
vmcs, phys_addr);
}
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
/*
* This bitmap is used to indicate whether the vmclear
* operation is enabled on all cpus. All disabled by
@@ -1302,7 +1302,7 @@ static void crash_vmclear_local_loaded_vmcss(void)
#else
static inline void crash_enable_local_vmclear(int cpu) { }
static inline void crash_disable_local_vmclear(int cpu) { }
-#endif /* CONFIG_KEXEC */
+#endif /* CONFIG_KEXEC_CORE */
static void __loaded_vmcs_clear(void *arg)
{
@@ -2236,7 +2236,7 @@ static u64 guest_read_tsc(void)
{
u64 host_tsc, tsc_offset;
- rdtscll(host_tsc);
+ host_tsc = rdtsc();
tsc_offset = vmcs_read64(TSC_OFFSET);
return host_tsc + tsc_offset;
}
@@ -2317,7 +2317,7 @@ static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool ho
static u64 vmx_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
{
- return target_tsc - native_read_tsc();
+ return target_tsc - rdtsc();
}
static bool guest_cpuid_has_vmx(struct kvm_vcpu *vcpu)
@@ -2443,10 +2443,10 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING |
#endif
CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING |
- CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_EXITING |
- CPU_BASED_RDPMC_EXITING | CPU_BASED_RDTSC_EXITING |
- CPU_BASED_PAUSE_EXITING | CPU_BASED_TPR_SHADOW |
- CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
+ CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_TRAP_FLAG |
+ CPU_BASED_MONITOR_EXITING | CPU_BASED_RDPMC_EXITING |
+ CPU_BASED_RDTSC_EXITING | CPU_BASED_PAUSE_EXITING |
+ CPU_BASED_TPR_SHADOW | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
/*
* We can allow some features even when not supported by the
* hardware. For example, L1 can specify an MSR bitmap - and we
@@ -3150,7 +3150,7 @@ static struct vmcs *alloc_vmcs_cpu(int cpu)
struct page *pages;
struct vmcs *vmcs;
- pages = alloc_pages_exact_node(node, GFP_KERNEL, vmcs_config.order);
+ pages = __alloc_pages_node(node, GFP_KERNEL, vmcs_config.order);
if (!pages)
return NULL;
vmcs = page_address(pages);
@@ -3423,12 +3423,12 @@ static void enter_lmode(struct kvm_vcpu *vcpu)
vmx_segment_cache_clear(to_vmx(vcpu));
guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES);
- if ((guest_tr_ar & AR_TYPE_MASK) != AR_TYPE_BUSY_64_TSS) {
+ if ((guest_tr_ar & VMX_AR_TYPE_MASK) != VMX_AR_TYPE_BUSY_64_TSS) {
pr_debug_ratelimited("%s: tss fixup for long mode. \n",
__func__);
vmcs_write32(GUEST_TR_AR_BYTES,
- (guest_tr_ar & ~AR_TYPE_MASK)
- | AR_TYPE_BUSY_64_TSS);
+ (guest_tr_ar & ~VMX_AR_TYPE_MASK)
+ | VMX_AR_TYPE_BUSY_64_TSS);
}
vmx_set_efer(vcpu, vcpu->arch.efer | EFER_LMA);
}
@@ -3719,7 +3719,7 @@ static int vmx_get_cpl(struct kvm_vcpu *vcpu)
return 0;
else {
int ar = vmx_read_guest_seg_ar(vmx, VCPU_SREG_SS);
- return AR_DPL(ar);
+ return VMX_AR_DPL(ar);
}
}
@@ -3847,11 +3847,11 @@ static bool code_segment_valid(struct kvm_vcpu *vcpu)
if (cs.unusable)
return false;
- if (~cs.type & (AR_TYPE_CODE_MASK|AR_TYPE_ACCESSES_MASK))
+ if (~cs.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_ACCESSES_MASK))
return false;
if (!cs.s)
return false;
- if (cs.type & AR_TYPE_WRITEABLE_MASK) {
+ if (cs.type & VMX_AR_TYPE_WRITEABLE_MASK) {
if (cs.dpl > cs_rpl)
return false;
} else {
@@ -3901,7 +3901,7 @@ static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg)
return false;
if (!var.present)
return false;
- if (~var.type & (AR_TYPE_CODE_MASK|AR_TYPE_WRITEABLE_MASK)) {
+ if (~var.type & (VMX_AR_TYPE_CODE_MASK|VMX_AR_TYPE_WRITEABLE_MASK)) {
if (var.dpl < rpl) /* DPL < RPL */
return false;
}
@@ -5759,73 +5759,9 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
}
-static u64 ept_rsvd_mask(u64 spte, int level)
-{
- int i;
- u64 mask = 0;
-
- for (i = 51; i > boot_cpu_data.x86_phys_bits; i--)
- mask |= (1ULL << i);
-
- if (level == 4)
- /* bits 7:3 reserved */
- mask |= 0xf8;
- else if (spte & (1ULL << 7))
- /*
- * 1GB/2MB page, bits 29:12 or 20:12 reserved respectively,
- * level == 1 if the hypervisor is using the ignored bit 7.
- */
- mask |= (PAGE_SIZE << ((level - 1) * 9)) - PAGE_SIZE;
- else if (level > 1)
- /* bits 6:3 reserved */
- mask |= 0x78;
-
- return mask;
-}
-
-static void ept_misconfig_inspect_spte(struct kvm_vcpu *vcpu, u64 spte,
- int level)
-{
- printk(KERN_ERR "%s: spte 0x%llx level %d\n", __func__, spte, level);
-
- /* 010b (write-only) */
- WARN_ON((spte & 0x7) == 0x2);
-
- /* 110b (write/execute) */
- WARN_ON((spte & 0x7) == 0x6);
-
- /* 100b (execute-only) and value not supported by logical processor */
- if (!cpu_has_vmx_ept_execute_only())
- WARN_ON((spte & 0x7) == 0x4);
-
- /* not 000b */
- if ((spte & 0x7)) {
- u64 rsvd_bits = spte & ept_rsvd_mask(spte, level);
-
- if (rsvd_bits != 0) {
- printk(KERN_ERR "%s: rsvd_bits = 0x%llx\n",
- __func__, rsvd_bits);
- WARN_ON(1);
- }
-
- /* bits 5:3 are _not_ reserved for large page or leaf page */
- if ((rsvd_bits & 0x38) == 0) {
- u64 ept_mem_type = (spte & 0x38) >> 3;
-
- if (ept_mem_type == 2 || ept_mem_type == 3 ||
- ept_mem_type == 7) {
- printk(KERN_ERR "%s: ept_mem_type=0x%llx\n",
- __func__, ept_mem_type);
- WARN_ON(1);
- }
- }
- }
-}
-
static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
{
- u64 sptes[4];
- int nr_sptes, i, ret;
+ int ret;
gpa_t gpa;
gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
@@ -5846,13 +5782,7 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
return 1;
/* It is the real ept misconfig */
- printk(KERN_ERR "EPT: Misconfiguration.\n");
- printk(KERN_ERR "EPT: GPA: 0x%llx\n", gpa);
-
- nr_sptes = kvm_mmu_get_spte_hierarchy(vcpu, gpa, sptes);
-
- for (i = PT64_ROOT_LEVEL; i > PT64_ROOT_LEVEL - nr_sptes; --i)
- ept_misconfig_inspect_spte(vcpu, sptes[i-1], i);
+ WARN_ON(1);
vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
vcpu->run->hw.hardware_exit_reason = EXIT_REASON_EPT_MISCONFIG;
@@ -6134,6 +6064,8 @@ static __init int hardware_setup(void)
memcpy(vmx_msr_bitmap_longmode_x2apic,
vmx_msr_bitmap_longmode, PAGE_SIZE);
+ set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
+
if (enable_apicv) {
for (msr = 0x800; msr <= 0x8ff; msr++)
vmx_disable_intercept_msr_read_x2apic(msr);
@@ -6246,6 +6178,11 @@ static int handle_mwait(struct kvm_vcpu *vcpu)
return handle_nop(vcpu);
}
+static int handle_monitor_trap(struct kvm_vcpu *vcpu)
+{
+ return 1;
+}
+
static int handle_monitor(struct kvm_vcpu *vcpu)
{
printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n");
@@ -6408,8 +6345,12 @@ static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer)
*/
static int get_vmx_mem_address(struct kvm_vcpu *vcpu,
unsigned long exit_qualification,
- u32 vmx_instruction_info, gva_t *ret)
+ u32 vmx_instruction_info, bool wr, gva_t *ret)
{
+ gva_t off;
+ bool exn;
+ struct kvm_segment s;
+
/*
* According to Vol. 3B, "Information for VM Exits Due to Instruction
* Execution", on an exit, vmx_instruction_info holds most of the
@@ -6434,22 +6375,63 @@ static int get_vmx_mem_address(struct kvm_vcpu *vcpu,
/* Addr = segment_base + offset */
/* offset = base + [index * scale] + displacement */
- *ret = vmx_get_segment_base(vcpu, seg_reg);
+ off = exit_qualification; /* holds the displacement */
if (base_is_valid)
- *ret += kvm_register_read(vcpu, base_reg);
+ off += kvm_register_read(vcpu, base_reg);
if (index_is_valid)
- *ret += kvm_register_read(vcpu, index_reg)<<scaling;
- *ret += exit_qualification; /* holds the displacement */
+ off += kvm_register_read(vcpu, index_reg)<<scaling;
+ vmx_get_segment(vcpu, &s, seg_reg);
+ *ret = s.base + off;
if (addr_size == 1) /* 32 bit */
*ret &= 0xffffffff;
- /*
- * TODO: throw #GP (and return 1) in various cases that the VM*
- * instructions require it - e.g., offset beyond segment limit,
- * unusable or unreadable/unwritable segment, non-canonical 64-bit
- * address, and so on. Currently these are not checked.
- */
+ /* Checks for #GP/#SS exceptions. */
+ exn = false;
+ if (is_protmode(vcpu)) {
+ /* Protected mode: apply checks for segment validity in the
+ * following order:
+ * - segment type check (#GP(0) may be thrown)
+ * - usability check (#GP(0)/#SS(0))
+ * - limit check (#GP(0)/#SS(0))
+ */
+ if (wr)
+ /* #GP(0) if the destination operand is located in a
+ * read-only data segment or any code segment.
+ */
+ exn = ((s.type & 0xa) == 0 || (s.type & 8));
+ else
+ /* #GP(0) if the source operand is located in an
+ * execute-only code segment
+ */
+ exn = ((s.type & 0xa) == 8);
+ }
+ if (exn) {
+ kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
+ return 1;
+ }
+ if (is_long_mode(vcpu)) {
+ /* Long mode: #GP(0)/#SS(0) if the memory address is in a
+ * non-canonical form. This is an only check for long mode.
+ */
+ exn = is_noncanonical_address(*ret);
+ } else if (is_protmode(vcpu)) {
+ /* Protected mode: #GP(0)/#SS(0) if the segment is unusable.
+ */
+ exn = (s.unusable != 0);
+ /* Protected mode: #GP(0)/#SS(0) if the memory
+ * operand is outside the segment limit.
+ */
+ exn = exn || (off + sizeof(u64) > s.limit);
+ }
+ if (exn) {
+ kvm_queue_exception_e(vcpu,
+ seg_reg == VCPU_SREG_SS ?
+ SS_VECTOR : GP_VECTOR,
+ 0);
+ return 1;
+ }
+
return 0;
}
@@ -6471,7 +6453,7 @@ static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason,
int maxphyaddr = cpuid_maxphyaddr(vcpu);
if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
- vmcs_read32(VMX_INSTRUCTION_INFO), &gva))
+ vmcs_read32(VMX_INSTRUCTION_INFO), false, &gva))
return 1;
if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vmptr,
@@ -6999,7 +6981,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
field_value);
} else {
if (get_vmx_mem_address(vcpu, exit_qualification,
- vmx_instruction_info, &gva))
+ vmx_instruction_info, true, &gva))
return 1;
/* _system ok, as nested_vmx_check_permission verified cpl=0 */
kvm_write_guest_virt_system(&vcpu->arch.emulate_ctxt, gva,
@@ -7036,7 +7018,7 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
(((vmx_instruction_info) >> 3) & 0xf));
else {
if (get_vmx_mem_address(vcpu, exit_qualification,
- vmx_instruction_info, &gva))
+ vmx_instruction_info, false, &gva))
return 1;
if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva,
&field_value, (is_64_bit_mode(vcpu) ? 8 : 4), &e)) {
@@ -7128,7 +7110,7 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu)
return 1;
if (get_vmx_mem_address(vcpu, exit_qualification,
- vmx_instruction_info, &vmcs_gva))
+ vmx_instruction_info, true, &vmcs_gva))
return 1;
/* ok to use *_system, as nested_vmx_check_permission verified cpl=0 */
if (kvm_write_guest_virt_system(&vcpu->arch.emulate_ctxt, vmcs_gva,
@@ -7184,7 +7166,7 @@ static int handle_invept(struct kvm_vcpu *vcpu)
* operand is read even if it isn't needed (e.g., for type==global)
*/
if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
- vmx_instruction_info, &gva))
+ vmx_instruction_info, false, &gva))
return 1;
if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &operand,
sizeof(operand), &e)) {
@@ -7282,6 +7264,7 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
[EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig,
[EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause,
[EXIT_REASON_MWAIT_INSTRUCTION] = handle_mwait,
+ [EXIT_REASON_MONITOR_TRAP_FLAG] = handle_monitor_trap,
[EXIT_REASON_MONITOR_INSTRUCTION] = handle_monitor,
[EXIT_REASON_INVEPT] = handle_invept,
[EXIT_REASON_INVVPID] = handle_invvpid,
@@ -7542,6 +7525,8 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
return true;
case EXIT_REASON_MWAIT_INSTRUCTION:
return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING);
+ case EXIT_REASON_MONITOR_TRAP_FLAG:
+ return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_TRAP_FLAG);
case EXIT_REASON_MONITOR_INSTRUCTION:
return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING);
case EXIT_REASON_PAUSE_INSTRUCTION:
@@ -8655,7 +8640,10 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
if (kvm_read_cr0(vcpu) & X86_CR0_CD) {
ipat = VMX_EPT_IPAT_BIT;
- cache = MTRR_TYPE_UNCACHABLE;
+ if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
+ cache = MTRR_TYPE_WRBACK;
+ else
+ cache = MTRR_TYPE_UNCACHABLE;
goto exit;
}
@@ -10430,7 +10418,7 @@ static int __init vmx_init(void)
if (r)
return r;
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
rcu_assign_pointer(crash_vmclear_loaded_vmcss,
crash_vmclear_local_loaded_vmcss);
#endif
@@ -10440,7 +10428,7 @@ static int __init vmx_init(void)
static void __exit vmx_exit(void)
{
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL);
synchronize_rcu();
#endif
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index bbaf44e8f0d3..92511d4b7236 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -29,6 +29,7 @@
#include "cpuid.h"
#include "assigned-dev.h"
#include "pmu.h"
+#include "hyperv.h"
#include <linux/clocksource.h>
#include <linux/interrupt.h>
@@ -148,6 +149,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
{ "nmi_window", VCPU_STAT(nmi_window_exits) },
{ "halt_exits", VCPU_STAT(halt_exits) },
{ "halt_successful_poll", VCPU_STAT(halt_successful_poll) },
+ { "halt_attempted_poll", VCPU_STAT(halt_attempted_poll) },
{ "halt_wakeup", VCPU_STAT(halt_wakeup) },
{ "hypercalls", VCPU_STAT(hypercalls) },
{ "request_irq", VCPU_STAT(request_irq_exits) },
@@ -221,11 +223,9 @@ static void shared_msr_update(unsigned slot, u32 msr)
void kvm_define_shared_msr(unsigned slot, u32 msr)
{
BUG_ON(slot >= KVM_NR_SHARED_MSRS);
+ shared_msrs_global.msrs[slot] = msr;
if (slot >= shared_msrs_global.nr)
shared_msrs_global.nr = slot + 1;
- shared_msrs_global.msrs[slot] = msr;
- /* we need ensured the shared_msr_global have been updated */
- smp_wmb();
}
EXPORT_SYMBOL_GPL(kvm_define_shared_msr);
@@ -526,7 +526,8 @@ int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3)
}
for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
if (is_present_gpte(pdpte[i]) &&
- (pdpte[i] & vcpu->arch.mmu.rsvd_bits_mask[0][2])) {
+ (pdpte[i] &
+ vcpu->arch.mmu.guest_rsvd_check.rsvd_bits_mask[0][2])) {
ret = 0;
goto out;
}
@@ -949,6 +950,8 @@ static u32 emulated_msrs[] = {
MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC,
+ HV_X64_MSR_CRASH_P0, HV_X64_MSR_CRASH_P1, HV_X64_MSR_CRASH_P2,
+ HV_X64_MSR_CRASH_P3, HV_X64_MSR_CRASH_P4, HV_X64_MSR_CRASH_CTL,
HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
MSR_KVM_PV_EOI_EN,
@@ -1217,11 +1220,6 @@ static void kvm_get_time_scale(uint32_t scaled_khz, uint32_t base_khz,
__func__, base_khz, scaled_khz, shift, *pmultiplier);
}
-static inline u64 get_kernel_ns(void)
-{
- return ktime_get_boot_ns();
-}
-
#ifdef CONFIG_X86_64
static atomic_t kvm_guest_has_master_clock = ATOMIC_INIT(0);
#endif
@@ -1444,20 +1442,8 @@ EXPORT_SYMBOL_GPL(kvm_write_tsc);
static cycle_t read_tsc(void)
{
- cycle_t ret;
- u64 last;
-
- /*
- * Empirically, a fence (of type that depends on the CPU)
- * before rdtsc is enough to ensure that rdtsc is ordered
- * with respect to loads. The various CPU manuals are unclear
- * as to whether rdtsc can be reordered with later loads,
- * but no one has ever seen it happen.
- */
- rdtsc_barrier();
- ret = (cycle_t)vget_cycles();
-
- last = pvclock_gtod_data.clock.cycle_last;
+ cycle_t ret = (cycle_t)rdtsc_ordered();
+ u64 last = pvclock_gtod_data.clock.cycle_last;
if (likely(ret >= last))
return ret;
@@ -1646,7 +1632,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
return 1;
}
if (!use_master_clock) {
- host_tsc = native_read_tsc();
+ host_tsc = rdtsc();
kernel_ns = get_kernel_ns();
}
@@ -1722,8 +1708,6 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
vcpu->pvclock_set_guest_stopped_request = false;
}
- pvclock_flags |= PVCLOCK_COUNTS_FROM_ZERO;
-
/* If the host uses TSC clocksource, then it is stable */
if (use_master_clock)
pvclock_flags |= PVCLOCK_TSC_STABLE_BIT;
@@ -1869,123 +1853,6 @@ out:
return r;
}
-static bool kvm_hv_hypercall_enabled(struct kvm *kvm)
-{
- return kvm->arch.hv_hypercall & HV_X64_MSR_HYPERCALL_ENABLE;
-}
-
-static bool kvm_hv_msr_partition_wide(u32 msr)
-{
- bool r = false;
- switch (msr) {
- case HV_X64_MSR_GUEST_OS_ID:
- case HV_X64_MSR_HYPERCALL:
- case HV_X64_MSR_REFERENCE_TSC:
- case HV_X64_MSR_TIME_REF_COUNT:
- r = true;
- break;
- }
-
- return r;
-}
-
-static int set_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data)
-{
- struct kvm *kvm = vcpu->kvm;
-
- switch (msr) {
- case HV_X64_MSR_GUEST_OS_ID:
- kvm->arch.hv_guest_os_id = data;
- /* setting guest os id to zero disables hypercall page */
- if (!kvm->arch.hv_guest_os_id)
- kvm->arch.hv_hypercall &= ~HV_X64_MSR_HYPERCALL_ENABLE;
- break;
- case HV_X64_MSR_HYPERCALL: {
- u64 gfn;
- unsigned long addr;
- u8 instructions[4];
-
- /* if guest os id is not set hypercall should remain disabled */
- if (!kvm->arch.hv_guest_os_id)
- break;
- if (!(data & HV_X64_MSR_HYPERCALL_ENABLE)) {
- kvm->arch.hv_hypercall = data;
- break;
- }
- gfn = data >> HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT;
- addr = gfn_to_hva(kvm, gfn);
- if (kvm_is_error_hva(addr))
- return 1;
- kvm_x86_ops->patch_hypercall(vcpu, instructions);
- ((unsigned char *)instructions)[3] = 0xc3; /* ret */
- if (__copy_to_user((void __user *)addr, instructions, 4))
- return 1;
- kvm->arch.hv_hypercall = data;
- mark_page_dirty(kvm, gfn);
- break;
- }
- case HV_X64_MSR_REFERENCE_TSC: {
- u64 gfn;
- HV_REFERENCE_TSC_PAGE tsc_ref;
- memset(&tsc_ref, 0, sizeof(tsc_ref));
- kvm->arch.hv_tsc_page = data;
- if (!(data & HV_X64_MSR_TSC_REFERENCE_ENABLE))
- break;
- gfn = data >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT;
- if (kvm_write_guest(kvm, gfn << HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT,
- &tsc_ref, sizeof(tsc_ref)))
- return 1;
- mark_page_dirty(kvm, gfn);
- break;
- }
- default:
- vcpu_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x "
- "data 0x%llx\n", msr, data);
- return 1;
- }
- return 0;
-}
-
-static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data)
-{
- switch (msr) {
- case HV_X64_MSR_APIC_ASSIST_PAGE: {
- u64 gfn;
- unsigned long addr;
-
- if (!(data & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE)) {
- vcpu->arch.hv_vapic = data;
- if (kvm_lapic_enable_pv_eoi(vcpu, 0))
- return 1;
- break;
- }
- gfn = data >> HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT;
- addr = kvm_vcpu_gfn_to_hva(vcpu, gfn);
- if (kvm_is_error_hva(addr))
- return 1;
- if (__clear_user((void __user *)addr, PAGE_SIZE))
- return 1;
- vcpu->arch.hv_vapic = data;
- kvm_vcpu_mark_page_dirty(vcpu, gfn);
- if (kvm_lapic_enable_pv_eoi(vcpu, gfn_to_gpa(gfn) | KVM_MSR_ENABLED))
- return 1;
- break;
- }
- case HV_X64_MSR_EOI:
- return kvm_hv_vapic_msr_write(vcpu, APIC_EOI, data);
- case HV_X64_MSR_ICR:
- return kvm_hv_vapic_msr_write(vcpu, APIC_ICR, data);
- case HV_X64_MSR_TPR:
- return kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data);
- default:
- vcpu_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x "
- "data 0x%llx\n", msr, data);
- return 1;
- }
-
- return 0;
-}
-
static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
{
gpa_t gpa = data & ~0x3f;
@@ -2105,7 +1972,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if (guest_cpuid_has_tsc_adjust(vcpu)) {
if (!msr_info->host_initiated) {
s64 adj = data - vcpu->arch.ia32_tsc_adjust_msr;
- kvm_x86_ops->adjust_tsc_offset(vcpu, adj, true);
+ adjust_tsc_offset_guest(vcpu, adj);
}
vcpu->arch.ia32_tsc_adjust_msr = data;
}
@@ -2138,8 +2005,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
&vcpu->requests);
ka->boot_vcpu_runs_old_kvmclock = tmp;
-
- ka->kvmclock_offset = -get_kernel_ns();
}
vcpu->arch.time = data;
@@ -2224,15 +2089,10 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
*/
break;
case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
- if (kvm_hv_msr_partition_wide(msr)) {
- int r;
- mutex_lock(&vcpu->kvm->lock);
- r = set_msr_hyperv_pw(vcpu, msr, data);
- mutex_unlock(&vcpu->kvm->lock);
- return r;
- } else
- return set_msr_hyperv(vcpu, msr, data);
- break;
+ case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
+ case HV_X64_MSR_CRASH_CTL:
+ return kvm_hv_set_msr_common(vcpu, msr, data,
+ msr_info->host_initiated);
case MSR_IA32_BBL_CR_CTL3:
/* Drop writes to this legacy MSR -- see rdmsr
* counterpart for further detail.
@@ -2315,68 +2175,6 @@ static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
return 0;
}
-static int get_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
-{
- u64 data = 0;
- struct kvm *kvm = vcpu->kvm;
-
- switch (msr) {
- case HV_X64_MSR_GUEST_OS_ID:
- data = kvm->arch.hv_guest_os_id;
- break;
- case HV_X64_MSR_HYPERCALL:
- data = kvm->arch.hv_hypercall;
- break;
- case HV_X64_MSR_TIME_REF_COUNT: {
- data =
- div_u64(get_kernel_ns() + kvm->arch.kvmclock_offset, 100);
- break;
- }
- case HV_X64_MSR_REFERENCE_TSC:
- data = kvm->arch.hv_tsc_page;
- break;
- default:
- vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
- return 1;
- }
-
- *pdata = data;
- return 0;
-}
-
-static int get_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
-{
- u64 data = 0;
-
- switch (msr) {
- case HV_X64_MSR_VP_INDEX: {
- int r;
- struct kvm_vcpu *v;
- kvm_for_each_vcpu(r, v, vcpu->kvm) {
- if (v == vcpu) {
- data = r;
- break;
- }
- }
- break;
- }
- case HV_X64_MSR_EOI:
- return kvm_hv_vapic_msr_read(vcpu, APIC_EOI, pdata);
- case HV_X64_MSR_ICR:
- return kvm_hv_vapic_msr_read(vcpu, APIC_ICR, pdata);
- case HV_X64_MSR_TPR:
- return kvm_hv_vapic_msr_read(vcpu, APIC_TASKPRI, pdata);
- case HV_X64_MSR_APIC_ASSIST_PAGE:
- data = vcpu->arch.hv_vapic;
- break;
- default:
- vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
- return 1;
- }
- *pdata = data;
- return 0;
-}
-
int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
switch (msr_info->index) {
@@ -2388,6 +2186,8 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_IA32_LASTINTFROMIP:
case MSR_IA32_LASTINTTOIP:
case MSR_K8_SYSCFG:
+ case MSR_K8_TSEG_ADDR:
+ case MSR_K8_TSEG_MASK:
case MSR_K7_HWCR:
case MSR_VM_HSAVE_PA:
case MSR_K8_INT_PENDING_MSG:
@@ -2493,14 +2293,10 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
msr_info->data = 0x20000000;
break;
case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
- if (kvm_hv_msr_partition_wide(msr_info->index)) {
- int r;
- mutex_lock(&vcpu->kvm->lock);
- r = get_msr_hyperv_pw(vcpu, msr_info->index, &msr_info->data);
- mutex_unlock(&vcpu->kvm->lock);
- return r;
- } else
- return get_msr_hyperv(vcpu, msr_info->index, &msr_info->data);
+ case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
+ case HV_X64_MSR_CRASH_CTL:
+ return kvm_hv_get_msr_common(vcpu,
+ msr_info->index, &msr_info->data);
break;
case MSR_IA32_BBL_CR_CTL3:
/* This legacy MSR exists but isn't fully documented in current
@@ -2651,6 +2447,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_TSC_DEADLINE_TIMER:
case KVM_CAP_ENABLE_CAP_VM:
case KVM_CAP_DISABLE_QUIRKS:
+ case KVM_CAP_SET_BOOT_CPU_ID:
#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
case KVM_CAP_ASSIGN_DEV_IRQ:
case KVM_CAP_PCI_2_3:
@@ -2810,7 +2607,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) {
s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 :
- native_read_tsc() - vcpu->arch.last_host_tsc;
+ rdtsc() - vcpu->arch.last_host_tsc;
if (tsc_delta < 0)
mark_tsc_unstable("KVM discovered backwards TSC");
if (check_tsc_unstable()) {
@@ -2838,7 +2635,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
{
kvm_x86_ops->vcpu_put(vcpu);
kvm_put_guest_fpu(vcpu);
- vcpu->arch.last_host_tsc = native_read_tsc();
+ vcpu->arch.last_host_tsc = rdtsc();
}
static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
@@ -3157,8 +2954,7 @@ static void load_xsave(struct kvm_vcpu *vcpu, u8 *src)
cpuid_count(XSTATE_CPUID, index,
&size, &offset, &ecx, &edx);
memcpy(dest, src + offset, size);
- } else
- WARN_ON_ONCE(1);
+ }
valid -= feature;
}
@@ -3818,30 +3614,25 @@ long kvm_arch_vm_ioctl(struct file *filp,
r = kvm_ioapic_init(kvm);
if (r) {
mutex_lock(&kvm->slots_lock);
- kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS,
- &vpic->dev_master);
- kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS,
- &vpic->dev_slave);
- kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS,
- &vpic->dev_eclr);
+ kvm_destroy_pic(vpic);
mutex_unlock(&kvm->slots_lock);
- kfree(vpic);
goto create_irqchip_unlock;
}
} else
goto create_irqchip_unlock;
- smp_wmb();
- kvm->arch.vpic = vpic;
- smp_wmb();
r = kvm_setup_default_irq_routing(kvm);
if (r) {
mutex_lock(&kvm->slots_lock);
mutex_lock(&kvm->irq_lock);
kvm_ioapic_destroy(kvm);
- kvm_destroy_pic(kvm);
+ kvm_destroy_pic(vpic);
mutex_unlock(&kvm->irq_lock);
mutex_unlock(&kvm->slots_lock);
+ goto create_irqchip_unlock;
}
+ /* Write kvm->irq_routing before kvm->arch.vpic. */
+ smp_wmb();
+ kvm->arch.vpic = vpic;
create_irqchip_unlock:
mutex_unlock(&kvm->lock);
break;
@@ -3968,6 +3759,15 @@ long kvm_arch_vm_ioctl(struct file *filp,
r = kvm_vm_ioctl_reinject(kvm, &control);
break;
}
+ case KVM_SET_BOOT_CPU_ID:
+ r = 0;
+ mutex_lock(&kvm->lock);
+ if (atomic_read(&kvm->online_vcpus) != 0)
+ r = -EBUSY;
+ else
+ kvm->arch.bsp_vcpu_id = arg;
+ mutex_unlock(&kvm->lock);
+ break;
case KVM_XEN_HVM_CONFIG: {
r = -EFAULT;
if (copy_from_user(&kvm->arch.xen_hvm_config, argp,
@@ -5883,66 +5683,6 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu)
}
EXPORT_SYMBOL_GPL(kvm_emulate_halt);
-int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
-{
- u64 param, ingpa, outgpa, ret;
- uint16_t code, rep_idx, rep_cnt, res = HV_STATUS_SUCCESS, rep_done = 0;
- bool fast, longmode;
-
- /*
- * hypercall generates UD from non zero cpl and real mode
- * per HYPER-V spec
- */
- if (kvm_x86_ops->get_cpl(vcpu) != 0 || !is_protmode(vcpu)) {
- kvm_queue_exception(vcpu, UD_VECTOR);
- return 0;
- }
-
- longmode = is_64_bit_mode(vcpu);
-
- if (!longmode) {
- param = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDX) << 32) |
- (kvm_register_read(vcpu, VCPU_REGS_RAX) & 0xffffffff);
- ingpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RBX) << 32) |
- (kvm_register_read(vcpu, VCPU_REGS_RCX) & 0xffffffff);
- outgpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDI) << 32) |
- (kvm_register_read(vcpu, VCPU_REGS_RSI) & 0xffffffff);
- }
-#ifdef CONFIG_X86_64
- else {
- param = kvm_register_read(vcpu, VCPU_REGS_RCX);
- ingpa = kvm_register_read(vcpu, VCPU_REGS_RDX);
- outgpa = kvm_register_read(vcpu, VCPU_REGS_R8);
- }
-#endif
-
- code = param & 0xffff;
- fast = (param >> 16) & 0x1;
- rep_cnt = (param >> 32) & 0xfff;
- rep_idx = (param >> 48) & 0xfff;
-
- trace_kvm_hv_hypercall(code, fast, rep_cnt, rep_idx, ingpa, outgpa);
-
- switch (code) {
- case HV_X64_HV_NOTIFY_LONG_SPIN_WAIT:
- kvm_vcpu_on_spin(vcpu);
- break;
- default:
- res = HV_STATUS_INVALID_HYPERCALL_CODE;
- break;
- }
-
- ret = res | (((u64)rep_done & 0xfff) << 32);
- if (longmode) {
- kvm_register_write(vcpu, VCPU_REGS_RAX, ret);
- } else {
- kvm_register_write(vcpu, VCPU_REGS_RDX, ret >> 32);
- kvm_register_write(vcpu, VCPU_REGS_RAX, ret & 0xffffffff);
- }
-
- return 1;
-}
-
/*
* kvm_pv_kick_cpu_op: Kick a vcpu.
*
@@ -6202,6 +5942,7 @@ static void process_smi_save_seg_32(struct kvm_vcpu *vcpu, char *buf, int n)
put_smstate(u32, buf, offset, process_smi_get_segment_flags(&seg));
}
+#ifdef CONFIG_X86_64
static void process_smi_save_seg_64(struct kvm_vcpu *vcpu, char *buf, int n)
{
struct kvm_segment seg;
@@ -6217,6 +5958,7 @@ static void process_smi_save_seg_64(struct kvm_vcpu *vcpu, char *buf, int n)
put_smstate(u32, buf, offset + 4, seg.limit);
put_smstate(u64, buf, offset + 8, seg.base);
}
+#endif
static void process_smi_save_state_32(struct kvm_vcpu *vcpu, char *buf)
{
@@ -6328,6 +6070,7 @@ static void process_smi_save_state_64(struct kvm_vcpu *vcpu, char *buf)
static void process_smi(struct kvm_vcpu *vcpu)
{
struct kvm_segment cs, ds;
+ struct desc_ptr dt;
char buf[512];
u32 cr0;
@@ -6360,6 +6103,10 @@ static void process_smi(struct kvm_vcpu *vcpu)
kvm_x86_ops->set_cr4(vcpu, 0);
+ /* Undocumented: IDT limit is set to zero on entry to SMM. */
+ dt.address = dt.size = 0;
+ kvm_x86_ops->set_idt(vcpu, &dt);
+
__kvm_set_dr(vcpu, 7, DR7_FIXED_1);
cs.selector = (vcpu->arch.smbase >> 4) & 0xffff;
@@ -6514,6 +6261,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
vcpu_scan_ioapic(vcpu);
if (kvm_check_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu))
kvm_vcpu_reload_apic_access_page(vcpu);
+ if (kvm_check_request(KVM_REQ_HV_CRASH, vcpu)) {
+ vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
+ vcpu->run->system_event.type = KVM_SYSTEM_EVENT_CRASH;
+ r = 0;
+ goto out;
+ }
}
if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
@@ -6623,7 +6376,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
hw_breakpoint_restore();
vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu,
- native_read_tsc());
+ rdtsc());
vcpu->mode = OUTSIDE_GUEST_MODE;
smp_wmb();
@@ -7315,11 +7068,6 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
vcpu = kvm_x86_ops->vcpu_create(kvm, id);
- /*
- * Activate fpu unconditionally in case the guest needs eager FPU. It will be
- * deactivated soon if it doesn't.
- */
- kvm_x86_ops->fpu_activate(vcpu);
return vcpu;
}
@@ -7437,7 +7185,7 @@ int kvm_arch_hardware_enable(void)
if (ret != 0)
return ret;
- local_tsc = native_read_tsc();
+ local_tsc = rdtsc();
stable = !check_tsc_unstable();
list_for_each_entry(kvm, &vm_list, vm_list) {
kvm_for_each_vcpu(i, vcpu, kvm) {
@@ -7541,6 +7289,17 @@ void kvm_arch_check_processor_compat(void *rtn)
kvm_x86_ops->check_processor_compatibility(rtn);
}
+bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu)
+{
+ return vcpu->kvm->arch.bsp_vcpu_id == vcpu->vcpu_id;
+}
+EXPORT_SYMBOL_GPL(kvm_vcpu_is_reset_bsp);
+
+bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu)
+{
+ return (vcpu->arch.apic_base & MSR_IA32_APICBASE_BSP) != 0;
+}
+
bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu)
{
return irqchip_in_kernel(vcpu->kvm) == (vcpu->arch.apic != NULL);
@@ -8218,6 +7977,24 @@ bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu)
kvm_x86_ops->interrupt_allowed(vcpu);
}
+void kvm_arch_start_assignment(struct kvm *kvm)
+{
+ atomic_inc(&kvm->arch.assigned_device_count);
+}
+EXPORT_SYMBOL_GPL(kvm_arch_start_assignment);
+
+void kvm_arch_end_assignment(struct kvm *kvm)
+{
+ atomic_dec(&kvm->arch.assigned_device_count);
+}
+EXPORT_SYMBOL_GPL(kvm_arch_end_assignment);
+
+bool kvm_arch_has_assigned_device(struct kvm *kvm)
+{
+ return atomic_read(&kvm->arch.assigned_device_count);
+}
+EXPORT_SYMBOL_GPL(kvm_arch_has_assigned_device);
+
void kvm_arch_register_noncoherent_dma(struct kvm *kvm)
{
atomic_inc(&kvm->arch.noncoherent_dma_count);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index edc8cdcd786b..2f822cd886c2 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -147,6 +147,16 @@ static inline void kvm_register_writel(struct kvm_vcpu *vcpu,
return kvm_register_write(vcpu, reg, val);
}
+static inline u64 get_kernel_ns(void)
+{
+ return ktime_get_boot_ns();
+}
+
+static inline bool kvm_check_has_quirk(struct kvm *kvm, u64 quirk)
+{
+ return !(kvm->arch.disabled_quirks & quirk);
+}
+
void kvm_before_handle_nmi(struct kvm_vcpu *vcpu);
void kvm_after_handle_nmi(struct kvm_vcpu *vcpu);
void kvm_set_pending_timer(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index f2dc08c003eb..a0d09f6c6533 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -835,16 +835,46 @@ static struct irq_chip lguest_irq_controller = {
.irq_unmask = enable_lguest_irq,
};
+/*
+ * Interrupt descriptors are allocated as-needed, but low-numbered ones are
+ * reserved by the generic x86 code. So we ignore irq_alloc_desc_at if it
+ * tells us the irq is already used: other errors (ie. ENOMEM) we take
+ * seriously.
+ */
+static int lguest_setup_irq(unsigned int irq)
+{
+ struct irq_desc *desc;
+ int err;
+
+ /* Returns -ve error or vector number. */
+ err = irq_alloc_desc_at(irq, 0);
+ if (err < 0 && err != -EEXIST)
+ return err;
+
+ /*
+ * Tell the Linux infrastructure that the interrupt is
+ * controlled by our level-based lguest interrupt controller.
+ */
+ irq_set_chip_and_handler_name(irq, &lguest_irq_controller,
+ handle_level_irq, "level");
+
+ /* Some systems map "vectors" to interrupts weirdly. Not us! */
+ desc = irq_to_desc(irq);
+ __this_cpu_write(vector_irq[FIRST_EXTERNAL_VECTOR + irq], desc);
+ return 0;
+}
+
static int lguest_enable_irq(struct pci_dev *dev)
{
+ int err;
u8 line = 0;
/* We literally use the PCI interrupt line as the irq number. */
pci_read_config_byte(dev, PCI_INTERRUPT_LINE, &line);
- irq_set_chip_and_handler_name(line, &lguest_irq_controller,
- handle_level_irq, "level");
- dev->irq = line;
- return 0;
+ err = lguest_setup_irq(line);
+ if (!err)
+ dev->irq = line;
+ return err;
}
/* We don't do hotplug PCI, so this shouldn't be called. */
@@ -855,17 +885,13 @@ static void lguest_disable_irq(struct pci_dev *dev)
/*
* This sets up the Interrupt Descriptor Table (IDT) entry for each hardware
- * interrupt (except 128, which is used for system calls), and then tells the
- * Linux infrastructure that each interrupt is controlled by our level-based
- * lguest interrupt controller.
+ * interrupt (except 128, which is used for system calls).
*/
static void __init lguest_init_IRQ(void)
{
unsigned int i;
for (i = FIRST_EXTERNAL_VECTOR; i < FIRST_SYSTEM_VECTOR; i++) {
- /* Some systems map "vectors" to interrupts weirdly. Not us! */
- __this_cpu_write(vector_irq[i], i - FIRST_EXTERNAL_VECTOR);
if (i != IA32_SYSCALL_VECTOR)
set_intr_gate(i, irq_entries_start +
8 * (i - FIRST_EXTERNAL_VECTOR));
@@ -879,26 +905,6 @@ static void __init lguest_init_IRQ(void)
}
/*
- * Interrupt descriptors are allocated as-needed, but low-numbered ones are
- * reserved by the generic x86 code. So we ignore irq_alloc_desc_at if it
- * tells us the irq is already used: other errors (ie. ENOMEM) we take
- * seriously.
- */
-int lguest_setup_irq(unsigned int irq)
-{
- int err;
-
- /* Returns -ve error or vector number. */
- err = irq_alloc_desc_at(irq, 0);
- if (err < 0 && err != -EEXIST)
- return err;
-
- irq_set_chip_and_handler_name(irq, &lguest_irq_controller,
- handle_level_irq, "level");
- return 0;
-}
-
-/*
* Time.
*
* It would be far better for everyone if the Guest had its own clock, but
@@ -985,23 +991,11 @@ static int lguest_clockevent_set_next_event(unsigned long delta,
return 0;
}
-static void lguest_clockevent_set_mode(enum clock_event_mode mode,
- struct clock_event_device *evt)
+static int lguest_clockevent_shutdown(struct clock_event_device *evt)
{
- switch (mode) {
- case CLOCK_EVT_MODE_UNUSED:
- case CLOCK_EVT_MODE_SHUTDOWN:
- /* A 0 argument shuts the clock down. */
- hcall(LHCALL_SET_CLOCKEVENT, 0, 0, 0, 0);
- break;
- case CLOCK_EVT_MODE_ONESHOT:
- /* This is what we expect. */
- break;
- case CLOCK_EVT_MODE_PERIODIC:
- BUG();
- case CLOCK_EVT_MODE_RESUME:
- break;
- }
+ /* A 0 argument shuts the clock down. */
+ hcall(LHCALL_SET_CLOCKEVENT, 0, 0, 0, 0);
+ return 0;
}
/* This describes our primitive timer chip. */
@@ -1009,7 +1003,7 @@ static struct clock_event_device lguest_clockevent = {
.name = "lguest",
.features = CLOCK_EVT_FEAT_ONESHOT,
.set_next_event = lguest_clockevent_set_next_event,
- .set_mode = lguest_clockevent_set_mode,
+ .set_state_shutdown = lguest_clockevent_shutdown,
.rating = INT_MAX,
.mult = 1,
.shift = 0,
@@ -1021,7 +1015,7 @@ static struct clock_event_device lguest_clockevent = {
* This is the Guest timer interrupt handler (hardware interrupt 0). We just
* call the clockevent infrastructure and it does whatever needs doing.
*/
-static void lguest_time_irq(unsigned int irq, struct irq_desc *desc)
+static void lguest_time_irq(struct irq_desc *desc)
{
unsigned long flags;
@@ -1040,7 +1034,8 @@ static void lguest_time_irq(unsigned int irq, struct irq_desc *desc)
static void lguest_time_init(void)
{
/* Set up the timer interrupt (0) to go to our simple timer routine */
- lguest_setup_irq(0);
+ if (lguest_setup_irq(0) != 0)
+ panic("Could not set up timer irq");
irq_set_handler(0, lguest_time_irq);
clocksource_register_hz(&lguest_clock, NSEC_PER_SEC);
diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c
index 39d6a3db0b96..e912b2f6d36e 100644
--- a/arch/x86/lib/delay.c
+++ b/arch/x86/lib/delay.c
@@ -20,6 +20,7 @@
#include <asm/processor.h>
#include <asm/delay.h>
#include <asm/timer.h>
+#include <asm/mwait.h>
#ifdef CONFIG_SMP
# include <asm/smp.h>
@@ -49,16 +50,14 @@ static void delay_loop(unsigned long loops)
/* TSC based delay: */
static void delay_tsc(unsigned long __loops)
{
- u32 bclock, now, loops = __loops;
+ u64 bclock, now, loops = __loops;
int cpu;
preempt_disable();
cpu = smp_processor_id();
- rdtsc_barrier();
- rdtscl(bclock);
+ bclock = rdtsc_ordered();
for (;;) {
- rdtsc_barrier();
- rdtscl(now);
+ now = rdtsc_ordered();
if ((now - bclock) >= loops)
break;
@@ -79,14 +78,51 @@ static void delay_tsc(unsigned long __loops)
if (unlikely(cpu != smp_processor_id())) {
loops -= (now - bclock);
cpu = smp_processor_id();
- rdtsc_barrier();
- rdtscl(bclock);
+ bclock = rdtsc_ordered();
}
}
preempt_enable();
}
/*
+ * On some AMD platforms, MWAITX has a configurable 32-bit timer, that
+ * counts with TSC frequency. The input value is the loop of the
+ * counter, it will exit when the timer expires.
+ */
+static void delay_mwaitx(unsigned long __loops)
+{
+ u64 start, end, delay, loops = __loops;
+
+ start = rdtsc_ordered();
+
+ for (;;) {
+ delay = min_t(u64, MWAITX_MAX_LOOPS, loops);
+
+ /*
+ * Use cpu_tss as a cacheline-aligned, seldomly
+ * accessed per-cpu variable as the monitor target.
+ */
+ __monitorx(this_cpu_ptr(&cpu_tss), 0, 0);
+
+ /*
+ * AMD, like Intel, supports the EAX hint and EAX=0xf
+ * means, do not enter any deep C-state and we use it
+ * here in delay() to minimize wakeup latency.
+ */
+ __mwaitx(MWAITX_DISABLE_CSTATES, delay, MWAITX_ECX_TIMER_ENABLE);
+
+ end = rdtsc_ordered();
+
+ if (loops <= end - start)
+ break;
+
+ loops -= end - start;
+
+ start = end;
+ }
+}
+
+/*
* Since we calibrate only once at boot, this
* function should be set once at boot and not changed
*/
@@ -94,13 +130,19 @@ static void (*delay_fn)(unsigned long) = delay_loop;
void use_tsc_delay(void)
{
- delay_fn = delay_tsc;
+ if (delay_fn == delay_loop)
+ delay_fn = delay_tsc;
+}
+
+void use_mwaitx_delay(void)
+{
+ delay_fn = delay_mwaitx;
}
int read_current_timer(unsigned long *timer_val)
{
if (delay_fn == delay_tsc) {
- rdtscll(*timer_val);
+ *timer_val = rdtsc();
return 0;
}
return -1;
diff --git a/arch/x86/lib/usercopy.c b/arch/x86/lib/usercopy.c
index ddf9ecb53cc3..e342586db6e4 100644
--- a/arch/x86/lib/usercopy.c
+++ b/arch/x86/lib/usercopy.c
@@ -20,7 +20,7 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
unsigned long ret;
if (__range_not_ok(from, n, TASK_SIZE))
- return 0;
+ return n;
/*
* Even though this function is typically called from NMI/IRQ context
diff --git a/arch/x86/math-emu/fpu_entry.c b/arch/x86/math-emu/fpu_entry.c
index f37e84ab49f3..3d8f2e421466 100644
--- a/arch/x86/math-emu/fpu_entry.c
+++ b/arch/x86/math-emu/fpu_entry.c
@@ -29,7 +29,6 @@
#include <asm/uaccess.h>
#include <asm/traps.h>
-#include <asm/desc.h>
#include <asm/user.h>
#include <asm/fpu/internal.h>
@@ -181,7 +180,7 @@ void math_emulate(struct math_emu_info *info)
math_abort(FPU_info, SIGILL);
}
- code_descriptor = LDT_DESCRIPTOR(FPU_CS);
+ code_descriptor = FPU_get_ldt_descriptor(FPU_CS);
if (SEG_D_SIZE(code_descriptor)) {
/* The above test may be wrong, the book is not clear */
/* Segmented 32 bit protected mode */
diff --git a/arch/x86/math-emu/fpu_system.h b/arch/x86/math-emu/fpu_system.h
index 9ccecb61a4fa..5e044d506b7a 100644
--- a/arch/x86/math-emu/fpu_system.h
+++ b/arch/x86/math-emu/fpu_system.h
@@ -16,9 +16,24 @@
#include <linux/kernel.h>
#include <linux/mm.h>
-/* s is always from a cpu register, and the cpu does bounds checking
- * during register load --> no further bounds checks needed */
-#define LDT_DESCRIPTOR(s) (((struct desc_struct *)current->mm->context.ldt)[(s) >> 3])
+#include <asm/desc.h>
+#include <asm/mmu_context.h>
+
+static inline struct desc_struct FPU_get_ldt_descriptor(unsigned seg)
+{
+ static struct desc_struct zero_desc;
+ struct desc_struct ret = zero_desc;
+
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
+ seg >>= 3;
+ mutex_lock(&current->mm->context.lock);
+ if (current->mm->context.ldt && seg < current->mm->context.ldt->size)
+ ret = current->mm->context.ldt->entries[seg];
+ mutex_unlock(&current->mm->context.lock);
+#endif
+ return ret;
+}
+
#define SEG_D_SIZE(x) ((x).b & (3 << 21))
#define SEG_G_BIT(x) ((x).b & (1 << 23))
#define SEG_GRANULARITY(x) (((x).b & (1 << 23)) ? 4096 : 1)
diff --git a/arch/x86/math-emu/get_address.c b/arch/x86/math-emu/get_address.c
index 6ef5e99380f9..8db26591d91a 100644
--- a/arch/x86/math-emu/get_address.c
+++ b/arch/x86/math-emu/get_address.c
@@ -20,7 +20,7 @@
#include <linux/stddef.h>
#include <asm/uaccess.h>
-#include <asm/desc.h>
+#include <asm/vm86.h>
#include "fpu_system.h"
#include "exception.h"
@@ -158,7 +158,7 @@ static long pm_address(u_char FPU_modrm, u_char segment,
addr->selector = PM_REG_(segment);
}
- descriptor = LDT_DESCRIPTOR(PM_REG_(segment));
+ descriptor = FPU_get_ldt_descriptor(addr->selector);
base_address = SEG_BASE_ADDR(descriptor);
address = base_address + offset;
limit = base_address
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 9dc909841739..eef44d9a3f77 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -20,6 +20,7 @@
#include <asm/kmemcheck.h> /* kmemcheck_*(), ... */
#include <asm/fixmap.h> /* VSYSCALL_ADDR */
#include <asm/vsyscall.h> /* emulate_vsyscall */
+#include <asm/vm86.h> /* struct vm86 */
#define CREATE_TRACE_POINTS
#include <asm/trace/exceptions.h>
@@ -301,14 +302,16 @@ static inline void
check_v8086_mode(struct pt_regs *regs, unsigned long address,
struct task_struct *tsk)
{
+#ifdef CONFIG_VM86
unsigned long bit;
- if (!v8086_mode(regs))
+ if (!v8086_mode(regs) || !tsk->thread.vm86)
return;
bit = (address - 0xA0000) >> PAGE_SHIFT;
if (bit < 32)
- tsk->thread.screen_bitmap |= 1 << bit;
+ tsk->thread.vm86->screen_bitmap |= 1 << bit;
+#endif
}
static bool low_pfn(unsigned long pfn)
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 8533b46e6bee..1d8a83df153a 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -30,8 +30,11 @@
/*
* Tables translating between page_cache_type_t and pte encoding.
*
- * Minimal supported modes are defined statically, they are modified
- * during bootup if more supported cache modes are available.
+ * The default values are defined statically as minimal supported mode;
+ * WC and WT fall back to UC-. pat_init() updates these values to support
+ * more cache modes, WC and WT, when it is safe to do so. See pat_init()
+ * for the details. Note, __early_ioremap() used during early boot-time
+ * takes pgprot_t (pte encoding) and does not use these tables.
*
* Index into __cachemode2pte_tbl[] is the cachemode.
*
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 8340e45c891a..7562f42914b4 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -137,6 +137,7 @@ page_table_range_init_count(unsigned long start, unsigned long end)
vaddr = start;
pgd_idx = pgd_index(vaddr);
+ pmd_idx = pmd_index(vaddr);
for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd_idx++) {
for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end);
@@ -822,11 +823,11 @@ void __init mem_init(void)
}
#ifdef CONFIG_MEMORY_HOTPLUG
-int arch_add_memory(int nid, u64 start, u64 size)
+int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
{
struct pglist_data *pgdata = NODE_DATA(nid);
struct zone *zone = pgdata->node_zones +
- zone_for_memory(nid, start, size, ZONE_HIGHMEM);
+ zone_for_memory(nid, start, size, ZONE_HIGHMEM, for_device);
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 3fba623e3ba5..df48430c279b 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -687,11 +687,11 @@ static void update_end_of_memory_vars(u64 start, u64 size)
* Memory is added always to NORMAL zone. This means you will never get
* additional DMA/DMA32 memory.
*/
-int arch_add_memory(int nid, u64 start, u64 size)
+int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
{
struct pglist_data *pgdat = NODE_DATA(nid);
struct zone *zone = pgdat->node_zones +
- zone_for_memory(nid, start, size, ZONE_NORMAL);
+ zone_for_memory(nid, start, size, ZONE_NORMAL, for_device);
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
int ret;
@@ -1132,7 +1132,7 @@ void mark_rodata_ro(void)
* has been zapped already via cleanup_highmem().
*/
all_end = roundup((unsigned long)_brk_end, PMD_SIZE);
- set_memory_nx(rodata_start, (all_end - rodata_start) >> PAGE_SHIFT);
+ set_memory_nx(text_end, (all_end - text_end) >> PAGE_SHIFT);
rodata_test();
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index cc5ccc415cc0..b9c78f3bcd67 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -63,8 +63,6 @@ static int __ioremap_check_ram(unsigned long start_pfn, unsigned long nr_pages,
!PageReserved(pfn_to_page(start_pfn + i)))
return 1;
- WARN_ONCE(1, "ioremap on RAM pfn 0x%lx\n", start_pfn);
-
return 0;
}
@@ -94,7 +92,6 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
pgprot_t prot;
int retval;
void __iomem *ret_addr;
- int ram_region;
/* Don't allow wraparound or zero size */
last_addr = phys_addr + size - 1;
@@ -117,23 +114,15 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
/*
* Don't allow anybody to remap normal RAM that we're using..
*/
- /* First check if whole region can be identified as RAM or not */
- ram_region = region_is_ram(phys_addr, size);
- if (ram_region > 0) {
- WARN_ONCE(1, "ioremap on RAM at 0x%lx - 0x%lx\n",
- (unsigned long int)phys_addr,
- (unsigned long int)last_addr);
+ pfn = phys_addr >> PAGE_SHIFT;
+ last_pfn = last_addr >> PAGE_SHIFT;
+ if (walk_system_ram_range(pfn, last_pfn - pfn + 1, NULL,
+ __ioremap_check_ram) == 1) {
+ WARN_ONCE(1, "ioremap on RAM at %pa - %pa\n",
+ &phys_addr, &last_addr);
return NULL;
}
- /* If could not be identified(-1), check page by page */
- if (ram_region < 0) {
- pfn = phys_addr >> PAGE_SHIFT;
- last_pfn = last_addr >> PAGE_SHIFT;
- if (walk_system_ram_range(pfn, last_pfn - pfn + 1, NULL,
- __ioremap_check_ram) == 1)
- return NULL;
- }
/*
* Mappings have to be page-aligned
*/
diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
index 4860906c6b9f..9ce5da27b136 100644
--- a/arch/x86/mm/kasan_init_64.c
+++ b/arch/x86/mm/kasan_init_64.c
@@ -1,3 +1,4 @@
+#define pr_fmt(fmt) "kasan: " fmt
#include <linux/bootmem.h>
#include <linux/kasan.h>
#include <linux/kdebug.h>
@@ -11,8 +12,6 @@
extern pgd_t early_level4_pgt[PTRS_PER_PGD];
extern struct range pfn_mapped[E820_X_MAX];
-extern unsigned char kasan_zero_page[PAGE_SIZE];
-
static int __init map_range(struct range *range)
{
unsigned long start;
@@ -36,7 +35,7 @@ static void __init clear_pgds(unsigned long start,
pgd_clear(pgd_offset_k(start));
}
-void __init kasan_map_early_shadow(pgd_t *pgd)
+static void __init kasan_map_early_shadow(pgd_t *pgd)
{
int i;
unsigned long start = KASAN_SHADOW_START;
@@ -49,106 +48,6 @@ void __init kasan_map_early_shadow(pgd_t *pgd)
}
}
-static int __init zero_pte_populate(pmd_t *pmd, unsigned long addr,
- unsigned long end)
-{
- pte_t *pte = pte_offset_kernel(pmd, addr);
-
- while (addr + PAGE_SIZE <= end) {
- WARN_ON(!pte_none(*pte));
- set_pte(pte, __pte(__pa_nodebug(kasan_zero_page)
- | __PAGE_KERNEL_RO));
- addr += PAGE_SIZE;
- pte = pte_offset_kernel(pmd, addr);
- }
- return 0;
-}
-
-static int __init zero_pmd_populate(pud_t *pud, unsigned long addr,
- unsigned long end)
-{
- int ret = 0;
- pmd_t *pmd = pmd_offset(pud, addr);
-
- while (IS_ALIGNED(addr, PMD_SIZE) && addr + PMD_SIZE <= end) {
- WARN_ON(!pmd_none(*pmd));
- set_pmd(pmd, __pmd(__pa_nodebug(kasan_zero_pte)
- | __PAGE_KERNEL_RO));
- addr += PMD_SIZE;
- pmd = pmd_offset(pud, addr);
- }
- if (addr < end) {
- if (pmd_none(*pmd)) {
- void *p = vmemmap_alloc_block(PAGE_SIZE, NUMA_NO_NODE);
- if (!p)
- return -ENOMEM;
- set_pmd(pmd, __pmd(__pa_nodebug(p) | _KERNPG_TABLE));
- }
- ret = zero_pte_populate(pmd, addr, end);
- }
- return ret;
-}
-
-
-static int __init zero_pud_populate(pgd_t *pgd, unsigned long addr,
- unsigned long end)
-{
- int ret = 0;
- pud_t *pud = pud_offset(pgd, addr);
-
- while (IS_ALIGNED(addr, PUD_SIZE) && addr + PUD_SIZE <= end) {
- WARN_ON(!pud_none(*pud));
- set_pud(pud, __pud(__pa_nodebug(kasan_zero_pmd)
- | __PAGE_KERNEL_RO));
- addr += PUD_SIZE;
- pud = pud_offset(pgd, addr);
- }
-
- if (addr < end) {
- if (pud_none(*pud)) {
- void *p = vmemmap_alloc_block(PAGE_SIZE, NUMA_NO_NODE);
- if (!p)
- return -ENOMEM;
- set_pud(pud, __pud(__pa_nodebug(p) | _KERNPG_TABLE));
- }
- ret = zero_pmd_populate(pud, addr, end);
- }
- return ret;
-}
-
-static int __init zero_pgd_populate(unsigned long addr, unsigned long end)
-{
- int ret = 0;
- pgd_t *pgd = pgd_offset_k(addr);
-
- while (IS_ALIGNED(addr, PGDIR_SIZE) && addr + PGDIR_SIZE <= end) {
- WARN_ON(!pgd_none(*pgd));
- set_pgd(pgd, __pgd(__pa_nodebug(kasan_zero_pud)
- | __PAGE_KERNEL_RO));
- addr += PGDIR_SIZE;
- pgd = pgd_offset_k(addr);
- }
-
- if (addr < end) {
- if (pgd_none(*pgd)) {
- void *p = vmemmap_alloc_block(PAGE_SIZE, NUMA_NO_NODE);
- if (!p)
- return -ENOMEM;
- set_pgd(pgd, __pgd(__pa_nodebug(p) | _KERNPG_TABLE));
- }
- ret = zero_pud_populate(pgd, addr, end);
- }
- return ret;
-}
-
-
-static void __init populate_zero_shadow(const void *start, const void *end)
-{
- if (zero_pgd_populate((unsigned long)start, (unsigned long)end))
- panic("kasan: unable to map zero shadow!");
-}
-
-
#ifdef CONFIG_KASAN_INLINE
static int kasan_die_handler(struct notifier_block *self,
unsigned long val,
@@ -166,6 +65,26 @@ static struct notifier_block kasan_die_notifier = {
};
#endif
+void __init kasan_early_init(void)
+{
+ int i;
+ pteval_t pte_val = __pa_nodebug(kasan_zero_page) | __PAGE_KERNEL;
+ pmdval_t pmd_val = __pa_nodebug(kasan_zero_pte) | _KERNPG_TABLE;
+ pudval_t pud_val = __pa_nodebug(kasan_zero_pmd) | _KERNPG_TABLE;
+
+ for (i = 0; i < PTRS_PER_PTE; i++)
+ kasan_zero_pte[i] = __pte(pte_val);
+
+ for (i = 0; i < PTRS_PER_PMD; i++)
+ kasan_zero_pmd[i] = __pmd(pmd_val);
+
+ for (i = 0; i < PTRS_PER_PUD; i++)
+ kasan_zero_pud[i] = __pud(pud_val);
+
+ kasan_map_early_shadow(early_level4_pgt);
+ kasan_map_early_shadow(init_level4_pgt);
+}
+
void __init kasan_init(void)
{
int i;
@@ -176,10 +95,11 @@ void __init kasan_init(void)
memcpy(early_level4_pgt, init_level4_pgt, sizeof(early_level4_pgt));
load_cr3(early_level4_pgt);
+ __flush_tlb_all();
clear_pgds(KASAN_SHADOW_START, KASAN_SHADOW_END);
- populate_zero_shadow((void *)KASAN_SHADOW_START,
+ kasan_populate_zero_shadow((void *)KASAN_SHADOW_START,
kasan_mem_to_shadow((void *)PAGE_OFFSET));
for (i = 0; i < E820_X_MAX; i++) {
@@ -189,18 +109,22 @@ void __init kasan_init(void)
if (map_range(&pfn_mapped[i]))
panic("kasan: unable to allocate shadow!");
}
- populate_zero_shadow(kasan_mem_to_shadow((void *)PAGE_OFFSET + MAXMEM),
- kasan_mem_to_shadow((void *)__START_KERNEL_map));
+ kasan_populate_zero_shadow(
+ kasan_mem_to_shadow((void *)PAGE_OFFSET + MAXMEM),
+ kasan_mem_to_shadow((void *)__START_KERNEL_map));
vmemmap_populate((unsigned long)kasan_mem_to_shadow(_stext),
(unsigned long)kasan_mem_to_shadow(_end),
NUMA_NO_NODE);
- populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END),
+ kasan_populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END),
(void *)KASAN_SHADOW_END);
memset(kasan_zero_page, 0, PAGE_SIZE);
load_cr3(init_level4_pgt);
+ __flush_tlb_all();
init_task.kasan_depth = 0;
+
+ pr_info("Kernel address sanitizer initialized\n");
}
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index 9d518d693b4b..844b06d67df4 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -126,3 +126,10 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
mm->get_unmapped_area = arch_get_unmapped_area_topdown;
}
}
+
+const char *arch_vma_name(struct vm_area_struct *vma)
+{
+ if (vma->vm_flags & VM_MPX)
+ return "[mpx]";
+ return NULL;
+}
diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c
index 7a657f58bbea..134948b0926f 100644
--- a/arch/x86/mm/mpx.c
+++ b/arch/x86/mm/mpx.c
@@ -20,20 +20,6 @@
#define CREATE_TRACE_POINTS
#include <asm/trace/mpx.h>
-static const char *mpx_mapping_name(struct vm_area_struct *vma)
-{
- return "[mpx]";
-}
-
-static struct vm_operations_struct mpx_vma_ops = {
- .name = mpx_mapping_name,
-};
-
-static int is_mpx_vma(struct vm_area_struct *vma)
-{
- return (vma->vm_ops == &mpx_vma_ops);
-}
-
static inline unsigned long mpx_bd_size_bytes(struct mm_struct *mm)
{
if (is_64bit_mm(mm))
@@ -53,65 +39,24 @@ static inline unsigned long mpx_bt_size_bytes(struct mm_struct *mm)
/*
* This is really a simplified "vm_mmap". it only handles MPX
* bounds tables (the bounds directory is user-allocated).
- *
- * Later on, we use the vma->vm_ops to uniquely identify these
- * VMAs.
*/
static unsigned long mpx_mmap(unsigned long len)
{
- unsigned long ret;
- unsigned long addr, pgoff;
struct mm_struct *mm = current->mm;
- vm_flags_t vm_flags;
- struct vm_area_struct *vma;
+ unsigned long addr, populate;
/* Only bounds table can be allocated here */
if (len != mpx_bt_size_bytes(mm))
return -EINVAL;
down_write(&mm->mmap_sem);
-
- /* Too many mappings? */
- if (mm->map_count > sysctl_max_map_count) {
- ret = -ENOMEM;
- goto out;
- }
-
- /* Obtain the address to map to. we verify (or select) it and ensure
- * that it represents a valid section of the address space.
- */
- addr = get_unmapped_area(NULL, 0, len, 0, MAP_ANONYMOUS | MAP_PRIVATE);
- if (addr & ~PAGE_MASK) {
- ret = addr;
- goto out;
- }
-
- vm_flags = VM_READ | VM_WRITE | VM_MPX |
- mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
-
- /* Set pgoff according to addr for anon_vma */
- pgoff = addr >> PAGE_SHIFT;
-
- ret = mmap_region(NULL, addr, len, vm_flags, pgoff);
- if (IS_ERR_VALUE(ret))
- goto out;
-
- vma = find_vma(mm, ret);
- if (!vma) {
- ret = -ENOMEM;
- goto out;
- }
- vma->vm_ops = &mpx_vma_ops;
-
- if (vm_flags & VM_LOCKED) {
- up_write(&mm->mmap_sem);
- mm_populate(ret, len);
- return ret;
- }
-
-out:
+ addr = do_mmap(NULL, 0, len, PROT_READ | PROT_WRITE,
+ MAP_ANONYMOUS | MAP_PRIVATE, VM_MPX, 0, &populate);
up_write(&mm->mmap_sem);
- return ret;
+ if (populate)
+ mm_populate(addr, populate);
+
+ return addr;
}
enum reg_type {
@@ -812,7 +757,7 @@ static noinline int zap_bt_entries_mapping(struct mm_struct *mm,
* so stop immediately and return an error. This
* probably results in a SIGSEGV.
*/
- if (!is_mpx_vma(vma))
+ if (!(vma->vm_flags & VM_MPX))
return -EINVAL;
len = min(vma->vm_end, end) - addr;
@@ -945,9 +890,9 @@ static int try_unmap_single_bt(struct mm_struct *mm,
* lots of tables even though we have no actual table
* entries in use.
*/
- while (next && is_mpx_vma(next))
+ while (next && (next->vm_flags & VM_MPX))
next = next->vm_next;
- while (prev && is_mpx_vma(prev))
+ while (prev && (prev->vm_flags & VM_MPX))
prev = prev->vm_prev;
/*
* We know 'start' and 'end' lie within an area controlled
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 4053bb58bf92..c3b3f653ed0c 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -246,8 +246,10 @@ int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
bi->start = max(bi->start, low);
bi->end = min(bi->end, high);
- /* and there's no empty block */
- if (bi->start >= bi->end)
+ /* and there's no empty or non-exist block */
+ if (bi->start >= bi->end ||
+ !memblock_overlaps_region(&memblock.memory,
+ bi->start, bi->end - bi->start))
numa_remove_memblk_from(i--, mi);
}
diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c
index 8ff686aa7e8c..5f169d5d76a8 100644
--- a/arch/x86/mm/pageattr-test.c
+++ b/arch/x86/mm/pageattr-test.c
@@ -8,6 +8,7 @@
#include <linux/kthread.h>
#include <linux/random.h>
#include <linux/kernel.h>
+#include <linux/init.h>
#include <linux/mm.h>
#include <linux/vmalloc.h>
@@ -256,5 +257,4 @@ static int start_pageattr_test(void)
return 0;
}
-
-module_init(start_pageattr_test);
+device_initcall(start_pageattr_test);
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 727158cb3b3c..2c44c0792301 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -4,7 +4,6 @@
*/
#include <linux/highmem.h>
#include <linux/bootmem.h>
-#include <linux/module.h>
#include <linux/sched.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c
index 66338a60aa6e..c2aea63bee20 100644
--- a/arch/x86/mm/srat.c
+++ b/arch/x86/mm/srat.c
@@ -192,10 +192,11 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
node_set(node, numa_nodes_parsed);
- pr_info("SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx]%s\n",
+ pr_info("SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx]%s%s\n",
node, pxm,
(unsigned long long) start, (unsigned long long) end - 1,
- hotpluggable ? " hotplug" : "");
+ hotpluggable ? " hotplug" : "",
+ ma->flags & ACPI_SRAT_MEM_NON_VOLATILE ? " non-volatile" : "");
/* Mark hotplug range in memblock. */
if (hotpluggable && memblock_mark_hotplug(start, ma->length))
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 3250f2371aea..8ddb5d0d66fb 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -117,7 +117,7 @@ static void flush_tlb_func(void *info)
} else {
unsigned long addr;
unsigned long nr_pages =
- f->flush_end - f->flush_start / PAGE_SIZE;
+ (f->flush_end - f->flush_start) / PAGE_SIZE;
addr = f->flush_start;
while (addr < f->flush_end) {
__flush_tlb_single(addr);
@@ -140,6 +140,7 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
info.flush_end = end;
count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
+ trace_tlb_flush(TLB_REMOTE_SEND_IPI, end - start);
if (is_uv_system()) {
unsigned int cpu;
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 579a8fd74be0..70efcd0940f9 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -246,7 +246,7 @@ static void emit_prologue(u8 **pprog)
* goto out;
* if (++tail_call_cnt > MAX_TAIL_CALL_CNT)
* goto out;
- * prog = array->prog[index];
+ * prog = array->ptrs[index];
* if (prog == NULL)
* goto out;
* goto *(prog->bpf_func + prologue_size);
@@ -269,7 +269,7 @@ static void emit_bpf_tail_call(u8 **pprog)
EMIT4(0x48, 0x8B, 0x46, /* mov rax, qword ptr [rsi + 16] */
offsetof(struct bpf_array, map.max_entries));
EMIT3(0x48, 0x39, 0xD0); /* cmp rax, rdx */
-#define OFFSET1 44 /* number of bytes to jump */
+#define OFFSET1 47 /* number of bytes to jump */
EMIT2(X86_JBE, OFFSET1); /* jbe out */
label1 = cnt;
@@ -278,15 +278,15 @@ static void emit_bpf_tail_call(u8 **pprog)
*/
EMIT2_off32(0x8B, 0x85, -STACKSIZE + 36); /* mov eax, dword ptr [rbp - 516] */
EMIT3(0x83, 0xF8, MAX_TAIL_CALL_CNT); /* cmp eax, MAX_TAIL_CALL_CNT */
-#define OFFSET2 33
+#define OFFSET2 36
EMIT2(X86_JA, OFFSET2); /* ja out */
label2 = cnt;
EMIT3(0x83, 0xC0, 0x01); /* add eax, 1 */
EMIT2_off32(0x89, 0x85, -STACKSIZE + 36); /* mov dword ptr [rbp - 516], eax */
- /* prog = array->prog[index]; */
- EMIT4(0x48, 0x8D, 0x44, 0xD6); /* lea rax, [rsi + rdx * 8 + 0x50] */
- EMIT1(offsetof(struct bpf_array, prog));
+ /* prog = array->ptrs[index]; */
+ EMIT4_off32(0x48, 0x8D, 0x84, 0xD6, /* lea rax, [rsi + rdx * 8 + offsetof(...)] */
+ offsetof(struct bpf_array, ptrs));
EMIT3(0x48, 0x8B, 0x00); /* mov rax, qword ptr [rax] */
/* if (prog == NULL)
@@ -315,6 +315,26 @@ static void emit_bpf_tail_call(u8 **pprog)
*pprog = prog;
}
+
+static void emit_load_skb_data_hlen(u8 **pprog)
+{
+ u8 *prog = *pprog;
+ int cnt = 0;
+
+ /* r9d = skb->len - skb->data_len (headlen)
+ * r10 = skb->data
+ */
+ /* mov %r9d, off32(%rdi) */
+ EMIT3_off32(0x44, 0x8b, 0x8f, offsetof(struct sk_buff, len));
+
+ /* sub %r9d, off32(%rdi) */
+ EMIT3_off32(0x44, 0x2b, 0x8f, offsetof(struct sk_buff, data_len));
+
+ /* mov %r10, off32(%rdi) */
+ EMIT3_off32(0x4c, 0x8b, 0x97, offsetof(struct sk_buff, data));
+ *pprog = prog;
+}
+
static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
int oldproglen, struct jit_context *ctx)
{
@@ -329,36 +349,8 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
emit_prologue(&prog);
- if (seen_ld_abs) {
- /* r9d : skb->len - skb->data_len (headlen)
- * r10 : skb->data
- */
- if (is_imm8(offsetof(struct sk_buff, len)))
- /* mov %r9d, off8(%rdi) */
- EMIT4(0x44, 0x8b, 0x4f,
- offsetof(struct sk_buff, len));
- else
- /* mov %r9d, off32(%rdi) */
- EMIT3_off32(0x44, 0x8b, 0x8f,
- offsetof(struct sk_buff, len));
-
- if (is_imm8(offsetof(struct sk_buff, data_len)))
- /* sub %r9d, off8(%rdi) */
- EMIT4(0x44, 0x2b, 0x4f,
- offsetof(struct sk_buff, data_len));
- else
- EMIT3_off32(0x44, 0x2b, 0x8f,
- offsetof(struct sk_buff, data_len));
-
- if (is_imm8(offsetof(struct sk_buff, data)))
- /* mov %r10, off8(%rdi) */
- EMIT4(0x4c, 0x8b, 0x57,
- offsetof(struct sk_buff, data));
- else
- /* mov %r10, off32(%rdi) */
- EMIT3_off32(0x4c, 0x8b, 0x97,
- offsetof(struct sk_buff, data));
- }
+ if (seen_ld_abs)
+ emit_load_skb_data_hlen(&prog);
for (i = 0; i < insn_cnt; i++, insn++) {
const s32 imm32 = insn->imm;
@@ -367,6 +359,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
u8 b1 = 0, b2 = 0, b3 = 0;
s64 jmp_offset;
u8 jmp_cond;
+ bool reload_skb_data;
int ilen;
u8 *func;
@@ -818,12 +811,18 @@ xadd: if (is_imm8(insn->off))
func = (u8 *) __bpf_call_base + imm32;
jmp_offset = func - (image + addrs[i]);
if (seen_ld_abs) {
- EMIT2(0x41, 0x52); /* push %r10 */
- EMIT2(0x41, 0x51); /* push %r9 */
- /* need to adjust jmp offset, since
- * pop %r9, pop %r10 take 4 bytes after call insn
- */
- jmp_offset += 4;
+ reload_skb_data = bpf_helper_changes_skb_data(func);
+ if (reload_skb_data) {
+ EMIT1(0x57); /* push %rdi */
+ jmp_offset += 22; /* pop, mov, sub, mov */
+ } else {
+ EMIT2(0x41, 0x52); /* push %r10 */
+ EMIT2(0x41, 0x51); /* push %r9 */
+ /* need to adjust jmp offset, since
+ * pop %r9, pop %r10 take 4 bytes after call insn
+ */
+ jmp_offset += 4;
+ }
}
if (!imm32 || !is_simm32(jmp_offset)) {
pr_err("unsupported bpf func %d addr %p image %p\n",
@@ -832,8 +831,13 @@ xadd: if (is_imm8(insn->off))
}
EMIT1_off32(0xE8, jmp_offset);
if (seen_ld_abs) {
- EMIT2(0x41, 0x59); /* pop %r9 */
- EMIT2(0x41, 0x5A); /* pop %r10 */
+ if (reload_skb_data) {
+ EMIT1(0x5F); /* pop %rdi */
+ emit_load_skb_data_hlen(&prog);
+ } else {
+ EMIT2(0x41, 0x59); /* pop %r9 */
+ EMIT2(0x41, 0x5A); /* pop %r10 */
+ }
}
break;
@@ -1099,7 +1103,7 @@ void bpf_int_jit_compile(struct bpf_prog *prog)
}
if (bpf_jit_enable > 1)
- bpf_jit_dump(prog->len, proglen, 0, image);
+ bpf_jit_dump(prog->len, proglen, pass + 1, image);
if (image) {
bpf_flush_icache(header, image + proglen);
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index 8fd6f44aee83..dc78a4a9a466 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -673,24 +673,22 @@ int pcibios_add_device(struct pci_dev *dev)
return 0;
}
-int pcibios_enable_device(struct pci_dev *dev, int mask)
+int pcibios_alloc_irq(struct pci_dev *dev)
{
- int err;
-
- if ((err = pci_enable_resources(dev, mask)) < 0)
- return err;
-
- if (!pci_dev_msi_enabled(dev))
- return pcibios_enable_irq(dev);
- return 0;
+ return pcibios_enable_irq(dev);
}
-void pcibios_disable_device (struct pci_dev *dev)
+void pcibios_free_irq(struct pci_dev *dev)
{
- if (!pci_dev_msi_enabled(dev) && pcibios_disable_irq)
+ if (pcibios_disable_irq)
pcibios_disable_irq(dev);
}
+int pcibios_enable_device(struct pci_dev *dev, int mask)
+{
+ return pci_enable_resources(dev, mask);
+}
+
int pci_ext_cfg_avail(void)
{
if (raw_pci_ext_ops)
diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c
index 9a2b7101ae8a..e58565556703 100644
--- a/arch/x86/pci/fixup.c
+++ b/arch/x86/pci/fixup.c
@@ -62,19 +62,6 @@ static void pci_fixup_umc_ide(struct pci_dev *d)
}
DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_UMC, PCI_DEVICE_ID_UMC_UM8886BF, pci_fixup_umc_ide);
-static void pci_fixup_ncr53c810(struct pci_dev *d)
-{
- /*
- * NCR 53C810 returns class code 0 (at least on some systems).
- * Fix class to be PCI_CLASS_STORAGE_SCSI
- */
- if (!d->class) {
- dev_warn(&d->dev, "Fixing NCR 53C810 class code\n");
- d->class = PCI_CLASS_STORAGE_SCSI << 8;
- }
-}
-DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_NCR, PCI_DEVICE_ID_NCR_53C810, pci_fixup_ncr53c810);
-
static void pci_fixup_latency(struct pci_dev *d)
{
/*
diff --git a/arch/x86/pci/intel_mid_pci.c b/arch/x86/pci/intel_mid_pci.c
index 27062303c881..0d24e7c10145 100644
--- a/arch/x86/pci/intel_mid_pci.c
+++ b/arch/x86/pci/intel_mid_pci.c
@@ -35,6 +35,9 @@
#define PCIE_CAP_OFFSET 0x100
+/* Quirks for the listed devices */
+#define PCI_DEVICE_ID_INTEL_MRFL_MMC 0x1190
+
/* Fixed BAR fields */
#define PCIE_VNDR_CAP_ID_FIXED_BAR 0x00 /* Fixed BAR (TBD) */
#define PCI_FIXED_BAR_0_SIZE 0x04
@@ -210,22 +213,41 @@ static int intel_mid_pci_irq_enable(struct pci_dev *dev)
{
struct irq_alloc_info info;
int polarity;
+ int ret;
- if (dev->irq_managed && dev->irq > 0)
+ if (pci_has_managed_irq(dev))
return 0;
- if (intel_mid_identify_cpu() == INTEL_MID_CPU_CHIP_TANGIER)
- polarity = 0; /* active high */
- else
- polarity = 1; /* active low */
+ switch (intel_mid_identify_cpu()) {
+ case INTEL_MID_CPU_CHIP_TANGIER:
+ polarity = IOAPIC_POL_HIGH;
+
+ /* Special treatment for IRQ0 */
+ if (dev->irq == 0) {
+ /*
+ * TNG has IRQ0 assigned to eMMC controller. But there
+ * are also other devices with bogus PCI configuration
+ * that have IRQ0 assigned. This check ensures that
+ * eMMC gets it.
+ */
+ if (dev->device != PCI_DEVICE_ID_INTEL_MRFL_MMC)
+ return -EBUSY;
+ }
+ break;
+ default:
+ polarity = IOAPIC_POL_LOW;
+ break;
+ }
+
ioapic_set_alloc_attr(&info, dev_to_node(&dev->dev), 1, polarity);
/*
* MRST only have IOAPIC, the PCI irq lines are 1:1 mapped to
* IOAPIC RTE entries, so we just enable RTE for the device.
*/
- if (mp_map_gsi_to_irq(dev->irq, IOAPIC_MAP_ALLOC, &info) < 0)
- return -EBUSY;
+ ret = mp_map_gsi_to_irq(dev->irq, IOAPIC_MAP_ALLOC, &info);
+ if (ret < 0)
+ return ret;
dev->irq_managed = 1;
@@ -234,14 +256,17 @@ static int intel_mid_pci_irq_enable(struct pci_dev *dev)
static void intel_mid_pci_irq_disable(struct pci_dev *dev)
{
- if (!mp_should_keep_irq(&dev->dev) && dev->irq_managed &&
- dev->irq > 0) {
+ if (pci_has_managed_irq(dev)) {
mp_unmap_irq(dev->irq);
dev->irq_managed = 0;
+ /*
+ * Don't reset dev->irq here, otherwise
+ * intel_mid_pci_irq_enable() will fail on next call.
+ */
}
}
-struct pci_ops intel_mid_pci_ops = {
+static struct pci_ops intel_mid_pci_ops = {
.read = pci_read,
.write = pci_write,
};
diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c
index 9bd115484745..32e70343e6fd 100644
--- a/arch/x86/pci/irq.c
+++ b/arch/x86/pci/irq.c
@@ -1202,7 +1202,7 @@ static int pirq_enable_irq(struct pci_dev *dev)
struct pci_dev *temp_dev;
int irq;
- if (dev->irq_managed && dev->irq > 0)
+ if (pci_has_managed_irq(dev))
return 0;
irq = IO_APIC_get_PCI_irq_vector(dev->bus->number,
@@ -1230,8 +1230,7 @@ static int pirq_enable_irq(struct pci_dev *dev)
}
dev = temp_dev;
if (irq >= 0) {
- dev->irq_managed = 1;
- dev->irq = irq;
+ pci_set_managed_irq(dev, irq);
dev_info(&dev->dev, "PCI->APIC IRQ transform: "
"INT %c -> IRQ %d\n", 'A' + pin - 1, irq);
return 0;
@@ -1257,24 +1256,10 @@ static int pirq_enable_irq(struct pci_dev *dev)
return 0;
}
-bool mp_should_keep_irq(struct device *dev)
-{
- if (dev->power.is_prepared)
- return true;
-#ifdef CONFIG_PM
- if (dev->power.runtime_status == RPM_SUSPENDING)
- return true;
-#endif
-
- return false;
-}
-
static void pirq_disable_irq(struct pci_dev *dev)
{
- if (io_apic_assign_pci_irqs && !mp_should_keep_irq(&dev->dev) &&
- dev->irq_managed && dev->irq) {
+ if (io_apic_assign_pci_irqs && pci_has_managed_irq(dev)) {
mp_unmap_irq(dev->irq);
- dev->irq = 0;
- dev->irq_managed = 0;
+ pci_reset_managed_irq(dev);
}
}
diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c
index d22f4b5bbc04..ff31ab464213 100644
--- a/arch/x86/pci/xen.c
+++ b/arch/x86/pci/xen.c
@@ -179,7 +179,7 @@ static int xen_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
if (ret)
goto error;
i = 0;
- list_for_each_entry(msidesc, &dev->msi_list, list) {
+ for_each_pci_msi_entry(msidesc, dev) {
irq = xen_bind_pirq_msi_to_irq(dev, msidesc, v[i],
(type == PCI_CAP_ID_MSI) ? nvec : 1,
(type == PCI_CAP_ID_MSIX) ?
@@ -230,7 +230,7 @@ static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
if (type == PCI_CAP_ID_MSI && nvec > 1)
return 1;
- list_for_each_entry(msidesc, &dev->msi_list, list) {
+ for_each_pci_msi_entry(msidesc, dev) {
__pci_read_msi_msg(msidesc, &msg);
pirq = MSI_ADDR_EXT_DEST_ID(msg.address_hi) |
((msg.address_lo >> MSI_ADDR_DEST_ID_SHIFT) & 0xff);
@@ -274,7 +274,7 @@ static int xen_initdom_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
int ret = 0;
struct msi_desc *msidesc;
- list_for_each_entry(msidesc, &dev->msi_list, list) {
+ for_each_pci_msi_entry(msidesc, dev) {
struct physdev_map_pirq map_irq;
domid_t domid;
@@ -386,7 +386,7 @@ static void xen_teardown_msi_irqs(struct pci_dev *dev)
{
struct msi_desc *msidesc;
- msidesc = list_entry(dev->msi_list.next, struct msi_desc, list);
+ msidesc = first_pci_msi_entry(dev);
if (msidesc->msi_attrib.is_msix)
xen_pci_frontend_disable_msix(dev);
else
diff --git a/arch/x86/platform/Makefile b/arch/x86/platform/Makefile
index f1a6c8e86ddd..184842ef332e 100644
--- a/arch/x86/platform/Makefile
+++ b/arch/x86/platform/Makefile
@@ -5,6 +5,7 @@ obj-y += efi/
obj-y += geode/
obj-y += goldfish/
obj-y += iris/
+obj-y += intel/
obj-y += intel-mid/
obj-y += intel-quark/
obj-y += olpc/
diff --git a/arch/x86/platform/atom/Makefile b/arch/x86/platform/atom/Makefile
index 0a3a40cbc794..40983f5b0858 100644
--- a/arch/x86/platform/atom/Makefile
+++ b/arch/x86/platform/atom/Makefile
@@ -1 +1,2 @@
-obj-$(CONFIG_PUNIT_ATOM_DEBUG) += punit_atom_debug.o
+obj-$(CONFIG_PMC_ATOM) += pmc_atom.o
+obj-$(CONFIG_PUNIT_ATOM_DEBUG) += punit_atom_debug.o
diff --git a/arch/x86/kernel/pmc_atom.c b/arch/x86/platform/atom/pmc_atom.c
index d66a4fe6caee..964ff4fc61f9 100644
--- a/arch/x86/kernel/pmc_atom.c
+++ b/arch/x86/platform/atom/pmc_atom.c
@@ -15,7 +15,6 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#include <linux/module.h>
#include <linux/init.h>
#include <linux/pci.h>
#include <linux/device.h>
@@ -25,80 +24,149 @@
#include <asm/pmc_atom.h>
+struct pmc_bit_map {
+ const char *name;
+ u32 bit_mask;
+};
+
+struct pmc_reg_map {
+ const struct pmc_bit_map *d3_sts_0;
+ const struct pmc_bit_map *d3_sts_1;
+ const struct pmc_bit_map *func_dis;
+ const struct pmc_bit_map *func_dis_2;
+ const struct pmc_bit_map *pss;
+};
+
struct pmc_dev {
u32 base_addr;
void __iomem *regmap;
+ const struct pmc_reg_map *map;
#ifdef CONFIG_DEBUG_FS
struct dentry *dbgfs_dir;
#endif /* CONFIG_DEBUG_FS */
+ bool init;
};
static struct pmc_dev pmc_device;
static u32 acpi_base_addr;
-struct pmc_bit_map {
- const char *name;
- u32 bit_mask;
+static const struct pmc_bit_map d3_sts_0_map[] = {
+ {"LPSS1_F0_DMA", BIT_LPSS1_F0_DMA},
+ {"LPSS1_F1_PWM1", BIT_LPSS1_F1_PWM1},
+ {"LPSS1_F2_PWM2", BIT_LPSS1_F2_PWM2},
+ {"LPSS1_F3_HSUART1", BIT_LPSS1_F3_HSUART1},
+ {"LPSS1_F4_HSUART2", BIT_LPSS1_F4_HSUART2},
+ {"LPSS1_F5_SPI", BIT_LPSS1_F5_SPI},
+ {"LPSS1_F6_Reserved", BIT_LPSS1_F6_XXX},
+ {"LPSS1_F7_Reserved", BIT_LPSS1_F7_XXX},
+ {"SCC_EMMC", BIT_SCC_EMMC},
+ {"SCC_SDIO", BIT_SCC_SDIO},
+ {"SCC_SDCARD", BIT_SCC_SDCARD},
+ {"SCC_MIPI", BIT_SCC_MIPI},
+ {"HDA", BIT_HDA},
+ {"LPE", BIT_LPE},
+ {"OTG", BIT_OTG},
+ {"USH", BIT_USH},
+ {"GBE", BIT_GBE},
+ {"SATA", BIT_SATA},
+ {"USB_EHCI", BIT_USB_EHCI},
+ {"SEC", BIT_SEC},
+ {"PCIE_PORT0", BIT_PCIE_PORT0},
+ {"PCIE_PORT1", BIT_PCIE_PORT1},
+ {"PCIE_PORT2", BIT_PCIE_PORT2},
+ {"PCIE_PORT3", BIT_PCIE_PORT3},
+ {"LPSS2_F0_DMA", BIT_LPSS2_F0_DMA},
+ {"LPSS2_F1_I2C1", BIT_LPSS2_F1_I2C1},
+ {"LPSS2_F2_I2C2", BIT_LPSS2_F2_I2C2},
+ {"LPSS2_F3_I2C3", BIT_LPSS2_F3_I2C3},
+ {"LPSS2_F3_I2C4", BIT_LPSS2_F4_I2C4},
+ {"LPSS2_F5_I2C5", BIT_LPSS2_F5_I2C5},
+ {"LPSS2_F6_I2C6", BIT_LPSS2_F6_I2C6},
+ {"LPSS2_F7_I2C7", BIT_LPSS2_F7_I2C7},
+ {},
+};
+
+static struct pmc_bit_map byt_d3_sts_1_map[] = {
+ {"SMB", BIT_SMB},
+ {"OTG_SS_PHY", BIT_OTG_SS_PHY},
+ {"USH_SS_PHY", BIT_USH_SS_PHY},
+ {"DFX", BIT_DFX},
+ {},
};
-static const struct pmc_bit_map dev_map[] = {
- {"0 - LPSS1_F0_DMA", BIT_LPSS1_F0_DMA},
- {"1 - LPSS1_F1_PWM1", BIT_LPSS1_F1_PWM1},
- {"2 - LPSS1_F2_PWM2", BIT_LPSS1_F2_PWM2},
- {"3 - LPSS1_F3_HSUART1", BIT_LPSS1_F3_HSUART1},
- {"4 - LPSS1_F4_HSUART2", BIT_LPSS1_F4_HSUART2},
- {"5 - LPSS1_F5_SPI", BIT_LPSS1_F5_SPI},
- {"6 - LPSS1_F6_Reserved", BIT_LPSS1_F6_XXX},
- {"7 - LPSS1_F7_Reserved", BIT_LPSS1_F7_XXX},
- {"8 - SCC_EMMC", BIT_SCC_EMMC},
- {"9 - SCC_SDIO", BIT_SCC_SDIO},
- {"10 - SCC_SDCARD", BIT_SCC_SDCARD},
- {"11 - SCC_MIPI", BIT_SCC_MIPI},
- {"12 - HDA", BIT_HDA},
- {"13 - LPE", BIT_LPE},
- {"14 - OTG", BIT_OTG},
- {"15 - USH", BIT_USH},
- {"16 - GBE", BIT_GBE},
- {"17 - SATA", BIT_SATA},
- {"18 - USB_EHCI", BIT_USB_EHCI},
- {"19 - SEC", BIT_SEC},
- {"20 - PCIE_PORT0", BIT_PCIE_PORT0},
- {"21 - PCIE_PORT1", BIT_PCIE_PORT1},
- {"22 - PCIE_PORT2", BIT_PCIE_PORT2},
- {"23 - PCIE_PORT3", BIT_PCIE_PORT3},
- {"24 - LPSS2_F0_DMA", BIT_LPSS2_F0_DMA},
- {"25 - LPSS2_F1_I2C1", BIT_LPSS2_F1_I2C1},
- {"26 - LPSS2_F2_I2C2", BIT_LPSS2_F2_I2C2},
- {"27 - LPSS2_F3_I2C3", BIT_LPSS2_F3_I2C3},
- {"28 - LPSS2_F3_I2C4", BIT_LPSS2_F4_I2C4},
- {"29 - LPSS2_F5_I2C5", BIT_LPSS2_F5_I2C5},
- {"30 - LPSS2_F6_I2C6", BIT_LPSS2_F6_I2C6},
- {"31 - LPSS2_F7_I2C7", BIT_LPSS2_F7_I2C7},
- {"32 - SMB", BIT_SMB},
- {"33 - OTG_SS_PHY", BIT_OTG_SS_PHY},
- {"34 - USH_SS_PHY", BIT_USH_SS_PHY},
- {"35 - DFX", BIT_DFX},
+static struct pmc_bit_map cht_d3_sts_1_map[] = {
+ {"SMB", BIT_SMB},
+ {"GMM", BIT_STS_GMM},
+ {"ISH", BIT_STS_ISH},
+ {},
};
-static const struct pmc_bit_map pss_map[] = {
- {"0 - GBE", PMC_PSS_BIT_GBE},
- {"1 - SATA", PMC_PSS_BIT_SATA},
- {"2 - HDA", PMC_PSS_BIT_HDA},
- {"3 - SEC", PMC_PSS_BIT_SEC},
- {"4 - PCIE", PMC_PSS_BIT_PCIE},
- {"5 - LPSS", PMC_PSS_BIT_LPSS},
- {"6 - LPE", PMC_PSS_BIT_LPE},
- {"7 - DFX", PMC_PSS_BIT_DFX},
- {"8 - USH_CTRL", PMC_PSS_BIT_USH_CTRL},
- {"9 - USH_SUS", PMC_PSS_BIT_USH_SUS},
- {"10 - USH_VCCS", PMC_PSS_BIT_USH_VCCS},
- {"11 - USH_VCCA", PMC_PSS_BIT_USH_VCCA},
- {"12 - OTG_CTRL", PMC_PSS_BIT_OTG_CTRL},
- {"13 - OTG_VCCS", PMC_PSS_BIT_OTG_VCCS},
- {"14 - OTG_VCCA_CLK", PMC_PSS_BIT_OTG_VCCA_CLK},
- {"15 - OTG_VCCA", PMC_PSS_BIT_OTG_VCCA},
- {"16 - USB", PMC_PSS_BIT_USB},
- {"17 - USB_SUS", PMC_PSS_BIT_USB_SUS},
+static struct pmc_bit_map cht_func_dis_2_map[] = {
+ {"SMB", BIT_SMB},
+ {"GMM", BIT_FD_GMM},
+ {"ISH", BIT_FD_ISH},
+ {},
+};
+
+static const struct pmc_bit_map byt_pss_map[] = {
+ {"GBE", PMC_PSS_BIT_GBE},
+ {"SATA", PMC_PSS_BIT_SATA},
+ {"HDA", PMC_PSS_BIT_HDA},
+ {"SEC", PMC_PSS_BIT_SEC},
+ {"PCIE", PMC_PSS_BIT_PCIE},
+ {"LPSS", PMC_PSS_BIT_LPSS},
+ {"LPE", PMC_PSS_BIT_LPE},
+ {"DFX", PMC_PSS_BIT_DFX},
+ {"USH_CTRL", PMC_PSS_BIT_USH_CTRL},
+ {"USH_SUS", PMC_PSS_BIT_USH_SUS},
+ {"USH_VCCS", PMC_PSS_BIT_USH_VCCS},
+ {"USH_VCCA", PMC_PSS_BIT_USH_VCCA},
+ {"OTG_CTRL", PMC_PSS_BIT_OTG_CTRL},
+ {"OTG_VCCS", PMC_PSS_BIT_OTG_VCCS},
+ {"OTG_VCCA_CLK", PMC_PSS_BIT_OTG_VCCA_CLK},
+ {"OTG_VCCA", PMC_PSS_BIT_OTG_VCCA},
+ {"USB", PMC_PSS_BIT_USB},
+ {"USB_SUS", PMC_PSS_BIT_USB_SUS},
+ {},
+};
+
+static const struct pmc_bit_map cht_pss_map[] = {
+ {"SATA", PMC_PSS_BIT_SATA},
+ {"HDA", PMC_PSS_BIT_HDA},
+ {"SEC", PMC_PSS_BIT_SEC},
+ {"PCIE", PMC_PSS_BIT_PCIE},
+ {"LPSS", PMC_PSS_BIT_LPSS},
+ {"LPE", PMC_PSS_BIT_LPE},
+ {"UFS", PMC_PSS_BIT_CHT_UFS},
+ {"UXD", PMC_PSS_BIT_CHT_UXD},
+ {"UXD_FD", PMC_PSS_BIT_CHT_UXD_FD},
+ {"UX_ENG", PMC_PSS_BIT_CHT_UX_ENG},
+ {"USB_SUS", PMC_PSS_BIT_CHT_USB_SUS},
+ {"GMM", PMC_PSS_BIT_CHT_GMM},
+ {"ISH", PMC_PSS_BIT_CHT_ISH},
+ {"DFX_MASTER", PMC_PSS_BIT_CHT_DFX_MASTER},
+ {"DFX_CLUSTER1", PMC_PSS_BIT_CHT_DFX_CLUSTER1},
+ {"DFX_CLUSTER2", PMC_PSS_BIT_CHT_DFX_CLUSTER2},
+ {"DFX_CLUSTER3", PMC_PSS_BIT_CHT_DFX_CLUSTER3},
+ {"DFX_CLUSTER4", PMC_PSS_BIT_CHT_DFX_CLUSTER4},
+ {"DFX_CLUSTER5", PMC_PSS_BIT_CHT_DFX_CLUSTER5},
+ {},
+};
+
+static const struct pmc_reg_map byt_reg_map = {
+ .d3_sts_0 = d3_sts_0_map,
+ .d3_sts_1 = byt_d3_sts_1_map,
+ .func_dis = d3_sts_0_map,
+ .func_dis_2 = byt_d3_sts_1_map,
+ .pss = byt_pss_map,
+};
+
+static const struct pmc_reg_map cht_reg_map = {
+ .d3_sts_0 = d3_sts_0_map,
+ .d3_sts_1 = cht_d3_sts_1_map,
+ .func_dis = d3_sts_0_map,
+ .func_dis_2 = cht_func_dis_2_map,
+ .pss = cht_pss_map,
};
static inline u32 pmc_reg_read(struct pmc_dev *pmc, int reg_offset)
@@ -111,6 +179,30 @@ static inline void pmc_reg_write(struct pmc_dev *pmc, int reg_offset, u32 val)
writel(val, pmc->regmap + reg_offset);
}
+int pmc_atom_read(int offset, u32 *value)
+{
+ struct pmc_dev *pmc = &pmc_device;
+
+ if (!pmc->init)
+ return -ENODEV;
+
+ *value = pmc_reg_read(pmc, offset);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(pmc_atom_read);
+
+int pmc_atom_write(int offset, u32 value)
+{
+ struct pmc_dev *pmc = &pmc_device;
+
+ if (!pmc->init)
+ return -ENODEV;
+
+ pmc_reg_write(pmc, offset, value);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(pmc_atom_write);
+
static void pmc_power_off(void)
{
u16 pm1_cnt_port;
@@ -142,37 +234,39 @@ static void pmc_hw_reg_setup(struct pmc_dev *pmc)
}
#ifdef CONFIG_DEBUG_FS
+static void pmc_dev_state_print(struct seq_file *s, int reg_index,
+ u32 sts, const struct pmc_bit_map *sts_map,
+ u32 fd, const struct pmc_bit_map *fd_map)
+{
+ int offset = PMC_REG_BIT_WIDTH * reg_index;
+ int index;
+
+ for (index = 0; sts_map[index].name; index++) {
+ seq_printf(s, "Dev: %-2d - %-32s\tState: %s [%s]\n",
+ offset + index, sts_map[index].name,
+ fd_map[index].bit_mask & fd ? "Disabled" : "Enabled ",
+ sts_map[index].bit_mask & sts ? "D3" : "D0");
+ }
+}
+
static int pmc_dev_state_show(struct seq_file *s, void *unused)
{
struct pmc_dev *pmc = s->private;
- u32 func_dis, func_dis_2, func_dis_index;
- u32 d3_sts_0, d3_sts_1, d3_sts_index;
- int dev_num, dev_index, reg_index;
+ const struct pmc_reg_map *m = pmc->map;
+ u32 func_dis, func_dis_2;
+ u32 d3_sts_0, d3_sts_1;
func_dis = pmc_reg_read(pmc, PMC_FUNC_DIS);
func_dis_2 = pmc_reg_read(pmc, PMC_FUNC_DIS_2);
d3_sts_0 = pmc_reg_read(pmc, PMC_D3_STS_0);
d3_sts_1 = pmc_reg_read(pmc, PMC_D3_STS_1);
- dev_num = ARRAY_SIZE(dev_map);
-
- for (dev_index = 0; dev_index < dev_num; dev_index++) {
- reg_index = dev_index / PMC_REG_BIT_WIDTH;
- if (reg_index) {
- func_dis_index = func_dis_2;
- d3_sts_index = d3_sts_1;
- } else {
- func_dis_index = func_dis;
- d3_sts_index = d3_sts_0;
- }
-
- seq_printf(s, "Dev: %-32s\tState: %s [%s]\n",
- dev_map[dev_index].name,
- dev_map[dev_index].bit_mask & func_dis_index ?
- "Disabled" : "Enabled ",
- dev_map[dev_index].bit_mask & d3_sts_index ?
- "D3" : "D0");
- }
+ /* Low part */
+ pmc_dev_state_print(s, 0, d3_sts_0, m->d3_sts_0, func_dis, m->func_dis);
+
+ /* High part */
+ pmc_dev_state_print(s, 1, d3_sts_1, m->d3_sts_1, func_dis_2, m->func_dis_2);
+
return 0;
}
@@ -191,13 +285,14 @@ static const struct file_operations pmc_dev_state_ops = {
static int pmc_pss_state_show(struct seq_file *s, void *unused)
{
struct pmc_dev *pmc = s->private;
+ const struct pmc_bit_map *map = pmc->map->pss;
u32 pss = pmc_reg_read(pmc, PMC_PSS);
- int pss_index;
+ int index;
- for (pss_index = 0; pss_index < ARRAY_SIZE(pss_map); pss_index++) {
- seq_printf(s, "Island: %-32s\tState: %s\n",
- pss_map[pss_index].name,
- pss_map[pss_index].bit_mask & pss ? "Off" : "On");
+ for (index = 0; map[index].name; index++) {
+ seq_printf(s, "Island: %-2d - %-32s\tState: %s\n",
+ index, map[index].name,
+ map[index].bit_mask & pss ? "Off" : "On");
}
return 0;
}
@@ -250,7 +345,7 @@ static void pmc_dbgfs_unregister(struct pmc_dev *pmc)
debugfs_remove_recursive(pmc->dbgfs_dir);
}
-static int pmc_dbgfs_register(struct pmc_dev *pmc, struct pci_dev *pdev)
+static int pmc_dbgfs_register(struct pmc_dev *pmc)
{
struct dentry *dir, *f;
@@ -262,24 +357,18 @@ static int pmc_dbgfs_register(struct pmc_dev *pmc, struct pci_dev *pdev)
f = debugfs_create_file("dev_state", S_IFREG | S_IRUGO,
dir, pmc, &pmc_dev_state_ops);
- if (!f) {
- dev_err(&pdev->dev, "dev_state register failed\n");
+ if (!f)
goto err;
- }
f = debugfs_create_file("pss_state", S_IFREG | S_IRUGO,
dir, pmc, &pmc_pss_state_ops);
- if (!f) {
- dev_err(&pdev->dev, "pss_state register failed\n");
+ if (!f)
goto err;
- }
f = debugfs_create_file("sleep_state", S_IFREG | S_IRUGO,
dir, pmc, &pmc_sleep_tmr_ops);
- if (!f) {
- dev_err(&pdev->dev, "sleep_state register failed\n");
+ if (!f)
goto err;
- }
return 0;
err:
@@ -287,15 +376,16 @@ err:
return -ENODEV;
}
#else
-static int pmc_dbgfs_register(struct pmc_dev *pmc, struct pci_dev *pdev)
+static int pmc_dbgfs_register(struct pmc_dev *pmc)
{
return 0;
}
#endif /* CONFIG_DEBUG_FS */
-static int pmc_setup_dev(struct pci_dev *pdev)
+static int pmc_setup_dev(struct pci_dev *pdev, const struct pci_device_id *ent)
{
struct pmc_dev *pmc = &pmc_device;
+ const struct pmc_reg_map *map = (struct pmc_reg_map *)ent->driver_data;
int ret;
/* Obtain ACPI base address */
@@ -315,32 +405,30 @@ static int pmc_setup_dev(struct pci_dev *pdev)
return -ENOMEM;
}
+ pmc->map = map;
+
/* PMC hardware registers setup */
pmc_hw_reg_setup(pmc);
- ret = pmc_dbgfs_register(pmc, pdev);
- if (ret) {
- iounmap(pmc->regmap);
- }
+ ret = pmc_dbgfs_register(pmc);
+ if (ret)
+ dev_warn(&pdev->dev, "debugfs register failed\n");
+ pmc->init = true;
return ret;
}
/*
* Data for PCI driver interface
*
- * This data only exists for exporting the supported
- * PCI ids via MODULE_DEVICE_TABLE. We do not actually
- * register a pci_driver, because lpc_ich will register
- * a driver on the same PCI id.
+ * used by pci_match_id() call below.
*/
static const struct pci_device_id pmc_pci_ids[] = {
- { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_VLV_PMC) },
+ { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_VLV_PMC), (kernel_ulong_t)&byt_reg_map },
+ { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_CHT_PMC), (kernel_ulong_t)&cht_reg_map },
{ 0, },
};
-MODULE_DEVICE_TABLE(pci, pmc_pci_ids);
-
static int __init pmc_atom_init(void)
{
struct pci_dev *pdev = NULL;
@@ -357,15 +445,16 @@ static int __init pmc_atom_init(void)
for_each_pci_dev(pdev) {
ent = pci_match_id(pmc_pci_ids, pdev);
if (ent)
- return pmc_setup_dev(pdev);
+ return pmc_setup_dev(pdev, ent);
}
/* Device not found. */
return -ENODEV;
}
-module_init(pmc_atom_init);
-/* no module_exit, this driver shouldn't be unloaded */
+device_initcall(pmc_atom_init);
+/*
MODULE_AUTHOR("Aubrey Li <aubrey.li@linux.intel.com>");
MODULE_DESCRIPTION("Intel Atom SOC Power Management Controller Interface");
MODULE_LICENSE("GPL v2");
+*/
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index cfba30f27392..6a28ded74211 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -650,7 +650,7 @@ static void __init get_systab_virt_addr(efi_memory_desc_t *md)
static void __init save_runtime_map(void)
{
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
efi_memory_desc_t *md;
void *tmp, *p, *q = NULL;
int count = 0;
@@ -705,6 +705,70 @@ out:
}
/*
+ * Iterate the EFI memory map in reverse order because the regions
+ * will be mapped top-down. The end result is the same as if we had
+ * mapped things forward, but doesn't require us to change the
+ * existing implementation of efi_map_region().
+ */
+static inline void *efi_map_next_entry_reverse(void *entry)
+{
+ /* Initial call */
+ if (!entry)
+ return memmap.map_end - memmap.desc_size;
+
+ entry -= memmap.desc_size;
+ if (entry < memmap.map)
+ return NULL;
+
+ return entry;
+}
+
+/*
+ * efi_map_next_entry - Return the next EFI memory map descriptor
+ * @entry: Previous EFI memory map descriptor
+ *
+ * This is a helper function to iterate over the EFI memory map, which
+ * we do in different orders depending on the current configuration.
+ *
+ * To begin traversing the memory map @entry must be %NULL.
+ *
+ * Returns %NULL when we reach the end of the memory map.
+ */
+static void *efi_map_next_entry(void *entry)
+{
+ if (!efi_enabled(EFI_OLD_MEMMAP) && efi_enabled(EFI_64BIT)) {
+ /*
+ * Starting in UEFI v2.5 the EFI_PROPERTIES_TABLE
+ * config table feature requires us to map all entries
+ * in the same order as they appear in the EFI memory
+ * map. That is to say, entry N must have a lower
+ * virtual address than entry N+1. This is because the
+ * firmware toolchain leaves relative references in
+ * the code/data sections, which are split and become
+ * separate EFI memory regions. Mapping things
+ * out-of-order leads to the firmware accessing
+ * unmapped addresses.
+ *
+ * Since we need to map things this way whether or not
+ * the kernel actually makes use of
+ * EFI_PROPERTIES_TABLE, let's just switch to this
+ * scheme by default for 64-bit.
+ */
+ return efi_map_next_entry_reverse(entry);
+ }
+
+ /* Initial call */
+ if (!entry)
+ return memmap.map;
+
+ entry += memmap.desc_size;
+ if (entry >= memmap.map_end)
+ return NULL;
+
+ return entry;
+}
+
+/*
* Map the efi memory ranges of the runtime services and update new_mmap with
* virtual addresses.
*/
@@ -714,7 +778,8 @@ static void * __init efi_map_regions(int *count, int *pg_shift)
unsigned long left = 0;
efi_memory_desc_t *md;
- for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
+ p = NULL;
+ while ((p = efi_map_next_entry(p))) {
md = p;
if (!(md->attribute & EFI_MEMORY_RUNTIME)) {
#ifdef CONFIG_X86_64
@@ -748,7 +813,7 @@ static void * __init efi_map_regions(int *count, int *pg_shift)
static void __init kexec_enter_virtual_mode(void)
{
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
efi_memory_desc_t *md;
void *p;
@@ -972,6 +1037,11 @@ u64 efi_mem_attributes(unsigned long phys_addr)
static int __init arch_parse_efi_cmdline(char *str)
{
+ if (!str) {
+ pr_warn("need at least one option\n");
+ return -EINVAL;
+ }
+
if (parse_option_str(str, "old_map"))
set_bit(EFI_OLD_MEMMAP, &efi.flags);
if (parse_option_str(str, "debug"))
diff --git a/arch/x86/platform/intel/Makefile b/arch/x86/platform/intel/Makefile
new file mode 100644
index 000000000000..b878032fbc82
--- /dev/null
+++ b/arch/x86/platform/intel/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_IOSF_MBI) += iosf_mbi.o
diff --git a/arch/x86/kernel/iosf_mbi.c b/arch/x86/platform/intel/iosf_mbi.c
index 82f8d02f0df2..edf2c54bf131 100644
--- a/arch/x86/kernel/iosf_mbi.c
+++ b/arch/x86/platform/intel/iosf_mbi.c
@@ -30,7 +30,9 @@
#define PCI_DEVICE_ID_BAYTRAIL 0x0F00
#define PCI_DEVICE_ID_BRASWELL 0x2280
#define PCI_DEVICE_ID_QUARK_X1000 0x0958
+#define PCI_DEVICE_ID_TANGIER 0x1170
+static struct pci_dev *mbi_pdev;
static DEFINE_SPINLOCK(iosf_mbi_lock);
static inline u32 iosf_mbi_form_mcr(u8 op, u8 port, u8 offset)
@@ -38,8 +40,6 @@ static inline u32 iosf_mbi_form_mcr(u8 op, u8 port, u8 offset)
return (op << 24) | (port << 16) | (offset << 8) | MBI_ENABLE;
}
-static struct pci_dev *mbi_pdev; /* one mbi device */
-
static int iosf_mbi_pci_read_mdr(u32 mcrx, u32 mcr, u32 *mdr)
{
int result;
@@ -104,7 +104,7 @@ int iosf_mbi_read(u8 port, u8 opcode, u32 offset, u32 *mdr)
unsigned long flags;
int ret;
- /*Access to the GFX unit is handled by GPU code */
+ /* Access to the GFX unit is handled by GPU code */
if (port == BT_MBI_UNIT_GFX) {
WARN_ON(1);
return -EPERM;
@@ -127,7 +127,7 @@ int iosf_mbi_write(u8 port, u8 opcode, u32 offset, u32 mdr)
unsigned long flags;
int ret;
- /*Access to the GFX unit is handled by GPU code */
+ /* Access to the GFX unit is handled by GPU code */
if (port == BT_MBI_UNIT_GFX) {
WARN_ON(1);
return -EPERM;
@@ -151,7 +151,7 @@ int iosf_mbi_modify(u8 port, u8 opcode, u32 offset, u32 mdr, u32 mask)
unsigned long flags;
int ret;
- /*Access to the GFX unit is handled by GPU code */
+ /* Access to the GFX unit is handled by GPU code */
if (port == BT_MBI_UNIT_GFX) {
WARN_ON(1);
return -EPERM;
@@ -240,17 +240,17 @@ static void iosf_sideband_debug_init(void)
/* mdr */
d = debugfs_create_x32("mdr", 0660, iosf_dbg, &dbg_mdr);
- if (IS_ERR_OR_NULL(d))
+ if (!d)
goto cleanup;
/* mcrx */
- debugfs_create_x32("mcrx", 0660, iosf_dbg, &dbg_mcrx);
- if (IS_ERR_OR_NULL(d))
+ d = debugfs_create_x32("mcrx", 0660, iosf_dbg, &dbg_mcrx);
+ if (!d)
goto cleanup;
/* mcr - initiates mailbox tranaction */
- debugfs_create_file("mcr", 0660, iosf_dbg, &dbg_mcr, &iosf_mcr_fops);
- if (IS_ERR_OR_NULL(d))
+ d = debugfs_create_file("mcr", 0660, iosf_dbg, &dbg_mcr, &iosf_mcr_fops);
+ if (!d)
goto cleanup;
return;
@@ -292,6 +292,7 @@ static const struct pci_device_id iosf_mbi_pci_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_BAYTRAIL) },
{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_BRASWELL) },
{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_QUARK_X1000) },
+ { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_TANGIER) },
{ 0, },
};
MODULE_DEVICE_TABLE(pci, iosf_mbi_pci_ids);
@@ -314,10 +315,8 @@ static void __exit iosf_mbi_exit(void)
iosf_debugfs_remove();
pci_unregister_driver(&iosf_mbi_pci_driver);
- if (mbi_pdev) {
- pci_dev_put(mbi_pdev);
- mbi_pdev = NULL;
- }
+ pci_dev_put(mbi_pdev);
+ mbi_pdev = NULL;
}
module_init(iosf_mbi_init);
diff --git a/arch/x86/platform/uv/uv_irq.c b/arch/x86/platform/uv/uv_irq.c
index 8570abe68be1..e1c24631afbb 100644
--- a/arch/x86/platform/uv/uv_irq.c
+++ b/arch/x86/platform/uv/uv_irq.c
@@ -89,7 +89,7 @@ static int uv_domain_alloc(struct irq_domain *domain, unsigned int virq,
return -EINVAL;
chip_data = kmalloc_node(sizeof(*chip_data), GFP_KERNEL,
- irq_data->node);
+ irq_data_get_node(irq_data));
if (!chip_data)
return -ENOMEM;
diff --git a/arch/x86/platform/uv/uv_nmi.c b/arch/x86/platform/uv/uv_nmi.c
index 020c101c255f..5c9f63fa6abf 100644
--- a/arch/x86/platform/uv/uv_nmi.c
+++ b/arch/x86/platform/uv/uv_nmi.c
@@ -492,7 +492,7 @@ static void uv_nmi_touch_watchdogs(void)
touch_nmi_watchdog();
}
-#if defined(CONFIG_KEXEC)
+#if defined(CONFIG_KEXEC_CORE)
static atomic_t uv_nmi_kexec_failed;
static void uv_nmi_kdump(int cpu, int master, struct pt_regs *regs)
{
@@ -519,13 +519,13 @@ static void uv_nmi_kdump(int cpu, int master, struct pt_regs *regs)
uv_nmi_sync_exit(0);
}
-#else /* !CONFIG_KEXEC */
+#else /* !CONFIG_KEXEC_CORE */
static inline void uv_nmi_kdump(int cpu, int master, struct pt_regs *regs)
{
if (master)
pr_err("UV: NMI kdump: KEXEC not supported in this kernel\n");
}
-#endif /* !CONFIG_KEXEC */
+#endif /* !CONFIG_KEXEC_CORE */
#ifdef CONFIG_KGDB
#ifdef CONFIG_KGDB_KDB
diff --git a/arch/x86/platform/uv/uv_time.c b/arch/x86/platform/uv/uv_time.c
index a244237f3cfa..2b158a9fa1d7 100644
--- a/arch/x86/platform/uv/uv_time.c
+++ b/arch/x86/platform/uv/uv_time.c
@@ -32,8 +32,7 @@
static cycle_t uv_read_rtc(struct clocksource *cs);
static int uv_rtc_next_event(unsigned long, struct clock_event_device *);
-static void uv_rtc_timer_setup(enum clock_event_mode,
- struct clock_event_device *);
+static int uv_rtc_shutdown(struct clock_event_device *evt);
static struct clocksource clocksource_uv = {
.name = RTC_NAME,
@@ -44,14 +43,14 @@ static struct clocksource clocksource_uv = {
};
static struct clock_event_device clock_event_device_uv = {
- .name = RTC_NAME,
- .features = CLOCK_EVT_FEAT_ONESHOT,
- .shift = 20,
- .rating = 400,
- .irq = -1,
- .set_next_event = uv_rtc_next_event,
- .set_mode = uv_rtc_timer_setup,
- .event_handler = NULL,
+ .name = RTC_NAME,
+ .features = CLOCK_EVT_FEAT_ONESHOT,
+ .shift = 20,
+ .rating = 400,
+ .irq = -1,
+ .set_next_event = uv_rtc_next_event,
+ .set_state_shutdown = uv_rtc_shutdown,
+ .event_handler = NULL,
};
static DEFINE_PER_CPU(struct clock_event_device, cpu_ced);
@@ -321,24 +320,14 @@ static int uv_rtc_next_event(unsigned long delta,
}
/*
- * Setup the RTC timer in oneshot mode
+ * Shutdown the RTC timer
*/
-static void uv_rtc_timer_setup(enum clock_event_mode mode,
- struct clock_event_device *evt)
+static int uv_rtc_shutdown(struct clock_event_device *evt)
{
int ced_cpu = cpumask_first(evt->cpumask);
- switch (mode) {
- case CLOCK_EVT_MODE_PERIODIC:
- case CLOCK_EVT_MODE_ONESHOT:
- case CLOCK_EVT_MODE_RESUME:
- /* Nothing to do here yet */
- break;
- case CLOCK_EVT_MODE_UNUSED:
- case CLOCK_EVT_MODE_SHUTDOWN:
- uv_rtc_unset_timer(ced_cpu, 1);
- break;
- }
+ uv_rtc_unset_timer(ced_cpu, 1);
+ return 0;
}
static void uv_rtc_interrupt(void)
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index 0d7dd1f5ac36..9ab52791fed5 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -22,6 +22,7 @@
#include <asm/fpu/internal.h>
#include <asm/debugreg.h>
#include <asm/cpu.h>
+#include <asm/mmu_context.h>
#ifdef CONFIG_X86_32
__visible unsigned long saved_context_ebx;
@@ -153,7 +154,7 @@ static void fix_processor_context(void)
syscall_init(); /* This sets MSR_*STAR and related */
#endif
load_TR_desc(); /* This does ltr */
- load_LDT(&current->active_mm->context); /* This does lldt */
+ load_mm_ldt(current->active_mm); /* This does lldt */
fpu__resume_cpu();
}
diff --git a/arch/x86/ras/Kconfig b/arch/x86/ras/Kconfig
new file mode 100644
index 000000000000..10fea5fc821e
--- /dev/null
+++ b/arch/x86/ras/Kconfig
@@ -0,0 +1,11 @@
+config AMD_MCE_INJ
+ tristate "Simple MCE injection interface for AMD processors"
+ depends on RAS && EDAC_DECODE_MCE && DEBUG_FS
+ default n
+ help
+ This is a simple debugfs interface to inject MCEs and test different
+ aspects of the MCE handling code.
+
+ WARNING: Do not even assume this interface is staying stable!
+
+
diff --git a/arch/x86/ras/Makefile b/arch/x86/ras/Makefile
new file mode 100644
index 000000000000..dd2c98b84037
--- /dev/null
+++ b/arch/x86/ras/Makefile
@@ -0,0 +1,2 @@
+obj-$(CONFIG_AMD_MCE_INJ) += mce_amd_inj.o
+
diff --git a/arch/x86/ras/mce_amd_inj.c b/arch/x86/ras/mce_amd_inj.c
new file mode 100644
index 000000000000..17e35b5bf779
--- /dev/null
+++ b/arch/x86/ras/mce_amd_inj.c
@@ -0,0 +1,375 @@
+/*
+ * A simple MCE injection facility for testing different aspects of the RAS
+ * code. This driver should be built as module so that it can be loaded
+ * on production kernels for testing purposes.
+ *
+ * This file may be distributed under the terms of the GNU General Public
+ * License version 2.
+ *
+ * Copyright (c) 2010-15: Borislav Petkov <bp@alien8.de>
+ * Advanced Micro Devices Inc.
+ */
+
+#include <linux/kobject.h>
+#include <linux/debugfs.h>
+#include <linux/device.h>
+#include <linux/module.h>
+#include <linux/cpu.h>
+#include <linux/string.h>
+#include <linux/uaccess.h>
+#include <asm/mce.h>
+
+#include "../kernel/cpu/mcheck/mce-internal.h"
+
+/*
+ * Collect all the MCi_XXX settings
+ */
+static struct mce i_mce;
+static struct dentry *dfs_inj;
+
+static u8 n_banks;
+
+#define MAX_FLAG_OPT_SIZE 3
+
+enum injection_type {
+ SW_INJ = 0, /* SW injection, simply decode the error */
+ HW_INJ, /* Trigger a #MC */
+ N_INJ_TYPES,
+};
+
+static const char * const flags_options[] = {
+ [SW_INJ] = "sw",
+ [HW_INJ] = "hw",
+ NULL
+};
+
+/* Set default injection to SW_INJ */
+static enum injection_type inj_type = SW_INJ;
+
+#define MCE_INJECT_SET(reg) \
+static int inj_##reg##_set(void *data, u64 val) \
+{ \
+ struct mce *m = (struct mce *)data; \
+ \
+ m->reg = val; \
+ return 0; \
+}
+
+MCE_INJECT_SET(status);
+MCE_INJECT_SET(misc);
+MCE_INJECT_SET(addr);
+
+#define MCE_INJECT_GET(reg) \
+static int inj_##reg##_get(void *data, u64 *val) \
+{ \
+ struct mce *m = (struct mce *)data; \
+ \
+ *val = m->reg; \
+ return 0; \
+}
+
+MCE_INJECT_GET(status);
+MCE_INJECT_GET(misc);
+MCE_INJECT_GET(addr);
+
+DEFINE_SIMPLE_ATTRIBUTE(status_fops, inj_status_get, inj_status_set, "%llx\n");
+DEFINE_SIMPLE_ATTRIBUTE(misc_fops, inj_misc_get, inj_misc_set, "%llx\n");
+DEFINE_SIMPLE_ATTRIBUTE(addr_fops, inj_addr_get, inj_addr_set, "%llx\n");
+
+/*
+ * Caller needs to be make sure this cpu doesn't disappear
+ * from under us, i.e.: get_cpu/put_cpu.
+ */
+static int toggle_hw_mce_inject(unsigned int cpu, bool enable)
+{
+ u32 l, h;
+ int err;
+
+ err = rdmsr_on_cpu(cpu, MSR_K7_HWCR, &l, &h);
+ if (err) {
+ pr_err("%s: error reading HWCR\n", __func__);
+ return err;
+ }
+
+ enable ? (l |= BIT(18)) : (l &= ~BIT(18));
+
+ err = wrmsr_on_cpu(cpu, MSR_K7_HWCR, l, h);
+ if (err)
+ pr_err("%s: error writing HWCR\n", __func__);
+
+ return err;
+}
+
+static int __set_inj(const char *buf)
+{
+ int i;
+
+ for (i = 0; i < N_INJ_TYPES; i++) {
+ if (!strncmp(flags_options[i], buf, strlen(flags_options[i]))) {
+ inj_type = i;
+ return 0;
+ }
+ }
+ return -EINVAL;
+}
+
+static ssize_t flags_read(struct file *filp, char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ char buf[MAX_FLAG_OPT_SIZE];
+ int n;
+
+ n = sprintf(buf, "%s\n", flags_options[inj_type]);
+
+ return simple_read_from_buffer(ubuf, cnt, ppos, buf, n);
+}
+
+static ssize_t flags_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ char buf[MAX_FLAG_OPT_SIZE], *__buf;
+ int err;
+ size_t ret;
+
+ if (cnt > MAX_FLAG_OPT_SIZE)
+ cnt = MAX_FLAG_OPT_SIZE;
+
+ ret = cnt;
+
+ if (copy_from_user(&buf, ubuf, cnt))
+ return -EFAULT;
+
+ buf[cnt - 1] = 0;
+
+ /* strip whitespace */
+ __buf = strstrip(buf);
+
+ err = __set_inj(__buf);
+ if (err) {
+ pr_err("%s: Invalid flags value: %s\n", __func__, __buf);
+ return err;
+ }
+
+ *ppos += ret;
+
+ return ret;
+}
+
+static const struct file_operations flags_fops = {
+ .read = flags_read,
+ .write = flags_write,
+ .llseek = generic_file_llseek,
+};
+
+/*
+ * On which CPU to inject?
+ */
+MCE_INJECT_GET(extcpu);
+
+static int inj_extcpu_set(void *data, u64 val)
+{
+ struct mce *m = (struct mce *)data;
+
+ if (val >= nr_cpu_ids || !cpu_online(val)) {
+ pr_err("%s: Invalid CPU: %llu\n", __func__, val);
+ return -EINVAL;
+ }
+ m->extcpu = val;
+ return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(extcpu_fops, inj_extcpu_get, inj_extcpu_set, "%llu\n");
+
+static void trigger_mce(void *info)
+{
+ asm volatile("int $18");
+}
+
+static void do_inject(void)
+{
+ u64 mcg_status = 0;
+ unsigned int cpu = i_mce.extcpu;
+ u8 b = i_mce.bank;
+
+ if (i_mce.misc)
+ i_mce.status |= MCI_STATUS_MISCV;
+
+ if (inj_type == SW_INJ) {
+ mce_inject_log(&i_mce);
+ return;
+ }
+
+ /* prep MCE global settings for the injection */
+ mcg_status = MCG_STATUS_MCIP | MCG_STATUS_EIPV;
+
+ if (!(i_mce.status & MCI_STATUS_PCC))
+ mcg_status |= MCG_STATUS_RIPV;
+
+ get_online_cpus();
+ if (!cpu_online(cpu))
+ goto err;
+
+ toggle_hw_mce_inject(cpu, true);
+
+ wrmsr_on_cpu(cpu, MSR_IA32_MCG_STATUS,
+ (u32)mcg_status, (u32)(mcg_status >> 32));
+
+ wrmsr_on_cpu(cpu, MSR_IA32_MCx_STATUS(b),
+ (u32)i_mce.status, (u32)(i_mce.status >> 32));
+
+ wrmsr_on_cpu(cpu, MSR_IA32_MCx_ADDR(b),
+ (u32)i_mce.addr, (u32)(i_mce.addr >> 32));
+
+ wrmsr_on_cpu(cpu, MSR_IA32_MCx_MISC(b),
+ (u32)i_mce.misc, (u32)(i_mce.misc >> 32));
+
+ toggle_hw_mce_inject(cpu, false);
+
+ smp_call_function_single(cpu, trigger_mce, NULL, 0);
+
+err:
+ put_online_cpus();
+
+}
+
+/*
+ * This denotes into which bank we're injecting and triggers
+ * the injection, at the same time.
+ */
+static int inj_bank_set(void *data, u64 val)
+{
+ struct mce *m = (struct mce *)data;
+
+ if (val >= n_banks) {
+ pr_err("Non-existent MCE bank: %llu\n", val);
+ return -EINVAL;
+ }
+
+ m->bank = val;
+ do_inject();
+
+ return 0;
+}
+
+MCE_INJECT_GET(bank);
+
+DEFINE_SIMPLE_ATTRIBUTE(bank_fops, inj_bank_get, inj_bank_set, "%llu\n");
+
+static const char readme_msg[] =
+"Description of the files and their usages:\n"
+"\n"
+"Note1: i refers to the bank number below.\n"
+"Note2: See respective BKDGs for the exact bit definitions of the files below\n"
+"as they mirror the hardware registers.\n"
+"\n"
+"status:\t Set MCi_STATUS: the bits in that MSR control the error type and\n"
+"\t attributes of the error which caused the MCE.\n"
+"\n"
+"misc:\t Set MCi_MISC: provide auxiliary info about the error. It is mostly\n"
+"\t used for error thresholding purposes and its validity is indicated by\n"
+"\t MCi_STATUS[MiscV].\n"
+"\n"
+"addr:\t Error address value to be written to MCi_ADDR. Log address information\n"
+"\t associated with the error.\n"
+"\n"
+"cpu:\t The CPU to inject the error on.\n"
+"\n"
+"bank:\t Specify the bank you want to inject the error into: the number of\n"
+"\t banks in a processor varies and is family/model-specific, therefore, the\n"
+"\t supplied value is sanity-checked. Setting the bank value also triggers the\n"
+"\t injection.\n"
+"\n"
+"flags:\t Injection type to be performed. Writing to this file will trigger a\n"
+"\t real machine check, an APIC interrupt or invoke the error decoder routines\n"
+"\t for AMD processors.\n"
+"\n"
+"\t Allowed error injection types:\n"
+"\t - \"sw\": Software error injection. Decode error to a human-readable \n"
+"\t format only. Safe to use.\n"
+"\t - \"hw\": Hardware error injection. Causes the #MC exception handler to \n"
+"\t handle the error. Be warned: might cause system panic if MCi_STATUS[PCC] \n"
+"\t is set. Therefore, consider setting (debugfs_mountpoint)/mce/fake_panic \n"
+"\t before injecting.\n"
+"\n";
+
+static ssize_t
+inj_readme_read(struct file *filp, char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ return simple_read_from_buffer(ubuf, cnt, ppos,
+ readme_msg, strlen(readme_msg));
+}
+
+static const struct file_operations readme_fops = {
+ .read = inj_readme_read,
+};
+
+static struct dfs_node {
+ char *name;
+ struct dentry *d;
+ const struct file_operations *fops;
+ umode_t perm;
+} dfs_fls[] = {
+ { .name = "status", .fops = &status_fops, .perm = S_IRUSR | S_IWUSR },
+ { .name = "misc", .fops = &misc_fops, .perm = S_IRUSR | S_IWUSR },
+ { .name = "addr", .fops = &addr_fops, .perm = S_IRUSR | S_IWUSR },
+ { .name = "bank", .fops = &bank_fops, .perm = S_IRUSR | S_IWUSR },
+ { .name = "flags", .fops = &flags_fops, .perm = S_IRUSR | S_IWUSR },
+ { .name = "cpu", .fops = &extcpu_fops, .perm = S_IRUSR | S_IWUSR },
+ { .name = "README", .fops = &readme_fops, .perm = S_IRUSR | S_IRGRP | S_IROTH },
+};
+
+static int __init init_mce_inject(void)
+{
+ int i;
+ u64 cap;
+
+ rdmsrl(MSR_IA32_MCG_CAP, cap);
+ n_banks = cap & MCG_BANKCNT_MASK;
+
+ dfs_inj = debugfs_create_dir("mce-inject", NULL);
+ if (!dfs_inj)
+ return -EINVAL;
+
+ for (i = 0; i < ARRAY_SIZE(dfs_fls); i++) {
+ dfs_fls[i].d = debugfs_create_file(dfs_fls[i].name,
+ dfs_fls[i].perm,
+ dfs_inj,
+ &i_mce,
+ dfs_fls[i].fops);
+
+ if (!dfs_fls[i].d)
+ goto err_dfs_add;
+ }
+
+ return 0;
+
+err_dfs_add:
+ while (--i >= 0)
+ debugfs_remove(dfs_fls[i].d);
+
+ debugfs_remove(dfs_inj);
+ dfs_inj = NULL;
+
+ return -ENOMEM;
+}
+
+static void __exit exit_mce_inject(void)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(dfs_fls); i++)
+ debugfs_remove(dfs_fls[i].d);
+
+ memset(&dfs_fls, 0, sizeof(dfs_fls));
+
+ debugfs_remove(dfs_inj);
+ dfs_inj = NULL;
+}
+module_init(init_mce_inject);
+module_exit(exit_mce_inject);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Borislav Petkov <bp@alien8.de>");
+MODULE_AUTHOR("AMD Inc.");
+MODULE_DESCRIPTION("MCE injection facility for RAS testing");
diff --git a/arch/x86/um/asm/barrier.h b/arch/x86/um/asm/barrier.h
index b9531d343134..755481f14d90 100644
--- a/arch/x86/um/asm/barrier.h
+++ b/arch/x86/um/asm/barrier.h
@@ -45,17 +45,4 @@
#define read_barrier_depends() do { } while (0)
#define smp_read_barrier_depends() do { } while (0)
-/*
- * Stop RDTSC speculation. This is needed when you need to use RDTSC
- * (or get_cycles or vread that possibly accesses the TSC) in a defined
- * code region.
- *
- * (Could use an alternative three way for this if there was one.)
- */
-static inline void rdtsc_barrier(void)
-{
- alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC,
- "lfence", X86_FEATURE_LFENCE_RDTSC);
-}
-
#endif
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index e88fda867a33..c7b15f3e2cf3 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -7,8 +7,9 @@ config XEN
depends on PARAVIRT
select PARAVIRT_CLOCK
select XEN_HAVE_PVMMU
+ select XEN_HAVE_VPMU
depends on X86_64 || (X86_32 && X86_PAE)
- depends on X86_TSC
+ depends on X86_LOCAL_APIC && X86_TSC
help
This is the Linux Xen port. Enabling this will allow the
kernel to boot in a paravirtualized environment under the
@@ -17,20 +18,24 @@ config XEN
config XEN_DOM0
def_bool y
depends on XEN && PCI_XEN && SWIOTLB_XEN
- depends on X86_LOCAL_APIC && X86_IO_APIC && ACPI && PCI
+ depends on X86_IO_APIC && ACPI && PCI
config XEN_PVHVM
def_bool y
depends on XEN && PCI && X86_LOCAL_APIC
-config XEN_MAX_DOMAIN_MEMORY
- int
- default 500 if X86_64
- default 64 if X86_32
- depends on XEN
- help
- This only affects the sizing of some bss arrays, the unused
- portions of which are freed.
+config XEN_512GB
+ bool "Limit Xen pv-domain memory to 512GB"
+ depends on XEN && X86_64
+ default y
+ help
+ Limit paravirtualized user domains to 512GB of RAM.
+
+ The Xen tools and crash dump analysis tools might not support
+ pv-domains with more than 512 GB of RAM. This option controls the
+ default setting of the kernel to use only up to 512 GB or more.
+ It is always possible to change the default via specifying the
+ boot parameter "xen_512gb_limit".
config XEN_SAVE_RESTORE
bool
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 7322755f337a..e47e52787d32 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -13,13 +13,13 @@ CFLAGS_mmu.o := $(nostackp)
obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \
time.o xen-asm.o xen-asm_$(BITS).o \
grant-table.o suspend.o platform-pci-unplug.o \
- p2m.o
+ p2m.o apic.o pmu.o
obj-$(CONFIG_EVENT_TRACING) += trace.o
obj-$(CONFIG_SMP) += smp.o
obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o
obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o
-obj-$(CONFIG_XEN_DOM0) += apic.o vga.o
+obj-$(CONFIG_XEN_DOM0) += vga.o
obj-$(CONFIG_SWIOTLB_XEN) += pci-swiotlb-xen.o
obj-$(CONFIG_XEN_EFI) += efi.o
diff --git a/arch/x86/xen/apic.c b/arch/x86/xen/apic.c
index 70e060ad879a..acda713ab5be 100644
--- a/arch/x86/xen/apic.c
+++ b/arch/x86/xen/apic.c
@@ -7,6 +7,7 @@
#include <xen/xen.h>
#include <xen/interface/physdev.h>
#include "xen-ops.h"
+#include "pmu.h"
#include "smp.h"
static unsigned int xen_io_apic_read(unsigned apic, unsigned reg)
@@ -72,6 +73,11 @@ static u32 xen_apic_read(u32 reg)
static void xen_apic_write(u32 reg, u32 val)
{
+ if (reg == APIC_LVTPC) {
+ (void)pmu_apic_update(reg);
+ return;
+ }
+
/* Warn to see if there's any stray references */
WARN(1,"register: %x, value: %x\n", reg, val);
}
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 0b95c9b8283f..993b7a71386d 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -33,6 +33,10 @@
#include <linux/memblock.h>
#include <linux/edd.h>
+#ifdef CONFIG_KEXEC_CORE
+#include <linux/kexec.h>
+#endif
+
#include <xen/xen.h>
#include <xen/events.h>
#include <xen/interface/xen.h>
@@ -84,6 +88,7 @@
#include "mmu.h"
#include "smp.h"
#include "multicalls.h"
+#include "pmu.h"
EXPORT_SYMBOL_GPL(hypercall_page);
@@ -483,6 +488,7 @@ static void set_aliased_prot(void *v, pgprot_t prot)
pte_t pte;
unsigned long pfn;
struct page *page;
+ unsigned char dummy;
ptep = lookup_address((unsigned long)v, &level);
BUG_ON(ptep == NULL);
@@ -492,6 +498,32 @@ static void set_aliased_prot(void *v, pgprot_t prot)
pte = pfn_pte(pfn, prot);
+ /*
+ * Careful: update_va_mapping() will fail if the virtual address
+ * we're poking isn't populated in the page tables. We don't
+ * need to worry about the direct map (that's always in the page
+ * tables), but we need to be careful about vmap space. In
+ * particular, the top level page table can lazily propagate
+ * entries between processes, so if we've switched mms since we
+ * vmapped the target in the first place, we might not have the
+ * top-level page table entry populated.
+ *
+ * We disable preemption because we want the same mm active when
+ * we probe the target and when we issue the hypercall. We'll
+ * have the same nominal mm, but if we're a kernel thread, lazy
+ * mm dropping could change our pgd.
+ *
+ * Out of an abundance of caution, this uses __get_user() to fault
+ * in the target address just in case there's some obscure case
+ * in which the target address isn't readable.
+ */
+
+ preempt_disable();
+
+ pagefault_disable(); /* Avoid warnings due to being atomic. */
+ __get_user(dummy, (unsigned char __user __force *)v);
+ pagefault_enable();
+
if (HYPERVISOR_update_va_mapping((unsigned long)v, pte, 0))
BUG();
@@ -503,6 +535,8 @@ static void set_aliased_prot(void *v, pgprot_t prot)
BUG();
} else
kmap_flush_unused();
+
+ preempt_enable();
}
static void xen_alloc_ldt(struct desc_struct *ldt, unsigned entries)
@@ -510,6 +544,17 @@ static void xen_alloc_ldt(struct desc_struct *ldt, unsigned entries)
const unsigned entries_per_page = PAGE_SIZE / LDT_ENTRY_SIZE;
int i;
+ /*
+ * We need to mark the all aliases of the LDT pages RO. We
+ * don't need to call vm_flush_aliases(), though, since that's
+ * only responsible for flushing aliases out the TLBs, not the
+ * page tables, and Xen will flush the TLB for us if needed.
+ *
+ * To avoid confusing future readers: none of this is necessary
+ * to load the LDT. The hypervisor only checks this when the
+ * LDT is faulted in due to subsequent descriptor access.
+ */
+
for(i = 0; i < entries; i += entries_per_page)
set_aliased_prot(ldt + i, PAGE_KERNEL_RO);
}
@@ -970,8 +1015,7 @@ static void xen_write_cr0(unsigned long cr0)
static void xen_write_cr4(unsigned long cr4)
{
- cr4 &= ~X86_CR4_PGE;
- cr4 &= ~X86_CR4_PSE;
+ cr4 &= ~(X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PCE);
native_write_cr4(cr4);
}
@@ -990,6 +1034,9 @@ static u64 xen_read_msr_safe(unsigned int msr, int *err)
{
u64 val;
+ if (pmu_msr_read(msr, &val, err))
+ return val;
+
val = native_read_msr_safe(msr, err);
switch (msr) {
case MSR_IA32_APICBASE:
@@ -1034,9 +1081,11 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
/* Fast syscall setup is all done in hypercalls, so
these are all ignored. Stub them out here to stop
Xen console noise. */
+ break;
default:
- ret = native_write_msr_safe(msr, low, high);
+ if (!pmu_msr_write(msr, low, high, &ret))
+ ret = native_write_msr_safe(msr, low, high);
}
return ret;
@@ -1175,10 +1224,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = {
.read_msr = xen_read_msr_safe,
.write_msr = xen_write_msr_safe,
- .read_tsc = native_read_tsc,
- .read_pmc = native_read_pmc,
-
- .read_tscp = native_read_tscp,
+ .read_pmc = xen_read_pmc,
.iret = xen_iret,
#ifdef CONFIG_X86_64
@@ -1227,6 +1273,10 @@ static const struct pv_apic_ops xen_apic_ops __initconst = {
static void xen_reboot(int reason)
{
struct sched_shutdown r = { .reason = reason };
+ int cpu;
+
+ for_each_online_cpu(cpu)
+ xen_pmu_finish(cpu);
if (HYPERVISOR_sched_op(SCHEDOP_shutdown, &r))
BUG();
@@ -1570,7 +1620,9 @@ asmlinkage __visible void __init xen_start_kernel(void)
early_boot_irqs_disabled = true;
xen_raw_console_write("mapping kernel into physical memory\n");
- xen_setup_kernel_pagetable((pgd_t *)xen_start_info->pt_base, xen_start_info->nr_pages);
+ xen_setup_kernel_pagetable((pgd_t *)xen_start_info->pt_base,
+ xen_start_info->nr_pages);
+ xen_reserve_special_pages();
/*
* Modify the cache mode translation tables to match Xen's PAT
@@ -1760,6 +1812,21 @@ static struct notifier_block xen_hvm_cpu_notifier = {
.notifier_call = xen_hvm_cpu_notify,
};
+#ifdef CONFIG_KEXEC_CORE
+static void xen_hvm_shutdown(void)
+{
+ native_machine_shutdown();
+ if (kexec_in_progress)
+ xen_reboot(SHUTDOWN_soft_reset);
+}
+
+static void xen_hvm_crash_shutdown(struct pt_regs *regs)
+{
+ native_machine_crash_shutdown(regs);
+ xen_reboot(SHUTDOWN_soft_reset);
+}
+#endif
+
static void __init xen_hvm_guest_init(void)
{
if (xen_pv_domain())
@@ -1779,6 +1846,10 @@ static void __init xen_hvm_guest_init(void)
x86_init.irqs.intr_init = xen_init_IRQ;
xen_hvm_init_time_ops();
xen_hvm_init_mmu_ops();
+#ifdef CONFIG_KEXEC_CORE
+ machine_ops.shutdown = xen_hvm_shutdown;
+ machine_ops.crash_shutdown = xen_hvm_crash_shutdown;
+#endif
}
#endif
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index dd151b2045b0..9c479fe40459 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -116,6 +116,7 @@ static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
DEFINE_PER_CPU(unsigned long, xen_cr3); /* cr3 stored as physaddr */
DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */
+static phys_addr_t xen_pt_base, xen_pt_size __initdata;
/*
* Just beyond the highest usermode address. STACK_TOP_MAX has a
@@ -1093,6 +1094,16 @@ static void xen_exit_mmap(struct mm_struct *mm)
static void xen_post_allocator_init(void);
+static void __init pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
+{
+ struct mmuext_op op;
+
+ op.cmd = cmd;
+ op.arg1.mfn = pfn_to_mfn(pfn);
+ if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
+ BUG();
+}
+
#ifdef CONFIG_X86_64
static void __init xen_cleanhighmap(unsigned long vaddr,
unsigned long vaddr_end)
@@ -1114,6 +1125,83 @@ static void __init xen_cleanhighmap(unsigned long vaddr,
xen_mc_flush();
}
+/*
+ * Make a page range writeable and free it.
+ */
+static void __init xen_free_ro_pages(unsigned long paddr, unsigned long size)
+{
+ void *vaddr = __va(paddr);
+ void *vaddr_end = vaddr + size;
+
+ for (; vaddr < vaddr_end; vaddr += PAGE_SIZE)
+ make_lowmem_page_readwrite(vaddr);
+
+ memblock_free(paddr, size);
+}
+
+static void __init xen_cleanmfnmap_free_pgtbl(void *pgtbl, bool unpin)
+{
+ unsigned long pa = __pa(pgtbl) & PHYSICAL_PAGE_MASK;
+
+ if (unpin)
+ pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(pa));
+ ClearPagePinned(virt_to_page(__va(pa)));
+ xen_free_ro_pages(pa, PAGE_SIZE);
+}
+
+/*
+ * Since it is well isolated we can (and since it is perhaps large we should)
+ * also free the page tables mapping the initial P->M table.
+ */
+static void __init xen_cleanmfnmap(unsigned long vaddr)
+{
+ unsigned long va = vaddr & PMD_MASK;
+ unsigned long pa;
+ pgd_t *pgd = pgd_offset_k(va);
+ pud_t *pud_page = pud_offset(pgd, 0);
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+ unsigned int i;
+ bool unpin;
+
+ unpin = (vaddr == 2 * PGDIR_SIZE);
+ set_pgd(pgd, __pgd(0));
+ do {
+ pud = pud_page + pud_index(va);
+ if (pud_none(*pud)) {
+ va += PUD_SIZE;
+ } else if (pud_large(*pud)) {
+ pa = pud_val(*pud) & PHYSICAL_PAGE_MASK;
+ xen_free_ro_pages(pa, PUD_SIZE);
+ va += PUD_SIZE;
+ } else {
+ pmd = pmd_offset(pud, va);
+ if (pmd_large(*pmd)) {
+ pa = pmd_val(*pmd) & PHYSICAL_PAGE_MASK;
+ xen_free_ro_pages(pa, PMD_SIZE);
+ } else if (!pmd_none(*pmd)) {
+ pte = pte_offset_kernel(pmd, va);
+ set_pmd(pmd, __pmd(0));
+ for (i = 0; i < PTRS_PER_PTE; ++i) {
+ if (pte_none(pte[i]))
+ break;
+ pa = pte_pfn(pte[i]) << PAGE_SHIFT;
+ xen_free_ro_pages(pa, PAGE_SIZE);
+ }
+ xen_cleanmfnmap_free_pgtbl(pte, unpin);
+ }
+ va += PMD_SIZE;
+ if (pmd_index(va))
+ continue;
+ set_pud(pud, __pud(0));
+ xen_cleanmfnmap_free_pgtbl(pmd, unpin);
+ }
+
+ } while (pud_index(va) || pmd_index(va));
+ xen_cleanmfnmap_free_pgtbl(pud_page, unpin);
+}
+
static void __init xen_pagetable_p2m_free(void)
{
unsigned long size;
@@ -1128,18 +1216,31 @@ static void __init xen_pagetable_p2m_free(void)
/* using __ka address and sticking INVALID_P2M_ENTRY! */
memset((void *)xen_start_info->mfn_list, 0xff, size);
- /* We should be in __ka space. */
- BUG_ON(xen_start_info->mfn_list < __START_KERNEL_map);
addr = xen_start_info->mfn_list;
- /* We roundup to the PMD, which means that if anybody at this stage is
- * using the __ka address of xen_start_info or xen_start_info->shared_info
- * they are in going to crash. Fortunatly we have already revectored
- * in xen_setup_kernel_pagetable and in xen_setup_shared_info. */
+ /*
+ * We could be in __ka space.
+ * We roundup to the PMD, which means that if anybody at this stage is
+ * using the __ka address of xen_start_info or
+ * xen_start_info->shared_info they are in going to crash. Fortunatly
+ * we have already revectored in xen_setup_kernel_pagetable and in
+ * xen_setup_shared_info.
+ */
size = roundup(size, PMD_SIZE);
- xen_cleanhighmap(addr, addr + size);
- size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long));
- memblock_free(__pa(xen_start_info->mfn_list), size);
+ if (addr >= __START_KERNEL_map) {
+ xen_cleanhighmap(addr, addr + size);
+ size = PAGE_ALIGN(xen_start_info->nr_pages *
+ sizeof(unsigned long));
+ memblock_free(__pa(addr), size);
+ } else {
+ xen_cleanmfnmap(addr);
+ }
+}
+
+static void __init xen_pagetable_cleanhighmap(void)
+{
+ unsigned long size;
+ unsigned long addr;
/* At this stage, cleanup_highmap has already cleaned __ka space
* from _brk_limit way up to the max_pfn_mapped (which is the end of
@@ -1172,6 +1273,8 @@ static void __init xen_pagetable_p2m_setup(void)
#ifdef CONFIG_X86_64
xen_pagetable_p2m_free();
+
+ xen_pagetable_cleanhighmap();
#endif
/* And revector! Bye bye old array */
xen_start_info->mfn_list = (unsigned long)xen_p2m_addr;
@@ -1461,6 +1564,24 @@ static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
#else /* CONFIG_X86_64 */
static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
{
+ unsigned long pfn;
+
+ if (xen_feature(XENFEAT_writable_page_tables) ||
+ xen_feature(XENFEAT_auto_translated_physmap) ||
+ xen_start_info->mfn_list >= __START_KERNEL_map)
+ return pte;
+
+ /*
+ * Pages belonging to the initial p2m list mapped outside the default
+ * address range must be mapped read-only. This region contains the
+ * page tables for mapping the p2m list, too, and page tables MUST be
+ * mapped read-only.
+ */
+ pfn = pte_pfn(pte);
+ if (pfn >= xen_start_info->first_p2m_pfn &&
+ pfn < xen_start_info->first_p2m_pfn + xen_start_info->nr_p2m_frames)
+ pte = __pte_ma(pte_val_ma(pte) & ~_PAGE_RW);
+
return pte;
}
#endif /* CONFIG_X86_64 */
@@ -1489,15 +1610,6 @@ static void __init xen_set_pte_init(pte_t *ptep, pte_t pte)
native_set_pte(ptep, pte);
}
-static void __init pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
-{
- struct mmuext_op op;
- op.cmd = cmd;
- op.arg1.mfn = pfn_to_mfn(pfn);
- if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
- BUG();
-}
-
/* Early in boot, while setting up the initial pagetable, assume
everything is pinned. */
static void __init xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
@@ -1815,7 +1927,10 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
* mappings. Considering that on Xen after the kernel mappings we
* have the mappings of some pages that don't exist in pfn space, we
* set max_pfn_mapped to the last real pfn mapped. */
- max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->mfn_list));
+ if (xen_start_info->mfn_list < __START_KERNEL_map)
+ max_pfn_mapped = xen_start_info->first_p2m_pfn;
+ else
+ max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->mfn_list));
pt_base = PFN_DOWN(__pa(xen_start_info->pt_base));
pt_end = pt_base + xen_start_info->nr_pt_frames;
@@ -1855,6 +1970,11 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
/* Graft it onto L4[511][510] */
copy_page(level2_kernel_pgt, l2);
+ /* Copy the initial P->M table mappings if necessary. */
+ i = pgd_index(xen_start_info->mfn_list);
+ if (i && i < pgd_index(__START_KERNEL_map))
+ init_level4_pgt[i] = ((pgd_t *)xen_start_info->pt_base)[i];
+
if (!xen_feature(XENFEAT_auto_translated_physmap)) {
/* Make pagetable pieces RO */
set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
@@ -1894,10 +2014,192 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
check_pt_base(&pt_base, &pt_end, addr[i]);
/* Our (by three pages) smaller Xen pagetable that we are using */
- memblock_reserve(PFN_PHYS(pt_base), (pt_end - pt_base) * PAGE_SIZE);
+ xen_pt_base = PFN_PHYS(pt_base);
+ xen_pt_size = (pt_end - pt_base) * PAGE_SIZE;
+ memblock_reserve(xen_pt_base, xen_pt_size);
+
/* Revector the xen_start_info */
xen_start_info = (struct start_info *)__va(__pa(xen_start_info));
}
+
+/*
+ * Read a value from a physical address.
+ */
+static unsigned long __init xen_read_phys_ulong(phys_addr_t addr)
+{
+ unsigned long *vaddr;
+ unsigned long val;
+
+ vaddr = early_memremap_ro(addr, sizeof(val));
+ val = *vaddr;
+ early_memunmap(vaddr, sizeof(val));
+ return val;
+}
+
+/*
+ * Translate a virtual address to a physical one without relying on mapped
+ * page tables.
+ */
+static phys_addr_t __init xen_early_virt_to_phys(unsigned long vaddr)
+{
+ phys_addr_t pa;
+ pgd_t pgd;
+ pud_t pud;
+ pmd_t pmd;
+ pte_t pte;
+
+ pa = read_cr3();
+ pgd = native_make_pgd(xen_read_phys_ulong(pa + pgd_index(vaddr) *
+ sizeof(pgd)));
+ if (!pgd_present(pgd))
+ return 0;
+
+ pa = pgd_val(pgd) & PTE_PFN_MASK;
+ pud = native_make_pud(xen_read_phys_ulong(pa + pud_index(vaddr) *
+ sizeof(pud)));
+ if (!pud_present(pud))
+ return 0;
+ pa = pud_pfn(pud) << PAGE_SHIFT;
+ if (pud_large(pud))
+ return pa + (vaddr & ~PUD_MASK);
+
+ pmd = native_make_pmd(xen_read_phys_ulong(pa + pmd_index(vaddr) *
+ sizeof(pmd)));
+ if (!pmd_present(pmd))
+ return 0;
+ pa = pmd_pfn(pmd) << PAGE_SHIFT;
+ if (pmd_large(pmd))
+ return pa + (vaddr & ~PMD_MASK);
+
+ pte = native_make_pte(xen_read_phys_ulong(pa + pte_index(vaddr) *
+ sizeof(pte)));
+ if (!pte_present(pte))
+ return 0;
+ pa = pte_pfn(pte) << PAGE_SHIFT;
+
+ return pa | (vaddr & ~PAGE_MASK);
+}
+
+/*
+ * Find a new area for the hypervisor supplied p2m list and relocate the p2m to
+ * this area.
+ */
+void __init xen_relocate_p2m(void)
+{
+ phys_addr_t size, new_area, pt_phys, pmd_phys, pud_phys;
+ unsigned long p2m_pfn, p2m_pfn_end, n_frames, pfn, pfn_end;
+ int n_pte, n_pt, n_pmd, n_pud, idx_pte, idx_pt, idx_pmd, idx_pud;
+ pte_t *pt;
+ pmd_t *pmd;
+ pud_t *pud;
+ pgd_t *pgd;
+ unsigned long *new_p2m;
+
+ size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long));
+ n_pte = roundup(size, PAGE_SIZE) >> PAGE_SHIFT;
+ n_pt = roundup(size, PMD_SIZE) >> PMD_SHIFT;
+ n_pmd = roundup(size, PUD_SIZE) >> PUD_SHIFT;
+ n_pud = roundup(size, PGDIR_SIZE) >> PGDIR_SHIFT;
+ n_frames = n_pte + n_pt + n_pmd + n_pud;
+
+ new_area = xen_find_free_area(PFN_PHYS(n_frames));
+ if (!new_area) {
+ xen_raw_console_write("Can't find new memory area for p2m needed due to E820 map conflict\n");
+ BUG();
+ }
+
+ /*
+ * Setup the page tables for addressing the new p2m list.
+ * We have asked the hypervisor to map the p2m list at the user address
+ * PUD_SIZE. It may have done so, or it may have used a kernel space
+ * address depending on the Xen version.
+ * To avoid any possible virtual address collision, just use
+ * 2 * PUD_SIZE for the new area.
+ */
+ pud_phys = new_area;
+ pmd_phys = pud_phys + PFN_PHYS(n_pud);
+ pt_phys = pmd_phys + PFN_PHYS(n_pmd);
+ p2m_pfn = PFN_DOWN(pt_phys) + n_pt;
+
+ pgd = __va(read_cr3());
+ new_p2m = (unsigned long *)(2 * PGDIR_SIZE);
+ for (idx_pud = 0; idx_pud < n_pud; idx_pud++) {
+ pud = early_memremap(pud_phys, PAGE_SIZE);
+ clear_page(pud);
+ for (idx_pmd = 0; idx_pmd < min(n_pmd, PTRS_PER_PUD);
+ idx_pmd++) {
+ pmd = early_memremap(pmd_phys, PAGE_SIZE);
+ clear_page(pmd);
+ for (idx_pt = 0; idx_pt < min(n_pt, PTRS_PER_PMD);
+ idx_pt++) {
+ pt = early_memremap(pt_phys, PAGE_SIZE);
+ clear_page(pt);
+ for (idx_pte = 0;
+ idx_pte < min(n_pte, PTRS_PER_PTE);
+ idx_pte++) {
+ set_pte(pt + idx_pte,
+ pfn_pte(p2m_pfn, PAGE_KERNEL));
+ p2m_pfn++;
+ }
+ n_pte -= PTRS_PER_PTE;
+ early_memunmap(pt, PAGE_SIZE);
+ make_lowmem_page_readonly(__va(pt_phys));
+ pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE,
+ PFN_DOWN(pt_phys));
+ set_pmd(pmd + idx_pt,
+ __pmd(_PAGE_TABLE | pt_phys));
+ pt_phys += PAGE_SIZE;
+ }
+ n_pt -= PTRS_PER_PMD;
+ early_memunmap(pmd, PAGE_SIZE);
+ make_lowmem_page_readonly(__va(pmd_phys));
+ pin_pagetable_pfn(MMUEXT_PIN_L2_TABLE,
+ PFN_DOWN(pmd_phys));
+ set_pud(pud + idx_pmd, __pud(_PAGE_TABLE | pmd_phys));
+ pmd_phys += PAGE_SIZE;
+ }
+ n_pmd -= PTRS_PER_PUD;
+ early_memunmap(pud, PAGE_SIZE);
+ make_lowmem_page_readonly(__va(pud_phys));
+ pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(pud_phys));
+ set_pgd(pgd + 2 + idx_pud, __pgd(_PAGE_TABLE | pud_phys));
+ pud_phys += PAGE_SIZE;
+ }
+
+ /* Now copy the old p2m info to the new area. */
+ memcpy(new_p2m, xen_p2m_addr, size);
+ xen_p2m_addr = new_p2m;
+
+ /* Release the old p2m list and set new list info. */
+ p2m_pfn = PFN_DOWN(xen_early_virt_to_phys(xen_start_info->mfn_list));
+ BUG_ON(!p2m_pfn);
+ p2m_pfn_end = p2m_pfn + PFN_DOWN(size);
+
+ if (xen_start_info->mfn_list < __START_KERNEL_map) {
+ pfn = xen_start_info->first_p2m_pfn;
+ pfn_end = xen_start_info->first_p2m_pfn +
+ xen_start_info->nr_p2m_frames;
+ set_pgd(pgd + 1, __pgd(0));
+ } else {
+ pfn = p2m_pfn;
+ pfn_end = p2m_pfn_end;
+ }
+
+ memblock_free(PFN_PHYS(pfn), PAGE_SIZE * (pfn_end - pfn));
+ while (pfn < pfn_end) {
+ if (pfn == p2m_pfn) {
+ pfn = p2m_pfn_end;
+ continue;
+ }
+ make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
+ pfn++;
+ }
+
+ xen_start_info->mfn_list = (unsigned long)xen_p2m_addr;
+ xen_start_info->first_p2m_pfn = PFN_DOWN(new_area);
+ xen_start_info->nr_p2m_frames = n_frames;
+}
+
#else /* !CONFIG_X86_64 */
static RESERVE_BRK_ARRAY(pmd_t, initial_kernel_pmd, PTRS_PER_PMD);
static RESERVE_BRK_ARRAY(pmd_t, swapper_kernel_pmd, PTRS_PER_PMD);
@@ -1938,18 +2240,41 @@ static void __init xen_write_cr3_init(unsigned long cr3)
pv_mmu_ops.write_cr3 = &xen_write_cr3;
}
+/*
+ * For 32 bit domains xen_start_info->pt_base is the pgd address which might be
+ * not the first page table in the page table pool.
+ * Iterate through the initial page tables to find the real page table base.
+ */
+static phys_addr_t xen_find_pt_base(pmd_t *pmd)
+{
+ phys_addr_t pt_base, paddr;
+ unsigned pmdidx;
+
+ pt_base = min(__pa(xen_start_info->pt_base), __pa(pmd));
+
+ for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++)
+ if (pmd_present(pmd[pmdidx]) && !pmd_large(pmd[pmdidx])) {
+ paddr = m2p(pmd[pmdidx].pmd);
+ pt_base = min(pt_base, paddr);
+ }
+
+ return pt_base;
+}
+
void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
{
pmd_t *kernel_pmd;
+ kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
+
+ xen_pt_base = xen_find_pt_base(kernel_pmd);
+ xen_pt_size = xen_start_info->nr_pt_frames * PAGE_SIZE;
+
initial_kernel_pmd =
extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE);
- max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->pt_base) +
- xen_start_info->nr_pt_frames * PAGE_SIZE +
- 512*1024);
+ max_pfn_mapped = PFN_DOWN(xen_pt_base + xen_pt_size + 512 * 1024);
- kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
copy_page(initial_kernel_pmd, kernel_pmd);
xen_map_identity_early(initial_kernel_pmd, max_pfn);
@@ -1968,11 +2293,33 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
PFN_DOWN(__pa(initial_page_table)));
xen_write_cr3(__pa(initial_page_table));
- memblock_reserve(__pa(xen_start_info->pt_base),
- xen_start_info->nr_pt_frames * PAGE_SIZE);
+ memblock_reserve(xen_pt_base, xen_pt_size);
}
#endif /* CONFIG_X86_64 */
+void __init xen_reserve_special_pages(void)
+{
+ phys_addr_t paddr;
+
+ memblock_reserve(__pa(xen_start_info), PAGE_SIZE);
+ if (xen_start_info->store_mfn) {
+ paddr = PFN_PHYS(mfn_to_pfn(xen_start_info->store_mfn));
+ memblock_reserve(paddr, PAGE_SIZE);
+ }
+ if (!xen_initial_domain()) {
+ paddr = PFN_PHYS(mfn_to_pfn(xen_start_info->console.domU.mfn));
+ memblock_reserve(paddr, PAGE_SIZE);
+ }
+}
+
+void __init xen_pt_check_e820(void)
+{
+ if (xen_is_e820_reserved(xen_pt_base, xen_pt_size)) {
+ xen_raw_console_write("Xen hypervisor allocated page table memory conflicts with E820 map\n");
+ BUG();
+ }
+}
+
static unsigned char dummy_mapping[PAGE_SIZE] __page_aligned_bss;
static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
@@ -2465,9 +2812,9 @@ static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token,
return 0;
}
-static int do_remap_mfn(struct vm_area_struct *vma,
+static int do_remap_gfn(struct vm_area_struct *vma,
unsigned long addr,
- xen_pfn_t *mfn, int nr,
+ xen_pfn_t *gfn, int nr,
int *err_ptr, pgprot_t prot,
unsigned domid,
struct page **pages)
@@ -2483,14 +2830,14 @@ static int do_remap_mfn(struct vm_area_struct *vma,
if (xen_feature(XENFEAT_auto_translated_physmap)) {
#ifdef CONFIG_XEN_PVH
/* We need to update the local page tables and the xen HAP */
- return xen_xlate_remap_gfn_array(vma, addr, mfn, nr, err_ptr,
+ return xen_xlate_remap_gfn_array(vma, addr, gfn, nr, err_ptr,
prot, domid, pages);
#else
return -EINVAL;
#endif
}
- rmd.mfn = mfn;
+ rmd.mfn = gfn;
rmd.prot = prot;
/* We use the err_ptr to indicate if there we are doing a contigious
* mapping or a discontigious mapping. */
@@ -2518,8 +2865,8 @@ static int do_remap_mfn(struct vm_area_struct *vma,
batch_left, &done, domid);
/*
- * @err_ptr may be the same buffer as @mfn, so
- * only clear it after each chunk of @mfn is
+ * @err_ptr may be the same buffer as @gfn, so
+ * only clear it after each chunk of @gfn is
* used.
*/
if (err_ptr) {
@@ -2549,19 +2896,19 @@ out:
return err < 0 ? err : mapped;
}
-int xen_remap_domain_mfn_range(struct vm_area_struct *vma,
+int xen_remap_domain_gfn_range(struct vm_area_struct *vma,
unsigned long addr,
- xen_pfn_t mfn, int nr,
+ xen_pfn_t gfn, int nr,
pgprot_t prot, unsigned domid,
struct page **pages)
{
- return do_remap_mfn(vma, addr, &mfn, nr, NULL, prot, domid, pages);
+ return do_remap_gfn(vma, addr, &gfn, nr, NULL, prot, domid, pages);
}
-EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range);
+EXPORT_SYMBOL_GPL(xen_remap_domain_gfn_range);
-int xen_remap_domain_mfn_array(struct vm_area_struct *vma,
+int xen_remap_domain_gfn_array(struct vm_area_struct *vma,
unsigned long addr,
- xen_pfn_t *mfn, int nr,
+ xen_pfn_t *gfn, int nr,
int *err_ptr, pgprot_t prot,
unsigned domid, struct page **pages)
{
@@ -2570,13 +2917,13 @@ int xen_remap_domain_mfn_array(struct vm_area_struct *vma,
* cause of "wrong memory was mapped in".
*/
BUG_ON(err_ptr == NULL);
- return do_remap_mfn(vma, addr, mfn, nr, err_ptr, prot, domid, pages);
+ return do_remap_gfn(vma, addr, gfn, nr, err_ptr, prot, domid, pages);
}
-EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_array);
+EXPORT_SYMBOL_GPL(xen_remap_domain_gfn_array);
/* Returns: 0 success */
-int xen_unmap_domain_mfn_range(struct vm_area_struct *vma,
+int xen_unmap_domain_gfn_range(struct vm_area_struct *vma,
int numpgs, struct page **pages)
{
if (!pages || !xen_feature(XENFEAT_auto_translated_physmap))
@@ -2588,4 +2935,4 @@ int xen_unmap_domain_mfn_range(struct vm_area_struct *vma,
return -EINVAL;
#endif
}
-EXPORT_SYMBOL_GPL(xen_unmap_domain_mfn_range);
+EXPORT_SYMBOL_GPL(xen_unmap_domain_gfn_range);
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index 8b7f18e200aa..660b3cfef234 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -79,10 +79,14 @@
#include <xen/balloon.h>
#include <xen/grant_table.h>
-#include "p2m.h"
#include "multicalls.h"
#include "xen-ops.h"
+#define P2M_MID_PER_PAGE (PAGE_SIZE / sizeof(unsigned long *))
+#define P2M_TOP_PER_PAGE (PAGE_SIZE / sizeof(unsigned long **))
+
+#define MAX_P2M_PFN (P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE)
+
#define PMDS_PER_MID_PAGE (P2M_MID_PER_PAGE / PTRS_PER_PTE)
unsigned long *xen_p2m_addr __read_mostly;
@@ -108,6 +112,15 @@ static unsigned long *p2m_identity;
static pte_t *p2m_missing_pte;
static pte_t *p2m_identity_pte;
+/*
+ * Hint at last populated PFN.
+ *
+ * Used to set HYPERVISOR_shared_info->arch.max_pfn so the toolstack
+ * can avoid scanning the whole P2M (which may be sized to account for
+ * hotplugged memory).
+ */
+static unsigned long xen_p2m_last_pfn;
+
static inline unsigned p2m_top_index(unsigned long pfn)
{
BUG_ON(pfn >= MAX_P2M_PFN);
@@ -199,7 +212,8 @@ void __ref xen_build_mfn_list_list(void)
unsigned int level, topidx, mididx;
unsigned long *mid_mfn_p;
- if (xen_feature(XENFEAT_auto_translated_physmap))
+ if (xen_feature(XENFEAT_auto_translated_physmap) ||
+ xen_start_info->flags & SIF_VIRT_P2M_4TOOLS)
return;
/* Pre-initialize p2m_top_mfn to be completely missing */
@@ -260,9 +274,16 @@ void xen_setup_mfn_list_list(void)
BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
- HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
- virt_to_mfn(p2m_top_mfn);
- HYPERVISOR_shared_info->arch.max_pfn = xen_max_p2m_pfn;
+ if (xen_start_info->flags & SIF_VIRT_P2M_4TOOLS)
+ HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list = ~0UL;
+ else
+ HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
+ virt_to_mfn(p2m_top_mfn);
+ HYPERVISOR_shared_info->arch.max_pfn = xen_p2m_last_pfn;
+ HYPERVISOR_shared_info->arch.p2m_generation = 0;
+ HYPERVISOR_shared_info->arch.p2m_vaddr = (unsigned long)xen_p2m_addr;
+ HYPERVISOR_shared_info->arch.p2m_cr3 =
+ xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir));
}
/* Set up p2m_top to point to the domain-builder provided p2m pages */
@@ -394,6 +415,8 @@ void __init xen_vmalloc_p2m_tree(void)
static struct vm_struct vm;
unsigned long p2m_limit;
+ xen_p2m_last_pfn = xen_max_p2m_pfn;
+
p2m_limit = (phys_addr_t)P2M_LIMIT * 1024 * 1024 * 1024 / PAGE_SIZE;
vm.flags = VM_ALLOC;
vm.size = ALIGN(sizeof(unsigned long) * max(xen_max_p2m_pfn, p2m_limit),
@@ -478,8 +501,12 @@ static pte_t *alloc_p2m_pmd(unsigned long addr, pte_t *pte_pg)
ptechk = lookup_address(vaddr, &level);
if (ptechk == pte_pg) {
+ HYPERVISOR_shared_info->arch.p2m_generation++;
+ wmb(); /* Tools are synchronizing via p2m_generation. */
set_pmd(pmdp,
__pmd(__pa(pte_newpg[i]) | _KERNPG_TABLE));
+ wmb(); /* Tools are synchronizing via p2m_generation. */
+ HYPERVISOR_shared_info->arch.p2m_generation++;
pte_newpg[i] = NULL;
}
@@ -505,7 +532,7 @@ static pte_t *alloc_p2m_pmd(unsigned long addr, pte_t *pte_pg)
*/
static bool alloc_p2m(unsigned long pfn)
{
- unsigned topidx, mididx;
+ unsigned topidx;
unsigned long *top_mfn_p, *mid_mfn;
pte_t *ptep, *pte_pg;
unsigned int level;
@@ -513,9 +540,6 @@ static bool alloc_p2m(unsigned long pfn)
unsigned long addr = (unsigned long)(xen_p2m_addr + pfn);
unsigned long p2m_pfn;
- topidx = p2m_top_index(pfn);
- mididx = p2m_mid_index(pfn);
-
ptep = lookup_address(addr, &level);
BUG_ON(!ptep || level != PG_LEVEL_4K);
pte_pg = (pte_t *)((unsigned long)ptep & ~(PAGE_SIZE - 1));
@@ -527,7 +551,8 @@ static bool alloc_p2m(unsigned long pfn)
return false;
}
- if (p2m_top_mfn) {
+ if (p2m_top_mfn && pfn < MAX_P2M_PFN) {
+ topidx = p2m_top_index(pfn);
top_mfn_p = &p2m_top_mfn[topidx];
mid_mfn = ACCESS_ONCE(p2m_top_mfn_p[topidx]);
@@ -577,10 +602,14 @@ static bool alloc_p2m(unsigned long pfn)
spin_lock_irqsave(&p2m_update_lock, flags);
if (pte_pfn(*ptep) == p2m_pfn) {
+ HYPERVISOR_shared_info->arch.p2m_generation++;
+ wmb(); /* Tools are synchronizing via p2m_generation. */
set_pte(ptep,
pfn_pte(PFN_DOWN(__pa(p2m)), PAGE_KERNEL));
+ wmb(); /* Tools are synchronizing via p2m_generation. */
+ HYPERVISOR_shared_info->arch.p2m_generation++;
if (mid_mfn)
- mid_mfn[mididx] = virt_to_mfn(p2m);
+ mid_mfn[p2m_mid_index(pfn)] = virt_to_mfn(p2m);
p2m = NULL;
}
@@ -590,6 +619,12 @@ static bool alloc_p2m(unsigned long pfn)
free_p2m_page(p2m);
}
+ /* Expanded the p2m? */
+ if (pfn > xen_p2m_last_pfn) {
+ xen_p2m_last_pfn = pfn;
+ HYPERVISOR_shared_info->arch.max_pfn = xen_p2m_last_pfn;
+ }
+
return true;
}
@@ -630,6 +665,11 @@ bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
return true;
}
+ /*
+ * The interface requires atomic updates on p2m elements.
+ * xen_safe_write_ulong() is using __put_user which does an atomic
+ * store via asm().
+ */
if (likely(!xen_safe_write_ulong(xen_p2m_addr + pfn, mfn)))
return true;
diff --git a/arch/x86/xen/p2m.h b/arch/x86/xen/p2m.h
deleted file mode 100644
index ad8aee24ab72..000000000000
--- a/arch/x86/xen/p2m.h
+++ /dev/null
@@ -1,15 +0,0 @@
-#ifndef _XEN_P2M_H
-#define _XEN_P2M_H
-
-#define P2M_PER_PAGE (PAGE_SIZE / sizeof(unsigned long))
-#define P2M_MID_PER_PAGE (PAGE_SIZE / sizeof(unsigned long *))
-#define P2M_TOP_PER_PAGE (PAGE_SIZE / sizeof(unsigned long **))
-
-#define MAX_P2M_PFN (P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE)
-
-#define MAX_REMAP_RANGES 10
-
-extern unsigned long __init set_phys_range_identity(unsigned long pfn_s,
- unsigned long pfn_e);
-
-#endif /* _XEN_P2M_H */
diff --git a/arch/x86/xen/platform-pci-unplug.c b/arch/x86/xen/platform-pci-unplug.c
index a8261716d58d..9586ff32810c 100644
--- a/arch/x86/xen/platform-pci-unplug.c
+++ b/arch/x86/xen/platform-pci-unplug.c
@@ -68,7 +68,7 @@ static int check_platform_magic(void)
return 0;
}
-bool xen_has_pv_devices()
+bool xen_has_pv_devices(void)
{
if (!xen_domain())
return false;
diff --git a/arch/x86/xen/pmu.c b/arch/x86/xen/pmu.c
new file mode 100644
index 000000000000..724a08740a04
--- /dev/null
+++ b/arch/x86/xen/pmu.c
@@ -0,0 +1,570 @@
+#include <linux/types.h>
+#include <linux/interrupt.h>
+
+#include <asm/xen/hypercall.h>
+#include <xen/page.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/vcpu.h>
+#include <xen/interface/xenpmu.h>
+
+#include "xen-ops.h"
+#include "pmu.h"
+
+/* x86_pmu.handle_irq definition */
+#include "../kernel/cpu/perf_event.h"
+
+#define XENPMU_IRQ_PROCESSING 1
+struct xenpmu {
+ /* Shared page between hypervisor and domain */
+ struct xen_pmu_data *xenpmu_data;
+
+ uint8_t flags;
+};
+static DEFINE_PER_CPU(struct xenpmu, xenpmu_shared);
+#define get_xenpmu_data() (this_cpu_ptr(&xenpmu_shared)->xenpmu_data)
+#define get_xenpmu_flags() (this_cpu_ptr(&xenpmu_shared)->flags)
+
+/* Macro for computing address of a PMU MSR bank */
+#define field_offset(ctxt, field) ((void *)((uintptr_t)ctxt + \
+ (uintptr_t)ctxt->field))
+
+/* AMD PMU */
+#define F15H_NUM_COUNTERS 6
+#define F10H_NUM_COUNTERS 4
+
+static __read_mostly uint32_t amd_counters_base;
+static __read_mostly uint32_t amd_ctrls_base;
+static __read_mostly int amd_msr_step;
+static __read_mostly int k7_counters_mirrored;
+static __read_mostly int amd_num_counters;
+
+/* Intel PMU */
+#define MSR_TYPE_COUNTER 0
+#define MSR_TYPE_CTRL 1
+#define MSR_TYPE_GLOBAL 2
+#define MSR_TYPE_ARCH_COUNTER 3
+#define MSR_TYPE_ARCH_CTRL 4
+
+/* Number of general pmu registers (CPUID.EAX[0xa].EAX[8..15]) */
+#define PMU_GENERAL_NR_SHIFT 8
+#define PMU_GENERAL_NR_BITS 8
+#define PMU_GENERAL_NR_MASK (((1 << PMU_GENERAL_NR_BITS) - 1) \
+ << PMU_GENERAL_NR_SHIFT)
+
+/* Number of fixed pmu registers (CPUID.EDX[0xa].EDX[0..4]) */
+#define PMU_FIXED_NR_SHIFT 0
+#define PMU_FIXED_NR_BITS 5
+#define PMU_FIXED_NR_MASK (((1 << PMU_FIXED_NR_BITS) - 1) \
+ << PMU_FIXED_NR_SHIFT)
+
+/* Alias registers (0x4c1) for full-width writes to PMCs */
+#define MSR_PMC_ALIAS_MASK (~(MSR_IA32_PERFCTR0 ^ MSR_IA32_PMC0))
+
+#define INTEL_PMC_TYPE_SHIFT 30
+
+static __read_mostly int intel_num_arch_counters, intel_num_fixed_counters;
+
+
+static void xen_pmu_arch_init(void)
+{
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
+
+ switch (boot_cpu_data.x86) {
+ case 0x15:
+ amd_num_counters = F15H_NUM_COUNTERS;
+ amd_counters_base = MSR_F15H_PERF_CTR;
+ amd_ctrls_base = MSR_F15H_PERF_CTL;
+ amd_msr_step = 2;
+ k7_counters_mirrored = 1;
+ break;
+ case 0x10:
+ case 0x12:
+ case 0x14:
+ case 0x16:
+ default:
+ amd_num_counters = F10H_NUM_COUNTERS;
+ amd_counters_base = MSR_K7_PERFCTR0;
+ amd_ctrls_base = MSR_K7_EVNTSEL0;
+ amd_msr_step = 1;
+ k7_counters_mirrored = 0;
+ break;
+ }
+ } else {
+ uint32_t eax, ebx, ecx, edx;
+
+ cpuid(0xa, &eax, &ebx, &ecx, &edx);
+
+ intel_num_arch_counters = (eax & PMU_GENERAL_NR_MASK) >>
+ PMU_GENERAL_NR_SHIFT;
+ intel_num_fixed_counters = (edx & PMU_FIXED_NR_MASK) >>
+ PMU_FIXED_NR_SHIFT;
+ }
+}
+
+static inline uint32_t get_fam15h_addr(u32 addr)
+{
+ switch (addr) {
+ case MSR_K7_PERFCTR0:
+ case MSR_K7_PERFCTR1:
+ case MSR_K7_PERFCTR2:
+ case MSR_K7_PERFCTR3:
+ return MSR_F15H_PERF_CTR + (addr - MSR_K7_PERFCTR0);
+ case MSR_K7_EVNTSEL0:
+ case MSR_K7_EVNTSEL1:
+ case MSR_K7_EVNTSEL2:
+ case MSR_K7_EVNTSEL3:
+ return MSR_F15H_PERF_CTL + (addr - MSR_K7_EVNTSEL0);
+ default:
+ break;
+ }
+
+ return addr;
+}
+
+static inline bool is_amd_pmu_msr(unsigned int msr)
+{
+ if ((msr >= MSR_F15H_PERF_CTL &&
+ msr < MSR_F15H_PERF_CTR + (amd_num_counters * 2)) ||
+ (msr >= MSR_K7_EVNTSEL0 &&
+ msr < MSR_K7_PERFCTR0 + amd_num_counters))
+ return true;
+
+ return false;
+}
+
+static int is_intel_pmu_msr(u32 msr_index, int *type, int *index)
+{
+ u32 msr_index_pmc;
+
+ switch (msr_index) {
+ case MSR_CORE_PERF_FIXED_CTR_CTRL:
+ case MSR_IA32_DS_AREA:
+ case MSR_IA32_PEBS_ENABLE:
+ *type = MSR_TYPE_CTRL;
+ return true;
+
+ case MSR_CORE_PERF_GLOBAL_CTRL:
+ case MSR_CORE_PERF_GLOBAL_STATUS:
+ case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
+ *type = MSR_TYPE_GLOBAL;
+ return true;
+
+ default:
+
+ if ((msr_index >= MSR_CORE_PERF_FIXED_CTR0) &&
+ (msr_index < MSR_CORE_PERF_FIXED_CTR0 +
+ intel_num_fixed_counters)) {
+ *index = msr_index - MSR_CORE_PERF_FIXED_CTR0;
+ *type = MSR_TYPE_COUNTER;
+ return true;
+ }
+
+ if ((msr_index >= MSR_P6_EVNTSEL0) &&
+ (msr_index < MSR_P6_EVNTSEL0 + intel_num_arch_counters)) {
+ *index = msr_index - MSR_P6_EVNTSEL0;
+ *type = MSR_TYPE_ARCH_CTRL;
+ return true;
+ }
+
+ msr_index_pmc = msr_index & MSR_PMC_ALIAS_MASK;
+ if ((msr_index_pmc >= MSR_IA32_PERFCTR0) &&
+ (msr_index_pmc < MSR_IA32_PERFCTR0 +
+ intel_num_arch_counters)) {
+ *type = MSR_TYPE_ARCH_COUNTER;
+ *index = msr_index_pmc - MSR_IA32_PERFCTR0;
+ return true;
+ }
+ return false;
+ }
+}
+
+static bool xen_intel_pmu_emulate(unsigned int msr, u64 *val, int type,
+ int index, bool is_read)
+{
+ uint64_t *reg = NULL;
+ struct xen_pmu_intel_ctxt *ctxt;
+ uint64_t *fix_counters;
+ struct xen_pmu_cntr_pair *arch_cntr_pair;
+ struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
+ uint8_t xenpmu_flags = get_xenpmu_flags();
+
+
+ if (!xenpmu_data || !(xenpmu_flags & XENPMU_IRQ_PROCESSING))
+ return false;
+
+ ctxt = &xenpmu_data->pmu.c.intel;
+
+ switch (msr) {
+ case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
+ reg = &ctxt->global_ovf_ctrl;
+ break;
+ case MSR_CORE_PERF_GLOBAL_STATUS:
+ reg = &ctxt->global_status;
+ break;
+ case MSR_CORE_PERF_GLOBAL_CTRL:
+ reg = &ctxt->global_ctrl;
+ break;
+ case MSR_CORE_PERF_FIXED_CTR_CTRL:
+ reg = &ctxt->fixed_ctrl;
+ break;
+ default:
+ switch (type) {
+ case MSR_TYPE_COUNTER:
+ fix_counters = field_offset(ctxt, fixed_counters);
+ reg = &fix_counters[index];
+ break;
+ case MSR_TYPE_ARCH_COUNTER:
+ arch_cntr_pair = field_offset(ctxt, arch_counters);
+ reg = &arch_cntr_pair[index].counter;
+ break;
+ case MSR_TYPE_ARCH_CTRL:
+ arch_cntr_pair = field_offset(ctxt, arch_counters);
+ reg = &arch_cntr_pair[index].control;
+ break;
+ default:
+ return false;
+ }
+ }
+
+ if (reg) {
+ if (is_read)
+ *val = *reg;
+ else {
+ *reg = *val;
+
+ if (msr == MSR_CORE_PERF_GLOBAL_OVF_CTRL)
+ ctxt->global_status &= (~(*val));
+ }
+ return true;
+ }
+
+ return false;
+}
+
+static bool xen_amd_pmu_emulate(unsigned int msr, u64 *val, bool is_read)
+{
+ uint64_t *reg = NULL;
+ int i, off = 0;
+ struct xen_pmu_amd_ctxt *ctxt;
+ uint64_t *counter_regs, *ctrl_regs;
+ struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
+ uint8_t xenpmu_flags = get_xenpmu_flags();
+
+ if (!xenpmu_data || !(xenpmu_flags & XENPMU_IRQ_PROCESSING))
+ return false;
+
+ if (k7_counters_mirrored &&
+ ((msr >= MSR_K7_EVNTSEL0) && (msr <= MSR_K7_PERFCTR3)))
+ msr = get_fam15h_addr(msr);
+
+ ctxt = &xenpmu_data->pmu.c.amd;
+ for (i = 0; i < amd_num_counters; i++) {
+ if (msr == amd_ctrls_base + off) {
+ ctrl_regs = field_offset(ctxt, ctrls);
+ reg = &ctrl_regs[i];
+ break;
+ } else if (msr == amd_counters_base + off) {
+ counter_regs = field_offset(ctxt, counters);
+ reg = &counter_regs[i];
+ break;
+ }
+ off += amd_msr_step;
+ }
+
+ if (reg) {
+ if (is_read)
+ *val = *reg;
+ else
+ *reg = *val;
+
+ return true;
+ }
+ return false;
+}
+
+bool pmu_msr_read(unsigned int msr, uint64_t *val, int *err)
+{
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
+ if (is_amd_pmu_msr(msr)) {
+ if (!xen_amd_pmu_emulate(msr, val, 1))
+ *val = native_read_msr_safe(msr, err);
+ return true;
+ }
+ } else {
+ int type, index;
+
+ if (is_intel_pmu_msr(msr, &type, &index)) {
+ if (!xen_intel_pmu_emulate(msr, val, type, index, 1))
+ *val = native_read_msr_safe(msr, err);
+ return true;
+ }
+ }
+
+ return false;
+}
+
+bool pmu_msr_write(unsigned int msr, uint32_t low, uint32_t high, int *err)
+{
+ uint64_t val = ((uint64_t)high << 32) | low;
+
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
+ if (is_amd_pmu_msr(msr)) {
+ if (!xen_amd_pmu_emulate(msr, &val, 0))
+ *err = native_write_msr_safe(msr, low, high);
+ return true;
+ }
+ } else {
+ int type, index;
+
+ if (is_intel_pmu_msr(msr, &type, &index)) {
+ if (!xen_intel_pmu_emulate(msr, &val, type, index, 0))
+ *err = native_write_msr_safe(msr, low, high);
+ return true;
+ }
+ }
+
+ return false;
+}
+
+static unsigned long long xen_amd_read_pmc(int counter)
+{
+ struct xen_pmu_amd_ctxt *ctxt;
+ uint64_t *counter_regs;
+ struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
+ uint8_t xenpmu_flags = get_xenpmu_flags();
+
+ if (!xenpmu_data || !(xenpmu_flags & XENPMU_IRQ_PROCESSING)) {
+ uint32_t msr;
+ int err;
+
+ msr = amd_counters_base + (counter * amd_msr_step);
+ return native_read_msr_safe(msr, &err);
+ }
+
+ ctxt = &xenpmu_data->pmu.c.amd;
+ counter_regs = field_offset(ctxt, counters);
+ return counter_regs[counter];
+}
+
+static unsigned long long xen_intel_read_pmc(int counter)
+{
+ struct xen_pmu_intel_ctxt *ctxt;
+ uint64_t *fixed_counters;
+ struct xen_pmu_cntr_pair *arch_cntr_pair;
+ struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
+ uint8_t xenpmu_flags = get_xenpmu_flags();
+
+ if (!xenpmu_data || !(xenpmu_flags & XENPMU_IRQ_PROCESSING)) {
+ uint32_t msr;
+ int err;
+
+ if (counter & (1 << INTEL_PMC_TYPE_SHIFT))
+ msr = MSR_CORE_PERF_FIXED_CTR0 + (counter & 0xffff);
+ else
+ msr = MSR_IA32_PERFCTR0 + counter;
+
+ return native_read_msr_safe(msr, &err);
+ }
+
+ ctxt = &xenpmu_data->pmu.c.intel;
+ if (counter & (1 << INTEL_PMC_TYPE_SHIFT)) {
+ fixed_counters = field_offset(ctxt, fixed_counters);
+ return fixed_counters[counter & 0xffff];
+ }
+
+ arch_cntr_pair = field_offset(ctxt, arch_counters);
+ return arch_cntr_pair[counter].counter;
+}
+
+unsigned long long xen_read_pmc(int counter)
+{
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
+ return xen_amd_read_pmc(counter);
+ else
+ return xen_intel_read_pmc(counter);
+}
+
+int pmu_apic_update(uint32_t val)
+{
+ int ret;
+ struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
+
+ if (!xenpmu_data) {
+ pr_warn_once("%s: pmudata not initialized\n", __func__);
+ return -EINVAL;
+ }
+
+ xenpmu_data->pmu.l.lapic_lvtpc = val;
+
+ if (get_xenpmu_flags() & XENPMU_IRQ_PROCESSING)
+ return 0;
+
+ ret = HYPERVISOR_xenpmu_op(XENPMU_lvtpc_set, NULL);
+
+ return ret;
+}
+
+/* perf callbacks */
+static int xen_is_in_guest(void)
+{
+ const struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
+
+ if (!xenpmu_data) {
+ pr_warn_once("%s: pmudata not initialized\n", __func__);
+ return 0;
+ }
+
+ if (!xen_initial_domain() || (xenpmu_data->domain_id >= DOMID_SELF))
+ return 0;
+
+ return 1;
+}
+
+static int xen_is_user_mode(void)
+{
+ const struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
+
+ if (!xenpmu_data) {
+ pr_warn_once("%s: pmudata not initialized\n", __func__);
+ return 0;
+ }
+
+ if (xenpmu_data->pmu.pmu_flags & PMU_SAMPLE_PV)
+ return (xenpmu_data->pmu.pmu_flags & PMU_SAMPLE_USER);
+ else
+ return !!(xenpmu_data->pmu.r.regs.cpl & 3);
+}
+
+static unsigned long xen_get_guest_ip(void)
+{
+ const struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
+
+ if (!xenpmu_data) {
+ pr_warn_once("%s: pmudata not initialized\n", __func__);
+ return 0;
+ }
+
+ return xenpmu_data->pmu.r.regs.ip;
+}
+
+static struct perf_guest_info_callbacks xen_guest_cbs = {
+ .is_in_guest = xen_is_in_guest,
+ .is_user_mode = xen_is_user_mode,
+ .get_guest_ip = xen_get_guest_ip,
+};
+
+/* Convert registers from Xen's format to Linux' */
+static void xen_convert_regs(const struct xen_pmu_regs *xen_regs,
+ struct pt_regs *regs, uint64_t pmu_flags)
+{
+ regs->ip = xen_regs->ip;
+ regs->cs = xen_regs->cs;
+ regs->sp = xen_regs->sp;
+
+ if (pmu_flags & PMU_SAMPLE_PV) {
+ if (pmu_flags & PMU_SAMPLE_USER)
+ regs->cs |= 3;
+ else
+ regs->cs &= ~3;
+ } else {
+ if (xen_regs->cpl)
+ regs->cs |= 3;
+ else
+ regs->cs &= ~3;
+ }
+}
+
+irqreturn_t xen_pmu_irq_handler(int irq, void *dev_id)
+{
+ int err, ret = IRQ_NONE;
+ struct pt_regs regs;
+ const struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
+ uint8_t xenpmu_flags = get_xenpmu_flags();
+
+ if (!xenpmu_data) {
+ pr_warn_once("%s: pmudata not initialized\n", __func__);
+ return ret;
+ }
+
+ this_cpu_ptr(&xenpmu_shared)->flags =
+ xenpmu_flags | XENPMU_IRQ_PROCESSING;
+ xen_convert_regs(&xenpmu_data->pmu.r.regs, &regs,
+ xenpmu_data->pmu.pmu_flags);
+ if (x86_pmu.handle_irq(&regs))
+ ret = IRQ_HANDLED;
+
+ /* Write out cached context to HW */
+ err = HYPERVISOR_xenpmu_op(XENPMU_flush, NULL);
+ this_cpu_ptr(&xenpmu_shared)->flags = xenpmu_flags;
+ if (err) {
+ pr_warn_once("%s: failed hypercall, err: %d\n", __func__, err);
+ return IRQ_NONE;
+ }
+
+ return ret;
+}
+
+bool is_xen_pmu(int cpu)
+{
+ return (get_xenpmu_data() != NULL);
+}
+
+void xen_pmu_init(int cpu)
+{
+ int err;
+ struct xen_pmu_params xp;
+ unsigned long pfn;
+ struct xen_pmu_data *xenpmu_data;
+
+ BUILD_BUG_ON(sizeof(struct xen_pmu_data) > PAGE_SIZE);
+
+ if (xen_hvm_domain())
+ return;
+
+ xenpmu_data = (struct xen_pmu_data *)get_zeroed_page(GFP_KERNEL);
+ if (!xenpmu_data) {
+ pr_err("VPMU init: No memory\n");
+ return;
+ }
+ pfn = virt_to_pfn(xenpmu_data);
+
+ xp.val = pfn_to_mfn(pfn);
+ xp.vcpu = cpu;
+ xp.version.maj = XENPMU_VER_MAJ;
+ xp.version.min = XENPMU_VER_MIN;
+ err = HYPERVISOR_xenpmu_op(XENPMU_init, &xp);
+ if (err)
+ goto fail;
+
+ per_cpu(xenpmu_shared, cpu).xenpmu_data = xenpmu_data;
+ per_cpu(xenpmu_shared, cpu).flags = 0;
+
+ if (cpu == 0) {
+ perf_register_guest_info_callbacks(&xen_guest_cbs);
+ xen_pmu_arch_init();
+ }
+
+ return;
+
+fail:
+ pr_warn_once("Could not initialize VPMU for cpu %d, error %d\n",
+ cpu, err);
+ free_pages((unsigned long)xenpmu_data, 0);
+}
+
+void xen_pmu_finish(int cpu)
+{
+ struct xen_pmu_params xp;
+
+ if (xen_hvm_domain())
+ return;
+
+ xp.vcpu = cpu;
+ xp.version.maj = XENPMU_VER_MAJ;
+ xp.version.min = XENPMU_VER_MIN;
+
+ (void)HYPERVISOR_xenpmu_op(XENPMU_finish, &xp);
+
+ free_pages((unsigned long)per_cpu(xenpmu_shared, cpu).xenpmu_data, 0);
+ per_cpu(xenpmu_shared, cpu).xenpmu_data = NULL;
+}
diff --git a/arch/x86/xen/pmu.h b/arch/x86/xen/pmu.h
new file mode 100644
index 000000000000..af5f0ad94078
--- /dev/null
+++ b/arch/x86/xen/pmu.h
@@ -0,0 +1,15 @@
+#ifndef __XEN_PMU_H
+#define __XEN_PMU_H
+
+#include <xen/interface/xenpmu.h>
+
+irqreturn_t xen_pmu_irq_handler(int irq, void *dev_id);
+void xen_pmu_init(int cpu);
+void xen_pmu_finish(int cpu);
+bool is_xen_pmu(int cpu);
+bool pmu_msr_read(unsigned int msr, uint64_t *val, int *err);
+bool pmu_msr_write(unsigned int msr, uint32_t low, uint32_t high, int *err);
+int pmu_apic_update(uint32_t reg);
+unsigned long long xen_read_pmc(int counter);
+
+#endif /* __XEN_PMU_H */
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 55f388ef481a..1c30e4ab1022 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -27,17 +27,23 @@
#include <xen/interface/memory.h>
#include <xen/interface/physdev.h>
#include <xen/features.h>
+#include <xen/hvc-console.h>
#include "xen-ops.h"
#include "vdso.h"
-#include "p2m.h"
#include "mmu.h"
+#define GB(x) ((uint64_t)(x) * 1024 * 1024 * 1024)
+
/* Amount of extra memory space we add to the e820 ranges */
struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata;
/* Number of pages released from the initial allocation. */
unsigned long xen_released_pages;
+/* E820 map used during setting up memory. */
+static struct e820entry xen_e820_map[E820MAX] __initdata;
+static u32 xen_e820_map_entries __initdata;
+
/*
* Buffer used to remap identity mapped pages. We only need the virtual space.
* The physical page behind this address is remapped as needed to different
@@ -64,62 +70,89 @@ static unsigned long xen_remap_mfn __initdata = INVALID_P2M_ENTRY;
*/
#define EXTRA_MEM_RATIO (10)
-static void __init xen_add_extra_mem(phys_addr_t start, phys_addr_t size)
+static bool xen_512gb_limit __initdata = IS_ENABLED(CONFIG_XEN_512GB);
+
+static void __init xen_parse_512gb(void)
+{
+ bool val = false;
+ char *arg;
+
+ arg = strstr(xen_start_info->cmd_line, "xen_512gb_limit");
+ if (!arg)
+ return;
+
+ arg = strstr(xen_start_info->cmd_line, "xen_512gb_limit=");
+ if (!arg)
+ val = true;
+ else if (strtobool(arg + strlen("xen_512gb_limit="), &val))
+ return;
+
+ xen_512gb_limit = val;
+}
+
+static void __init xen_add_extra_mem(unsigned long start_pfn,
+ unsigned long n_pfns)
{
int i;
+ /*
+ * No need to check for zero size, should happen rarely and will only
+ * write a new entry regarded to be unused due to zero size.
+ */
for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
/* Add new region. */
- if (xen_extra_mem[i].size == 0) {
- xen_extra_mem[i].start = start;
- xen_extra_mem[i].size = size;
+ if (xen_extra_mem[i].n_pfns == 0) {
+ xen_extra_mem[i].start_pfn = start_pfn;
+ xen_extra_mem[i].n_pfns = n_pfns;
break;
}
/* Append to existing region. */
- if (xen_extra_mem[i].start + xen_extra_mem[i].size == start) {
- xen_extra_mem[i].size += size;
+ if (xen_extra_mem[i].start_pfn + xen_extra_mem[i].n_pfns ==
+ start_pfn) {
+ xen_extra_mem[i].n_pfns += n_pfns;
break;
}
}
if (i == XEN_EXTRA_MEM_MAX_REGIONS)
printk(KERN_WARNING "Warning: not enough extra memory regions\n");
- memblock_reserve(start, size);
+ memblock_reserve(PFN_PHYS(start_pfn), PFN_PHYS(n_pfns));
}
-static void __init xen_del_extra_mem(phys_addr_t start, phys_addr_t size)
+static void __init xen_del_extra_mem(unsigned long start_pfn,
+ unsigned long n_pfns)
{
int i;
- phys_addr_t start_r, size_r;
+ unsigned long start_r, size_r;
for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
- start_r = xen_extra_mem[i].start;
- size_r = xen_extra_mem[i].size;
+ start_r = xen_extra_mem[i].start_pfn;
+ size_r = xen_extra_mem[i].n_pfns;
/* Start of region. */
- if (start_r == start) {
- BUG_ON(size > size_r);
- xen_extra_mem[i].start += size;
- xen_extra_mem[i].size -= size;
+ if (start_r == start_pfn) {
+ BUG_ON(n_pfns > size_r);
+ xen_extra_mem[i].start_pfn += n_pfns;
+ xen_extra_mem[i].n_pfns -= n_pfns;
break;
}
/* End of region. */
- if (start_r + size_r == start + size) {
- BUG_ON(size > size_r);
- xen_extra_mem[i].size -= size;
+ if (start_r + size_r == start_pfn + n_pfns) {
+ BUG_ON(n_pfns > size_r);
+ xen_extra_mem[i].n_pfns -= n_pfns;
break;
}
/* Mid of region. */
- if (start > start_r && start < start_r + size_r) {
- BUG_ON(start + size > start_r + size_r);
- xen_extra_mem[i].size = start - start_r;
+ if (start_pfn > start_r && start_pfn < start_r + size_r) {
+ BUG_ON(start_pfn + n_pfns > start_r + size_r);
+ xen_extra_mem[i].n_pfns = start_pfn - start_r;
/* Calling memblock_reserve() again is okay. */
- xen_add_extra_mem(start + size, start_r + size_r -
- (start + size));
+ xen_add_extra_mem(start_pfn + n_pfns, start_r + size_r -
+ (start_pfn + n_pfns));
break;
}
}
- memblock_free(start, size);
+ memblock_free(PFN_PHYS(start_pfn), PFN_PHYS(n_pfns));
}
/*
@@ -130,11 +163,10 @@ static void __init xen_del_extra_mem(phys_addr_t start, phys_addr_t size)
unsigned long __ref xen_chk_extra_mem(unsigned long pfn)
{
int i;
- phys_addr_t addr = PFN_PHYS(pfn);
for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
- if (addr >= xen_extra_mem[i].start &&
- addr < xen_extra_mem[i].start + xen_extra_mem[i].size)
+ if (pfn >= xen_extra_mem[i].start_pfn &&
+ pfn < xen_extra_mem[i].start_pfn + xen_extra_mem[i].n_pfns)
return INVALID_P2M_ENTRY;
}
@@ -150,10 +182,10 @@ void __init xen_inv_extra_mem(void)
int i;
for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
- if (!xen_extra_mem[i].size)
+ if (!xen_extra_mem[i].n_pfns)
continue;
- pfn_s = PFN_DOWN(xen_extra_mem[i].start);
- pfn_e = PFN_UP(xen_extra_mem[i].start + xen_extra_mem[i].size);
+ pfn_s = xen_extra_mem[i].start_pfn;
+ pfn_e = pfn_s + xen_extra_mem[i].n_pfns;
for (pfn = pfn_s; pfn < pfn_e; pfn++)
set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
}
@@ -164,15 +196,13 @@ void __init xen_inv_extra_mem(void)
* This function updates min_pfn with the pfn found and returns
* the size of that range or zero if not found.
*/
-static unsigned long __init xen_find_pfn_range(
- const struct e820entry *list, size_t map_size,
- unsigned long *min_pfn)
+static unsigned long __init xen_find_pfn_range(unsigned long *min_pfn)
{
- const struct e820entry *entry;
+ const struct e820entry *entry = xen_e820_map;
unsigned int i;
unsigned long done = 0;
- for (i = 0, entry = list; i < map_size; i++, entry++) {
+ for (i = 0; i < xen_e820_map_entries; i++, entry++) {
unsigned long s_pfn;
unsigned long e_pfn;
@@ -221,7 +251,7 @@ static int __init xen_free_mfn(unsigned long mfn)
* as a fallback if the remapping fails.
*/
static void __init xen_set_identity_and_release_chunk(unsigned long start_pfn,
- unsigned long end_pfn, unsigned long nr_pages, unsigned long *released)
+ unsigned long end_pfn, unsigned long nr_pages)
{
unsigned long pfn, end;
int ret;
@@ -241,7 +271,7 @@ static void __init xen_set_identity_and_release_chunk(unsigned long start_pfn,
WARN(ret != 1, "Failed to release pfn %lx err=%d\n", pfn, ret);
if (ret == 1) {
- (*released)++;
+ xen_released_pages++;
if (!__set_phys_to_machine(pfn, INVALID_P2M_ENTRY))
break;
} else
@@ -356,9 +386,8 @@ static void __init xen_do_set_identity_and_remap_chunk(
* to Xen and not remapped.
*/
static unsigned long __init xen_set_identity_and_remap_chunk(
- const struct e820entry *list, size_t map_size, unsigned long start_pfn,
- unsigned long end_pfn, unsigned long nr_pages, unsigned long remap_pfn,
- unsigned long *released, unsigned long *remapped)
+ unsigned long start_pfn, unsigned long end_pfn, unsigned long nr_pages,
+ unsigned long remap_pfn)
{
unsigned long pfn;
unsigned long i = 0;
@@ -379,12 +408,11 @@ static unsigned long __init xen_set_identity_and_remap_chunk(
if (cur_pfn + size > nr_pages)
size = nr_pages - cur_pfn;
- remap_range_size = xen_find_pfn_range(list, map_size,
- &remap_pfn);
+ remap_range_size = xen_find_pfn_range(&remap_pfn);
if (!remap_range_size) {
pr_warning("Unable to find available pfn range, not remapping identity pages\n");
xen_set_identity_and_release_chunk(cur_pfn,
- cur_pfn + left, nr_pages, released);
+ cur_pfn + left, nr_pages);
break;
}
/* Adjust size to fit in current e820 RAM region */
@@ -396,7 +424,6 @@ static unsigned long __init xen_set_identity_and_remap_chunk(
/* Update variables to reflect new mappings. */
i += size;
remap_pfn += size;
- *remapped += size;
}
/*
@@ -411,15 +438,11 @@ static unsigned long __init xen_set_identity_and_remap_chunk(
return remap_pfn;
}
-static void __init xen_set_identity_and_remap(
- const struct e820entry *list, size_t map_size, unsigned long nr_pages,
- unsigned long *released, unsigned long *remapped)
+static void __init xen_set_identity_and_remap(unsigned long nr_pages)
{
phys_addr_t start = 0;
unsigned long last_pfn = nr_pages;
- const struct e820entry *entry;
- unsigned long num_released = 0;
- unsigned long num_remapped = 0;
+ const struct e820entry *entry = xen_e820_map;
int i;
/*
@@ -433,9 +456,9 @@ static void __init xen_set_identity_and_remap(
* example) the DMI tables in a reserved region that begins on
* a non-page boundary.
*/
- for (i = 0, entry = list; i < map_size; i++, entry++) {
+ for (i = 0; i < xen_e820_map_entries; i++, entry++) {
phys_addr_t end = entry->addr + entry->size;
- if (entry->type == E820_RAM || i == map_size - 1) {
+ if (entry->type == E820_RAM || i == xen_e820_map_entries - 1) {
unsigned long start_pfn = PFN_DOWN(start);
unsigned long end_pfn = PFN_UP(end);
@@ -444,17 +467,13 @@ static void __init xen_set_identity_and_remap(
if (start_pfn < end_pfn)
last_pfn = xen_set_identity_and_remap_chunk(
- list, map_size, start_pfn,
- end_pfn, nr_pages, last_pfn,
- &num_released, &num_remapped);
+ start_pfn, end_pfn, nr_pages,
+ last_pfn);
start = end;
}
}
- *released = num_released;
- *remapped = num_remapped;
-
- pr_info("Released %ld page(s)\n", num_released);
+ pr_info("Released %ld page(s)\n", xen_released_pages);
}
/*
@@ -494,7 +513,7 @@ void __init xen_remap_memory(void)
} else if (pfn_s + len == xen_remap_buf.target_pfn) {
len += xen_remap_buf.size;
} else {
- xen_del_extra_mem(PFN_PHYS(pfn_s), PFN_PHYS(len));
+ xen_del_extra_mem(pfn_s, len);
pfn_s = xen_remap_buf.target_pfn;
len = xen_remap_buf.size;
}
@@ -504,18 +523,35 @@ void __init xen_remap_memory(void)
}
if (pfn_s != ~0UL && len)
- xen_del_extra_mem(PFN_PHYS(pfn_s), PFN_PHYS(len));
+ xen_del_extra_mem(pfn_s, len);
set_pte_mfn(buf, mfn_save, PAGE_KERNEL);
pr_info("Remapped %ld page(s)\n", remapped);
}
+static unsigned long __init xen_get_pages_limit(void)
+{
+ unsigned long limit;
+
+#ifdef CONFIG_X86_32
+ limit = GB(64) / PAGE_SIZE;
+#else
+ limit = MAXMEM / PAGE_SIZE;
+ if (!xen_initial_domain() && xen_512gb_limit)
+ limit = GB(512) / PAGE_SIZE;
+#endif
+ return limit;
+}
+
static unsigned long __init xen_get_max_pages(void)
{
- unsigned long max_pages = MAX_DOMAIN_PAGES;
+ unsigned long max_pages, limit;
domid_t domid = DOMID_SELF;
- int ret;
+ long ret;
+
+ limit = xen_get_pages_limit();
+ max_pages = limit;
/*
* For the initial domain we use the maximum reservation as
@@ -532,7 +568,7 @@ static unsigned long __init xen_get_max_pages(void)
max_pages = ret;
}
- return min(max_pages, MAX_DOMAIN_PAGES);
+ return min(max_pages, limit);
}
static void __init xen_align_and_add_e820_region(phys_addr_t start,
@@ -549,39 +585,188 @@ static void __init xen_align_and_add_e820_region(phys_addr_t start,
e820_add_region(start, end - start, type);
}
-static void __init xen_ignore_unusable(struct e820entry *list, size_t map_size)
+static void __init xen_ignore_unusable(void)
{
- struct e820entry *entry;
+ struct e820entry *entry = xen_e820_map;
unsigned int i;
- for (i = 0, entry = list; i < map_size; i++, entry++) {
+ for (i = 0; i < xen_e820_map_entries; i++, entry++) {
if (entry->type == E820_UNUSABLE)
entry->type = E820_RAM;
}
}
+static unsigned long __init xen_count_remap_pages(unsigned long max_pfn)
+{
+ unsigned long extra = 0;
+ unsigned long start_pfn, end_pfn;
+ const struct e820entry *entry = xen_e820_map;
+ int i;
+
+ end_pfn = 0;
+ for (i = 0; i < xen_e820_map_entries; i++, entry++) {
+ start_pfn = PFN_DOWN(entry->addr);
+ /* Adjacent regions on non-page boundaries handling! */
+ end_pfn = min(end_pfn, start_pfn);
+
+ if (start_pfn >= max_pfn)
+ return extra + max_pfn - end_pfn;
+
+ /* Add any holes in map to result. */
+ extra += start_pfn - end_pfn;
+
+ end_pfn = PFN_UP(entry->addr + entry->size);
+ end_pfn = min(end_pfn, max_pfn);
+
+ if (entry->type != E820_RAM)
+ extra += end_pfn - start_pfn;
+ }
+
+ return extra;
+}
+
+bool __init xen_is_e820_reserved(phys_addr_t start, phys_addr_t size)
+{
+ struct e820entry *entry;
+ unsigned mapcnt;
+ phys_addr_t end;
+
+ if (!size)
+ return false;
+
+ end = start + size;
+ entry = xen_e820_map;
+
+ for (mapcnt = 0; mapcnt < xen_e820_map_entries; mapcnt++) {
+ if (entry->type == E820_RAM && entry->addr <= start &&
+ (entry->addr + entry->size) >= end)
+ return false;
+
+ entry++;
+ }
+
+ return true;
+}
+
+/*
+ * Find a free area in physical memory not yet reserved and compliant with
+ * E820 map.
+ * Used to relocate pre-allocated areas like initrd or p2m list which are in
+ * conflict with the to be used E820 map.
+ * In case no area is found, return 0. Otherwise return the physical address
+ * of the area which is already reserved for convenience.
+ */
+phys_addr_t __init xen_find_free_area(phys_addr_t size)
+{
+ unsigned mapcnt;
+ phys_addr_t addr, start;
+ struct e820entry *entry = xen_e820_map;
+
+ for (mapcnt = 0; mapcnt < xen_e820_map_entries; mapcnt++, entry++) {
+ if (entry->type != E820_RAM || entry->size < size)
+ continue;
+ start = entry->addr;
+ for (addr = start; addr < start + size; addr += PAGE_SIZE) {
+ if (!memblock_is_reserved(addr))
+ continue;
+ start = addr + PAGE_SIZE;
+ if (start + size > entry->addr + entry->size)
+ break;
+ }
+ if (addr >= start + size) {
+ memblock_reserve(start, size);
+ return start;
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * Like memcpy, but with physical addresses for dest and src.
+ */
+static void __init xen_phys_memcpy(phys_addr_t dest, phys_addr_t src,
+ phys_addr_t n)
+{
+ phys_addr_t dest_off, src_off, dest_len, src_len, len;
+ void *from, *to;
+
+ while (n) {
+ dest_off = dest & ~PAGE_MASK;
+ src_off = src & ~PAGE_MASK;
+ dest_len = n;
+ if (dest_len > (NR_FIX_BTMAPS << PAGE_SHIFT) - dest_off)
+ dest_len = (NR_FIX_BTMAPS << PAGE_SHIFT) - dest_off;
+ src_len = n;
+ if (src_len > (NR_FIX_BTMAPS << PAGE_SHIFT) - src_off)
+ src_len = (NR_FIX_BTMAPS << PAGE_SHIFT) - src_off;
+ len = min(dest_len, src_len);
+ to = early_memremap(dest - dest_off, dest_len + dest_off);
+ from = early_memremap(src - src_off, src_len + src_off);
+ memcpy(to, from, len);
+ early_memunmap(to, dest_len + dest_off);
+ early_memunmap(from, src_len + src_off);
+ n -= len;
+ dest += len;
+ src += len;
+ }
+}
+
+/*
+ * Reserve Xen mfn_list.
+ */
+static void __init xen_reserve_xen_mfnlist(void)
+{
+ phys_addr_t start, size;
+
+ if (xen_start_info->mfn_list >= __START_KERNEL_map) {
+ start = __pa(xen_start_info->mfn_list);
+ size = PFN_ALIGN(xen_start_info->nr_pages *
+ sizeof(unsigned long));
+ } else {
+ start = PFN_PHYS(xen_start_info->first_p2m_pfn);
+ size = PFN_PHYS(xen_start_info->nr_p2m_frames);
+ }
+
+ if (!xen_is_e820_reserved(start, size)) {
+ memblock_reserve(start, size);
+ return;
+ }
+
+#ifdef CONFIG_X86_32
+ /*
+ * Relocating the p2m on 32 bit system to an arbitrary virtual address
+ * is not supported, so just give up.
+ */
+ xen_raw_console_write("Xen hypervisor allocated p2m list conflicts with E820 map\n");
+ BUG();
+#else
+ xen_relocate_p2m();
+#endif
+}
+
/**
* machine_specific_memory_setup - Hook for machine specific memory setup.
**/
char * __init xen_memory_setup(void)
{
- static struct e820entry map[E820MAX] __initdata;
-
- unsigned long max_pfn = xen_start_info->nr_pages;
- phys_addr_t mem_end;
+ unsigned long max_pfn, pfn_s, n_pfns;
+ phys_addr_t mem_end, addr, size, chunk_size;
+ u32 type;
int rc;
struct xen_memory_map memmap;
unsigned long max_pages;
unsigned long extra_pages = 0;
- unsigned long remapped_pages;
int i;
int op;
- max_pfn = min(MAX_DOMAIN_PAGES, max_pfn);
+ xen_parse_512gb();
+ max_pfn = xen_get_pages_limit();
+ max_pfn = min(max_pfn, xen_start_info->nr_pages);
mem_end = PFN_PHYS(max_pfn);
memmap.nr_entries = E820MAX;
- set_xen_guest_handle(memmap.buffer, map);
+ set_xen_guest_handle(memmap.buffer, xen_e820_map);
op = xen_initial_domain() ?
XENMEM_machine_memory_map :
@@ -590,15 +775,16 @@ char * __init xen_memory_setup(void)
if (rc == -ENOSYS) {
BUG_ON(xen_initial_domain());
memmap.nr_entries = 1;
- map[0].addr = 0ULL;
- map[0].size = mem_end;
+ xen_e820_map[0].addr = 0ULL;
+ xen_e820_map[0].size = mem_end;
/* 8MB slack (to balance backend allocations). */
- map[0].size += 8ULL << 20;
- map[0].type = E820_RAM;
+ xen_e820_map[0].size += 8ULL << 20;
+ xen_e820_map[0].type = E820_RAM;
rc = 0;
}
BUG_ON(rc);
BUG_ON(memmap.nr_entries == 0);
+ xen_e820_map_entries = memmap.nr_entries;
/*
* Xen won't allow a 1:1 mapping to be created to UNUSABLE
@@ -609,24 +795,19 @@ char * __init xen_memory_setup(void)
* a patch in the future.
*/
if (xen_initial_domain())
- xen_ignore_unusable(map, memmap.nr_entries);
+ xen_ignore_unusable();
/* Make sure the Xen-supplied memory map is well-ordered. */
- sanitize_e820_map(map, memmap.nr_entries, &memmap.nr_entries);
+ sanitize_e820_map(xen_e820_map, ARRAY_SIZE(xen_e820_map),
+ &xen_e820_map_entries);
max_pages = xen_get_max_pages();
- if (max_pages > max_pfn)
- extra_pages += max_pages - max_pfn;
- /*
- * Set identity map on non-RAM pages and prepare remapping the
- * underlying RAM.
- */
- xen_set_identity_and_remap(map, memmap.nr_entries, max_pfn,
- &xen_released_pages, &remapped_pages);
+ /* How many extra pages do we need due to remapping? */
+ max_pages += xen_count_remap_pages(max_pfn);
- extra_pages += xen_released_pages;
- extra_pages += remapped_pages;
+ if (max_pages > max_pfn)
+ extra_pages += max_pages - max_pfn;
/*
* Clamp the amount of extra memory to a EXTRA_MEM_RATIO
@@ -635,46 +816,54 @@ char * __init xen_memory_setup(void)
* is limited to the max size of lowmem, so that it doesn't
* get completely filled.
*
+ * Make sure we have no memory above max_pages, as this area
+ * isn't handled by the p2m management.
+ *
* In principle there could be a problem in lowmem systems if
* the initial memory is also very large with respect to
* lowmem, but we won't try to deal with that here.
*/
- extra_pages = min(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)),
- extra_pages);
+ extra_pages = min3(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)),
+ extra_pages, max_pages - max_pfn);
i = 0;
- while (i < memmap.nr_entries) {
- phys_addr_t addr = map[i].addr;
- phys_addr_t size = map[i].size;
- u32 type = map[i].type;
+ addr = xen_e820_map[0].addr;
+ size = xen_e820_map[0].size;
+ while (i < xen_e820_map_entries) {
+ chunk_size = size;
+ type = xen_e820_map[i].type;
if (type == E820_RAM) {
if (addr < mem_end) {
- size = min(size, mem_end - addr);
+ chunk_size = min(size, mem_end - addr);
} else if (extra_pages) {
- size = min(size, PFN_PHYS(extra_pages));
- extra_pages -= PFN_DOWN(size);
- xen_add_extra_mem(addr, size);
- xen_max_p2m_pfn = PFN_DOWN(addr + size);
+ chunk_size = min(size, PFN_PHYS(extra_pages));
+ pfn_s = PFN_UP(addr);
+ n_pfns = PFN_DOWN(addr + chunk_size) - pfn_s;
+ extra_pages -= n_pfns;
+ xen_add_extra_mem(pfn_s, n_pfns);
+ xen_max_p2m_pfn = pfn_s + n_pfns;
} else
type = E820_UNUSABLE;
}
- xen_align_and_add_e820_region(addr, size, type);
+ xen_align_and_add_e820_region(addr, chunk_size, type);
- map[i].addr += size;
- map[i].size -= size;
- if (map[i].size == 0)
+ addr += chunk_size;
+ size -= chunk_size;
+ if (size == 0) {
i++;
+ if (i < xen_e820_map_entries) {
+ addr = xen_e820_map[i].addr;
+ size = xen_e820_map[i].size;
+ }
+ }
}
/*
* Set the rest as identity mapped, in case PCI BARs are
* located here.
- *
- * PFNs above MAX_P2M_PFN are considered identity mapped as
- * well.
*/
- set_phys_range_identity(map[i-1].addr / PAGE_SIZE, ~0ul);
+ set_phys_range_identity(addr / PAGE_SIZE, ~0ul);
/*
* In domU, the ISA region is normal, usable memory, but we
@@ -684,34 +873,53 @@ char * __init xen_memory_setup(void)
e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS,
E820_RESERVED);
+ sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
+
/*
- * Reserve Xen bits:
- * - mfn_list
- * - xen_start_info
- * See comment above "struct start_info" in <xen/interface/xen.h>
- * We tried to make the the memblock_reserve more selective so
- * that it would be clear what region is reserved. Sadly we ran
- * in the problem wherein on a 64-bit hypervisor with a 32-bit
- * initial domain, the pt_base has the cr3 value which is not
- * neccessarily where the pagetable starts! As Jan put it: "
- * Actually, the adjustment turns out to be correct: The page
- * tables for a 32-on-64 dom0 get allocated in the order "first L1",
- * "first L2", "first L3", so the offset to the page table base is
- * indeed 2. When reading xen/include/public/xen.h's comment
- * very strictly, this is not a violation (since there nothing is said
- * that the first thing in the page table space is pointed to by
- * pt_base; I admit that this seems to be implied though, namely
- * do I think that it is implied that the page table space is the
- * range [pt_base, pt_base + nt_pt_frames), whereas that
- * range here indeed is [pt_base - 2, pt_base - 2 + nt_pt_frames),
- * which - without a priori knowledge - the kernel would have
- * difficulty to figure out)." - so lets just fall back to the
- * easy way and reserve the whole region.
+ * Check whether the kernel itself conflicts with the target E820 map.
+ * Failing now is better than running into weird problems later due
+ * to relocating (and even reusing) pages with kernel text or data.
*/
- memblock_reserve(__pa(xen_start_info->mfn_list),
- xen_start_info->pt_base - xen_start_info->mfn_list);
+ if (xen_is_e820_reserved(__pa_symbol(_text),
+ __pa_symbol(__bss_stop) - __pa_symbol(_text))) {
+ xen_raw_console_write("Xen hypervisor allocated kernel memory conflicts with E820 map\n");
+ BUG();
+ }
- sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
+ /*
+ * Check for a conflict of the hypervisor supplied page tables with
+ * the target E820 map.
+ */
+ xen_pt_check_e820();
+
+ xen_reserve_xen_mfnlist();
+
+ /* Check for a conflict of the initrd with the target E820 map. */
+ if (xen_is_e820_reserved(boot_params.hdr.ramdisk_image,
+ boot_params.hdr.ramdisk_size)) {
+ phys_addr_t new_area, start, size;
+
+ new_area = xen_find_free_area(boot_params.hdr.ramdisk_size);
+ if (!new_area) {
+ xen_raw_console_write("Can't find new memory area for initrd needed due to E820 map conflict\n");
+ BUG();
+ }
+
+ start = boot_params.hdr.ramdisk_image;
+ size = boot_params.hdr.ramdisk_size;
+ xen_phys_memcpy(new_area, start, size);
+ pr_info("initrd moved from [mem %#010llx-%#010llx] to [mem %#010llx-%#010llx]\n",
+ start, start + size, new_area, new_area + size);
+ memblock_free(start, size);
+ boot_params.hdr.ramdisk_image = new_area;
+ boot_params.ext_ramdisk_image = new_area >> 32;
+ }
+
+ /*
+ * Set identity map on non-RAM pages and prepare remapping the
+ * underlying RAM.
+ */
+ xen_set_identity_and_remap(max_pfn);
return "Xen";
}
@@ -721,26 +929,30 @@ char * __init xen_memory_setup(void)
*/
char * __init xen_auto_xlated_memory_setup(void)
{
- static struct e820entry map[E820MAX] __initdata;
-
struct xen_memory_map memmap;
int i;
int rc;
memmap.nr_entries = E820MAX;
- set_xen_guest_handle(memmap.buffer, map);
+ set_xen_guest_handle(memmap.buffer, xen_e820_map);
rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
if (rc < 0)
panic("No memory map (%d)\n", rc);
- sanitize_e820_map(map, ARRAY_SIZE(map), &memmap.nr_entries);
+ xen_e820_map_entries = memmap.nr_entries;
+
+ sanitize_e820_map(xen_e820_map, ARRAY_SIZE(xen_e820_map),
+ &xen_e820_map_entries);
- for (i = 0; i < memmap.nr_entries; i++)
- e820_add_region(map[i].addr, map[i].size, map[i].type);
+ for (i = 0; i < xen_e820_map_entries; i++)
+ e820_add_region(xen_e820_map[i].addr, xen_e820_map[i].size,
+ xen_e820_map[i].type);
- memblock_reserve(__pa(xen_start_info->mfn_list),
- xen_start_info->pt_base - xen_start_info->mfn_list);
+ /* Remove p2m info, it is not needed. */
+ xen_start_info->mfn_list = 0;
+ xen_start_info->first_p2m_pfn = 0;
+ xen_start_info->nr_p2m_frames = 0;
return "Xen";
}
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 86484384492e..3f4ebf0261f2 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -26,6 +26,7 @@
#include <xen/interface/xen.h>
#include <xen/interface/vcpu.h>
+#include <xen/interface/xenpmu.h>
#include <asm/xen/interface.h>
#include <asm/xen/hypercall.h>
@@ -38,6 +39,7 @@
#include "xen-ops.h"
#include "mmu.h"
#include "smp.h"
+#include "pmu.h"
cpumask_var_t xen_cpu_initialized_map;
@@ -50,6 +52,7 @@ static DEFINE_PER_CPU(struct xen_common_irq, xen_callfunc_irq) = { .irq = -1 };
static DEFINE_PER_CPU(struct xen_common_irq, xen_callfuncsingle_irq) = { .irq = -1 };
static DEFINE_PER_CPU(struct xen_common_irq, xen_irq_work) = { .irq = -1 };
static DEFINE_PER_CPU(struct xen_common_irq, xen_debug_irq) = { .irq = -1 };
+static DEFINE_PER_CPU(struct xen_common_irq, xen_pmu_irq) = { .irq = -1 };
static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id);
static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id);
@@ -148,11 +151,18 @@ static void xen_smp_intr_free(unsigned int cpu)
kfree(per_cpu(xen_irq_work, cpu).name);
per_cpu(xen_irq_work, cpu).name = NULL;
}
+
+ if (per_cpu(xen_pmu_irq, cpu).irq >= 0) {
+ unbind_from_irqhandler(per_cpu(xen_pmu_irq, cpu).irq, NULL);
+ per_cpu(xen_pmu_irq, cpu).irq = -1;
+ kfree(per_cpu(xen_pmu_irq, cpu).name);
+ per_cpu(xen_pmu_irq, cpu).name = NULL;
+ }
};
static int xen_smp_intr_init(unsigned int cpu)
{
int rc;
- char *resched_name, *callfunc_name, *debug_name;
+ char *resched_name, *callfunc_name, *debug_name, *pmu_name;
resched_name = kasprintf(GFP_KERNEL, "resched%d", cpu);
rc = bind_ipi_to_irqhandler(XEN_RESCHEDULE_VECTOR,
@@ -218,6 +228,18 @@ static int xen_smp_intr_init(unsigned int cpu)
per_cpu(xen_irq_work, cpu).irq = rc;
per_cpu(xen_irq_work, cpu).name = callfunc_name;
+ if (is_xen_pmu(cpu)) {
+ pmu_name = kasprintf(GFP_KERNEL, "pmu%d", cpu);
+ rc = bind_virq_to_irqhandler(VIRQ_XENPMU, cpu,
+ xen_pmu_irq_handler,
+ IRQF_PERCPU|IRQF_NOBALANCING,
+ pmu_name, NULL);
+ if (rc < 0)
+ goto fail;
+ per_cpu(xen_pmu_irq, cpu).irq = rc;
+ per_cpu(xen_pmu_irq, cpu).name = pmu_name;
+ }
+
return 0;
fail:
@@ -335,6 +357,8 @@ static void __init xen_smp_prepare_cpus(unsigned int max_cpus)
}
set_cpu_sibling_map(0);
+ xen_pmu_init(0);
+
if (xen_smp_intr_init(0))
BUG();
@@ -429,7 +453,7 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
}
#endif
ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs);
- ctxt->ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir));
+ ctxt->ctrlreg[3] = xen_pfn_to_cr3(virt_to_gfn(swapper_pg_dir));
if (HYPERVISOR_vcpu_op(VCPUOP_initialise, cpu, ctxt))
BUG();
@@ -462,6 +486,8 @@ static int xen_cpu_up(unsigned int cpu, struct task_struct *idle)
if (rc)
return rc;
+ xen_pmu_init(cpu);
+
rc = xen_smp_intr_init(cpu);
if (rc)
return rc;
@@ -503,6 +529,7 @@ static void xen_cpu_die(unsigned int cpu)
xen_smp_intr_free(cpu);
xen_uninit_lock_cpu(cpu);
xen_teardown_timer(cpu);
+ xen_pmu_finish(cpu);
}
}
diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
index 53b4c0811f4f..feddabdab448 100644
--- a/arch/x86/xen/suspend.c
+++ b/arch/x86/xen/suspend.c
@@ -11,6 +11,7 @@
#include "xen-ops.h"
#include "mmu.h"
+#include "pmu.h"
static void xen_pv_pre_suspend(void)
{
@@ -67,16 +68,26 @@ static void xen_pv_post_suspend(int suspend_cancelled)
void xen_arch_pre_suspend(void)
{
- if (xen_pv_domain())
- xen_pv_pre_suspend();
+ int cpu;
+
+ for_each_online_cpu(cpu)
+ xen_pmu_finish(cpu);
+
+ if (xen_pv_domain())
+ xen_pv_pre_suspend();
}
void xen_arch_post_suspend(int cancelled)
{
- if (xen_pv_domain())
- xen_pv_post_suspend(cancelled);
- else
- xen_hvm_post_suspend(cancelled);
+ int cpu;
+
+ if (xen_pv_domain())
+ xen_pv_post_suspend(cancelled);
+ else
+ xen_hvm_post_suspend(cancelled);
+
+ for_each_online_cpu(cpu)
+ xen_pmu_init(cpu);
}
static void xen_vcpu_notify_restore(void *data)
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 55da33b1d51c..f1ba6a092854 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -274,30 +274,18 @@ static s64 get_abs_timeout(unsigned long delta)
return xen_clocksource_read() + delta;
}
-static void xen_timerop_set_mode(enum clock_event_mode mode,
- struct clock_event_device *evt)
+static int xen_timerop_shutdown(struct clock_event_device *evt)
{
- switch (mode) {
- case CLOCK_EVT_MODE_PERIODIC:
- /* unsupported */
- WARN_ON(1);
- break;
-
- case CLOCK_EVT_MODE_ONESHOT:
- case CLOCK_EVT_MODE_RESUME:
- break;
-
- case CLOCK_EVT_MODE_UNUSED:
- case CLOCK_EVT_MODE_SHUTDOWN:
- HYPERVISOR_set_timer_op(0); /* cancel timeout */
- break;
- }
+ /* cancel timeout */
+ HYPERVISOR_set_timer_op(0);
+
+ return 0;
}
static int xen_timerop_set_next_event(unsigned long delta,
struct clock_event_device *evt)
{
- WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
+ WARN_ON(!clockevent_state_oneshot(evt));
if (HYPERVISOR_set_timer_op(get_abs_timeout(delta)) < 0)
BUG();
@@ -310,46 +298,39 @@ static int xen_timerop_set_next_event(unsigned long delta,
}
static const struct clock_event_device xen_timerop_clockevent = {
- .name = "xen",
- .features = CLOCK_EVT_FEAT_ONESHOT,
+ .name = "xen",
+ .features = CLOCK_EVT_FEAT_ONESHOT,
- .max_delta_ns = 0xffffffff,
- .min_delta_ns = TIMER_SLOP,
+ .max_delta_ns = 0xffffffff,
+ .min_delta_ns = TIMER_SLOP,
- .mult = 1,
- .shift = 0,
- .rating = 500,
+ .mult = 1,
+ .shift = 0,
+ .rating = 500,
- .set_mode = xen_timerop_set_mode,
- .set_next_event = xen_timerop_set_next_event,
+ .set_state_shutdown = xen_timerop_shutdown,
+ .set_next_event = xen_timerop_set_next_event,
};
+static int xen_vcpuop_shutdown(struct clock_event_device *evt)
+{
+ int cpu = smp_processor_id();
+ if (HYPERVISOR_vcpu_op(VCPUOP_stop_singleshot_timer, cpu, NULL) ||
+ HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
+ BUG();
+
+ return 0;
+}
-static void xen_vcpuop_set_mode(enum clock_event_mode mode,
- struct clock_event_device *evt)
+static int xen_vcpuop_set_oneshot(struct clock_event_device *evt)
{
int cpu = smp_processor_id();
- switch (mode) {
- case CLOCK_EVT_MODE_PERIODIC:
- WARN_ON(1); /* unsupported */
- break;
-
- case CLOCK_EVT_MODE_ONESHOT:
- if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
- BUG();
- break;
+ if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
+ BUG();
- case CLOCK_EVT_MODE_UNUSED:
- case CLOCK_EVT_MODE_SHUTDOWN:
- if (HYPERVISOR_vcpu_op(VCPUOP_stop_singleshot_timer, cpu, NULL) ||
- HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
- BUG();
- break;
- case CLOCK_EVT_MODE_RESUME:
- break;
- }
+ return 0;
}
static int xen_vcpuop_set_next_event(unsigned long delta,
@@ -359,7 +340,7 @@ static int xen_vcpuop_set_next_event(unsigned long delta,
struct vcpu_set_singleshot_timer single;
int ret;
- WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
+ WARN_ON(!clockevent_state_oneshot(evt));
single.timeout_abs_ns = get_abs_timeout(delta);
single.flags = VCPU_SSHOTTMR_future;
@@ -382,7 +363,8 @@ static const struct clock_event_device xen_vcpuop_clockevent = {
.shift = 0,
.rating = 500,
- .set_mode = xen_vcpuop_set_mode,
+ .set_state_shutdown = xen_vcpuop_shutdown,
+ .set_state_oneshot = xen_vcpuop_set_oneshot,
.set_next_event = xen_vcpuop_set_next_event,
};
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
index 8afdfccf6086..b65f59a358a2 100644
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -104,6 +104,8 @@ ENTRY(hypercall_page)
ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __PAGE_OFFSET)
#else
ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __START_KERNEL_map)
+ /* Map the p2m table to a 512GB-aligned user address. */
+ ELFNOTE(Xen, XEN_ELFNOTE_INIT_P2M, .quad PGDIR_SIZE)
#endif
ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, _ASM_PTR startup_xen)
ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, _ASM_PTR hypercall_page)
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index c20fe29e65f4..1399423f3418 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -35,13 +35,20 @@ void xen_build_mfn_list_list(void);
void xen_setup_machphys_mapping(void);
void xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn);
void xen_reserve_top(void);
+void __init xen_reserve_special_pages(void);
+void __init xen_pt_check_e820(void);
void xen_mm_pin_all(void);
void xen_mm_unpin_all(void);
+#ifdef CONFIG_X86_64
+void __init xen_relocate_p2m(void);
+#endif
+bool __init xen_is_e820_reserved(phys_addr_t start, phys_addr_t size);
unsigned long __ref xen_chk_extra_mem(unsigned long pfn);
void __init xen_inv_extra_mem(void);
void __init xen_remap_memory(void);
+phys_addr_t __init xen_find_free_area(phys_addr_t size);
char * __init xen_memory_setup(void);
char * xen_auto_xlated_memory_setup(void);
void __init xen_arch_setup(void);
@@ -101,17 +108,15 @@ struct dom0_vga_console_info;
#ifdef CONFIG_XEN_DOM0
void __init xen_init_vga(const struct dom0_vga_console_info *, size_t size);
-void __init xen_init_apic(void);
#else
static inline void __init xen_init_vga(const struct dom0_vga_console_info *info,
size_t size)
{
}
-static inline void __init xen_init_apic(void)
-{
-}
#endif
+void __init xen_init_apic(void);
+
#ifdef CONFIG_XEN_EFI
extern void xen_efi_init(void);
#else