aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/.gitignore2
-rw-r--r--arch/x86/Kbuild2
-rw-r--r--arch/x86/Kconfig94
-rw-r--r--arch/x86/Makefile21
-rw-r--r--arch/x86/boot/Makefile7
-rw-r--r--arch/x86/boot/compressed/Makefile22
-rw-r--r--arch/x86/boot/compressed/aslr.c18
-rw-r--r--arch/x86/boot/compressed/early_serial_console.c4
-rw-r--r--arch/x86/boot/compressed/eboot.c92
-rw-r--r--arch/x86/boot/compressed/eboot.h16
-rw-r--r--arch/x86/boot/compressed/head_32.S5
-rw-r--r--arch/x86/boot/compressed/head_64.S5
-rw-r--r--arch/x86/boot/compressed/misc.c13
-rw-r--r--arch/x86/boot/compressed/mkpiggy.c9
-rw-r--r--arch/x86/boot/cpu.c68
-rw-r--r--arch/x86/boot/mkcpustr.c1
-rw-r--r--arch/x86/configs/tiny.config1
-rw-r--r--arch/x86/crypto/Makefile1
-rw-r--r--arch/x86/crypto/aes_ctrby8_avx-x86_64.S20
-rw-r--r--arch/x86/crypto/sha-mb/Makefile11
-rw-r--r--arch/x86/crypto/sha-mb/sha1_mb.c935
-rw-r--r--arch/x86/crypto/sha-mb/sha1_mb_mgr_datastruct.S287
-rw-r--r--arch/x86/crypto/sha-mb/sha1_mb_mgr_flush_avx2.S327
-rw-r--r--arch/x86/crypto/sha-mb/sha1_mb_mgr_init_avx2.c64
-rw-r--r--arch/x86/crypto/sha-mb/sha1_mb_mgr_submit_avx2.S228
-rw-r--r--arch/x86/crypto/sha-mb/sha1_x8_avx2.S472
-rw-r--r--arch/x86/crypto/sha-mb/sha_mb_ctx.h136
-rw-r--r--arch/x86/crypto/sha-mb/sha_mb_mgr.h110
-rw-r--r--arch/x86/ia32/ia32_aout.c21
-rw-r--r--arch/x86/ia32/ia32entry.S30
-rw-r--r--arch/x86/include/asm/Kbuild4
-rw-r--r--arch/x86/include/asm/alternative.h14
-rw-r--r--arch/x86/include/asm/apic.h46
-rw-r--r--arch/x86/include/asm/atomic.h17
-rw-r--r--arch/x86/include/asm/atomic64_64.h2
-rw-r--r--arch/x86/include/asm/bitops.h2
-rw-r--r--arch/x86/include/asm/calling.h6
-rw-r--r--arch/x86/include/asm/cpufeature.h60
-rw-r--r--arch/x86/include/asm/crash.h9
-rw-r--r--arch/x86/include/asm/debugreg.h4
-rw-r--r--arch/x86/include/asm/disabled-features.h39
-rw-r--r--arch/x86/include/asm/dma-contiguous.h12
-rw-r--r--arch/x86/include/asm/efi.h55
-rw-r--r--arch/x86/include/asm/elf.h5
-rw-r--r--arch/x86/include/asm/fixmap.h6
-rw-r--r--arch/x86/include/asm/fpu-internal.h11
-rw-r--r--arch/x86/include/asm/hardirq.h3
-rw-r--r--arch/x86/include/asm/i8259.h5
-rw-r--r--arch/x86/include/asm/io_apic.h59
-rw-r--r--arch/x86/include/asm/irq_work.h11
-rw-r--r--arch/x86/include/asm/kexec-bzimage64.h6
-rw-r--r--arch/x86/include/asm/kexec.h45
-rw-r--r--arch/x86/include/asm/kprobes.h1
-rw-r--r--arch/x86/include/asm/kvm_host.h50
-rw-r--r--arch/x86/include/asm/kvm_para.h10
-rw-r--r--arch/x86/include/asm/microcode_intel.h2
-rw-r--r--arch/x86/include/asm/mpspec.h15
-rw-r--r--arch/x86/include/asm/numa.h1
-rw-r--r--arch/x86/include/asm/page.h1
-rw-r--r--arch/x86/include/asm/page_64.h2
-rw-r--r--arch/x86/include/asm/perf_event.h8
-rw-r--r--arch/x86/include/asm/perf_event_p4.h2
-rw-r--r--arch/x86/include/asm/pgtable.h9
-rw-r--r--arch/x86/include/asm/pgtable_32.h3
-rw-r--r--arch/x86/include/asm/pgtable_64.h4
-rw-r--r--arch/x86/include/asm/pgtable_types.h25
-rw-r--r--arch/x86/include/asm/preempt.h1
-rw-r--r--arch/x86/include/asm/processor.h4
-rw-r--r--arch/x86/include/asm/prom.h2
-rw-r--r--arch/x86/include/asm/ptrace.h5
-rw-r--r--arch/x86/include/asm/rwlock.h49
-rw-r--r--arch/x86/include/asm/scatterlist.h8
-rw-r--r--arch/x86/include/asm/serial.h24
-rw-r--r--arch/x86/include/asm/smp.h1
-rw-r--r--arch/x86/include/asm/smpboot_hooks.h10
-rw-r--r--arch/x86/include/asm/spinlock.h81
-rw-r--r--arch/x86/include/asm/spinlock_types.h4
-rw-r--r--arch/x86/include/asm/uv/uv_hub.h12
-rw-r--r--arch/x86/include/asm/xsave.h223
-rw-r--r--arch/x86/include/uapi/asm/e820.h5
-rw-r--r--arch/x86/include/uapi/asm/msr-index.h3
-rw-r--r--arch/x86/include/uapi/asm/vmx.h2
-rw-r--r--arch/x86/kernel/Makefile3
-rw-r--r--arch/x86/kernel/acpi/boot.c404
-rw-r--r--arch/x86/kernel/apb_timer.c6
-rw-r--r--arch/x86/kernel/apic/apic.c83
-rw-r--r--arch/x86/kernel/apic/apic_flat_64.c16
-rw-r--r--arch/x86/kernel/apic/apic_noop.c23
-rw-r--r--arch/x86/kernel/apic/apic_numachip.c10
-rw-r--r--arch/x86/kernel/apic/bigsmp_32.c14
-rw-r--r--arch/x86/kernel/apic/io_apic.c789
-rw-r--r--arch/x86/kernel/apic/probe_32.c33
-rw-r--r--arch/x86/kernel/apic/x2apic_cluster.c10
-rw-r--r--arch/x86/kernel/apic/x2apic_phys.c8
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c10
-rw-r--r--arch/x86/kernel/cpu/Makefile14
-rw-r--r--arch/x86/kernel/cpu/amd.c7
-rw-r--r--arch/x86/kernel/cpu/common.c55
-rw-r--r--arch/x86/kernel/cpu/intel.c37
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c4
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-inject.c6
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c52
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c6
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel.c22
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c4
-rw-r--r--arch/x86/kernel/cpu/microcode/amd_early.c35
-rw-r--r--arch/x86/kernel/cpu/microcode/core_early.c2
-rw-r--r--arch/x86/kernel/cpu/microcode/intel.c4
-rw-r--r--arch/x86/kernel/cpu/microcode/intel_early.c10
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c6
-rw-r--r--arch/x86/kernel/cpu/perf_event.c28
-rw-r--r--arch/x86/kernel/cpu/perf_event.h48
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd.c4
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c74
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_ds.c205
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_lbr.c20
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_rapl.c12
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_uncore.c3175
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_uncore.h439
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_uncore_nhmex.c1221
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c636
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c2258
-rw-r--r--arch/x86/kernel/cpu/perf_event_knc.c2
-rw-r--r--arch/x86/kernel/cpu/perf_event_p4.c6
-rw-r--r--arch/x86/kernel/crash.c563
-rw-r--r--arch/x86/kernel/devicetree.c207
-rw-r--r--arch/x86/kernel/e820.c7
-rw-r--r--arch/x86/kernel/entry_32.S18
-rw-r--r--arch/x86/kernel/entry_64.S51
-rw-r--r--arch/x86/kernel/hw_breakpoint.c8
-rw-r--r--arch/x86/kernel/i387.c2
-rw-r--r--arch/x86/kernel/i8259.c3
-rw-r--r--arch/x86/kernel/iosf_mbi.c93
-rw-r--r--arch/x86/kernel/irq_64.c6
-rw-r--r--arch/x86/kernel/irq_work.c2
-rw-r--r--arch/x86/kernel/irqinit.c17
-rw-r--r--arch/x86/kernel/kexec-bzimage64.c554
-rw-r--r--arch/x86/kernel/kprobes/opt.c4
-rw-r--r--arch/x86/kernel/kvm.c30
-rw-r--r--arch/x86/kernel/machine_kexec_32.c3
-rw-r--r--arch/x86/kernel/machine_kexec_64.c250
-rw-r--r--arch/x86/kernel/mpparse.c111
-rw-r--r--arch/x86/kernel/pmc_atom.c11
-rw-r--r--arch/x86/kernel/preempt.S25
-rw-r--r--arch/x86/kernel/process.c17
-rw-r--r--arch/x86/kernel/process_32.c6
-rw-r--r--arch/x86/kernel/process_64.c3
-rw-r--r--arch/x86/kernel/ptrace.c165
-rw-r--r--arch/x86/kernel/quirks.c18
-rw-r--r--arch/x86/kernel/setup.c11
-rw-r--r--arch/x86/kernel/signal.c5
-rw-r--r--arch/x86/kernel/smpboot.c194
-rw-r--r--arch/x86/kernel/time.c2
-rw-r--r--arch/x86/kernel/tsc.c5
-rw-r--r--arch/x86/kernel/vsmp_64.c4
-rw-r--r--arch/x86/kernel/vsyscall_64.c2
-rw-r--r--arch/x86/kernel/xsave.c125
-rw-r--r--arch/x86/kvm/Kconfig1
-rw-r--r--arch/x86/kvm/cpuid.c31
-rw-r--r--arch/x86/kvm/cpuid.h10
-rw-r--r--arch/x86/kvm/emulate.c369
-rw-r--r--arch/x86/kvm/i8254.c2
-rw-r--r--arch/x86/kvm/irq.c2
-rw-r--r--arch/x86/kvm/lapic.c86
-rw-r--r--arch/x86/kvm/mmu.c141
-rw-r--r--arch/x86/kvm/mmu.h5
-rw-r--r--arch/x86/kvm/mmu_audit.c2
-rw-r--r--arch/x86/kvm/paging_tmpl.h22
-rw-r--r--arch/x86/kvm/pmu.c24
-rw-r--r--arch/x86/kvm/svm.c54
-rw-r--r--arch/x86/kvm/trace.h41
-rw-r--r--arch/x86/kvm/vmx.c435
-rw-r--r--arch/x86/kvm/x86.c190
-rw-r--r--arch/x86/kvm/x86.h22
-rw-r--r--arch/x86/lib/Makefile3
-rw-r--r--arch/x86/lib/cmpxchg16b_emu.S32
-rw-r--r--arch/x86/lib/cmpxchg8b_emu.S20
-rw-r--r--arch/x86/lib/csum-wrappers_64.c5
-rw-r--r--arch/x86/lib/rwlock.S44
-rw-r--r--arch/x86/lib/thunk_32.S41
-rw-r--r--arch/x86/lib/thunk_64.S7
-rw-r--r--arch/x86/mm/dump_pagetables.c4
-rw-r--r--arch/x86/mm/fault.c32
-rw-r--r--arch/x86/mm/init_32.c5
-rw-r--r--arch/x86/mm/init_64.c41
-rw-r--r--arch/x86/mm/ioremap.c20
-rw-r--r--arch/x86/mm/kmemcheck/kmemcheck.c14
-rw-r--r--arch/x86/mm/mmap.c2
-rw-r--r--arch/x86/mm/numa.c123
-rw-r--r--arch/x86/mm/pageattr.c2
-rw-r--r--arch/x86/mm/pgtable_32.c35
-rw-r--r--arch/x86/mm/tlb.c10
-rw-r--r--arch/x86/net/bpf_jit_comp.c154
-rw-r--r--arch/x86/oprofile/nmi_int.c8
-rw-r--r--arch/x86/oprofile/op_model_p4.c2
-rw-r--r--arch/x86/pci/acpi.c6
-rw-r--r--arch/x86/pci/common.c20
-rw-r--r--arch/x86/pci/fixup.c24
-rw-r--r--arch/x86/pci/i386.c2
-rw-r--r--arch/x86/pci/intel_mid_pci.c27
-rw-r--r--arch/x86/pci/irq.c16
-rw-r--r--arch/x86/pci/mmconfig-shared.c40
-rw-r--r--arch/x86/pci/pcbios.c8
-rw-r--r--arch/x86/pci/xen.c7
-rw-r--r--arch/x86/platform/ce4100/ce4100.c11
-rw-r--r--arch/x86/platform/efi/efi-bgrt.c36
-rw-r--r--arch/x86/platform/efi/efi.c52
-rw-r--r--arch/x86/platform/efi/efi_32.c12
-rw-r--r--arch/x86/platform/efi/efi_64.c6
-rw-r--r--arch/x86/platform/efi/efi_stub_32.S4
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_wdt.c22
-rw-r--r--arch/x86/platform/intel-mid/intel_mid_weak_decls.h7
-rw-r--r--arch/x86/platform/intel-mid/sfi.c58
-rw-r--r--arch/x86/platform/sfi/sfi.c10
-rw-r--r--arch/x86/platform/uv/tlb_uv.c2
-rw-r--r--arch/x86/platform/uv/uv_nmi.c40
-rw-r--r--arch/x86/platform/uv/uv_time.c2
-rw-r--r--arch/x86/power/hibernate_32.c4
-rw-r--r--arch/x86/power/hibernate_64.c4
-rw-r--r--arch/x86/purgatory/Makefile29
-rw-r--r--arch/x86/purgatory/entry64.S101
-rw-r--r--arch/x86/purgatory/purgatory.c72
-rw-r--r--arch/x86/purgatory/setup-x86_64.S58
-rw-r--r--arch/x86/purgatory/sha256.c283
-rw-r--r--arch/x86/purgatory/sha256.h22
-rw-r--r--arch/x86/purgatory/stack.S19
-rw-r--r--arch/x86/purgatory/string.c13
-rw-r--r--arch/x86/syscalls/syscall_32.tbl2
-rw-r--r--arch/x86/syscalls/syscall_64.tbl3
-rw-r--r--arch/x86/tools/calc_run_size.pl30
-rw-r--r--arch/x86/tools/relocs.c2
-rw-r--r--arch/x86/um/asm/elf.h1
-rw-r--r--arch/x86/um/asm/ptrace.h4
-rw-r--r--arch/x86/um/asm/syscall.h15
-rw-r--r--arch/x86/um/checksum_32.S239
-rw-r--r--arch/x86/um/mem_64.c15
-rw-r--r--arch/x86/um/signal.c45
-rw-r--r--arch/x86/vdso/vdso2c.h12
-rw-r--r--arch/x86/vdso/vdso32-setup.c19
-rw-r--r--arch/x86/xen/efi.c2
-rw-r--r--arch/x86/xen/enlighten.c39
-rw-r--r--arch/x86/xen/grant-table.c70
-rw-r--r--arch/x86/xen/mmu.c80
-rw-r--r--arch/x86/xen/multicalls.c8
-rw-r--r--arch/x86/xen/p2m.c109
-rw-r--r--arch/x86/xen/p2m.h15
-rw-r--r--arch/x86/xen/setup.c371
-rw-r--r--arch/x86/xen/smp.c34
-rw-r--r--arch/x86/xen/smp.h8
-rw-r--r--arch/x86/xen/spinlock.c2
-rw-r--r--arch/x86/xen/time.c12
-rw-r--r--arch/x86/xen/xen-head.S36
252 files changed, 13517 insertions, 7047 deletions
diff --git a/arch/x86/.gitignore b/arch/x86/.gitignore
index 7cab8c08e6d1..aff152c87cf4 100644
--- a/arch/x86/.gitignore
+++ b/arch/x86/.gitignore
@@ -1,4 +1,6 @@
boot/compressed/vmlinux
tools/test_get_len
tools/insn_sanity
+purgatory/kexec-purgatory.c
+purgatory/purgatory.ro
diff --git a/arch/x86/Kbuild b/arch/x86/Kbuild
index e5287d8517aa..3942f74c92d7 100644
--- a/arch/x86/Kbuild
+++ b/arch/x86/Kbuild
@@ -16,3 +16,5 @@ obj-$(CONFIG_IA32_EMULATION) += ia32/
obj-y += platform/
obj-y += net/
+
+obj-$(CONFIG_KEXEC_FILE) += purgatory/
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index bf2405053af5..ded8a6774ac9 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -23,13 +23,13 @@ config X86
def_bool y
select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI
select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS
+ select ARCH_HAS_FAST_MULTIPLIER
select ARCH_MIGHT_HAVE_PC_PARPORT
select ARCH_MIGHT_HAVE_PC_SERIO
select HAVE_AOUT if X86_32
select HAVE_UNSTABLE_SCHED_CLOCK
select ARCH_SUPPORTS_NUMA_BALANCING if X86_64
select ARCH_SUPPORTS_INT128 if X86_64
- select ARCH_WANTS_PROT_NUMA_PROT_NONE
select HAVE_IDE
select HAVE_OPROFILE
select HAVE_PCSPKR_PLATFORM
@@ -96,6 +96,7 @@ config X86
select IRQ_FORCED_THREADING
select HAVE_BPF_JIT if X86_64
select HAVE_ARCH_TRANSPARENT_HUGEPAGE
+ select ARCH_HAS_SG_CHAIN
select CLKEVT_I8253
select ARCH_HAVE_NMI_SAFE_CMPXCHG
select GENERIC_IOMAP
@@ -135,11 +136,16 @@ config X86
select HAVE_ACPI_APEI if ACPI
select HAVE_ACPI_APEI_NMI if ACPI
select ACPI_LEGACY_TABLES_LOOKUP if ACPI
+ select X86_FEATURE_NAMES if PROC_FS
config INSTRUCTION_DECODER
def_bool y
depends on KPROBES || PERF_EVENTS || UPROBES
+config PERF_EVENTS_INTEL_UNCORE
+ def_bool y
+ depends on PERF_EVENTS && SUP_SUP_INTEL && PCI
+
config OUTPUT_FORMAT
string
default "elf32-i386" if X86_32
@@ -312,6 +318,17 @@ config SMP
If you don't know what to do here, say N.
+config X86_FEATURE_NAMES
+ bool "Processor feature human-readable names" if EMBEDDED
+ default y
+ ---help---
+ This option compiles in a table of x86 feature bits and corresponding
+ names. This is required to support /proc/cpuinfo and a few kernel
+ messages. You can disable this to save space, at the expense of
+ making those few kernel messages show numeric feature bits instead.
+
+ If in doubt, say Y.
+
config X86_X2APIC
bool "Support x2apic"
depends on X86_LOCAL_APIC && X86_64 && IRQ_REMAP
@@ -433,6 +450,7 @@ config X86_INTEL_CE
bool "CE4100 TV platform"
depends on PCI
depends on PCI_GODIRECT
+ depends on X86_IO_APIC
depends on X86_32
depends on X86_EXTENDED_PLATFORM
select X86_REBOOTFIXUPS
@@ -477,6 +495,36 @@ config X86_INTEL_LPSS
things like clock tree (common clock framework) and pincontrol
which are needed by the LPSS peripheral drivers.
+config IOSF_MBI
+ tristate "Intel SoC IOSF Sideband support for SoC platforms"
+ depends on PCI
+ ---help---
+ This option enables sideband register access support for Intel SoC
+ platforms. On these platforms the IOSF sideband is used in lieu of
+ MSR's for some register accesses, mostly but not limited to thermal
+ and power. Drivers may query the availability of this device to
+ determine if they need the sideband in order to work on these
+ platforms. The sideband is available on the following SoC products.
+ This list is not meant to be exclusive.
+ - BayTrail
+ - Braswell
+ - Quark
+
+ You should say Y if you are running a kernel on one of these SoC's.
+
+config IOSF_MBI_DEBUG
+ bool "Enable IOSF sideband access through debugfs"
+ depends on IOSF_MBI && DEBUG_FS
+ ---help---
+ Select this option to expose the IOSF sideband access registers (MCR,
+ MDR, MCRX) through debugfs to write and read register information from
+ different units on the SoC. This is most useful for obtaining device
+ state information for debug and analysis. As this is a general access
+ mechanism, users of this option would have specific knowledge of the
+ device they want to access.
+
+ If you don't require the option or are in doubt, say N.
+
config X86_RDC321X
bool "RDC R-321x SoC"
depends on X86_32
@@ -839,6 +887,7 @@ config X86_IO_APIC
def_bool y
depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_IOAPIC || PCI_MSI
select GENERIC_IRQ_LEGACY_ALLOC_HWIRQ
+ select IRQ_DOMAIN
config X86_REROUTE_FOR_BROKEN_BOOT_IRQS
bool "Reroute for broken boot IRQs"
@@ -1540,7 +1589,8 @@ config EFI
config EFI_STUB
bool "EFI stub support"
- depends on EFI
+ depends on EFI && !X86_USE_3DNOW
+ select RELOCATABLE
---help---
This kernel feature allows a bzImage to be loaded directly
by EFI firmware without the use of a bootloader.
@@ -1595,6 +1645,41 @@ config KEXEC
interface is strongly in flux, so no good recommendation can be
made.
+config KEXEC_FILE
+ bool "kexec file based system call"
+ select BUILD_BIN2C
+ depends on KEXEC
+ depends on X86_64
+ depends on CRYPTO=y
+ depends on CRYPTO_SHA256=y
+ ---help---
+ This is new version of kexec system call. This system call is
+ file based and takes file descriptors as system call argument
+ for kernel and initramfs as opposed to list of segments as
+ accepted by previous system call.
+
+config KEXEC_VERIFY_SIG
+ bool "Verify kernel signature during kexec_file_load() syscall"
+ depends on KEXEC_FILE
+ ---help---
+ This option makes kernel signature verification mandatory for
+ kexec_file_load() syscall. If kernel is signature can not be
+ verified, kexec_file_load() will fail.
+
+ This option enforces signature verification at generic level.
+ One needs to enable signature verification for type of kernel
+ image being loaded to make sure it works. For example, enable
+ bzImage signature verification option to be able to load and
+ verify signatures of bzImage. Otherwise kernel loading will fail.
+
+config KEXEC_BZIMAGE_VERIFY_SIG
+ bool "Enable bzImage signature verification support"
+ depends on KEXEC_VERIFY_SIG
+ depends on SIGNED_PE_FILE_VERIFICATION
+ select SYSTEM_TRUSTED_KEYRING
+ ---help---
+ Enable bzImage signature verification support.
+
config CRASH_DUMP
bool "kernel crash dumps"
depends on X86_64 || (X86_32 && HIGHMEM)
@@ -2403,11 +2488,6 @@ config X86_DMA_REMAP
bool
depends on STA2X11
-config IOSF_MBI
- tristate
- default m
- depends on PCI
-
config PMC_ATOM
def_bool y
depends on PCI
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index c65fd9650467..920e6160c535 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -50,9 +50,6 @@ ifeq ($(CONFIG_X86_32),y)
KBUILD_CFLAGS += -msoft-float -mregparm=3 -freg-struct-return
- # Don't autogenerate MMX or SSE instructions
- KBUILD_CFLAGS += -mno-mmx -mno-sse
-
# Never want PIC in a 32-bit kernel, prevent breakage with GCC built
# with nonstandard options
KBUILD_CFLAGS += -fno-pic
@@ -80,8 +77,7 @@ else
KBUILD_AFLAGS += -m64
KBUILD_CFLAGS += -m64
- # Don't autogenerate traditional x87, MMX or SSE instructions
- KBUILD_CFLAGS += -mno-mmx -mno-sse
+ # Don't autogenerate traditional x87 instructions
KBUILD_CFLAGS += $(call cc-option,-mno-80387)
KBUILD_CFLAGS += $(call cc-option,-mno-fp-ret-in-387)
@@ -168,7 +164,7 @@ KBUILD_CFLAGS += -Wno-sign-compare
#
KBUILD_CFLAGS += -fno-asynchronous-unwind-tables
# prevent gcc from generating any FP code by mistake
-KBUILD_CFLAGS += $(call cc-option,-mno-sse -mno-mmx -mno-sse2 -mno-3dnow,)
+KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow
KBUILD_CFLAGS += $(call cc-option,-mno-avx,)
KBUILD_CFLAGS += $(mflags-y)
@@ -183,6 +179,11 @@ archscripts: scripts_basic
archheaders:
$(Q)$(MAKE) $(build)=arch/x86/syscalls all
+archprepare:
+ifeq ($(CONFIG_KEXEC_FILE),y)
+ $(Q)$(MAKE) $(build)=arch/x86/purgatory arch/x86/purgatory/kexec-purgatory.c
+endif
+
###
# Kernel objects
@@ -246,12 +247,7 @@ archclean:
$(Q)rm -rf $(objtree)/arch/x86_64
$(Q)$(MAKE) $(clean)=$(boot)
$(Q)$(MAKE) $(clean)=arch/x86/tools
-
-PHONY += kvmconfig
-kvmconfig:
- $(if $(wildcard $(objtree)/.config),, $(error You need an existing .config for this target))
- $(Q)$(CONFIG_SHELL) $(srctree)/scripts/kconfig/merge_config.sh -m -O $(objtree) $(objtree)/.config $(srctree)/arch/x86/configs/kvm_guest.config
- $(Q)yes "" | $(MAKE) -f $(srctree)/Makefile oldconfig
+ $(Q)$(MAKE) $(clean)=arch/x86/purgatory
define archhelp
echo '* bzImage - Compressed kernel image (arch/x86/boot/bzImage)'
@@ -266,5 +262,4 @@ define archhelp
echo ' bzdisk/fdimage*/isoimage also accept:'
echo ' FDARGS="..." arguments for the booted kernel'
echo ' FDINITRD=file initrd for the booted kernel'
- echo ' kvmconfig - Enable additional options for guest kernel support'
endef
diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile
index dbe8dd2fe247..5b016e2498f3 100644
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -35,19 +35,22 @@ setup-y += video-vesa.o
setup-y += video-bios.o
targets += $(setup-y)
-hostprogs-y := mkcpustr tools/build
+hostprogs-y := tools/build
+hostprogs-$(CONFIG_X86_FEATURE_NAMES) += mkcpustr
HOST_EXTRACFLAGS += -I$(srctree)/tools/include \
-include include/generated/autoconf.h \
-D__EXPORTED_HEADERS__
+ifdef CONFIG_X86_FEATURE_NAMES
$(obj)/cpu.o: $(obj)/cpustr.h
quiet_cmd_cpustr = CPUSTR $@
cmd_cpustr = $(obj)/mkcpustr > $@
-targets += cpustr.h
+targets += cpustr.h
$(obj)/cpustr.h: $(obj)/mkcpustr FORCE
$(call if_changed,cpustr)
+endif
# ---------------------------------------------------------------------------
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index 7a801a310e37..be1e07d4b596 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -26,18 +26,18 @@ LDFLAGS_vmlinux := -T
hostprogs-y := mkpiggy
HOST_EXTRACFLAGS += -I$(srctree)/tools/include
-VMLINUX_OBJS = $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o \
- $(obj)/string.o $(obj)/cmdline.o $(obj)/early_serial_console.o \
- $(obj)/piggy.o $(obj)/cpuflags.o $(obj)/aslr.o
+vmlinux-objs-y := $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o \
+ $(obj)/string.o $(obj)/cmdline.o \
+ $(obj)/piggy.o $(obj)/cpuflags.o
+
+vmlinux-objs-$(CONFIG_EARLY_PRINTK) += $(obj)/early_serial_console.o
+vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/aslr.o
$(obj)/eboot.o: KBUILD_CFLAGS += -fshort-wchar -mno-red-zone
-ifeq ($(CONFIG_EFI_STUB), y)
- VMLINUX_OBJS += $(obj)/eboot.o $(obj)/efi_stub_$(BITS).o \
- $(objtree)/drivers/firmware/efi/libstub/lib.a
-endif
+vmlinux-objs-$(CONFIG_EFI_STUB) += $(obj)/eboot.o $(obj)/efi_stub_$(BITS).o
-$(obj)/vmlinux: $(VMLINUX_OBJS) FORCE
+$(obj)/vmlinux: $(vmlinux-objs-y) FORCE
$(call if_changed,ld)
@:
@@ -45,7 +45,7 @@ OBJCOPYFLAGS_vmlinux.bin := -R .comment -S
$(obj)/vmlinux.bin: vmlinux FORCE
$(call if_changed,objcopy)
-targets += $(patsubst $(obj)/%,%,$(VMLINUX_OBJS)) vmlinux.bin.all vmlinux.relocs
+targets += $(patsubst $(obj)/%,%,$(vmlinux-objs-y)) vmlinux.bin.all vmlinux.relocs
CMD_RELOCS = arch/x86/tools/relocs
quiet_cmd_relocs = RELOCS $@
@@ -76,8 +76,10 @@ suffix-$(CONFIG_KERNEL_XZ) := xz
suffix-$(CONFIG_KERNEL_LZO) := lzo
suffix-$(CONFIG_KERNEL_LZ4) := lz4
+RUN_SIZE = $(shell objdump -h vmlinux | \
+ perl $(srctree)/arch/x86/tools/calc_run_size.pl)
quiet_cmd_mkpiggy = MKPIGGY $@
- cmd_mkpiggy = $(obj)/mkpiggy $< > $@ || ( rm -f $@ ; false )
+ cmd_mkpiggy = $(obj)/mkpiggy $< $(RUN_SIZE) > $@ || ( rm -f $@ ; false )
targets += piggy.S
$(obj)/piggy.S: $(obj)/vmlinux.bin.$(suffix-y) $(obj)/mkpiggy FORCE
diff --git a/arch/x86/boot/compressed/aslr.c b/arch/x86/boot/compressed/aslr.c
index fc6091abedb7..bb1376381985 100644
--- a/arch/x86/boot/compressed/aslr.c
+++ b/arch/x86/boot/compressed/aslr.c
@@ -1,6 +1,5 @@
#include "misc.h"
-#ifdef CONFIG_RANDOMIZE_BASE
#include <asm/msr.h>
#include <asm/archrandom.h>
#include <asm/e820.h>
@@ -183,12 +182,27 @@ static void mem_avoid_init(unsigned long input, unsigned long input_size,
static bool mem_avoid_overlap(struct mem_vector *img)
{
int i;
+ struct setup_data *ptr;
for (i = 0; i < MEM_AVOID_MAX; i++) {
if (mem_overlaps(img, &mem_avoid[i]))
return true;
}
+ /* Avoid all entries in the setup_data linked list. */
+ ptr = (struct setup_data *)(unsigned long)real_mode->hdr.setup_data;
+ while (ptr) {
+ struct mem_vector avoid;
+
+ avoid.start = (unsigned long)ptr;
+ avoid.size = sizeof(*ptr) + ptr->len;
+
+ if (mem_overlaps(img, &avoid))
+ return true;
+
+ ptr = (struct setup_data *)(unsigned long)ptr->next;
+ }
+
return false;
}
@@ -320,5 +334,3 @@ unsigned char *choose_kernel_location(unsigned char *input,
out:
return (unsigned char *)choice;
}
-
-#endif /* CONFIG_RANDOMIZE_BASE */
diff --git a/arch/x86/boot/compressed/early_serial_console.c b/arch/x86/boot/compressed/early_serial_console.c
index d3d003cb5481..261e81fb9582 100644
--- a/arch/x86/boot/compressed/early_serial_console.c
+++ b/arch/x86/boot/compressed/early_serial_console.c
@@ -1,9 +1,5 @@
#include "misc.h"
-#ifdef CONFIG_EARLY_PRINTK
-
int early_serial_base;
#include "../early_serial_console.c"
-
-#endif
diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c
index f277184e2ac1..1acf605a646d 100644
--- a/arch/x86/boot/compressed/eboot.c
+++ b/arch/x86/boot/compressed/eboot.c
@@ -19,7 +19,10 @@
static efi_system_table_t *sys_table;
-struct efi_config *efi_early;
+static struct efi_config *efi_early;
+
+#define efi_call_early(f, ...) \
+ efi_early->call(efi_early->f, __VA_ARGS__);
#define BOOT_SERVICES(bits) \
static void setup_boot_services##bits(struct efi_config *c) \
@@ -265,21 +268,25 @@ void efi_char16_printk(efi_system_table_t *table, efi_char16_t *str)
offset = offsetof(typeof(*out), output_string);
output_string = efi_early->text_output + offset;
+ out = (typeof(out))(unsigned long)efi_early->text_output;
func = (u64 *)output_string;
- efi_early->call(*func, efi_early->text_output, str);
+ efi_early->call(*func, out, str);
} else {
struct efi_simple_text_output_protocol_32 *out;
u32 *func;
offset = offsetof(typeof(*out), output_string);
output_string = efi_early->text_output + offset;
+ out = (typeof(out))(unsigned long)efi_early->text_output;
func = (u32 *)output_string;
- efi_early->call(*func, efi_early->text_output, str);
+ efi_early->call(*func, out, str);
}
}
+#include "../../../../drivers/firmware/efi/libstub/efi-stub-helper.c"
+
static void find_bits(unsigned long mask, u8 *pos, u8 *size)
{
u8 first, len;
@@ -323,8 +330,10 @@ __setup_efi_pci32(efi_pci_io_protocol_32 *pci, struct pci_setup_rom **__rom)
size = pci->romsize + sizeof(*rom);
status = efi_call_early(allocate_pool, EFI_LOADER_DATA, size, &rom);
- if (status != EFI_SUCCESS)
+ if (status != EFI_SUCCESS) {
+ efi_printk(sys_table, "Failed to alloc mem for rom\n");
return status;
+ }
memset(rom, 0, sizeof(*rom));
@@ -337,14 +346,18 @@ __setup_efi_pci32(efi_pci_io_protocol_32 *pci, struct pci_setup_rom **__rom)
status = efi_early->call(pci->pci.read, pci, EfiPciIoWidthUint16,
PCI_VENDOR_ID, 1, &(rom->vendor));
- if (status != EFI_SUCCESS)
+ if (status != EFI_SUCCESS) {
+ efi_printk(sys_table, "Failed to read rom->vendor\n");
goto free_struct;
+ }
status = efi_early->call(pci->pci.read, pci, EfiPciIoWidthUint16,
PCI_DEVICE_ID, 1, &(rom->devid));
- if (status != EFI_SUCCESS)
+ if (status != EFI_SUCCESS) {
+ efi_printk(sys_table, "Failed to read rom->devid\n");
goto free_struct;
+ }
status = efi_early->call(pci->get_location, pci, &(rom->segment),
&(rom->bus), &(rom->device), &(rom->function));
@@ -360,7 +373,7 @@ free_struct:
return status;
}
-static efi_status_t
+static void
setup_efi_pci32(struct boot_params *params, void **pci_handle,
unsigned long size)
{
@@ -403,8 +416,6 @@ setup_efi_pci32(struct boot_params *params, void **pci_handle,
data = (struct setup_data *)rom;
}
-
- return status;
}
static efi_status_t
@@ -427,8 +438,10 @@ __setup_efi_pci64(efi_pci_io_protocol_64 *pci, struct pci_setup_rom **__rom)
size = pci->romsize + sizeof(*rom);
status = efi_call_early(allocate_pool, EFI_LOADER_DATA, size, &rom);
- if (status != EFI_SUCCESS)
+ if (status != EFI_SUCCESS) {
+ efi_printk(sys_table, "Failed to alloc mem for rom\n");
return status;
+ }
rom->data.type = SETUP_PCI;
rom->data.len = size - sizeof(struct setup_data);
@@ -439,14 +452,18 @@ __setup_efi_pci64(efi_pci_io_protocol_64 *pci, struct pci_setup_rom **__rom)
status = efi_early->call(pci->pci.read, pci, EfiPciIoWidthUint16,
PCI_VENDOR_ID, 1, &(rom->vendor));
- if (status != EFI_SUCCESS)
+ if (status != EFI_SUCCESS) {
+ efi_printk(sys_table, "Failed to read rom->vendor\n");
goto free_struct;
+ }
status = efi_early->call(pci->pci.read, pci, EfiPciIoWidthUint16,
PCI_DEVICE_ID, 1, &(rom->devid));
- if (status != EFI_SUCCESS)
+ if (status != EFI_SUCCESS) {
+ efi_printk(sys_table, "Failed to read rom->devid\n");
goto free_struct;
+ }
status = efi_early->call(pci->get_location, pci, &(rom->segment),
&(rom->bus), &(rom->device), &(rom->function));
@@ -463,7 +480,7 @@ free_struct:
}
-static efi_status_t
+static void
setup_efi_pci64(struct boot_params *params, void **pci_handle,
unsigned long size)
{
@@ -506,11 +523,18 @@ setup_efi_pci64(struct boot_params *params, void **pci_handle,
data = (struct setup_data *)rom;
}
-
- return status;
}
-static efi_status_t setup_efi_pci(struct boot_params *params)
+/*
+ * There's no way to return an informative status from this function,
+ * because any analysis (and printing of error messages) needs to be
+ * done directly at the EFI function call-site.
+ *
+ * For example, EFI_INVALID_PARAMETER could indicate a bug or maybe we
+ * just didn't find any PCI devices, but there's no way to tell outside
+ * the context of the call.
+ */
+static void setup_efi_pci(struct boot_params *params)
{
efi_status_t status;
void **pci_handle = NULL;
@@ -526,8 +550,10 @@ static efi_status_t setup_efi_pci(struct boot_params *params)
EFI_LOADER_DATA,
size, (void **)&pci_handle);
- if (status != EFI_SUCCESS)
- return status;
+ if (status != EFI_SUCCESS) {
+ efi_printk(sys_table, "Failed to alloc mem for pci_handle\n");
+ return;
+ }
status = efi_call_early(locate_handle,
EFI_LOCATE_BY_PROTOCOL, &pci_proto,
@@ -538,13 +564,12 @@ static efi_status_t setup_efi_pci(struct boot_params *params)
goto free_handle;
if (efi_early->is64)
- status = setup_efi_pci64(params, pci_handle, size);
+ setup_efi_pci64(params, pci_handle, size);
else
- status = setup_efi_pci32(params, pci_handle, size);
+ setup_efi_pci32(params, pci_handle, size);
free_handle:
efi_call_early(free_pool, pci_handle);
- return status;
}
static void
@@ -1032,7 +1057,6 @@ struct boot_params *make_boot_params(struct efi_config *c)
int i;
unsigned long ramdisk_addr;
unsigned long ramdisk_size;
- unsigned long initrd_addr_max;
efi_early = c;
sys_table = (efi_system_table_t *)(unsigned long)efi_early->table;
@@ -1095,15 +1119,24 @@ struct boot_params *make_boot_params(struct efi_config *c)
memset(sdt, 0, sizeof(*sdt));
- if (hdr->xloadflags & XLF_CAN_BE_LOADED_ABOVE_4G)
- initrd_addr_max = -1UL;
- else
- initrd_addr_max = hdr->initrd_addr_max;
+ status = efi_parse_options(cmdline_ptr);
+ if (status != EFI_SUCCESS)
+ goto fail2;
status = handle_cmdline_files(sys_table, image,
(char *)(unsigned long)hdr->cmd_line_ptr,
- "initrd=", initrd_addr_max,
+ "initrd=", hdr->initrd_addr_max,
&ramdisk_addr, &ramdisk_size);
+
+ if (status != EFI_SUCCESS &&
+ hdr->xloadflags & XLF_CAN_BE_LOADED_ABOVE_4G) {
+ efi_printk(sys_table, "Trying to load files to higher address\n");
+ status = handle_cmdline_files(sys_table, image,
+ (char *)(unsigned long)hdr->cmd_line_ptr,
+ "initrd=", -1UL,
+ &ramdisk_addr, &ramdisk_size);
+ }
+
if (status != EFI_SUCCESS)
goto fail2;
hdr->ramdisk_image = ramdisk_addr & 0xffffffff;
@@ -1376,10 +1409,7 @@ struct boot_params *efi_main(struct efi_config *c,
setup_graphics(boot_params);
- status = setup_efi_pci(boot_params);
- if (status != EFI_SUCCESS) {
- efi_printk(sys_table, "setup_efi_pci() failed!\n");
- }
+ setup_efi_pci(boot_params);
status = efi_call_early(allocate_pool, EFI_LOADER_DATA,
sizeof(*gdt), (void **)&gdt);
diff --git a/arch/x86/boot/compressed/eboot.h b/arch/x86/boot/compressed/eboot.h
index d487e727f1ec..c88c31ecad12 100644
--- a/arch/x86/boot/compressed/eboot.h
+++ b/arch/x86/boot/compressed/eboot.h
@@ -103,4 +103,20 @@ struct efi_uga_draw_protocol {
void *blt;
};
+struct efi_config {
+ u64 image_handle;
+ u64 table;
+ u64 allocate_pool;
+ u64 allocate_pages;
+ u64 get_memory_map;
+ u64 free_pool;
+ u64 free_pages;
+ u64 locate_handle;
+ u64 handle_protocol;
+ u64 exit_boot_services;
+ u64 text_output;
+ efi_status_t (*call)(unsigned long, ...);
+ bool is64;
+} __packed;
+
#endif /* BOOT_COMPRESSED_EBOOT_H */
diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S
index cbed1407a5cd..1d7fbbcc196d 100644
--- a/arch/x86/boot/compressed/head_32.S
+++ b/arch/x86/boot/compressed/head_32.S
@@ -207,7 +207,8 @@ relocated:
* Do the decompression, and jump to the new kernel..
*/
/* push arguments for decompress_kernel: */
- pushl $z_output_len /* decompressed length */
+ pushl $z_run_size /* size of kernel with .bss and .brk */
+ pushl $z_output_len /* decompressed length, end of relocs */
leal z_extract_offset_negative(%ebx), %ebp
pushl %ebp /* output address */
pushl $z_input_len /* input_len */
@@ -217,7 +218,7 @@ relocated:
pushl %eax /* heap area */
pushl %esi /* real mode pointer */
call decompress_kernel /* returns kernel location in %eax */
- addl $24, %esp
+ addl $28, %esp
/*
* Jump to the decompressed kernel.
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index 2884e0c3e8a5..6b1766c6c082 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -402,13 +402,16 @@ relocated:
* Do the decompression, and jump to the new kernel..
*/
pushq %rsi /* Save the real mode argument */
+ movq $z_run_size, %r9 /* size of kernel with .bss and .brk */
+ pushq %r9
movq %rsi, %rdi /* real mode address */
leaq boot_heap(%rip), %rsi /* malloc area for uncompression */
leaq input_data(%rip), %rdx /* input_data */
movl $z_input_len, %ecx /* input_len */
movq %rbp, %r8 /* output target address */
- movq $z_output_len, %r9 /* decompressed length */
+ movq $z_output_len, %r9 /* decompressed length, end of relocs */
call decompress_kernel /* returns kernel location in %rax */
+ popq %r9
popq %rsi
/*
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index 57ab74df7eea..30dd59a9f0b4 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -358,7 +358,8 @@ asmlinkage __visible void *decompress_kernel(void *rmode, memptr heap,
unsigned char *input_data,
unsigned long input_len,
unsigned char *output,
- unsigned long output_len)
+ unsigned long output_len,
+ unsigned long run_size)
{
real_mode = rmode;
@@ -381,8 +382,14 @@ asmlinkage __visible void *decompress_kernel(void *rmode, memptr heap,
free_mem_ptr = heap; /* Heap */
free_mem_end_ptr = heap + BOOT_HEAP_SIZE;
- output = choose_kernel_location(input_data, input_len,
- output, output_len);
+ /*
+ * The memory hole needed for the kernel is the larger of either
+ * the entire decompressed kernel plus relocation table, or the
+ * entire decompressed kernel plus .bss and .brk sections.
+ */
+ output = choose_kernel_location(input_data, input_len, output,
+ output_len > run_size ? output_len
+ : run_size);
/* Validate memory location choices. */
if ((unsigned long)output & (MIN_KERNEL_ALIGN - 1))
diff --git a/arch/x86/boot/compressed/mkpiggy.c b/arch/x86/boot/compressed/mkpiggy.c
index b669ab65bf6c..d8222f213182 100644
--- a/arch/x86/boot/compressed/mkpiggy.c
+++ b/arch/x86/boot/compressed/mkpiggy.c
@@ -36,11 +36,13 @@ int main(int argc, char *argv[])
uint32_t olen;
long ilen;
unsigned long offs;
+ unsigned long run_size;
FILE *f = NULL;
int retval = 1;
- if (argc < 2) {
- fprintf(stderr, "Usage: %s compressed_file\n", argv[0]);
+ if (argc < 3) {
+ fprintf(stderr, "Usage: %s compressed_file run_size\n",
+ argv[0]);
goto bail;
}
@@ -74,6 +76,7 @@ int main(int argc, char *argv[])
offs += olen >> 12; /* Add 8 bytes for each 32K block */
offs += 64*1024 + 128; /* Add 64K + 128 bytes slack */
offs = (offs+4095) & ~4095; /* Round to a 4K boundary */
+ run_size = atoi(argv[2]);
printf(".section \".rodata..compressed\",\"a\",@progbits\n");
printf(".globl z_input_len\n");
@@ -85,6 +88,8 @@ int main(int argc, char *argv[])
/* z_extract_offset_negative allows simplification of head_32.S */
printf(".globl z_extract_offset_negative\n");
printf("z_extract_offset_negative = -0x%lx\n", offs);
+ printf(".globl z_run_size\n");
+ printf("z_run_size = %lu\n", run_size);
printf(".globl input_data, input_data_end\n");
printf("input_data:\n");
diff --git a/arch/x86/boot/cpu.c b/arch/x86/boot/cpu.c
index 6ec6bb6e9957..29207f69ae8c 100644
--- a/arch/x86/boot/cpu.c
+++ b/arch/x86/boot/cpu.c
@@ -16,7 +16,9 @@
*/
#include "boot.h"
+#ifdef CONFIG_X86_FEATURE_NAMES
#include "cpustr.h"
+#endif
static char *cpu_name(int level)
{
@@ -32,11 +34,48 @@ static char *cpu_name(int level)
}
}
+static void show_cap_strs(u32 *err_flags)
+{
+ int i, j;
+#ifdef CONFIG_X86_FEATURE_NAMES
+ const unsigned char *msg_strs = (const unsigned char *)x86_cap_strs;
+ for (i = 0; i < NCAPINTS; i++) {
+ u32 e = err_flags[i];
+ for (j = 0; j < 32; j++) {
+ if (msg_strs[0] < i ||
+ (msg_strs[0] == i && msg_strs[1] < j)) {
+ /* Skip to the next string */
+ msg_strs += 2;
+ while (*msg_strs++)
+ ;
+ }
+ if (e & 1) {
+ if (msg_strs[0] == i &&
+ msg_strs[1] == j &&
+ msg_strs[2])
+ printf("%s ", msg_strs+2);
+ else
+ printf("%d:%d ", i, j);
+ }
+ e >>= 1;
+ }
+ }
+#else
+ for (i = 0; i < NCAPINTS; i++) {
+ u32 e = err_flags[i];
+ for (j = 0; j < 32; j++) {
+ if (e & 1)
+ printf("%d:%d ", i, j);
+ e >>= 1;
+ }
+ }
+#endif
+}
+
int validate_cpu(void)
{
u32 *err_flags;
int cpu_level, req_level;
- const unsigned char *msg_strs;
check_cpu(&cpu_level, &req_level, &err_flags);
@@ -49,34 +88,9 @@ int validate_cpu(void)
}
if (err_flags) {
- int i, j;
puts("This kernel requires the following features "
"not present on the CPU:\n");
-
- msg_strs = (const unsigned char *)x86_cap_strs;
-
- for (i = 0; i < NCAPINTS; i++) {
- u32 e = err_flags[i];
-
- for (j = 0; j < 32; j++) {
- if (msg_strs[0] < i ||
- (msg_strs[0] == i && msg_strs[1] < j)) {
- /* Skip to the next string */
- msg_strs += 2;
- while (*msg_strs++)
- ;
- }
- if (e & 1) {
- if (msg_strs[0] == i &&
- msg_strs[1] == j &&
- msg_strs[2])
- printf("%s ", msg_strs+2);
- else
- printf("%d:%d ", i, j);
- }
- e >>= 1;
- }
- }
+ show_cap_strs(err_flags);
putchar('\n');
return -1;
} else {
diff --git a/arch/x86/boot/mkcpustr.c b/arch/x86/boot/mkcpustr.c
index 4579eff0ef4d..637097e66a62 100644
--- a/arch/x86/boot/mkcpustr.c
+++ b/arch/x86/boot/mkcpustr.c
@@ -16,6 +16,7 @@
#include <stdio.h>
#include "../include/asm/required-features.h"
+#include "../include/asm/disabled-features.h"
#include "../include/asm/cpufeature.h"
#include "../kernel/cpu/capflags.c"
diff --git a/arch/x86/configs/tiny.config b/arch/x86/configs/tiny.config
new file mode 100644
index 000000000000..4e2ecfa23c15
--- /dev/null
+++ b/arch/x86/configs/tiny.config
@@ -0,0 +1 @@
+CONFIG_NOHIGHMEM=y
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index d551165a3159..fd0f848938cc 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -26,6 +26,7 @@ obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o
obj-$(CONFIG_CRYPTO_CRC32C_INTEL) += crc32c-intel.o
obj-$(CONFIG_CRYPTO_SHA1_SSSE3) += sha1-ssse3.o
+obj-$(CONFIG_CRYPTO_SHA1_MB) += sha-mb/
obj-$(CONFIG_CRYPTO_CRC32_PCLMUL) += crc32-pclmul.o
obj-$(CONFIG_CRYPTO_SHA256_SSSE3) += sha256-ssse3.o
obj-$(CONFIG_CRYPTO_SHA512_SSSE3) += sha512-ssse3.o
diff --git a/arch/x86/crypto/aes_ctrby8_avx-x86_64.S b/arch/x86/crypto/aes_ctrby8_avx-x86_64.S
index f091f122ed24..2df2a0298f5a 100644
--- a/arch/x86/crypto/aes_ctrby8_avx-x86_64.S
+++ b/arch/x86/crypto/aes_ctrby8_avx-x86_64.S
@@ -79,9 +79,6 @@
#define xcounter %xmm8
#define xbyteswap %xmm9
#define xkey0 %xmm10
-#define xkey3 %xmm11
-#define xkey6 %xmm12
-#define xkey9 %xmm13
#define xkey4 %xmm11
#define xkey8 %xmm12
#define xkey12 %xmm13
@@ -108,6 +105,10 @@
byteswap_const:
.octa 0x000102030405060708090A0B0C0D0E0F
+ddq_low_msk:
+ .octa 0x0000000000000000FFFFFFFFFFFFFFFF
+ddq_high_add_1:
+ .octa 0x00000000000000010000000000000000
ddq_add_1:
.octa 0x00000000000000000000000000000001
ddq_add_2:
@@ -169,7 +170,12 @@ ddq_add_8:
.rept (by - 1)
club DDQ_DATA, i
club XDATA, i
- vpaddd var_ddq_add(%rip), xcounter, var_xdata
+ vpaddq var_ddq_add(%rip), xcounter, var_xdata
+ vptest ddq_low_msk(%rip), var_xdata
+ jnz 1f
+ vpaddq ddq_high_add_1(%rip), var_xdata, var_xdata
+ vpaddq ddq_high_add_1(%rip), xcounter, xcounter
+ 1:
vpshufb xbyteswap, var_xdata, var_xdata
.set i, (i +1)
.endr
@@ -178,7 +184,11 @@ ddq_add_8:
vpxor xkey0, xdata0, xdata0
club DDQ_DATA, by
- vpaddd var_ddq_add(%rip), xcounter, xcounter
+ vpaddq var_ddq_add(%rip), xcounter, xcounter
+ vptest ddq_low_msk(%rip), xcounter
+ jnz 1f
+ vpaddq ddq_high_add_1(%rip), xcounter, xcounter
+ 1:
.set i, 1
.rept (by - 1)
diff --git a/arch/x86/crypto/sha-mb/Makefile b/arch/x86/crypto/sha-mb/Makefile
new file mode 100644
index 000000000000..2f8756375df5
--- /dev/null
+++ b/arch/x86/crypto/sha-mb/Makefile
@@ -0,0 +1,11 @@
+#
+# Arch-specific CryptoAPI modules.
+#
+
+avx2_supported := $(call as-instr,vpgatherdd %ymm0$(comma)(%eax$(comma)%ymm1\
+ $(comma)4)$(comma)%ymm2,yes,no)
+ifeq ($(avx2_supported),yes)
+ obj-$(CONFIG_CRYPTO_SHA1_MB) += sha1-mb.o
+ sha1-mb-y := sha1_mb.o sha1_mb_mgr_flush_avx2.o \
+ sha1_mb_mgr_init_avx2.o sha1_mb_mgr_submit_avx2.o sha1_x8_avx2.o
+endif
diff --git a/arch/x86/crypto/sha-mb/sha1_mb.c b/arch/x86/crypto/sha-mb/sha1_mb.c
new file mode 100644
index 000000000000..99eefd812958
--- /dev/null
+++ b/arch/x86/crypto/sha-mb/sha1_mb.c
@@ -0,0 +1,935 @@
+/*
+ * Multi buffer SHA1 algorithm Glue Code
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2014 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Contact Information:
+ * Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2014 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <crypto/internal/hash.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/cryptohash.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <crypto/scatterwalk.h>
+#include <crypto/sha.h>
+#include <crypto/mcryptd.h>
+#include <crypto/crypto_wq.h>
+#include <asm/byteorder.h>
+#include <asm/i387.h>
+#include <asm/xcr.h>
+#include <asm/xsave.h>
+#include <linux/hardirq.h>
+#include <asm/fpu-internal.h>
+#include "sha_mb_ctx.h"
+
+#define FLUSH_INTERVAL 1000 /* in usec */
+
+static struct mcryptd_alg_state sha1_mb_alg_state;
+
+struct sha1_mb_ctx {
+ struct mcryptd_ahash *mcryptd_tfm;
+};
+
+static inline struct mcryptd_hash_request_ctx *cast_hash_to_mcryptd_ctx(struct sha1_hash_ctx *hash_ctx)
+{
+ struct shash_desc *desc;
+
+ desc = container_of((void *) hash_ctx, struct shash_desc, __ctx);
+ return container_of(desc, struct mcryptd_hash_request_ctx, desc);
+}
+
+static inline struct ahash_request *cast_mcryptd_ctx_to_req(struct mcryptd_hash_request_ctx *ctx)
+{
+ return container_of((void *) ctx, struct ahash_request, __ctx);
+}
+
+static void req_ctx_init(struct mcryptd_hash_request_ctx *rctx,
+ struct shash_desc *desc)
+{
+ rctx->flag = HASH_UPDATE;
+}
+
+static asmlinkage void (*sha1_job_mgr_init)(struct sha1_mb_mgr *state);
+static asmlinkage struct job_sha1* (*sha1_job_mgr_submit)(struct sha1_mb_mgr *state,
+ struct job_sha1 *job);
+static asmlinkage struct job_sha1* (*sha1_job_mgr_flush)(struct sha1_mb_mgr *state);
+static asmlinkage struct job_sha1* (*sha1_job_mgr_get_comp_job)(struct sha1_mb_mgr *state);
+
+inline void sha1_init_digest(uint32_t *digest)
+{
+ static const uint32_t initial_digest[SHA1_DIGEST_LENGTH] = {SHA1_H0,
+ SHA1_H1, SHA1_H2, SHA1_H3, SHA1_H4 };
+ memcpy(digest, initial_digest, sizeof(initial_digest));
+}
+
+inline uint32_t sha1_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2],
+ uint32_t total_len)
+{
+ uint32_t i = total_len & (SHA1_BLOCK_SIZE - 1);
+
+ memset(&padblock[i], 0, SHA1_BLOCK_SIZE);
+ padblock[i] = 0x80;
+
+ i += ((SHA1_BLOCK_SIZE - 1) &
+ (0 - (total_len + SHA1_PADLENGTHFIELD_SIZE + 1)))
+ + 1 + SHA1_PADLENGTHFIELD_SIZE;
+
+#if SHA1_PADLENGTHFIELD_SIZE == 16
+ *((uint64_t *) &padblock[i - 16]) = 0;
+#endif
+
+ *((uint64_t *) &padblock[i - 8]) = cpu_to_be64(total_len << 3);
+
+ /* Number of extra blocks to hash */
+ return i >> SHA1_LOG2_BLOCK_SIZE;
+}
+
+static struct sha1_hash_ctx *sha1_ctx_mgr_resubmit(struct sha1_ctx_mgr *mgr, struct sha1_hash_ctx *ctx)
+{
+ while (ctx) {
+ if (ctx->status & HASH_CTX_STS_COMPLETE) {
+ /* Clear PROCESSING bit */
+ ctx->status = HASH_CTX_STS_COMPLETE;
+ return ctx;
+ }
+
+ /*
+ * If the extra blocks are empty, begin hashing what remains
+ * in the user's buffer.
+ */
+ if (ctx->partial_block_buffer_length == 0 &&
+ ctx->incoming_buffer_length) {
+
+ const void *buffer = ctx->incoming_buffer;
+ uint32_t len = ctx->incoming_buffer_length;
+ uint32_t copy_len;
+
+ /*
+ * Only entire blocks can be hashed.
+ * Copy remainder to extra blocks buffer.
+ */
+ copy_len = len & (SHA1_BLOCK_SIZE-1);
+
+ if (copy_len) {
+ len -= copy_len;
+ memcpy(ctx->partial_block_buffer,
+ ((const char *) buffer + len),
+ copy_len);
+ ctx->partial_block_buffer_length = copy_len;
+ }
+
+ ctx->incoming_buffer_length = 0;
+
+ /* len should be a multiple of the block size now */
+ assert((len % SHA1_BLOCK_SIZE) == 0);
+
+ /* Set len to the number of blocks to be hashed */
+ len >>= SHA1_LOG2_BLOCK_SIZE;
+
+ if (len) {
+
+ ctx->job.buffer = (uint8_t *) buffer;
+ ctx->job.len = len;
+ ctx = (struct sha1_hash_ctx *) sha1_job_mgr_submit(&mgr->mgr,
+ &ctx->job);
+ continue;
+ }
+ }
+
+ /*
+ * If the extra blocks are not empty, then we are
+ * either on the last block(s) or we need more
+ * user input before continuing.
+ */
+ if (ctx->status & HASH_CTX_STS_LAST) {
+
+ uint8_t *buf = ctx->partial_block_buffer;
+ uint32_t n_extra_blocks = sha1_pad(buf, ctx->total_length);
+
+ ctx->status = (HASH_CTX_STS_PROCESSING |
+ HASH_CTX_STS_COMPLETE);
+ ctx->job.buffer = buf;
+ ctx->job.len = (uint32_t) n_extra_blocks;
+ ctx = (struct sha1_hash_ctx *) sha1_job_mgr_submit(&mgr->mgr, &ctx->job);
+ continue;
+ }
+
+ if (ctx)
+ ctx->status = HASH_CTX_STS_IDLE;
+ return ctx;
+ }
+
+ return NULL;
+}
+
+static struct sha1_hash_ctx *sha1_ctx_mgr_get_comp_ctx(struct sha1_ctx_mgr *mgr)
+{
+ /*
+ * If get_comp_job returns NULL, there are no jobs complete.
+ * If get_comp_job returns a job, verify that it is safe to return to the user.
+ * If it is not ready, resubmit the job to finish processing.
+ * If sha1_ctx_mgr_resubmit returned a job, it is ready to be returned.
+ * Otherwise, all jobs currently being managed by the hash_ctx_mgr still need processing.
+ */
+ struct sha1_hash_ctx *ctx;
+
+ ctx = (struct sha1_hash_ctx *) sha1_job_mgr_get_comp_job(&mgr->mgr);
+ return sha1_ctx_mgr_resubmit(mgr, ctx);
+}
+
+static void sha1_ctx_mgr_init(struct sha1_ctx_mgr *mgr)
+{
+ sha1_job_mgr_init(&mgr->mgr);
+}
+
+static struct sha1_hash_ctx *sha1_ctx_mgr_submit(struct sha1_ctx_mgr *mgr,
+ struct sha1_hash_ctx *ctx,
+ const void *buffer,
+ uint32_t len,
+ int flags)
+{
+ if (flags & (~HASH_ENTIRE)) {
+ /* User should not pass anything other than FIRST, UPDATE, or LAST */
+ ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+ return ctx;
+ }
+
+ if (ctx->status & HASH_CTX_STS_PROCESSING) {
+ /* Cannot submit to a currently processing job. */
+ ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+ return ctx;
+ }
+
+ if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+ /* Cannot update a finished job. */
+ ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+ return ctx;
+ }
+
+
+ if (flags & HASH_FIRST) {
+ /* Init digest */
+ sha1_init_digest(ctx->job.result_digest);
+
+ /* Reset byte counter */
+ ctx->total_length = 0;
+
+ /* Clear extra blocks */
+ ctx->partial_block_buffer_length = 0;
+ }
+
+ /* If we made it here, there were no errors during this call to submit */
+ ctx->error = HASH_CTX_ERROR_NONE;
+
+ /* Store buffer ptr info from user */
+ ctx->incoming_buffer = buffer;
+ ctx->incoming_buffer_length = len;
+
+ /* Store the user's request flags and mark this ctx as currently being processed. */
+ ctx->status = (flags & HASH_LAST) ?
+ (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+ HASH_CTX_STS_PROCESSING;
+
+ /* Advance byte counter */
+ ctx->total_length += len;
+
+ /*
+ * If there is anything currently buffered in the extra blocks,
+ * append to it until it contains a whole block.
+ * Or if the user's buffer contains less than a whole block,
+ * append as much as possible to the extra block.
+ */
+ if ((ctx->partial_block_buffer_length) | (len < SHA1_BLOCK_SIZE)) {
+ /* Compute how many bytes to copy from user buffer into extra block */
+ uint32_t copy_len = SHA1_BLOCK_SIZE - ctx->partial_block_buffer_length;
+ if (len < copy_len)
+ copy_len = len;
+
+ if (copy_len) {
+ /* Copy and update relevant pointers and counters */
+ memcpy(&ctx->partial_block_buffer[ctx->partial_block_buffer_length],
+ buffer, copy_len);
+
+ ctx->partial_block_buffer_length += copy_len;
+ ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+ ctx->incoming_buffer_length = len - copy_len;
+ }
+
+ /* The extra block should never contain more than 1 block here */
+ assert(ctx->partial_block_buffer_length <= SHA1_BLOCK_SIZE);
+
+ /* If the extra block buffer contains exactly 1 block, it can be hashed. */
+ if (ctx->partial_block_buffer_length >= SHA1_BLOCK_SIZE) {
+ ctx->partial_block_buffer_length = 0;
+
+ ctx->job.buffer = ctx->partial_block_buffer;
+ ctx->job.len = 1;
+ ctx = (struct sha1_hash_ctx *) sha1_job_mgr_submit(&mgr->mgr, &ctx->job);
+ }
+ }
+
+ return sha1_ctx_mgr_resubmit(mgr, ctx);
+}
+
+static struct sha1_hash_ctx *sha1_ctx_mgr_flush(struct sha1_ctx_mgr *mgr)
+{
+ struct sha1_hash_ctx *ctx;
+
+ while (1) {
+ ctx = (struct sha1_hash_ctx *) sha1_job_mgr_flush(&mgr->mgr);
+
+ /* If flush returned 0, there are no more jobs in flight. */
+ if (!ctx)
+ return NULL;
+
+ /*
+ * If flush returned a job, resubmit the job to finish processing.
+ */
+ ctx = sha1_ctx_mgr_resubmit(mgr, ctx);
+
+ /*
+ * If sha1_ctx_mgr_resubmit returned a job, it is ready to be returned.
+ * Otherwise, all jobs currently being managed by the sha1_ctx_mgr
+ * still need processing. Loop.
+ */
+ if (ctx)
+ return ctx;
+ }
+}
+
+static int sha1_mb_init(struct shash_desc *desc)
+{
+ struct sha1_hash_ctx *sctx = shash_desc_ctx(desc);
+
+ hash_ctx_init(sctx);
+ sctx->job.result_digest[0] = SHA1_H0;
+ sctx->job.result_digest[1] = SHA1_H1;
+ sctx->job.result_digest[2] = SHA1_H2;
+ sctx->job.result_digest[3] = SHA1_H3;
+ sctx->job.result_digest[4] = SHA1_H4;
+ sctx->total_length = 0;
+ sctx->partial_block_buffer_length = 0;
+ sctx->status = HASH_CTX_STS_IDLE;
+
+ return 0;
+}
+
+static int sha1_mb_set_results(struct mcryptd_hash_request_ctx *rctx)
+{
+ int i;
+ struct sha1_hash_ctx *sctx = shash_desc_ctx(&rctx->desc);
+ __be32 *dst = (__be32 *) rctx->out;
+
+ for (i = 0; i < 5; ++i)
+ dst[i] = cpu_to_be32(sctx->job.result_digest[i]);
+
+ return 0;
+}
+
+static int sha_finish_walk(struct mcryptd_hash_request_ctx **ret_rctx,
+ struct mcryptd_alg_cstate *cstate, bool flush)
+{
+ int flag = HASH_UPDATE;
+ int nbytes, err = 0;
+ struct mcryptd_hash_request_ctx *rctx = *ret_rctx;
+ struct sha1_hash_ctx *sha_ctx;
+
+ /* more work ? */
+ while (!(rctx->flag & HASH_DONE)) {
+ nbytes = crypto_ahash_walk_done(&rctx->walk, 0);
+ if (nbytes < 0) {
+ err = nbytes;
+ goto out;
+ }
+ /* check if the walk is done */
+ if (crypto_ahash_walk_last(&rctx->walk)) {
+ rctx->flag |= HASH_DONE;
+ if (rctx->flag & HASH_FINAL)
+ flag |= HASH_LAST;
+
+ }
+ sha_ctx = (struct sha1_hash_ctx *) shash_desc_ctx(&rctx->desc);
+ kernel_fpu_begin();
+ sha_ctx = sha1_ctx_mgr_submit(cstate->mgr, sha_ctx, rctx->walk.data, nbytes, flag);
+ if (!sha_ctx) {
+ if (flush)
+ sha_ctx = sha1_ctx_mgr_flush(cstate->mgr);
+ }
+ kernel_fpu_end();
+ if (sha_ctx)
+ rctx = cast_hash_to_mcryptd_ctx(sha_ctx);
+ else {
+ rctx = NULL;
+ goto out;
+ }
+ }
+
+ /* copy the results */
+ if (rctx->flag & HASH_FINAL)
+ sha1_mb_set_results(rctx);
+
+out:
+ *ret_rctx = rctx;
+ return err;
+}
+
+static int sha_complete_job(struct mcryptd_hash_request_ctx *rctx,
+ struct mcryptd_alg_cstate *cstate,
+ int err)
+{
+ struct ahash_request *req = cast_mcryptd_ctx_to_req(rctx);
+ struct sha1_hash_ctx *sha_ctx;
+ struct mcryptd_hash_request_ctx *req_ctx;
+ int ret;
+
+ /* remove from work list */
+ spin_lock(&cstate->work_lock);
+ list_del(&rctx->waiter);
+ spin_unlock(&cstate->work_lock);
+
+ if (irqs_disabled())
+ rctx->complete(&req->base, err);
+ else {
+ local_bh_disable();
+ rctx->complete(&req->base, err);
+ local_bh_enable();
+ }
+
+ /* check to see if there are other jobs that are done */
+ sha_ctx = sha1_ctx_mgr_get_comp_ctx(cstate->mgr);
+ while (sha_ctx) {
+ req_ctx = cast_hash_to_mcryptd_ctx(sha_ctx);
+ ret = sha_finish_walk(&req_ctx, cstate, false);
+ if (req_ctx) {
+ spin_lock(&cstate->work_lock);
+ list_del(&req_ctx->waiter);
+ spin_unlock(&cstate->work_lock);
+
+ req = cast_mcryptd_ctx_to_req(req_ctx);
+ if (irqs_disabled())
+ rctx->complete(&req->base, ret);
+ else {
+ local_bh_disable();
+ rctx->complete(&req->base, ret);
+ local_bh_enable();
+ }
+ }
+ sha_ctx = sha1_ctx_mgr_get_comp_ctx(cstate->mgr);
+ }
+
+ return 0;
+}
+
+static void sha1_mb_add_list(struct mcryptd_hash_request_ctx *rctx,
+ struct mcryptd_alg_cstate *cstate)
+{
+ unsigned long next_flush;
+ unsigned long delay = usecs_to_jiffies(FLUSH_INTERVAL);
+
+ /* initialize tag */
+ rctx->tag.arrival = jiffies; /* tag the arrival time */
+ rctx->tag.seq_num = cstate->next_seq_num++;
+ next_flush = rctx->tag.arrival + delay;
+ rctx->tag.expire = next_flush;
+
+ spin_lock(&cstate->work_lock);
+ list_add_tail(&rctx->waiter, &cstate->work_list);
+ spin_unlock(&cstate->work_lock);
+
+ mcryptd_arm_flusher(cstate, delay);
+}
+
+static int sha1_mb_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len)
+{
+ struct mcryptd_hash_request_ctx *rctx =
+ container_of(desc, struct mcryptd_hash_request_ctx, desc);
+ struct mcryptd_alg_cstate *cstate =
+ this_cpu_ptr(sha1_mb_alg_state.alg_cstate);
+
+ struct ahash_request *req = cast_mcryptd_ctx_to_req(rctx);
+ struct sha1_hash_ctx *sha_ctx;
+ int ret = 0, nbytes;
+
+
+ /* sanity check */
+ if (rctx->tag.cpu != smp_processor_id()) {
+ pr_err("mcryptd error: cpu clash\n");
+ goto done;
+ }
+
+ /* need to init context */
+ req_ctx_init(rctx, desc);
+
+ nbytes = crypto_ahash_walk_first(req, &rctx->walk);
+
+ if (nbytes < 0) {
+ ret = nbytes;
+ goto done;
+ }
+
+ if (crypto_ahash_walk_last(&rctx->walk))
+ rctx->flag |= HASH_DONE;
+
+ /* submit */
+ sha_ctx = (struct sha1_hash_ctx *) shash_desc_ctx(desc);
+ sha1_mb_add_list(rctx, cstate);
+ kernel_fpu_begin();
+ sha_ctx = sha1_ctx_mgr_submit(cstate->mgr, sha_ctx, rctx->walk.data, nbytes, HASH_UPDATE);
+ kernel_fpu_end();
+
+ /* check if anything is returned */
+ if (!sha_ctx)
+ return -EINPROGRESS;
+
+ if (sha_ctx->error) {
+ ret = sha_ctx->error;
+ rctx = cast_hash_to_mcryptd_ctx(sha_ctx);
+ goto done;
+ }
+
+ rctx = cast_hash_to_mcryptd_ctx(sha_ctx);
+ ret = sha_finish_walk(&rctx, cstate, false);
+
+ if (!rctx)
+ return -EINPROGRESS;
+done:
+ sha_complete_job(rctx, cstate, ret);
+ return ret;
+}
+
+static int sha1_mb_finup(struct shash_desc *desc, const u8 *data,
+ unsigned int len, u8 *out)
+{
+ struct mcryptd_hash_request_ctx *rctx =
+ container_of(desc, struct mcryptd_hash_request_ctx, desc);
+ struct mcryptd_alg_cstate *cstate =
+ this_cpu_ptr(sha1_mb_alg_state.alg_cstate);
+
+ struct ahash_request *req = cast_mcryptd_ctx_to_req(rctx);
+ struct sha1_hash_ctx *sha_ctx;
+ int ret = 0, flag = HASH_UPDATE, nbytes;
+
+ /* sanity check */
+ if (rctx->tag.cpu != smp_processor_id()) {
+ pr_err("mcryptd error: cpu clash\n");
+ goto done;
+ }
+
+ /* need to init context */
+ req_ctx_init(rctx, desc);
+
+ nbytes = crypto_ahash_walk_first(req, &rctx->walk);
+
+ if (nbytes < 0) {
+ ret = nbytes;
+ goto done;
+ }
+
+ if (crypto_ahash_walk_last(&rctx->walk)) {
+ rctx->flag |= HASH_DONE;
+ flag = HASH_LAST;
+ }
+ rctx->out = out;
+
+ /* submit */
+ rctx->flag |= HASH_FINAL;
+ sha_ctx = (struct sha1_hash_ctx *) shash_desc_ctx(desc);
+ sha1_mb_add_list(rctx, cstate);
+
+ kernel_fpu_begin();
+ sha_ctx = sha1_ctx_mgr_submit(cstate->mgr, sha_ctx, rctx->walk.data, nbytes, flag);
+ kernel_fpu_end();
+
+ /* check if anything is returned */
+ if (!sha_ctx)
+ return -EINPROGRESS;
+
+ if (sha_ctx->error) {
+ ret = sha_ctx->error;
+ goto done;
+ }
+
+ rctx = cast_hash_to_mcryptd_ctx(sha_ctx);
+ ret = sha_finish_walk(&rctx, cstate, false);
+ if (!rctx)
+ return -EINPROGRESS;
+done:
+ sha_complete_job(rctx, cstate, ret);
+ return ret;
+}
+
+static int sha1_mb_final(struct shash_desc *desc, u8 *out)
+{
+ struct mcryptd_hash_request_ctx *rctx =
+ container_of(desc, struct mcryptd_hash_request_ctx, desc);
+ struct mcryptd_alg_cstate *cstate =
+ this_cpu_ptr(sha1_mb_alg_state.alg_cstate);
+
+ struct sha1_hash_ctx *sha_ctx;
+ int ret = 0;
+ u8 data;
+
+ /* sanity check */
+ if (rctx->tag.cpu != smp_processor_id()) {
+ pr_err("mcryptd error: cpu clash\n");
+ goto done;
+ }
+
+ /* need to init context */
+ req_ctx_init(rctx, desc);
+
+ rctx->out = out;
+ rctx->flag |= HASH_DONE | HASH_FINAL;
+
+ sha_ctx = (struct sha1_hash_ctx *) shash_desc_ctx(desc);
+ /* flag HASH_FINAL and 0 data size */
+ sha1_mb_add_list(rctx, cstate);
+ kernel_fpu_begin();
+ sha_ctx = sha1_ctx_mgr_submit(cstate->mgr, sha_ctx, &data, 0, HASH_LAST);
+ kernel_fpu_end();
+
+ /* check if anything is returned */
+ if (!sha_ctx)
+ return -EINPROGRESS;
+
+ if (sha_ctx->error) {
+ ret = sha_ctx->error;
+ rctx = cast_hash_to_mcryptd_ctx(sha_ctx);
+ goto done;
+ }
+
+ rctx = cast_hash_to_mcryptd_ctx(sha_ctx);
+ ret = sha_finish_walk(&rctx, cstate, false);
+ if (!rctx)
+ return -EINPROGRESS;
+done:
+ sha_complete_job(rctx, cstate, ret);
+ return ret;
+}
+
+static int sha1_mb_export(struct shash_desc *desc, void *out)
+{
+ struct sha1_hash_ctx *sctx = shash_desc_ctx(desc);
+
+ memcpy(out, sctx, sizeof(*sctx));
+
+ return 0;
+}
+
+static int sha1_mb_import(struct shash_desc *desc, const void *in)
+{
+ struct sha1_hash_ctx *sctx = shash_desc_ctx(desc);
+
+ memcpy(sctx, in, sizeof(*sctx));
+
+ return 0;
+}
+
+
+static struct shash_alg sha1_mb_shash_alg = {
+ .digestsize = SHA1_DIGEST_SIZE,
+ .init = sha1_mb_init,
+ .update = sha1_mb_update,
+ .final = sha1_mb_final,
+ .finup = sha1_mb_finup,
+ .export = sha1_mb_export,
+ .import = sha1_mb_import,
+ .descsize = sizeof(struct sha1_hash_ctx),
+ .statesize = sizeof(struct sha1_hash_ctx),
+ .base = {
+ .cra_name = "__sha1-mb",
+ .cra_driver_name = "__intel_sha1-mb",
+ .cra_priority = 100,
+ /*
+ * use ASYNC flag as some buffers in multi-buffer
+ * algo may not have completed before hashing thread sleep
+ */
+ .cra_flags = CRYPTO_ALG_TYPE_SHASH | CRYPTO_ALG_ASYNC,
+ .cra_blocksize = SHA1_BLOCK_SIZE,
+ .cra_module = THIS_MODULE,
+ .cra_list = LIST_HEAD_INIT(sha1_mb_shash_alg.base.cra_list),
+ }
+};
+
+static int sha1_mb_async_init(struct ahash_request *req)
+{
+ struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+ struct sha1_mb_ctx *ctx = crypto_ahash_ctx(tfm);
+ struct ahash_request *mcryptd_req = ahash_request_ctx(req);
+ struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
+
+ memcpy(mcryptd_req, req, sizeof(*req));
+ ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
+ return crypto_ahash_init(mcryptd_req);
+}
+
+static int sha1_mb_async_update(struct ahash_request *req)
+{
+ struct ahash_request *mcryptd_req = ahash_request_ctx(req);
+
+ struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+ struct sha1_mb_ctx *ctx = crypto_ahash_ctx(tfm);
+ struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
+
+ memcpy(mcryptd_req, req, sizeof(*req));
+ ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
+ return crypto_ahash_update(mcryptd_req);
+}
+
+static int sha1_mb_async_finup(struct ahash_request *req)
+{
+ struct ahash_request *mcryptd_req = ahash_request_ctx(req);
+
+ struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+ struct sha1_mb_ctx *ctx = crypto_ahash_ctx(tfm);
+ struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
+
+ memcpy(mcryptd_req, req, sizeof(*req));
+ ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
+ return crypto_ahash_finup(mcryptd_req);
+}
+
+static int sha1_mb_async_final(struct ahash_request *req)
+{
+ struct ahash_request *mcryptd_req = ahash_request_ctx(req);
+
+ struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+ struct sha1_mb_ctx *ctx = crypto_ahash_ctx(tfm);
+ struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
+
+ memcpy(mcryptd_req, req, sizeof(*req));
+ ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
+ return crypto_ahash_final(mcryptd_req);
+}
+
+static int sha1_mb_async_digest(struct ahash_request *req)
+{
+ struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+ struct sha1_mb_ctx *ctx = crypto_ahash_ctx(tfm);
+ struct ahash_request *mcryptd_req = ahash_request_ctx(req);
+ struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
+
+ memcpy(mcryptd_req, req, sizeof(*req));
+ ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
+ return crypto_ahash_digest(mcryptd_req);
+}
+
+static int sha1_mb_async_init_tfm(struct crypto_tfm *tfm)
+{
+ struct mcryptd_ahash *mcryptd_tfm;
+ struct sha1_mb_ctx *ctx = crypto_tfm_ctx(tfm);
+ struct mcryptd_hash_ctx *mctx;
+
+ mcryptd_tfm = mcryptd_alloc_ahash("__intel_sha1-mb", 0, 0);
+ if (IS_ERR(mcryptd_tfm))
+ return PTR_ERR(mcryptd_tfm);
+ mctx = crypto_ahash_ctx(&mcryptd_tfm->base);
+ mctx->alg_state = &sha1_mb_alg_state;
+ ctx->mcryptd_tfm = mcryptd_tfm;
+ crypto_ahash_set_reqsize(__crypto_ahash_cast(tfm),
+ sizeof(struct ahash_request) +
+ crypto_ahash_reqsize(&mcryptd_tfm->base));
+
+ return 0;
+}
+
+static void sha1_mb_async_exit_tfm(struct crypto_tfm *tfm)
+{
+ struct sha1_mb_ctx *ctx = crypto_tfm_ctx(tfm);
+
+ mcryptd_free_ahash(ctx->mcryptd_tfm);
+}
+
+static struct ahash_alg sha1_mb_async_alg = {
+ .init = sha1_mb_async_init,
+ .update = sha1_mb_async_update,
+ .final = sha1_mb_async_final,
+ .finup = sha1_mb_async_finup,
+ .digest = sha1_mb_async_digest,
+ .halg = {
+ .digestsize = SHA1_DIGEST_SIZE,
+ .base = {
+ .cra_name = "sha1",
+ .cra_driver_name = "sha1_mb",
+ .cra_priority = 200,
+ .cra_flags = CRYPTO_ALG_TYPE_AHASH | CRYPTO_ALG_ASYNC,
+ .cra_blocksize = SHA1_BLOCK_SIZE,
+ .cra_type = &crypto_ahash_type,
+ .cra_module = THIS_MODULE,
+ .cra_list = LIST_HEAD_INIT(sha1_mb_async_alg.halg.base.cra_list),
+ .cra_init = sha1_mb_async_init_tfm,
+ .cra_exit = sha1_mb_async_exit_tfm,
+ .cra_ctxsize = sizeof(struct sha1_mb_ctx),
+ .cra_alignmask = 0,
+ },
+ },
+};
+
+static unsigned long sha1_mb_flusher(struct mcryptd_alg_cstate *cstate)
+{
+ struct mcryptd_hash_request_ctx *rctx;
+ unsigned long cur_time;
+ unsigned long next_flush = 0;
+ struct sha1_hash_ctx *sha_ctx;
+
+
+ cur_time = jiffies;
+
+ while (!list_empty(&cstate->work_list)) {
+ rctx = list_entry(cstate->work_list.next,
+ struct mcryptd_hash_request_ctx, waiter);
+ if time_before(cur_time, rctx->tag.expire)
+ break;
+ kernel_fpu_begin();
+ sha_ctx = (struct sha1_hash_ctx *) sha1_ctx_mgr_flush(cstate->mgr);
+ kernel_fpu_end();
+ if (!sha_ctx) {
+ pr_err("sha1_mb error: nothing got flushed for non-empty list\n");
+ break;
+ }
+ rctx = cast_hash_to_mcryptd_ctx(sha_ctx);
+ sha_finish_walk(&rctx, cstate, true);
+ sha_complete_job(rctx, cstate, 0);
+ }
+
+ if (!list_empty(&cstate->work_list)) {
+ rctx = list_entry(cstate->work_list.next,
+ struct mcryptd_hash_request_ctx, waiter);
+ /* get the hash context and then flush time */
+ next_flush = rctx->tag.expire;
+ mcryptd_arm_flusher(cstate, get_delay(next_flush));
+ }
+ return next_flush;
+}
+
+static int __init sha1_mb_mod_init(void)
+{
+
+ int cpu;
+ int err;
+ struct mcryptd_alg_cstate *cpu_state;
+
+ /* check for dependent cpu features */
+ if (!boot_cpu_has(X86_FEATURE_AVX2) ||
+ !boot_cpu_has(X86_FEATURE_BMI2))
+ return -ENODEV;
+
+ /* initialize multibuffer structures */
+ sha1_mb_alg_state.alg_cstate = alloc_percpu(struct mcryptd_alg_cstate);
+
+ sha1_job_mgr_init = sha1_mb_mgr_init_avx2;
+ sha1_job_mgr_submit = sha1_mb_mgr_submit_avx2;
+ sha1_job_mgr_flush = sha1_mb_mgr_flush_avx2;
+ sha1_job_mgr_get_comp_job = sha1_mb_mgr_get_comp_job_avx2;
+
+ if (!sha1_mb_alg_state.alg_cstate)
+ return -ENOMEM;
+ for_each_possible_cpu(cpu) {
+ cpu_state = per_cpu_ptr(sha1_mb_alg_state.alg_cstate, cpu);
+ cpu_state->next_flush = 0;
+ cpu_state->next_seq_num = 0;
+ cpu_state->flusher_engaged = false;
+ INIT_DELAYED_WORK(&cpu_state->flush, mcryptd_flusher);
+ cpu_state->cpu = cpu;
+ cpu_state->alg_state = &sha1_mb_alg_state;
+ cpu_state->mgr = (struct sha1_ctx_mgr *) kzalloc(sizeof(struct sha1_ctx_mgr), GFP_KERNEL);
+ if (!cpu_state->mgr)
+ goto err2;
+ sha1_ctx_mgr_init(cpu_state->mgr);
+ INIT_LIST_HEAD(&cpu_state->work_list);
+ spin_lock_init(&cpu_state->work_lock);
+ }
+ sha1_mb_alg_state.flusher = &sha1_mb_flusher;
+
+ err = crypto_register_shash(&sha1_mb_shash_alg);
+ if (err)
+ goto err2;
+ err = crypto_register_ahash(&sha1_mb_async_alg);
+ if (err)
+ goto err1;
+
+
+ return 0;
+err1:
+ crypto_unregister_shash(&sha1_mb_shash_alg);
+err2:
+ for_each_possible_cpu(cpu) {
+ cpu_state = per_cpu_ptr(sha1_mb_alg_state.alg_cstate, cpu);
+ kfree(cpu_state->mgr);
+ }
+ free_percpu(sha1_mb_alg_state.alg_cstate);
+ return -ENODEV;
+}
+
+static void __exit sha1_mb_mod_fini(void)
+{
+ int cpu;
+ struct mcryptd_alg_cstate *cpu_state;
+
+ crypto_unregister_ahash(&sha1_mb_async_alg);
+ crypto_unregister_shash(&sha1_mb_shash_alg);
+ for_each_possible_cpu(cpu) {
+ cpu_state = per_cpu_ptr(sha1_mb_alg_state.alg_cstate, cpu);
+ kfree(cpu_state->mgr);
+ }
+ free_percpu(sha1_mb_alg_state.alg_cstate);
+}
+
+module_init(sha1_mb_mod_init);
+module_exit(sha1_mb_mod_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("SHA1 Secure Hash Algorithm, multi buffer accelerated");
+
+MODULE_ALIAS("sha1");
diff --git a/arch/x86/crypto/sha-mb/sha1_mb_mgr_datastruct.S b/arch/x86/crypto/sha-mb/sha1_mb_mgr_datastruct.S
new file mode 100644
index 000000000000..86688c6e7a25
--- /dev/null
+++ b/arch/x86/crypto/sha-mb/sha1_mb_mgr_datastruct.S
@@ -0,0 +1,287 @@
+/*
+ * Header file for multi buffer SHA1 algorithm data structure
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2014 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Contact Information:
+ * James Guilford <james.guilford@intel.com>
+ * Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2014 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+# Macros for defining data structures
+
+# Usage example
+
+#START_FIELDS # JOB_AES
+### name size align
+#FIELD _plaintext, 8, 8 # pointer to plaintext
+#FIELD _ciphertext, 8, 8 # pointer to ciphertext
+#FIELD _IV, 16, 8 # IV
+#FIELD _keys, 8, 8 # pointer to keys
+#FIELD _len, 4, 4 # length in bytes
+#FIELD _status, 4, 4 # status enumeration
+#FIELD _user_data, 8, 8 # pointer to user data
+#UNION _union, size1, align1, \
+# size2, align2, \
+# size3, align3, \
+# ...
+#END_FIELDS
+#%assign _JOB_AES_size _FIELD_OFFSET
+#%assign _JOB_AES_align _STRUCT_ALIGN
+
+#########################################################################
+
+# Alternate "struc-like" syntax:
+# STRUCT job_aes2
+# RES_Q .plaintext, 1
+# RES_Q .ciphertext, 1
+# RES_DQ .IV, 1
+# RES_B .nested, _JOB_AES_SIZE, _JOB_AES_ALIGN
+# RES_U .union, size1, align1, \
+# size2, align2, \
+# ...
+# ENDSTRUCT
+# # Following only needed if nesting
+# %assign job_aes2_size _FIELD_OFFSET
+# %assign job_aes2_align _STRUCT_ALIGN
+#
+# RES_* macros take a name, a count and an optional alignment.
+# The count in in terms of the base size of the macro, and the
+# default alignment is the base size.
+# The macros are:
+# Macro Base size
+# RES_B 1
+# RES_W 2
+# RES_D 4
+# RES_Q 8
+# RES_DQ 16
+# RES_Y 32
+# RES_Z 64
+#
+# RES_U defines a union. It's arguments are a name and two or more
+# pairs of "size, alignment"
+#
+# The two assigns are only needed if this structure is being nested
+# within another. Even if the assigns are not done, one can still use
+# STRUCT_NAME_size as the size of the structure.
+#
+# Note that for nesting, you still need to assign to STRUCT_NAME_size.
+#
+# The differences between this and using "struc" directly are that each
+# type is implicitly aligned to its natural length (although this can be
+# over-ridden with an explicit third parameter), and that the structure
+# is padded at the end to its overall alignment.
+#
+
+#########################################################################
+
+#ifndef _SHA1_MB_MGR_DATASTRUCT_ASM_
+#define _SHA1_MB_MGR_DATASTRUCT_ASM_
+
+## START_FIELDS
+.macro START_FIELDS
+ _FIELD_OFFSET = 0
+ _STRUCT_ALIGN = 0
+.endm
+
+## FIELD name size align
+.macro FIELD name size align
+ _FIELD_OFFSET = (_FIELD_OFFSET + (\align) - 1) & (~ ((\align)-1))
+ \name = _FIELD_OFFSET
+ _FIELD_OFFSET = _FIELD_OFFSET + (\size)
+.if (\align > _STRUCT_ALIGN)
+ _STRUCT_ALIGN = \align
+.endif
+.endm
+
+## END_FIELDS
+.macro END_FIELDS
+ _FIELD_OFFSET = (_FIELD_OFFSET + _STRUCT_ALIGN-1) & (~ (_STRUCT_ALIGN-1))
+.endm
+
+########################################################################
+
+.macro STRUCT p1
+START_FIELDS
+.struc \p1
+.endm
+
+.macro ENDSTRUCT
+ tmp = _FIELD_OFFSET
+ END_FIELDS
+ tmp = (_FIELD_OFFSET - %%tmp)
+.if (tmp > 0)
+ .lcomm tmp
+.endif
+.endstruc
+.endm
+
+## RES_int name size align
+.macro RES_int p1 p2 p3
+ name = \p1
+ size = \p2
+ align = .\p3
+
+ _FIELD_OFFSET = (_FIELD_OFFSET + (align) - 1) & (~ ((align)-1))
+.align align
+.lcomm name size
+ _FIELD_OFFSET = _FIELD_OFFSET + (size)
+.if (align > _STRUCT_ALIGN)
+ _STRUCT_ALIGN = align
+.endif
+.endm
+
+
+
+# macro RES_B name, size [, align]
+.macro RES_B _name, _size, _align=1
+RES_int _name _size _align
+.endm
+
+# macro RES_W name, size [, align]
+.macro RES_W _name, _size, _align=2
+RES_int _name 2*(_size) _align
+.endm
+
+# macro RES_D name, size [, align]
+.macro RES_D _name, _size, _align=4
+RES_int _name 4*(_size) _align
+.endm
+
+# macro RES_Q name, size [, align]
+.macro RES_Q _name, _size, _align=8
+RES_int _name 8*(_size) _align
+.endm
+
+# macro RES_DQ name, size [, align]
+.macro RES_DQ _name, _size, _align=16
+RES_int _name 16*(_size) _align
+.endm
+
+# macro RES_Y name, size [, align]
+.macro RES_Y _name, _size, _align=32
+RES_int _name 32*(_size) _align
+.endm
+
+# macro RES_Z name, size [, align]
+.macro RES_Z _name, _size, _align=64
+RES_int _name 64*(_size) _align
+.endm
+
+
+#endif
+
+########################################################################
+#### Define constants
+########################################################################
+
+########################################################################
+#### Define SHA1 Out Of Order Data Structures
+########################################################################
+
+START_FIELDS # LANE_DATA
+### name size align
+FIELD _job_in_lane, 8, 8 # pointer to job object
+END_FIELDS
+
+_LANE_DATA_size = _FIELD_OFFSET
+_LANE_DATA_align = _STRUCT_ALIGN
+
+########################################################################
+
+START_FIELDS # SHA1_ARGS_X8
+### name size align
+FIELD _digest, 4*5*8, 16 # transposed digest
+FIELD _data_ptr, 8*8, 8 # array of pointers to data
+END_FIELDS
+
+_SHA1_ARGS_X4_size = _FIELD_OFFSET
+_SHA1_ARGS_X4_align = _STRUCT_ALIGN
+_SHA1_ARGS_X8_size = _FIELD_OFFSET
+_SHA1_ARGS_X8_align = _STRUCT_ALIGN
+
+########################################################################
+
+START_FIELDS # MB_MGR
+### name size align
+FIELD _args, _SHA1_ARGS_X4_size, _SHA1_ARGS_X4_align
+FIELD _lens, 4*8, 8
+FIELD _unused_lanes, 8, 8
+FIELD _ldata, _LANE_DATA_size*8, _LANE_DATA_align
+END_FIELDS
+
+_MB_MGR_size = _FIELD_OFFSET
+_MB_MGR_align = _STRUCT_ALIGN
+
+_args_digest = _args + _digest
+_args_data_ptr = _args + _data_ptr
+
+
+########################################################################
+#### Define constants
+########################################################################
+
+#define STS_UNKNOWN 0
+#define STS_BEING_PROCESSED 1
+#define STS_COMPLETED 2
+
+########################################################################
+#### Define JOB_SHA1 structure
+########################################################################
+
+START_FIELDS # JOB_SHA1
+
+### name size align
+FIELD _buffer, 8, 8 # pointer to buffer
+FIELD _len, 4, 4 # length in bytes
+FIELD _result_digest, 5*4, 32 # Digest (output)
+FIELD _status, 4, 4
+FIELD _user_data, 8, 8
+END_FIELDS
+
+_JOB_SHA1_size = _FIELD_OFFSET
+_JOB_SHA1_align = _STRUCT_ALIGN
diff --git a/arch/x86/crypto/sha-mb/sha1_mb_mgr_flush_avx2.S b/arch/x86/crypto/sha-mb/sha1_mb_mgr_flush_avx2.S
new file mode 100644
index 000000000000..85c4e1cf7172
--- /dev/null
+++ b/arch/x86/crypto/sha-mb/sha1_mb_mgr_flush_avx2.S
@@ -0,0 +1,327 @@
+/*
+ * Flush routine for SHA1 multibuffer
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2014 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Contact Information:
+ * James Guilford <james.guilford@intel.com>
+ * Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2014 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <linux/linkage.h>
+#include "sha1_mb_mgr_datastruct.S"
+
+
+.extern sha1_x8_avx2
+
+# LINUX register definitions
+#define arg1 %rdi
+#define arg2 %rsi
+
+# Common definitions
+#define state arg1
+#define job arg2
+#define len2 arg2
+
+# idx must be a register not clobbered by sha1_x8_avx2
+#define idx %r8
+#define DWORD_idx %r8d
+
+#define unused_lanes %rbx
+#define lane_data %rbx
+#define tmp2 %rbx
+#define tmp2_w %ebx
+
+#define job_rax %rax
+#define tmp1 %rax
+#define size_offset %rax
+#define tmp %rax
+#define start_offset %rax
+
+#define tmp3 %arg1
+
+#define extra_blocks %arg2
+#define p %arg2
+
+
+# STACK_SPACE needs to be an odd multiple of 8
+_XMM_SAVE_SIZE = 10*16
+_GPR_SAVE_SIZE = 8*8
+_ALIGN_SIZE = 8
+
+_XMM_SAVE = 0
+_GPR_SAVE = _XMM_SAVE + _XMM_SAVE_SIZE
+STACK_SPACE = _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE
+
+.macro LABEL prefix n
+\prefix\n\():
+.endm
+
+.macro JNE_SKIP i
+jne skip_\i
+.endm
+
+.altmacro
+.macro SET_OFFSET _offset
+offset = \_offset
+.endm
+.noaltmacro
+
+# JOB* sha1_mb_mgr_flush_avx2(MB_MGR *state)
+# arg 1 : rcx : state
+ENTRY(sha1_mb_mgr_flush_avx2)
+ mov %rsp, %r10
+ sub $STACK_SPACE, %rsp
+ and $~31, %rsp
+ mov %rbx, _GPR_SAVE(%rsp)
+ mov %r10, _GPR_SAVE+8*1(%rsp) #save rsp
+ mov %rbp, _GPR_SAVE+8*3(%rsp)
+ mov %r12, _GPR_SAVE+8*4(%rsp)
+ mov %r13, _GPR_SAVE+8*5(%rsp)
+ mov %r14, _GPR_SAVE+8*6(%rsp)
+ mov %r15, _GPR_SAVE+8*7(%rsp)
+
+ # If bit (32+3) is set, then all lanes are empty
+ mov _unused_lanes(state), unused_lanes
+ bt $32+3, unused_lanes
+ jc return_null
+
+ # find a lane with a non-null job
+ xor idx, idx
+ offset = (_ldata + 1 * _LANE_DATA_size + _job_in_lane)
+ cmpq $0, offset(state)
+ cmovne one(%rip), idx
+ offset = (_ldata + 2 * _LANE_DATA_size + _job_in_lane)
+ cmpq $0, offset(state)
+ cmovne two(%rip), idx
+ offset = (_ldata + 3 * _LANE_DATA_size + _job_in_lane)
+ cmpq $0, offset(state)
+ cmovne three(%rip), idx
+ offset = (_ldata + 4 * _LANE_DATA_size + _job_in_lane)
+ cmpq $0, offset(state)
+ cmovne four(%rip), idx
+ offset = (_ldata + 5 * _LANE_DATA_size + _job_in_lane)
+ cmpq $0, offset(state)
+ cmovne five(%rip), idx
+ offset = (_ldata + 6 * _LANE_DATA_size + _job_in_lane)
+ cmpq $0, offset(state)
+ cmovne six(%rip), idx
+ offset = (_ldata + 7 * _LANE_DATA_size + _job_in_lane)
+ cmpq $0, offset(state)
+ cmovne seven(%rip), idx
+
+ # copy idx to empty lanes
+copy_lane_data:
+ offset = (_args + _data_ptr)
+ mov offset(state,idx,8), tmp
+
+ I = 0
+.rep 8
+ offset = (_ldata + I * _LANE_DATA_size + _job_in_lane)
+ cmpq $0, offset(state)
+.altmacro
+ JNE_SKIP %I
+ offset = (_args + _data_ptr + 8*I)
+ mov tmp, offset(state)
+ offset = (_lens + 4*I)
+ movl $0xFFFFFFFF, offset(state)
+LABEL skip_ %I
+ I = (I+1)
+.noaltmacro
+.endr
+
+ # Find min length
+ vmovdqa _lens+0*16(state), %xmm0
+ vmovdqa _lens+1*16(state), %xmm1
+
+ vpminud %xmm1, %xmm0, %xmm2 # xmm2 has {D,C,B,A}
+ vpalignr $8, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,D,C}
+ vpminud %xmm3, %xmm2, %xmm2 # xmm2 has {x,x,E,F}
+ vpalignr $4, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,x,E}
+ vpminud %xmm3, %xmm2, %xmm2 # xmm2 has min value in low dword
+
+ vmovd %xmm2, DWORD_idx
+ mov idx, len2
+ and $0xF, idx
+ shr $4, len2
+ jz len_is_0
+
+ vpand clear_low_nibble(%rip), %xmm2, %xmm2
+ vpshufd $0, %xmm2, %xmm2
+
+ vpsubd %xmm2, %xmm0, %xmm0
+ vpsubd %xmm2, %xmm1, %xmm1
+
+ vmovdqa %xmm0, _lens+0*16(state)
+ vmovdqa %xmm1, _lens+1*16(state)
+
+ # "state" and "args" are the same address, arg1
+ # len is arg2
+ call sha1_x8_avx2
+ # state and idx are intact
+
+
+len_is_0:
+ # process completed job "idx"
+ imul $_LANE_DATA_size, idx, lane_data
+ lea _ldata(state, lane_data), lane_data
+
+ mov _job_in_lane(lane_data), job_rax
+ movq $0, _job_in_lane(lane_data)
+ movl $STS_COMPLETED, _status(job_rax)
+ mov _unused_lanes(state), unused_lanes
+ shl $4, unused_lanes
+ or idx, unused_lanes
+ mov unused_lanes, _unused_lanes(state)
+
+ movl $0xFFFFFFFF, _lens(state, idx, 4)
+
+ vmovd _args_digest(state , idx, 4) , %xmm0
+ vpinsrd $1, _args_digest+1*32(state, idx, 4), %xmm0, %xmm0
+ vpinsrd $2, _args_digest+2*32(state, idx, 4), %xmm0, %xmm0
+ vpinsrd $3, _args_digest+3*32(state, idx, 4), %xmm0, %xmm0
+ movl _args_digest+4*32(state, idx, 4), tmp2_w
+
+ vmovdqu %xmm0, _result_digest(job_rax)
+ offset = (_result_digest + 1*16)
+ mov tmp2_w, offset(job_rax)
+
+return:
+
+ mov _GPR_SAVE(%rsp), %rbx
+ mov _GPR_SAVE+8*1(%rsp), %r10 #saved rsp
+ mov _GPR_SAVE+8*3(%rsp), %rbp
+ mov _GPR_SAVE+8*4(%rsp), %r12
+ mov _GPR_SAVE+8*5(%rsp), %r13
+ mov _GPR_SAVE+8*6(%rsp), %r14
+ mov _GPR_SAVE+8*7(%rsp), %r15
+ mov %r10, %rsp
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+ENDPROC(sha1_mb_mgr_flush_avx2)
+
+
+#################################################################
+
+.align 16
+ENTRY(sha1_mb_mgr_get_comp_job_avx2)
+ push %rbx
+
+ ## if bit 32+3 is set, then all lanes are empty
+ mov _unused_lanes(state), unused_lanes
+ bt $(32+3), unused_lanes
+ jc .return_null
+
+ # Find min length
+ vmovdqa _lens(state), %xmm0
+ vmovdqa _lens+1*16(state), %xmm1
+
+ vpminud %xmm1, %xmm0, %xmm2 # xmm2 has {D,C,B,A}
+ vpalignr $8, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,D,C}
+ vpminud %xmm3, %xmm2, %xmm2 # xmm2 has {x,x,E,F}
+ vpalignr $4, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,x,E}
+ vpminud %xmm3, %xmm2, %xmm2 # xmm2 has min value in low dword
+
+ vmovd %xmm2, DWORD_idx
+ test $~0xF, idx
+ jnz .return_null
+
+ # process completed job "idx"
+ imul $_LANE_DATA_size, idx, lane_data
+ lea _ldata(state, lane_data), lane_data
+
+ mov _job_in_lane(lane_data), job_rax
+ movq $0, _job_in_lane(lane_data)
+ movl $STS_COMPLETED, _status(job_rax)
+ mov _unused_lanes(state), unused_lanes
+ shl $4, unused_lanes
+ or idx, unused_lanes
+ mov unused_lanes, _unused_lanes(state)
+
+ movl $0xFFFFFFFF, _lens(state, idx, 4)
+
+ vmovd _args_digest(state, idx, 4), %xmm0
+ vpinsrd $1, _args_digest+1*32(state, idx, 4), %xmm0, %xmm0
+ vpinsrd $2, _args_digest+2*32(state, idx, 4), %xmm0, %xmm0
+ vpinsrd $3, _args_digest+3*32(state, idx, 4), %xmm0, %xmm0
+ movl _args_digest+4*32(state, idx, 4), tmp2_w
+
+ vmovdqu %xmm0, _result_digest(job_rax)
+ movl tmp2_w, _result_digest+1*16(job_rax)
+
+ pop %rbx
+
+ ret
+
+.return_null:
+ xor job_rax, job_rax
+ pop %rbx
+ ret
+ENDPROC(sha1_mb_mgr_get_comp_job_avx2)
+
+.data
+
+.align 16
+clear_low_nibble:
+.octa 0x000000000000000000000000FFFFFFF0
+one:
+.quad 1
+two:
+.quad 2
+three:
+.quad 3
+four:
+.quad 4
+five:
+.quad 5
+six:
+.quad 6
+seven:
+.quad 7
diff --git a/arch/x86/crypto/sha-mb/sha1_mb_mgr_init_avx2.c b/arch/x86/crypto/sha-mb/sha1_mb_mgr_init_avx2.c
new file mode 100644
index 000000000000..4ca7e166a2aa
--- /dev/null
+++ b/arch/x86/crypto/sha-mb/sha1_mb_mgr_init_avx2.c
@@ -0,0 +1,64 @@
+/*
+ * Initialization code for multi buffer SHA1 algorithm for AVX2
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2014 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Contact Information:
+ * Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2014 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "sha_mb_mgr.h"
+
+void sha1_mb_mgr_init_avx2(struct sha1_mb_mgr *state)
+{
+ unsigned int j;
+ state->unused_lanes = 0xF76543210;
+ for (j = 0; j < 8; j++) {
+ state->lens[j] = 0xFFFFFFFF;
+ state->ldata[j].job_in_lane = NULL;
+ }
+}
diff --git a/arch/x86/crypto/sha-mb/sha1_mb_mgr_submit_avx2.S b/arch/x86/crypto/sha-mb/sha1_mb_mgr_submit_avx2.S
new file mode 100644
index 000000000000..2ab9560b53c8
--- /dev/null
+++ b/arch/x86/crypto/sha-mb/sha1_mb_mgr_submit_avx2.S
@@ -0,0 +1,228 @@
+/*
+ * Buffer submit code for multi buffer SHA1 algorithm
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2014 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Contact Information:
+ * James Guilford <james.guilford@intel.com>
+ * Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2014 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/linkage.h>
+#include "sha1_mb_mgr_datastruct.S"
+
+
+.extern sha1_x8_avx
+
+# LINUX register definitions
+arg1 = %rdi
+arg2 = %rsi
+size_offset = %rcx
+tmp2 = %rcx
+extra_blocks = %rdx
+
+# Common definitions
+#define state arg1
+#define job %rsi
+#define len2 arg2
+#define p2 arg2
+
+# idx must be a register not clobberred by sha1_x8_avx2
+idx = %r8
+DWORD_idx = %r8d
+last_len = %r8
+
+p = %r11
+start_offset = %r11
+
+unused_lanes = %rbx
+BYTE_unused_lanes = %bl
+
+job_rax = %rax
+len = %rax
+DWORD_len = %eax
+
+lane = %rbp
+tmp3 = %rbp
+
+tmp = %r9
+DWORD_tmp = %r9d
+
+lane_data = %r10
+
+# STACK_SPACE needs to be an odd multiple of 8
+STACK_SPACE = 8*8 + 16*10 + 8
+
+# JOB* submit_mb_mgr_submit_avx2(MB_MGR *state, job_sha1 *job)
+# arg 1 : rcx : state
+# arg 2 : rdx : job
+ENTRY(sha1_mb_mgr_submit_avx2)
+
+ mov %rsp, %r10
+ sub $STACK_SPACE, %rsp
+ and $~31, %rsp
+
+ mov %rbx, (%rsp)
+ mov %r10, 8*2(%rsp) #save old rsp
+ mov %rbp, 8*3(%rsp)
+ mov %r12, 8*4(%rsp)
+ mov %r13, 8*5(%rsp)
+ mov %r14, 8*6(%rsp)
+ mov %r15, 8*7(%rsp)
+
+ mov _unused_lanes(state), unused_lanes
+ mov unused_lanes, lane
+ and $0xF, lane
+ shr $4, unused_lanes
+ imul $_LANE_DATA_size, lane, lane_data
+ movl $STS_BEING_PROCESSED, _status(job)
+ lea _ldata(state, lane_data), lane_data
+ mov unused_lanes, _unused_lanes(state)
+ movl _len(job), DWORD_len
+
+ mov job, _job_in_lane(lane_data)
+ shl $4, len
+ or lane, len
+
+ movl DWORD_len, _lens(state , lane, 4)
+
+ # Load digest words from result_digest
+ vmovdqu _result_digest(job), %xmm0
+ mov _result_digest+1*16(job), DWORD_tmp
+ vmovd %xmm0, _args_digest(state, lane, 4)
+ vpextrd $1, %xmm0, _args_digest+1*32(state , lane, 4)
+ vpextrd $2, %xmm0, _args_digest+2*32(state , lane, 4)
+ vpextrd $3, %xmm0, _args_digest+3*32(state , lane, 4)
+ movl DWORD_tmp, _args_digest+4*32(state , lane, 4)
+
+ mov _buffer(job), p
+ mov p, _args_data_ptr(state, lane, 8)
+
+ cmp $0xF, unused_lanes
+ jne return_null
+
+start_loop:
+ # Find min length
+ vmovdqa _lens(state), %xmm0
+ vmovdqa _lens+1*16(state), %xmm1
+
+ vpminud %xmm1, %xmm0, %xmm2 # xmm2 has {D,C,B,A}
+ vpalignr $8, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,D,C}
+ vpminud %xmm3, %xmm2, %xmm2 # xmm2 has {x,x,E,F}
+ vpalignr $4, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,x,E}
+ vpminud %xmm3, %xmm2, %xmm2 # xmm2 has min value in low dword
+
+ vmovd %xmm2, DWORD_idx
+ mov idx, len2
+ and $0xF, idx
+ shr $4, len2
+ jz len_is_0
+
+ vpand clear_low_nibble(%rip), %xmm2, %xmm2
+ vpshufd $0, %xmm2, %xmm2
+
+ vpsubd %xmm2, %xmm0, %xmm0
+ vpsubd %xmm2, %xmm1, %xmm1
+
+ vmovdqa %xmm0, _lens + 0*16(state)
+ vmovdqa %xmm1, _lens + 1*16(state)
+
+
+ # "state" and "args" are the same address, arg1
+ # len is arg2
+ call sha1_x8_avx2
+
+ # state and idx are intact
+
+len_is_0:
+ # process completed job "idx"
+ imul $_LANE_DATA_size, idx, lane_data
+ lea _ldata(state, lane_data), lane_data
+
+ mov _job_in_lane(lane_data), job_rax
+ mov _unused_lanes(state), unused_lanes
+ movq $0, _job_in_lane(lane_data)
+ movl $STS_COMPLETED, _status(job_rax)
+ shl $4, unused_lanes
+ or idx, unused_lanes
+ mov unused_lanes, _unused_lanes(state)
+
+ movl $0xFFFFFFFF, _lens(state, idx, 4)
+
+ vmovd _args_digest(state, idx, 4), %xmm0
+ vpinsrd $1, _args_digest+1*32(state , idx, 4), %xmm0, %xmm0
+ vpinsrd $2, _args_digest+2*32(state , idx, 4), %xmm0, %xmm0
+ vpinsrd $3, _args_digest+3*32(state , idx, 4), %xmm0, %xmm0
+ movl 4*32(state, idx, 4), DWORD_tmp
+
+ vmovdqu %xmm0, _result_digest(job_rax)
+ movl DWORD_tmp, _result_digest+1*16(job_rax)
+
+return:
+
+ mov (%rsp), %rbx
+ mov 8*2(%rsp), %r10 #save old rsp
+ mov 8*3(%rsp), %rbp
+ mov 8*4(%rsp), %r12
+ mov 8*5(%rsp), %r13
+ mov 8*6(%rsp), %r14
+ mov 8*7(%rsp), %r15
+ mov %r10, %rsp
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+ENDPROC(sha1_mb_mgr_submit_avx2)
+
+.data
+
+.align 16
+clear_low_nibble:
+ .octa 0x000000000000000000000000FFFFFFF0
diff --git a/arch/x86/crypto/sha-mb/sha1_x8_avx2.S b/arch/x86/crypto/sha-mb/sha1_x8_avx2.S
new file mode 100644
index 000000000000..8e1b47792b31
--- /dev/null
+++ b/arch/x86/crypto/sha-mb/sha1_x8_avx2.S
@@ -0,0 +1,472 @@
+/*
+ * Multi-buffer SHA1 algorithm hash compute routine
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2014 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Contact Information:
+ * James Guilford <james.guilford@intel.com>
+ * Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2014 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/linkage.h>
+#include "sha1_mb_mgr_datastruct.S"
+
+## code to compute oct SHA1 using SSE-256
+## outer calling routine takes care of save and restore of XMM registers
+
+## Function clobbers: rax, rcx, rdx, rbx, rsi, rdi, r9-r15# ymm0-15
+##
+## Linux clobbers: rax rbx rcx rdx rsi r9 r10 r11 r12 r13 r14 r15
+## Linux preserves: rdi rbp r8
+##
+## clobbers ymm0-15
+
+
+# TRANSPOSE8 r0, r1, r2, r3, r4, r5, r6, r7, t0, t1
+# "transpose" data in {r0...r7} using temps {t0...t1}
+# Input looks like: {r0 r1 r2 r3 r4 r5 r6 r7}
+# r0 = {a7 a6 a5 a4 a3 a2 a1 a0}
+# r1 = {b7 b6 b5 b4 b3 b2 b1 b0}
+# r2 = {c7 c6 c5 c4 c3 c2 c1 c0}
+# r3 = {d7 d6 d5 d4 d3 d2 d1 d0}
+# r4 = {e7 e6 e5 e4 e3 e2 e1 e0}
+# r5 = {f7 f6 f5 f4 f3 f2 f1 f0}
+# r6 = {g7 g6 g5 g4 g3 g2 g1 g0}
+# r7 = {h7 h6 h5 h4 h3 h2 h1 h0}
+#
+# Output looks like: {r0 r1 r2 r3 r4 r5 r6 r7}
+# r0 = {h0 g0 f0 e0 d0 c0 b0 a0}
+# r1 = {h1 g1 f1 e1 d1 c1 b1 a1}
+# r2 = {h2 g2 f2 e2 d2 c2 b2 a2}
+# r3 = {h3 g3 f3 e3 d3 c3 b3 a3}
+# r4 = {h4 g4 f4 e4 d4 c4 b4 a4}
+# r5 = {h5 g5 f5 e5 d5 c5 b5 a5}
+# r6 = {h6 g6 f6 e6 d6 c6 b6 a6}
+# r7 = {h7 g7 f7 e7 d7 c7 b7 a7}
+#
+
+.macro TRANSPOSE8 r0 r1 r2 r3 r4 r5 r6 r7 t0 t1
+ # process top half (r0..r3) {a...d}
+ vshufps $0x44, \r1, \r0, \t0 # t0 = {b5 b4 a5 a4 b1 b0 a1 a0}
+ vshufps $0xEE, \r1, \r0, \r0 # r0 = {b7 b6 a7 a6 b3 b2 a3 a2}
+ vshufps $0x44, \r3, \r2, \t1 # t1 = {d5 d4 c5 c4 d1 d0 c1 c0}
+ vshufps $0xEE, \r3, \r2, \r2 # r2 = {d7 d6 c7 c6 d3 d2 c3 c2}
+ vshufps $0xDD, \t1, \t0, \r3 # r3 = {d5 c5 b5 a5 d1 c1 b1 a1}
+ vshufps $0x88, \r2, \r0, \r1 # r1 = {d6 c6 b6 a6 d2 c2 b2 a2}
+ vshufps $0xDD, \r2, \r0, \r0 # r0 = {d7 c7 b7 a7 d3 c3 b3 a3}
+ vshufps $0x88, \t1, \t0, \t0 # t0 = {d4 c4 b4 a4 d0 c0 b0 a0}
+
+ # use r2 in place of t0
+ # process bottom half (r4..r7) {e...h}
+ vshufps $0x44, \r5, \r4, \r2 # r2 = {f5 f4 e5 e4 f1 f0 e1 e0}
+ vshufps $0xEE, \r5, \r4, \r4 # r4 = {f7 f6 e7 e6 f3 f2 e3 e2}
+ vshufps $0x44, \r7, \r6, \t1 # t1 = {h5 h4 g5 g4 h1 h0 g1 g0}
+ vshufps $0xEE, \r7, \r6, \r6 # r6 = {h7 h6 g7 g6 h3 h2 g3 g2}
+ vshufps $0xDD, \t1, \r2, \r7 # r7 = {h5 g5 f5 e5 h1 g1 f1 e1}
+ vshufps $0x88, \r6, \r4, \r5 # r5 = {h6 g6 f6 e6 h2 g2 f2 e2}
+ vshufps $0xDD, \r6, \r4, \r4 # r4 = {h7 g7 f7 e7 h3 g3 f3 e3}
+ vshufps $0x88, \t1, \r2, \t1 # t1 = {h4 g4 f4 e4 h0 g0 f0 e0}
+
+ vperm2f128 $0x13, \r1, \r5, \r6 # h6...a6
+ vperm2f128 $0x02, \r1, \r5, \r2 # h2...a2
+ vperm2f128 $0x13, \r3, \r7, \r5 # h5...a5
+ vperm2f128 $0x02, \r3, \r7, \r1 # h1...a1
+ vperm2f128 $0x13, \r0, \r4, \r7 # h7...a7
+ vperm2f128 $0x02, \r0, \r4, \r3 # h3...a3
+ vperm2f128 $0x13, \t0, \t1, \r4 # h4...a4
+ vperm2f128 $0x02, \t0, \t1, \r0 # h0...a0
+
+.endm
+##
+## Magic functions defined in FIPS 180-1
+##
+# macro MAGIC_F0 F,B,C,D,T ## F = (D ^ (B & (C ^ D)))
+.macro MAGIC_F0 regF regB regC regD regT
+ vpxor \regD, \regC, \regF
+ vpand \regB, \regF, \regF
+ vpxor \regD, \regF, \regF
+.endm
+
+# macro MAGIC_F1 F,B,C,D,T ## F = (B ^ C ^ D)
+.macro MAGIC_F1 regF regB regC regD regT
+ vpxor \regC, \regD, \regF
+ vpxor \regB, \regF, \regF
+.endm
+
+# macro MAGIC_F2 F,B,C,D,T ## F = ((B & C) | (B & D) | (C & D))
+.macro MAGIC_F2 regF regB regC regD regT
+ vpor \regC, \regB, \regF
+ vpand \regC, \regB, \regT
+ vpand \regD, \regF, \regF
+ vpor \regT, \regF, \regF
+.endm
+
+# macro MAGIC_F3 F,B,C,D,T ## F = (B ^ C ^ D)
+.macro MAGIC_F3 regF regB regC regD regT
+ MAGIC_F1 \regF,\regB,\regC,\regD,\regT
+.endm
+
+# PROLD reg, imm, tmp
+.macro PROLD reg imm tmp
+ vpsrld $(32-\imm), \reg, \tmp
+ vpslld $\imm, \reg, \reg
+ vpor \tmp, \reg, \reg
+.endm
+
+.macro PROLD_nd reg imm tmp src
+ vpsrld $(32-\imm), \src, \tmp
+ vpslld $\imm, \src, \reg
+ vpor \tmp, \reg, \reg
+.endm
+
+.macro SHA1_STEP_00_15 regA regB regC regD regE regT regF memW immCNT MAGIC
+ vpaddd \immCNT, \regE, \regE
+ vpaddd \memW*32(%rsp), \regE, \regE
+ PROLD_nd \regT, 5, \regF, \regA
+ vpaddd \regT, \regE, \regE
+ \MAGIC \regF, \regB, \regC, \regD, \regT
+ PROLD \regB, 30, \regT
+ vpaddd \regF, \regE, \regE
+.endm
+
+.macro SHA1_STEP_16_79 regA regB regC regD regE regT regF memW immCNT MAGIC
+ vpaddd \immCNT, \regE, \regE
+ offset = ((\memW - 14) & 15) * 32
+ vmovdqu offset(%rsp), W14
+ vpxor W14, W16, W16
+ offset = ((\memW - 8) & 15) * 32
+ vpxor offset(%rsp), W16, W16
+ offset = ((\memW - 3) & 15) * 32
+ vpxor offset(%rsp), W16, W16
+ vpsrld $(32-1), W16, \regF
+ vpslld $1, W16, W16
+ vpor W16, \regF, \regF
+
+ ROTATE_W
+
+ offset = ((\memW - 0) & 15) * 32
+ vmovdqu \regF, offset(%rsp)
+ vpaddd \regF, \regE, \regE
+ PROLD_nd \regT, 5, \regF, \regA
+ vpaddd \regT, \regE, \regE
+ \MAGIC \regF,\regB,\regC,\regD,\regT ## FUN = MAGIC_Fi(B,C,D)
+ PROLD \regB,30, \regT
+ vpaddd \regF, \regE, \regE
+.endm
+
+########################################################################
+########################################################################
+########################################################################
+
+## FRAMESZ plus pushes must be an odd multiple of 8
+YMM_SAVE = (15-15)*32
+FRAMESZ = 32*16 + YMM_SAVE
+_YMM = FRAMESZ - YMM_SAVE
+
+#define VMOVPS vmovups
+
+IDX = %rax
+inp0 = %r9
+inp1 = %r10
+inp2 = %r11
+inp3 = %r12
+inp4 = %r13
+inp5 = %r14
+inp6 = %r15
+inp7 = %rcx
+arg1 = %rdi
+arg2 = %rsi
+RSP_SAVE = %rdx
+
+# ymm0 A
+# ymm1 B
+# ymm2 C
+# ymm3 D
+# ymm4 E
+# ymm5 F AA
+# ymm6 T0 BB
+# ymm7 T1 CC
+# ymm8 T2 DD
+# ymm9 T3 EE
+# ymm10 T4 TMP
+# ymm11 T5 FUN
+# ymm12 T6 K
+# ymm13 T7 W14
+# ymm14 T8 W15
+# ymm15 T9 W16
+
+
+A = %ymm0
+B = %ymm1
+C = %ymm2
+D = %ymm3
+E = %ymm4
+F = %ymm5
+T0 = %ymm6
+T1 = %ymm7
+T2 = %ymm8
+T3 = %ymm9
+T4 = %ymm10
+T5 = %ymm11
+T6 = %ymm12
+T7 = %ymm13
+T8 = %ymm14
+T9 = %ymm15
+
+AA = %ymm5
+BB = %ymm6
+CC = %ymm7
+DD = %ymm8
+EE = %ymm9
+TMP = %ymm10
+FUN = %ymm11
+K = %ymm12
+W14 = %ymm13
+W15 = %ymm14
+W16 = %ymm15
+
+.macro ROTATE_ARGS
+ TMP_ = E
+ E = D
+ D = C
+ C = B
+ B = A
+ A = TMP_
+.endm
+
+.macro ROTATE_W
+TMP_ = W16
+W16 = W15
+W15 = W14
+W14 = TMP_
+.endm
+
+# 8 streams x 5 32bit words per digest x 4 bytes per word
+#define DIGEST_SIZE (8*5*4)
+
+.align 32
+
+# void sha1_x8_avx2(void **input_data, UINT128 *digest, UINT32 size)
+# arg 1 : pointer to array[4] of pointer to input data
+# arg 2 : size (in blocks) ;; assumed to be >= 1
+#
+ENTRY(sha1_x8_avx2)
+
+ push RSP_SAVE
+
+ #save rsp
+ mov %rsp, RSP_SAVE
+ sub $FRAMESZ, %rsp
+
+ #align rsp to 32 Bytes
+ and $~0x1F, %rsp
+
+ ## Initialize digests
+ vmovdqu 0*32(arg1), A
+ vmovdqu 1*32(arg1), B
+ vmovdqu 2*32(arg1), C
+ vmovdqu 3*32(arg1), D
+ vmovdqu 4*32(arg1), E
+
+ ## transpose input onto stack
+ mov _data_ptr+0*8(arg1),inp0
+ mov _data_ptr+1*8(arg1),inp1
+ mov _data_ptr+2*8(arg1),inp2
+ mov _data_ptr+3*8(arg1),inp3
+ mov _data_ptr+4*8(arg1),inp4
+ mov _data_ptr+5*8(arg1),inp5
+ mov _data_ptr+6*8(arg1),inp6
+ mov _data_ptr+7*8(arg1),inp7
+
+ xor IDX, IDX
+lloop:
+ vmovdqu PSHUFFLE_BYTE_FLIP_MASK(%rip), F
+ I=0
+.rep 2
+ VMOVPS (inp0, IDX), T0
+ VMOVPS (inp1, IDX), T1
+ VMOVPS (inp2, IDX), T2
+ VMOVPS (inp3, IDX), T3
+ VMOVPS (inp4, IDX), T4
+ VMOVPS (inp5, IDX), T5
+ VMOVPS (inp6, IDX), T6
+ VMOVPS (inp7, IDX), T7
+
+ TRANSPOSE8 T0, T1, T2, T3, T4, T5, T6, T7, T8, T9
+ vpshufb F, T0, T0
+ vmovdqu T0, (I*8)*32(%rsp)
+ vpshufb F, T1, T1
+ vmovdqu T1, (I*8+1)*32(%rsp)
+ vpshufb F, T2, T2
+ vmovdqu T2, (I*8+2)*32(%rsp)
+ vpshufb F, T3, T3
+ vmovdqu T3, (I*8+3)*32(%rsp)
+ vpshufb F, T4, T4
+ vmovdqu T4, (I*8+4)*32(%rsp)
+ vpshufb F, T5, T5
+ vmovdqu T5, (I*8+5)*32(%rsp)
+ vpshufb F, T6, T6
+ vmovdqu T6, (I*8+6)*32(%rsp)
+ vpshufb F, T7, T7
+ vmovdqu T7, (I*8+7)*32(%rsp)
+ add $32, IDX
+ I = (I+1)
+.endr
+ # save old digests
+ vmovdqu A,AA
+ vmovdqu B,BB
+ vmovdqu C,CC
+ vmovdqu D,DD
+ vmovdqu E,EE
+
+##
+## perform 0-79 steps
+##
+ vmovdqu K00_19(%rip), K
+## do rounds 0...15
+ I = 0
+.rep 16
+ SHA1_STEP_00_15 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0
+ ROTATE_ARGS
+ I = (I+1)
+.endr
+
+## do rounds 16...19
+ vmovdqu ((16 - 16) & 15) * 32 (%rsp), W16
+ vmovdqu ((16 - 15) & 15) * 32 (%rsp), W15
+.rep 4
+ SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0
+ ROTATE_ARGS
+ I = (I+1)
+.endr
+
+## do rounds 20...39
+ vmovdqu K20_39(%rip), K
+.rep 20
+ SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F1
+ ROTATE_ARGS
+ I = (I+1)
+.endr
+
+## do rounds 40...59
+ vmovdqu K40_59(%rip), K
+.rep 20
+ SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F2
+ ROTATE_ARGS
+ I = (I+1)
+.endr
+
+## do rounds 60...79
+ vmovdqu K60_79(%rip), K
+.rep 20
+ SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F3
+ ROTATE_ARGS
+ I = (I+1)
+.endr
+
+ vpaddd AA,A,A
+ vpaddd BB,B,B
+ vpaddd CC,C,C
+ vpaddd DD,D,D
+ vpaddd EE,E,E
+
+ sub $1, arg2
+ jne lloop
+
+ # write out digests
+ vmovdqu A, 0*32(arg1)
+ vmovdqu B, 1*32(arg1)
+ vmovdqu C, 2*32(arg1)
+ vmovdqu D, 3*32(arg1)
+ vmovdqu E, 4*32(arg1)
+
+ # update input pointers
+ add IDX, inp0
+ add IDX, inp1
+ add IDX, inp2
+ add IDX, inp3
+ add IDX, inp4
+ add IDX, inp5
+ add IDX, inp6
+ add IDX, inp7
+ mov inp0, _data_ptr (arg1)
+ mov inp1, _data_ptr + 1*8(arg1)
+ mov inp2, _data_ptr + 2*8(arg1)
+ mov inp3, _data_ptr + 3*8(arg1)
+ mov inp4, _data_ptr + 4*8(arg1)
+ mov inp5, _data_ptr + 5*8(arg1)
+ mov inp6, _data_ptr + 6*8(arg1)
+ mov inp7, _data_ptr + 7*8(arg1)
+
+ ################
+ ## Postamble
+
+ mov RSP_SAVE, %rsp
+ pop RSP_SAVE
+
+ ret
+ENDPROC(sha1_x8_avx2)
+
+
+.data
+
+.align 32
+K00_19:
+.octa 0x5A8279995A8279995A8279995A827999
+.octa 0x5A8279995A8279995A8279995A827999
+K20_39:
+.octa 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1
+.octa 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1
+K40_59:
+.octa 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC
+.octa 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC
+K60_79:
+.octa 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6
+.octa 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6
+PSHUFFLE_BYTE_FLIP_MASK:
+.octa 0x0c0d0e0f08090a0b0405060700010203
+.octa 0x0c0d0e0f08090a0b0405060700010203
diff --git a/arch/x86/crypto/sha-mb/sha_mb_ctx.h b/arch/x86/crypto/sha-mb/sha_mb_ctx.h
new file mode 100644
index 000000000000..e36069d0c1bd
--- /dev/null
+++ b/arch/x86/crypto/sha-mb/sha_mb_ctx.h
@@ -0,0 +1,136 @@
+/*
+ * Header file for multi buffer SHA context
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2014 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Contact Information:
+ * Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2014 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _SHA_MB_CTX_INTERNAL_H
+#define _SHA_MB_CTX_INTERNAL_H
+
+#include "sha_mb_mgr.h"
+
+#define HASH_UPDATE 0x00
+#define HASH_FIRST 0x01
+#define HASH_LAST 0x02
+#define HASH_ENTIRE 0x03
+#define HASH_DONE 0x04
+#define HASH_FINAL 0x08
+
+#define HASH_CTX_STS_IDLE 0x00
+#define HASH_CTX_STS_PROCESSING 0x01
+#define HASH_CTX_STS_LAST 0x02
+#define HASH_CTX_STS_COMPLETE 0x04
+
+enum hash_ctx_error {
+ HASH_CTX_ERROR_NONE = 0,
+ HASH_CTX_ERROR_INVALID_FLAGS = -1,
+ HASH_CTX_ERROR_ALREADY_PROCESSING = -2,
+ HASH_CTX_ERROR_ALREADY_COMPLETED = -3,
+
+#ifdef HASH_CTX_DEBUG
+ HASH_CTX_ERROR_DEBUG_DIGEST_MISMATCH = -4,
+#endif
+};
+
+
+#define hash_ctx_user_data(ctx) ((ctx)->user_data)
+#define hash_ctx_digest(ctx) ((ctx)->job.result_digest)
+#define hash_ctx_processing(ctx) ((ctx)->status & HASH_CTX_STS_PROCESSING)
+#define hash_ctx_complete(ctx) ((ctx)->status == HASH_CTX_STS_COMPLETE)
+#define hash_ctx_status(ctx) ((ctx)->status)
+#define hash_ctx_error(ctx) ((ctx)->error)
+#define hash_ctx_init(ctx) \
+ do { \
+ (ctx)->error = HASH_CTX_ERROR_NONE; \
+ (ctx)->status = HASH_CTX_STS_COMPLETE; \
+ } while (0)
+
+
+/* Hash Constants and Typedefs */
+#define SHA1_DIGEST_LENGTH 5
+#define SHA1_LOG2_BLOCK_SIZE 6
+
+#define SHA1_PADLENGTHFIELD_SIZE 8
+
+#ifdef SHA_MB_DEBUG
+#define assert(expr) \
+do { \
+ if (unlikely(!(expr))) { \
+ printk(KERN_ERR "Assertion failed! %s,%s,%s,line=%d\n", \
+ #expr, __FILE__, __func__, __LINE__); \
+ } \
+} while (0)
+#else
+#define assert(expr) do {} while (0)
+#endif
+
+struct sha1_ctx_mgr {
+ struct sha1_mb_mgr mgr;
+};
+
+/* typedef struct sha1_ctx_mgr sha1_ctx_mgr; */
+
+struct sha1_hash_ctx {
+ /* Must be at struct offset 0 */
+ struct job_sha1 job;
+ /* status flag */
+ int status;
+ /* error flag */
+ int error;
+
+ uint32_t total_length;
+ const void *incoming_buffer;
+ uint32_t incoming_buffer_length;
+ uint8_t partial_block_buffer[SHA1_BLOCK_SIZE * 2];
+ uint32_t partial_block_buffer_length;
+ void *user_data;
+};
+
+#endif
diff --git a/arch/x86/crypto/sha-mb/sha_mb_mgr.h b/arch/x86/crypto/sha-mb/sha_mb_mgr.h
new file mode 100644
index 000000000000..08ad1a9acfd7
--- /dev/null
+++ b/arch/x86/crypto/sha-mb/sha_mb_mgr.h
@@ -0,0 +1,110 @@
+/*
+ * Header file for multi buffer SHA1 algorithm manager
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2014 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Contact Information:
+ * James Guilford <james.guilford@intel.com>
+ * Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2014 Intel Corporation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef __SHA_MB_MGR_H
+#define __SHA_MB_MGR_H
+
+
+#include <linux/types.h>
+
+#define NUM_SHA1_DIGEST_WORDS 5
+
+enum job_sts { STS_UNKNOWN = 0,
+ STS_BEING_PROCESSED = 1,
+ STS_COMPLETED = 2,
+ STS_INTERNAL_ERROR = 3,
+ STS_ERROR = 4
+};
+
+struct job_sha1 {
+ u8 *buffer;
+ u32 len;
+ u32 result_digest[NUM_SHA1_DIGEST_WORDS] __aligned(32);
+ enum job_sts status;
+ void *user_data;
+};
+
+/* SHA1 out-of-order scheduler */
+
+/* typedef uint32_t sha1_digest_array[5][8]; */
+
+struct sha1_args_x8 {
+ uint32_t digest[5][8];
+ uint8_t *data_ptr[8];
+};
+
+struct sha1_lane_data {
+ struct job_sha1 *job_in_lane;
+};
+
+struct sha1_mb_mgr {
+ struct sha1_args_x8 args;
+
+ uint32_t lens[8];
+
+ /* each byte is index (0...7) of unused lanes */
+ uint64_t unused_lanes;
+ /* byte 4 is set to FF as a flag */
+ struct sha1_lane_data ldata[8];
+};
+
+
+#define SHA1_MB_MGR_NUM_LANES_AVX2 8
+
+void sha1_mb_mgr_init_avx2(struct sha1_mb_mgr *state);
+struct job_sha1 *sha1_mb_mgr_submit_avx2(struct sha1_mb_mgr *state,
+ struct job_sha1 *job);
+struct job_sha1 *sha1_mb_mgr_flush_avx2(struct sha1_mb_mgr *state);
+struct job_sha1 *sha1_mb_mgr_get_comp_job_avx2(struct sha1_mb_mgr *state);
+
+#endif
diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c
index d21ff89207cd..df91466f973d 100644
--- a/arch/x86/ia32/ia32_aout.c
+++ b/arch/x86/ia32/ia32_aout.c
@@ -308,11 +308,8 @@ static int load_aout_binary(struct linux_binprm *bprm)
(current->mm->start_brk = N_BSSADDR(ex));
retval = setup_arg_pages(bprm, IA32_STACK_TOP, EXSTACK_DEFAULT);
- if (retval < 0) {
- /* Someone check-me: is this error path enough? */
- send_sig(SIGKILL, current, 0);
+ if (retval < 0)
return retval;
- }
install_exec_creds(bprm);
@@ -324,17 +321,13 @@ static int load_aout_binary(struct linux_binprm *bprm)
error = vm_brk(text_addr & PAGE_MASK, map_size);
- if (error != (text_addr & PAGE_MASK)) {
- send_sig(SIGKILL, current, 0);
+ if (error != (text_addr & PAGE_MASK))
return error;
- }
error = read_code(bprm->file, text_addr, 32,
ex.a_text + ex.a_data);
- if ((signed long)error < 0) {
- send_sig(SIGKILL, current, 0);
+ if ((signed long)error < 0)
return error;
- }
} else {
#ifdef WARN_OLD
static unsigned long error_time, error_time2;
@@ -368,20 +361,16 @@ static int load_aout_binary(struct linux_binprm *bprm)
MAP_EXECUTABLE | MAP_32BIT,
fd_offset);
- if (error != N_TXTADDR(ex)) {
- send_sig(SIGKILL, current, 0);
+ if (error != N_TXTADDR(ex))
return error;
- }
error = vm_mmap(bprm->file, N_DATADDR(ex), ex.a_data,
PROT_READ | PROT_WRITE | PROT_EXEC,
MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE |
MAP_EXECUTABLE | MAP_32BIT,
fd_offset + ex.a_text);
- if (error != N_DATADDR(ex)) {
- send_sig(SIGKILL, current, 0);
+ if (error != N_DATADDR(ex))
return error;
- }
}
beyond_if:
set_binfmt(&aout_format);
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 4299eb05023c..ffe71228fc10 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -151,6 +151,16 @@ ENTRY(ia32_sysenter_target)
1: movl (%rbp),%ebp
_ASM_EXTABLE(1b,ia32_badarg)
ASM_CLAC
+
+ /*
+ * Sysenter doesn't filter flags, so we need to clear NT
+ * ourselves. To save a few cycles, we can check whether
+ * NT was set instead of doing an unconditional popfq.
+ */
+ testl $X86_EFLAGS_NT,EFLAGS-ARGOFFSET(%rsp)
+ jnz sysenter_fix_flags
+sysenter_flags_fixed:
+
orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET)
testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
CFI_REMEMBER_STATE
@@ -184,14 +194,16 @@ sysexit_from_sys_call:
TRACE_IRQS_ON
ENABLE_INTERRUPTS_SYSEXIT32
+ CFI_RESTORE_STATE
+
#ifdef CONFIG_AUDITSYSCALL
.macro auditsys_entry_common
- movl %esi,%r9d /* 6th arg: 4th syscall arg */
- movl %edx,%r8d /* 5th arg: 3rd syscall arg */
- /* (already in %ecx) 4th arg: 2nd syscall arg */
- movl %ebx,%edx /* 3rd arg: 1st syscall arg */
- movl %eax,%esi /* 2nd arg: syscall number */
- movl $AUDIT_ARCH_I386,%edi /* 1st arg: audit arch */
+ movl %esi,%r8d /* 5th arg: 4th syscall arg */
+ movl %ecx,%r9d /*swap with edx*/
+ movl %edx,%ecx /* 4th arg: 3rd syscall arg */
+ movl %r9d,%edx /* 3rd arg: 2nd syscall arg */
+ movl %ebx,%esi /* 2nd arg: 1st syscall arg */
+ movl %eax,%edi /* 1st arg: syscall number */
call __audit_syscall_entry
movl RAX-ARGOFFSET(%rsp),%eax /* reload syscall number */
cmpq $(IA32_NR_syscalls-1),%rax
@@ -226,7 +238,6 @@ sysexit_from_sys_call:
.endm
sysenter_auditsys:
- CFI_RESTORE_STATE
auditsys_entry_common
movl %ebp,%r9d /* reload 6th syscall arg */
jmp sysenter_dispatch
@@ -235,6 +246,11 @@ sysexit_audit:
auditsys_exit sysexit_from_sys_call
#endif
+sysenter_fix_flags:
+ pushq_cfi $(X86_EFLAGS_IF|X86_EFLAGS_FIXED)
+ popfq_cfi
+ jmp sysenter_flags_fixed
+
sysenter_tracesys:
#ifdef CONFIG_AUDITSYSCALL
testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild
index 3ca9762e1649..d55a210a49bf 100644
--- a/arch/x86/include/asm/Kbuild
+++ b/arch/x86/include/asm/Kbuild
@@ -5,6 +5,8 @@ genhdr-y += unistd_64.h
genhdr-y += unistd_x32.h
generic-y += clkdev.h
-generic-y += early_ioremap.h
generic-y += cputime.h
+generic-y += dma-contiguous.h
+generic-y += early_ioremap.h
generic-y += mcs_spinlock.h
+generic-y += scatterlist.h
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index 0a3f9c9f98d5..473bdbee378a 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -161,6 +161,20 @@ static inline int alternatives_text_reserved(void *start, void *end)
asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) \
: : "i" (0), ## input)
+/*
+ * This is similar to alternative_input. But it has two features and
+ * respective instructions.
+ *
+ * If CPU has feature2, newinstr2 is used.
+ * Otherwise, if CPU has feature1, newinstr1 is used.
+ * Otherwise, oldinstr is used.
+ */
+#define alternative_input_2(oldinstr, newinstr1, feature1, newinstr2, \
+ feature2, input...) \
+ asm volatile(ALTERNATIVE_2(oldinstr, newinstr1, feature1, \
+ newinstr2, feature2) \
+ : : "i" (0), ## input)
+
/* Like alternative_input, but with a single output argument */
#define alternative_io(oldinstr, newinstr, feature, output, input...) \
asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) \
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 79752f2bdec5..465b309af254 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -85,14 +85,6 @@ static inline bool apic_from_smp_config(void)
#include <asm/paravirt.h>
#endif
-#ifdef CONFIG_X86_64
-extern int is_vsmp_box(void);
-#else
-static inline int is_vsmp_box(void)
-{
- return 0;
-}
-#endif
extern int setup_profiling_timer(unsigned int);
static inline void native_apic_mem_write(u32 reg, u32 v)
@@ -300,7 +292,6 @@ struct apic {
int dest_logical;
unsigned long (*check_apicid_used)(physid_mask_t *map, int apicid);
- unsigned long (*check_apicid_present)(int apicid);
void (*vector_allocation_domain)(int cpu, struct cpumask *retmask,
const struct cpumask *mask);
@@ -309,21 +300,11 @@ struct apic {
void (*ioapic_phys_id_map)(physid_mask_t *phys_map, physid_mask_t *retmap);
void (*setup_apic_routing)(void);
- int (*multi_timer_check)(int apic, int irq);
int (*cpu_present_to_apicid)(int mps_cpu);
void (*apicid_to_cpu_present)(int phys_apicid, physid_mask_t *retmap);
- void (*setup_portio_remap)(void);
int (*check_phys_apicid_present)(int phys_apicid);
- void (*enable_apic_mode)(void);
int (*phys_pkg_id)(int cpuid_apic, int index_msb);
- /*
- * When one of the next two hooks returns 1 the apic
- * is switched to this. Essentially they are additional
- * probe functions:
- */
- int (*mps_oem_check)(struct mpc_table *mpc, char *oem, char *productid);
-
unsigned int (*get_apic_id)(unsigned long x);
unsigned long (*set_apic_id)(unsigned int id);
unsigned long apic_id_mask;
@@ -343,11 +324,7 @@ struct apic {
/* wakeup_secondary_cpu */
int (*wakeup_secondary_cpu)(int apicid, unsigned long start_eip);
- int trampoline_phys_low;
- int trampoline_phys_high;
-
bool wait_for_init_deassert;
- void (*smp_callin_clear_local_apic)(void);
void (*inquire_remote_apic)(int apicid);
/* apic ops */
@@ -378,14 +355,6 @@ struct apic {
* won't be applied properly during early boot in this case.
*/
int (*x86_32_early_logical_apicid)(int cpu);
-
- /*
- * Optional method called from setup_local_APIC() after logical
- * apicid is guaranteed to be known to initialize apicid -> node
- * mapping if NUMA initialization hasn't done so already. Don't
- * add new users.
- */
- int (*x86_32_numa_cpu_node)(int cpu);
#endif
};
@@ -496,14 +465,12 @@ static inline unsigned default_get_apic_id(unsigned long x)
}
/*
- * Warm reset vector default position:
+ * Warm reset vector position:
*/
-#define DEFAULT_TRAMPOLINE_PHYS_LOW 0x467
-#define DEFAULT_TRAMPOLINE_PHYS_HIGH 0x469
+#define TRAMPOLINE_PHYS_LOW 0x467
+#define TRAMPOLINE_PHYS_HIGH 0x469
#ifdef CONFIG_X86_64
-extern int default_acpi_madt_oem_check(char *, char *);
-
extern void apic_send_IPI_self(int vector);
DECLARE_PER_CPU(int, x2apic_extra_bits);
@@ -552,6 +519,8 @@ static inline int default_apic_id_valid(int apicid)
return (apicid < 255);
}
+extern int default_acpi_madt_oem_check(char *, char *);
+
extern void default_setup_apic_routing(void);
extern struct apic apic_noop;
@@ -635,11 +604,6 @@ static inline unsigned long default_check_apicid_used(physid_mask_t *map, int ap
return physid_isset(apicid, *map);
}
-static inline unsigned long default_check_apicid_present(int bit)
-{
- return physid_isset(bit, phys_cpu_present_map);
-}
-
static inline void default_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap)
{
*retmap = *phys_map;
diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h
index 6dd1c7dd0473..5e5cd123fdfb 100644
--- a/arch/x86/include/asm/atomic.h
+++ b/arch/x86/include/asm/atomic.h
@@ -24,7 +24,7 @@
*/
static inline int atomic_read(const atomic_t *v)
{
- return (*(volatile int *)&(v)->counter);
+ return ACCESS_ONCE((v)->counter);
}
/**
@@ -219,21 +219,6 @@ static inline short int atomic_inc_short(short int *v)
return *v;
}
-#ifdef CONFIG_X86_64
-/**
- * atomic_or_long - OR of two long integers
- * @v1: pointer to type unsigned long
- * @v2: pointer to type unsigned long
- *
- * Atomically ORs @v1 and @v2
- * Returns the result of the OR
- */
-static inline void atomic_or_long(unsigned long *v1, unsigned long v2)
-{
- asm(LOCK_PREFIX "orq %1, %0" : "+m" (*v1) : "r" (v2));
-}
-#endif
-
/* These are x86-specific, used by some header files */
#define atomic_clear_mask(mask, addr) \
asm volatile(LOCK_PREFIX "andl %0,%1" \
diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h
index 46e9052bbd28..f8d273e18516 100644
--- a/arch/x86/include/asm/atomic64_64.h
+++ b/arch/x86/include/asm/atomic64_64.h
@@ -18,7 +18,7 @@
*/
static inline long atomic64_read(const atomic64_t *v)
{
- return (*(volatile long *)&(v)->counter);
+ return ACCESS_ONCE((v)->counter);
}
/**
diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
index afcd35d331de..cfe3b954d5e4 100644
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -497,8 +497,6 @@ static __always_inline int fls64(__u64 x)
#include <asm-generic/bitops/sched.h>
-#define ARCH_HAS_FAST_MULTIPLIER 1
-
#include <asm/arch_hweight.h>
#include <asm-generic/bitops/const_hweight.h>
diff --git a/arch/x86/include/asm/calling.h b/arch/x86/include/asm/calling.h
index cb4c73bfeb48..76659b67fd11 100644
--- a/arch/x86/include/asm/calling.h
+++ b/arch/x86/include/asm/calling.h
@@ -85,7 +85,7 @@ For 32-bit we have the following conventions - kernel is built with
#define ARGOFFSET R11
#define SWFRAME ORIG_RAX
- .macro SAVE_ARGS addskip=0, save_rcx=1, save_r891011=1
+ .macro SAVE_ARGS addskip=0, save_rcx=1, save_r891011=1, rax_enosys=0
subq $9*8+\addskip, %rsp
CFI_ADJUST_CFA_OFFSET 9*8+\addskip
movq_cfi rdi, 8*8
@@ -96,7 +96,11 @@ For 32-bit we have the following conventions - kernel is built with
movq_cfi rcx, 5*8
.endif
+ .if \rax_enosys
+ movq $-ENOSYS, 4*8(%rsp)
+ .else
movq_cfi rax, 4*8
+ .endif
.if \save_r891011
movq_cfi r8, 3*8
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index bb9b258d60e7..0bb1335313b2 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -8,6 +8,10 @@
#include <asm/required-features.h>
#endif
+#ifndef _ASM_X86_DISABLED_FEATURES_H
+#include <asm/disabled-features.h>
+#endif
+
#define NCAPINTS 11 /* N 32-bit words worth of info */
#define NBUGINTS 1 /* N 32-bit bug flags */
@@ -202,6 +206,7 @@
#define X86_FEATURE_DECODEASSISTS ( 8*32+12) /* AMD Decode Assists support */
#define X86_FEATURE_PAUSEFILTER ( 8*32+13) /* AMD filtered pause intercept */
#define X86_FEATURE_PFTHRESHOLD ( 8*32+14) /* AMD pause filter threshold */
+#define X86_FEATURE_VMMCALL ( 8*32+15) /* Prefer vmmcall to vmcall */
/* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */
@@ -250,8 +255,15 @@
#include <asm/asm.h>
#include <linux/bitops.h>
+#ifdef CONFIG_X86_FEATURE_NAMES
extern const char * const x86_cap_flags[NCAPINTS*32];
extern const char * const x86_power_flags[32];
+#define X86_CAP_FMT "%s"
+#define x86_cap_flag(flag) x86_cap_flags[flag]
+#else
+#define X86_CAP_FMT "%d:%d"
+#define x86_cap_flag(flag) ((flag) >> 5), ((flag) & 31)
+#endif
/*
* In order to save room, we index into this array by doing
@@ -274,6 +286,18 @@ extern const char * const x86_bug_flags[NBUGINTS*32];
(((bit)>>5)==8 && (1UL<<((bit)&31) & REQUIRED_MASK8)) || \
(((bit)>>5)==9 && (1UL<<((bit)&31) & REQUIRED_MASK9)) )
+#define DISABLED_MASK_BIT_SET(bit) \
+ ( (((bit)>>5)==0 && (1UL<<((bit)&31) & DISABLED_MASK0)) || \
+ (((bit)>>5)==1 && (1UL<<((bit)&31) & DISABLED_MASK1)) || \
+ (((bit)>>5)==2 && (1UL<<((bit)&31) & DISABLED_MASK2)) || \
+ (((bit)>>5)==3 && (1UL<<((bit)&31) & DISABLED_MASK3)) || \
+ (((bit)>>5)==4 && (1UL<<((bit)&31) & DISABLED_MASK4)) || \
+ (((bit)>>5)==5 && (1UL<<((bit)&31) & DISABLED_MASK5)) || \
+ (((bit)>>5)==6 && (1UL<<((bit)&31) & DISABLED_MASK6)) || \
+ (((bit)>>5)==7 && (1UL<<((bit)&31) & DISABLED_MASK7)) || \
+ (((bit)>>5)==8 && (1UL<<((bit)&31) & DISABLED_MASK8)) || \
+ (((bit)>>5)==9 && (1UL<<((bit)&31) & DISABLED_MASK9)) )
+
#define cpu_has(c, bit) \
(__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \
test_cpu_cap(c, bit))
@@ -282,6 +306,18 @@ extern const char * const x86_bug_flags[NBUGINTS*32];
(__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \
x86_this_cpu_test_bit(bit, (unsigned long *)&cpu_info.x86_capability))
+/*
+ * This macro is for detection of features which need kernel
+ * infrastructure to be used. It may *not* directly test the CPU
+ * itself. Use the cpu_has() family if you want true runtime
+ * testing of CPU features, like in hypervisor code where you are
+ * supporting a possible guest feature where host support for it
+ * is not relevant.
+ */
+#define cpu_feature_enabled(bit) \
+ (__builtin_constant_p(bit) && DISABLED_MASK_BIT_SET(bit) ? 0 : \
+ cpu_has(&boot_cpu_data, bit))
+
#define boot_cpu_has(bit) cpu_has(&boot_cpu_data, bit)
#define set_cpu_cap(c, bit) set_bit(bit, (unsigned long *)((c)->x86_capability))
@@ -296,11 +332,9 @@ extern const char * const x86_bug_flags[NBUGINTS*32];
} while (0)
#define cpu_has_fpu boot_cpu_has(X86_FEATURE_FPU)
-#define cpu_has_vme boot_cpu_has(X86_FEATURE_VME)
#define cpu_has_de boot_cpu_has(X86_FEATURE_DE)
#define cpu_has_pse boot_cpu_has(X86_FEATURE_PSE)
#define cpu_has_tsc boot_cpu_has(X86_FEATURE_TSC)
-#define cpu_has_pae boot_cpu_has(X86_FEATURE_PAE)
#define cpu_has_pge boot_cpu_has(X86_FEATURE_PGE)
#define cpu_has_apic boot_cpu_has(X86_FEATURE_APIC)
#define cpu_has_sep boot_cpu_has(X86_FEATURE_SEP)
@@ -316,9 +350,6 @@ extern const char * const x86_bug_flags[NBUGINTS*32];
#define cpu_has_avx2 boot_cpu_has(X86_FEATURE_AVX2)
#define cpu_has_ht boot_cpu_has(X86_FEATURE_HT)
#define cpu_has_nx boot_cpu_has(X86_FEATURE_NX)
-#define cpu_has_k6_mtrr boot_cpu_has(X86_FEATURE_K6_MTRR)
-#define cpu_has_cyrix_arr boot_cpu_has(X86_FEATURE_CYRIX_ARR)
-#define cpu_has_centaur_mcr boot_cpu_has(X86_FEATURE_CENTAUR_MCR)
#define cpu_has_xstore boot_cpu_has(X86_FEATURE_XSTORE)
#define cpu_has_xstore_enabled boot_cpu_has(X86_FEATURE_XSTORE_EN)
#define cpu_has_xcrypt boot_cpu_has(X86_FEATURE_XCRYPT)
@@ -353,25 +384,6 @@ extern const char * const x86_bug_flags[NBUGINTS*32];
#define cpu_has_eager_fpu boot_cpu_has(X86_FEATURE_EAGER_FPU)
#define cpu_has_topoext boot_cpu_has(X86_FEATURE_TOPOEXT)
-#ifdef CONFIG_X86_64
-
-#undef cpu_has_vme
-#define cpu_has_vme 0
-
-#undef cpu_has_pae
-#define cpu_has_pae ___BUG___
-
-#undef cpu_has_k6_mtrr
-#define cpu_has_k6_mtrr 0
-
-#undef cpu_has_cyrix_arr
-#define cpu_has_cyrix_arr 0
-
-#undef cpu_has_centaur_mcr
-#define cpu_has_centaur_mcr 0
-
-#endif /* CONFIG_X86_64 */
-
#if __GNUC__ >= 4
extern void warn_pre_alternatives(void);
extern bool __static_cpu_has_safe(u16 bit);
diff --git a/arch/x86/include/asm/crash.h b/arch/x86/include/asm/crash.h
new file mode 100644
index 000000000000..f498411f2500
--- /dev/null
+++ b/arch/x86/include/asm/crash.h
@@ -0,0 +1,9 @@
+#ifndef _ASM_X86_CRASH_H
+#define _ASM_X86_CRASH_H
+
+int crash_load_segments(struct kimage *image);
+int crash_copy_backup_region(struct kimage *image);
+int crash_setup_memmap_entries(struct kimage *image,
+ struct boot_params *params);
+
+#endif /* _ASM_X86_CRASH_H */
diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h
index 4b528a970bd4..61fd18b83b6c 100644
--- a/arch/x86/include/asm/debugreg.h
+++ b/arch/x86/include/asm/debugreg.h
@@ -97,11 +97,11 @@ extern void hw_breakpoint_restore(void);
DECLARE_PER_CPU(int, debug_stack_usage);
static inline void debug_stack_usage_inc(void)
{
- __get_cpu_var(debug_stack_usage)++;
+ __this_cpu_inc(debug_stack_usage);
}
static inline void debug_stack_usage_dec(void)
{
- __get_cpu_var(debug_stack_usage)--;
+ __this_cpu_dec(debug_stack_usage);
}
int is_debug_stack(unsigned long addr);
void debug_stack_set_zero(void);
diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h
new file mode 100644
index 000000000000..97534a7d38e3
--- /dev/null
+++ b/arch/x86/include/asm/disabled-features.h
@@ -0,0 +1,39 @@
+#ifndef _ASM_X86_DISABLED_FEATURES_H
+#define _ASM_X86_DISABLED_FEATURES_H
+
+/* These features, although they might be available in a CPU
+ * will not be used because the compile options to support
+ * them are not present.
+ *
+ * This code allows them to be checked and disabled at
+ * compile time without an explicit #ifdef. Use
+ * cpu_feature_enabled().
+ */
+
+#ifdef CONFIG_X86_64
+# define DISABLE_VME (1<<(X86_FEATURE_VME & 31))
+# define DISABLE_K6_MTRR (1<<(X86_FEATURE_K6_MTRR & 31))
+# define DISABLE_CYRIX_ARR (1<<(X86_FEATURE_CYRIX_ARR & 31))
+# define DISABLE_CENTAUR_MCR (1<<(X86_FEATURE_CENTAUR_MCR & 31))
+#else
+# define DISABLE_VME 0
+# define DISABLE_K6_MTRR 0
+# define DISABLE_CYRIX_ARR 0
+# define DISABLE_CENTAUR_MCR 0
+#endif /* CONFIG_X86_64 */
+
+/*
+ * Make sure to add features to the correct mask
+ */
+#define DISABLED_MASK0 (DISABLE_VME)
+#define DISABLED_MASK1 0
+#define DISABLED_MASK2 0
+#define DISABLED_MASK3 (DISABLE_CYRIX_ARR|DISABLE_CENTAUR_MCR|DISABLE_K6_MTRR)
+#define DISABLED_MASK4 0
+#define DISABLED_MASK5 0
+#define DISABLED_MASK6 0
+#define DISABLED_MASK7 0
+#define DISABLED_MASK8 0
+#define DISABLED_MASK9 0
+
+#endif /* _ASM_X86_DISABLED_FEATURES_H */
diff --git a/arch/x86/include/asm/dma-contiguous.h b/arch/x86/include/asm/dma-contiguous.h
deleted file mode 100644
index b4b38bacb404..000000000000
--- a/arch/x86/include/asm/dma-contiguous.h
+++ /dev/null
@@ -1,12 +0,0 @@
-#ifndef ASMX86_DMA_CONTIGUOUS_H
-#define ASMX86_DMA_CONTIGUOUS_H
-
-#ifdef __KERNEL__
-
-#include <linux/types.h>
-
-static inline void
-dma_contiguous_early_fixup(phys_addr_t base, unsigned long size) { }
-
-#endif
-#endif
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index 044a2fd3c5fe..9b11757975d0 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -81,24 +81,23 @@ extern u64 asmlinkage efi_call(void *fp, ...);
*/
#define __efi_call_virt(f, args...) efi_call_virt(f, args)
-extern void __iomem *efi_ioremap(unsigned long addr, unsigned long size,
- u32 type, u64 attribute);
+extern void __iomem *__init efi_ioremap(unsigned long addr, unsigned long size,
+ u32 type, u64 attribute);
#endif /* CONFIG_X86_32 */
-extern int add_efi_memmap;
extern struct efi_scratch efi_scratch;
-extern void efi_set_executable(efi_memory_desc_t *md, bool executable);
-extern int efi_memblock_x86_reserve_range(void);
-extern void efi_call_phys_prelog(void);
-extern void efi_call_phys_epilog(void);
-extern void efi_unmap_memmap(void);
-extern void efi_memory_uc(u64 addr, unsigned long size);
+extern void __init efi_set_executable(efi_memory_desc_t *md, bool executable);
+extern int __init efi_memblock_x86_reserve_range(void);
+extern void __init efi_call_phys_prolog(void);
+extern void __init efi_call_phys_epilog(void);
+extern void __init efi_unmap_memmap(void);
+extern void __init efi_memory_uc(u64 addr, unsigned long size);
extern void __init efi_map_region(efi_memory_desc_t *md);
extern void __init efi_map_region_fixed(efi_memory_desc_t *md);
extern void efi_sync_low_kernel_mappings(void);
-extern int efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages);
-extern void efi_cleanup_page_tables(unsigned long pa_memmap, unsigned num_pages);
+extern int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages);
+extern void __init efi_cleanup_page_tables(unsigned long pa_memmap, unsigned num_pages);
extern void __init old_map_region(efi_memory_desc_t *md);
extern void __init runtime_code_page_mkexec(void);
extern void __init efi_runtime_mkexec(void);
@@ -159,43 +158,9 @@ static inline efi_status_t efi_thunk_set_virtual_address_map(
}
#endif /* CONFIG_EFI_MIXED */
-
-/* arch specific definitions used by the stub code */
-
-struct efi_config {
- u64 image_handle;
- u64 table;
- u64 allocate_pool;
- u64 allocate_pages;
- u64 get_memory_map;
- u64 free_pool;
- u64 free_pages;
- u64 locate_handle;
- u64 handle_protocol;
- u64 exit_boot_services;
- u64 text_output;
- efi_status_t (*call)(unsigned long, ...);
- bool is64;
-} __packed;
-
-extern struct efi_config *efi_early;
-
-#define efi_call_early(f, ...) \
- efi_early->call(efi_early->f, __VA_ARGS__);
-
extern bool efi_reboot_required(void);
#else
-/*
- * IF EFI is not configured, have the EFI calls return -ENOSYS.
- */
-#define efi_call0(_f) (-ENOSYS)
-#define efi_call1(_f, _a1) (-ENOSYS)
-#define efi_call2(_f, _a1, _a2) (-ENOSYS)
-#define efi_call3(_f, _a1, _a2, _a3) (-ENOSYS)
-#define efi_call4(_f, _a1, _a2, _a3, _a4) (-ENOSYS)
-#define efi_call5(_f, _a1, _a2, _a3, _a4, _a5) (-ENOSYS)
-#define efi_call6(_f, _a1, _a2, _a3, _a4, _a5, _a6) (-ENOSYS)
static inline void parse_efi_setup(u64 phys_addr, u32 data_len) {}
static inline bool efi_reboot_required(void)
{
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index 1a055c81d864..ca3347a9dab5 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -160,8 +160,9 @@ do { \
#define elf_check_arch(x) \
((x)->e_machine == EM_X86_64)
-#define compat_elf_check_arch(x) \
- (elf_check_arch_ia32(x) || (x)->e_machine == EM_X86_64)
+#define compat_elf_check_arch(x) \
+ (elf_check_arch_ia32(x) || \
+ (IS_ENABLED(CONFIG_X86_X32_ABI) && (x)->e_machine == EM_X86_64))
#if __USER32_DS != __USER_DS
# error "The following code assumes __USER32_DS == __USER_DS"
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index b0910f97a3ea..ffb1733ac91f 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -106,14 +106,14 @@ enum fixed_addresses {
__end_of_permanent_fixed_addresses,
/*
- * 256 temporary boot-time mappings, used by early_ioremap(),
+ * 512 temporary boot-time mappings, used by early_ioremap(),
* before ioremap() is functional.
*
- * If necessary we round it up to the next 256 pages boundary so
+ * If necessary we round it up to the next 512 pages boundary so
* that we can have a single pgd entry and a single pte table:
*/
#define NR_FIX_BTMAPS 64
-#define FIX_BTMAPS_SLOTS 4
+#define FIX_BTMAPS_SLOTS 8
#define TOTAL_FIX_BTMAPS (NR_FIX_BTMAPS * FIX_BTMAPS_SLOTS)
FIX_BTMAP_END =
(__end_of_permanent_fixed_addresses ^
diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h
index e3b85422cf12..e97622f57722 100644
--- a/arch/x86/include/asm/fpu-internal.h
+++ b/arch/x86/include/asm/fpu-internal.h
@@ -344,7 +344,7 @@ static inline void __thread_fpu_end(struct task_struct *tsk)
static inline void __thread_fpu_begin(struct task_struct *tsk)
{
- if (!static_cpu_has_safe(X86_FEATURE_EAGER_FPU))
+ if (!use_eager_fpu())
clts();
__thread_set_has_fpu(tsk);
}
@@ -508,9 +508,12 @@ static inline void user_fpu_begin(void)
static inline void __save_fpu(struct task_struct *tsk)
{
- if (use_xsave())
- xsave_state(&tsk->thread.fpu.state->xsave, -1);
- else
+ if (use_xsave()) {
+ if (unlikely(system_state == SYSTEM_BOOTING))
+ xsave_state_booting(&tsk->thread.fpu.state->xsave, -1);
+ else
+ xsave_state(&tsk->thread.fpu.state->xsave, -1);
+ } else
fpu_fxsave(&tsk->thread.fpu);
}
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index 230853da4ec0..0f5fb6b6567e 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -40,9 +40,6 @@ typedef struct {
DECLARE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
-/* We can have at most NR_VECTORS irqs routed to a cpu at a time */
-#define MAX_HARDIRQS_PER_CPU NR_VECTORS
-
#define __ARCH_IRQ_STAT
#define inc_irq_stat(member) this_cpu_inc(irq_stat.member)
diff --git a/arch/x86/include/asm/i8259.h b/arch/x86/include/asm/i8259.h
index a20365953bf8..ccffa53750a8 100644
--- a/arch/x86/include/asm/i8259.h
+++ b/arch/x86/include/asm/i8259.h
@@ -67,4 +67,9 @@ struct legacy_pic {
extern struct legacy_pic *legacy_pic;
extern struct legacy_pic null_legacy_pic;
+static inline int nr_legacy_irqs(void)
+{
+ return legacy_pic->nr_legacy_irqs;
+}
+
#endif /* _ASM_X86_I8259_H */
diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index 90f97b4b9347..1733ab49ac5e 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -98,6 +98,8 @@ struct IR_IO_APIC_route_entry {
#define IOAPIC_AUTO -1
#define IOAPIC_EDGE 0
#define IOAPIC_LEVEL 1
+#define IOAPIC_MAP_ALLOC 0x1
+#define IOAPIC_MAP_CHECK 0x2
#ifdef CONFIG_X86_IO_APIC
@@ -118,9 +120,6 @@ extern int mp_irq_entries;
/* MP IRQ source entries */
extern struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES];
-/* non-0 if default (table-less) MP configuration */
-extern int mpc_default_type;
-
/* Older SiS APIC requires we rewrite the index register */
extern int sis_apic_bug;
@@ -133,9 +132,6 @@ extern int noioapicquirk;
/* -1 if "noapic" boot option passed */
extern int noioapicreroute;
-/* 1 if the timer IRQ uses the '8259A Virtual Wire' mode */
-extern int timer_through_8259;
-
/*
* If we use the IO-APIC for IRQ routing, disable automatic
* assignment of PCI IRQ's.
@@ -145,24 +141,17 @@ extern int timer_through_8259;
struct io_apic_irq_attr;
struct irq_cfg;
-extern int io_apic_set_pci_routing(struct device *dev, int irq,
- struct io_apic_irq_attr *irq_attr);
-void setup_IO_APIC_irq_extra(u32 gsi);
extern void ioapic_insert_resources(void);
extern int native_setup_ioapic_entry(int, struct IO_APIC_route_entry *,
unsigned int, int,
struct io_apic_irq_attr *);
-extern int native_setup_ioapic_entry(int, struct IO_APIC_route_entry *,
- unsigned int, int,
- struct io_apic_irq_attr *);
extern void eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg);
extern void native_compose_msi_msg(struct pci_dev *pdev,
unsigned int irq, unsigned int dest,
struct msi_msg *msg, u8 hpet_id);
extern void native_eoi_ioapic_pin(int apic, int pin, int vector);
-int io_apic_setup_irq_pin_once(unsigned int irq, int node, struct io_apic_irq_attr *attr);
extern int save_ioapic_entries(void);
extern void mask_ioapic_entries(void);
@@ -171,15 +160,40 @@ extern int restore_ioapic_entries(void);
extern void setup_ioapic_ids_from_mpc(void);
extern void setup_ioapic_ids_from_mpc_nocheck(void);
+enum ioapic_domain_type {
+ IOAPIC_DOMAIN_INVALID,
+ IOAPIC_DOMAIN_LEGACY,
+ IOAPIC_DOMAIN_STRICT,
+ IOAPIC_DOMAIN_DYNAMIC,
+};
+
+struct device_node;
+struct irq_domain;
+struct irq_domain_ops;
+
+struct ioapic_domain_cfg {
+ enum ioapic_domain_type type;
+ const struct irq_domain_ops *ops;
+ struct device_node *dev;
+};
+
struct mp_ioapic_gsi{
u32 gsi_base;
u32 gsi_end;
};
-extern struct mp_ioapic_gsi mp_gsi_routing[];
extern u32 gsi_top;
-int mp_find_ioapic(u32 gsi);
-int mp_find_ioapic_pin(int ioapic, u32 gsi);
-void __init mp_register_ioapic(int id, u32 address, u32 gsi_base);
+
+extern int mp_find_ioapic(u32 gsi);
+extern int mp_find_ioapic_pin(int ioapic, u32 gsi);
+extern u32 mp_pin_to_gsi(int ioapic, int pin);
+extern int mp_map_gsi_to_irq(u32 gsi, unsigned int flags);
+extern void mp_unmap_irq(int irq);
+extern void __init mp_register_ioapic(int id, u32 address, u32 gsi_base,
+ struct ioapic_domain_cfg *cfg);
+extern int mp_irqdomain_map(struct irq_domain *domain, unsigned int virq,
+ irq_hw_number_t hwirq);
+extern void mp_irqdomain_unmap(struct irq_domain *domain, unsigned int virq);
+extern int mp_set_gsi_attr(u32 gsi, int trigger, int polarity, int node);
extern void __init pre_init_apic_IRQ0(void);
extern void mp_save_irq(struct mpc_intsrc *m);
@@ -213,18 +227,19 @@ static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned
extern void io_apic_eoi(unsigned int apic, unsigned int vector);
+extern bool mp_should_keep_irq(struct device *dev);
+
#else /* !CONFIG_X86_IO_APIC */
#define io_apic_assign_pci_irqs 0
#define setup_ioapic_ids_from_mpc x86_init_noop
-static const int timer_through_8259 = 0;
static inline void ioapic_insert_resources(void) { }
#define gsi_top (NR_IRQS_LEGACY)
static inline int mp_find_ioapic(u32 gsi) { return 0; }
-
-struct io_apic_irq_attr;
-static inline int io_apic_set_pci_routing(struct device *dev, int irq,
- struct io_apic_irq_attr *irq_attr) { return 0; }
+static inline u32 mp_pin_to_gsi(int ioapic, int pin) { return UINT_MAX; }
+static inline int mp_map_gsi_to_irq(u32 gsi, unsigned int flags) { return gsi; }
+static inline void mp_unmap_irq(int irq) { }
+static inline bool mp_should_keep_irq(struct device *dev) { return 1; }
static inline int save_ioapic_entries(void)
{
diff --git a/arch/x86/include/asm/irq_work.h b/arch/x86/include/asm/irq_work.h
new file mode 100644
index 000000000000..78162f8e248b
--- /dev/null
+++ b/arch/x86/include/asm/irq_work.h
@@ -0,0 +1,11 @@
+#ifndef _ASM_IRQ_WORK_H
+#define _ASM_IRQ_WORK_H
+
+#include <asm/processor.h>
+
+static inline bool arch_irq_work_has_interrupt(void)
+{
+ return cpu_has_apic;
+}
+
+#endif /* _ASM_IRQ_WORK_H */
diff --git a/arch/x86/include/asm/kexec-bzimage64.h b/arch/x86/include/asm/kexec-bzimage64.h
new file mode 100644
index 000000000000..d1b5d194e31d
--- /dev/null
+++ b/arch/x86/include/asm/kexec-bzimage64.h
@@ -0,0 +1,6 @@
+#ifndef _ASM_KEXEC_BZIMAGE64_H
+#define _ASM_KEXEC_BZIMAGE64_H
+
+extern struct kexec_file_ops kexec_bzImage64_ops;
+
+#endif /* _ASM_KEXE_BZIMAGE64_H */
diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h
index 17483a492f18..d2434c1cad05 100644
--- a/arch/x86/include/asm/kexec.h
+++ b/arch/x86/include/asm/kexec.h
@@ -23,6 +23,9 @@
#include <asm/page.h>
#include <asm/ptrace.h>
+#include <asm/bootparam.h>
+
+struct kimage;
/*
* KEXEC_SOURCE_MEMORY_LIMIT maximum page get_free_page can return.
@@ -61,6 +64,10 @@
# define KEXEC_ARCH KEXEC_ARCH_X86_64
#endif
+/* Memory to backup during crash kdump */
+#define KEXEC_BACKUP_SRC_START (0UL)
+#define KEXEC_BACKUP_SRC_END (640 * 1024UL) /* 640K */
+
/*
* CPU does not save ss and sp on stack if execution is already
* running in kernel mode at the time of NMI occurrence. This code
@@ -160,6 +167,44 @@ struct kimage_arch {
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
+ /* Details of backup region */
+ unsigned long backup_src_start;
+ unsigned long backup_src_sz;
+
+ /* Physical address of backup segment */
+ unsigned long backup_load_addr;
+
+ /* Core ELF header buffer */
+ void *elf_headers;
+ unsigned long elf_headers_sz;
+ unsigned long elf_load_addr;
+};
+#endif /* CONFIG_X86_32 */
+
+#ifdef CONFIG_X86_64
+/*
+ * Number of elements and order of elements in this structure should match
+ * with the ones in arch/x86/purgatory/entry64.S. If you make a change here
+ * make an appropriate change in purgatory too.
+ */
+struct kexec_entry64_regs {
+ uint64_t rax;
+ uint64_t rcx;
+ uint64_t rdx;
+ uint64_t rbx;
+ uint64_t rsp;
+ uint64_t rbp;
+ uint64_t rsi;
+ uint64_t rdi;
+ uint64_t r8;
+ uint64_t r9;
+ uint64_t r10;
+ uint64_t r11;
+ uint64_t r12;
+ uint64_t r13;
+ uint64_t r14;
+ uint64_t r15;
+ uint64_t rip;
};
#endif
diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h
index 53cdfb2857ab..4421b5da409d 100644
--- a/arch/x86/include/asm/kprobes.h
+++ b/arch/x86/include/asm/kprobes.h
@@ -27,7 +27,6 @@
#include <asm/insn.h>
#define __ARCH_WANT_KPROBES_INSN_SLOT
-#define ARCH_SUPPORTS_KPROBES_ON_FTRACE
struct pt_regs;
struct kprobe;
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 572460175ba5..6ed0c30d6a0c 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -95,14 +95,10 @@ static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level)
#define KVM_REFILL_PAGES 25
#define KVM_MAX_CPUID_ENTRIES 80
#define KVM_NR_FIXED_MTRR_REGION 88
-#define KVM_NR_VAR_MTRR 10
+#define KVM_NR_VAR_MTRR 8
#define ASYNC_PF_PER_VCPU 64
-struct kvm_vcpu;
-struct kvm;
-struct kvm_async_pf;
-
enum kvm_reg {
VCPU_REGS_RAX = 0,
VCPU_REGS_RCX = 1,
@@ -266,7 +262,8 @@ struct kvm_mmu {
struct x86_exception *fault);
gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access,
struct x86_exception *exception);
- gpa_t (*translate_gpa)(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access);
+ gpa_t (*translate_gpa)(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access,
+ struct x86_exception *exception);
int (*sync_page)(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *sp);
void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva);
@@ -481,6 +478,7 @@ struct kvm_vcpu_arch {
u64 mmio_gva;
unsigned access;
gfn_t mmio_gfn;
+ u64 mmio_gen;
struct kvm_pmu pmu;
@@ -576,11 +574,10 @@ struct kvm_arch {
struct kvm_apic_map *apic_map;
unsigned int tss_addr;
- struct page *apic_access_page;
+ bool apic_access_page_done;
gpa_t wall_clock;
- struct page *ept_identity_pagetable;
bool ept_identity_pagetable_done;
gpa_t ept_identity_map_addr;
@@ -665,8 +662,8 @@ struct msr_data {
struct kvm_x86_ops {
int (*cpu_has_kvm_support)(void); /* __init */
int (*disabled_by_bios)(void); /* __init */
- int (*hardware_enable)(void *dummy);
- void (*hardware_disable)(void *dummy);
+ int (*hardware_enable)(void);
+ void (*hardware_disable)(void);
void (*check_processor_compatibility)(void *rtn);
int (*hardware_setup)(void); /* __init */
void (*hardware_unsetup)(void); /* __exit */
@@ -710,7 +707,6 @@ struct kvm_x86_ops {
void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg);
unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
- void (*fpu_activate)(struct kvm_vcpu *vcpu);
void (*fpu_deactivate)(struct kvm_vcpu *vcpu);
void (*tlb_flush)(struct kvm_vcpu *vcpu);
@@ -740,6 +736,7 @@ struct kvm_x86_ops {
void (*hwapic_isr_update)(struct kvm *kvm, int isr);
void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
void (*set_virtual_x2apic_mode)(struct kvm_vcpu *vcpu, bool set);
+ void (*set_apic_access_page_addr)(struct kvm_vcpu *vcpu, hpa_t hpa);
void (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector);
void (*sync_pir_to_irr)(struct kvm_vcpu *vcpu);
int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
@@ -772,6 +769,8 @@ struct kvm_x86_ops {
bool (*mpx_supported)(void);
int (*check_nested_events)(struct kvm_vcpu *vcpu, bool external_intr);
+
+ void (*sched_in)(struct kvm_vcpu *kvm, int cpu);
};
struct kvm_arch_async_pf {
@@ -895,7 +894,6 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault);
int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
gfn_t gfn, void *data, int offset, int len,
u32 access);
-void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault);
bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl);
static inline int __kvm_irq_line_state(unsigned long *irq_state,
@@ -917,7 +915,6 @@ void kvm_inject_nmi(struct kvm_vcpu *vcpu);
int fx_init(struct kvm_vcpu *vcpu);
-void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu);
void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
const u8 *new, int bytes);
int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn);
@@ -926,7 +923,8 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
int kvm_mmu_load(struct kvm_vcpu *vcpu);
void kvm_mmu_unload(struct kvm_vcpu *vcpu);
void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu);
-gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access);
+gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access,
+ struct x86_exception *exception);
gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
struct x86_exception *exception);
gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva,
@@ -946,7 +944,8 @@ void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu);
void kvm_enable_tdp(void);
void kvm_disable_tdp(void);
-static inline gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access)
+static inline gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access,
+ struct x86_exception *exception)
{
return gpa;
}
@@ -990,6 +989,20 @@ static inline void kvm_inject_gp(struct kvm_vcpu *vcpu, u32 error_code)
kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
}
+static inline u64 get_canonical(u64 la)
+{
+ return ((int64_t)la << 16) >> 16;
+}
+
+static inline bool is_noncanonical_address(u64 la)
+{
+#ifdef CONFIG_X86_64
+ return get_canonical(la) != la;
+#else
+ return false;
+#endif
+}
+
#define TSS_IOPB_BASE_OFFSET 0x66
#define TSS_BASE_SIZE 0x68
#define TSS_IOPB_SIZE (65536 / 8)
@@ -1037,7 +1050,7 @@ asmlinkage void kvm_spurious_fault(void);
#define KVM_ARCH_WANT_MMU_NOTIFIER
int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end);
-int kvm_age_hva(struct kvm *kvm, unsigned long hva);
+int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end);
int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
int cpuid_maxphyaddr(struct kvm_vcpu *vcpu);
@@ -1046,9 +1059,12 @@ int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu);
int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
void kvm_vcpu_reset(struct kvm_vcpu *vcpu);
+void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu);
+void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm,
+ unsigned long address);
void kvm_define_shared_msr(unsigned index, u32 msr);
-void kvm_set_shared_msr(unsigned index, u64 val, u64 mask);
+int kvm_set_shared_msr(unsigned index, u64 val, u64 mask);
bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip);
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index c7678e43465b..e62cf897f781 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -2,6 +2,7 @@
#define _ASM_X86_KVM_PARA_H
#include <asm/processor.h>
+#include <asm/alternative.h>
#include <uapi/asm/kvm_para.h>
extern void kvmclock_init(void);
@@ -16,10 +17,15 @@ static inline bool kvm_check_and_clear_guest_paused(void)
}
#endif /* CONFIG_KVM_GUEST */
-/* This instruction is vmcall. On non-VT architectures, it will generate a
- * trap that we will then rewrite to the appropriate instruction.
+#ifdef CONFIG_DEBUG_RODATA
+#define KVM_HYPERCALL \
+ ALTERNATIVE(".byte 0x0f,0x01,0xc1", ".byte 0x0f,0x01,0xd9", X86_FEATURE_VMMCALL)
+#else
+/* On AMD processors, vmcall will generate a trap that we will
+ * then rewrite to the appropriate instruction.
*/
#define KVM_HYPERCALL ".byte 0x0f,0x01,0xc1"
+#endif
/* For KVM hypercalls, a three-byte sequence of either the vmcall or the vmmcall
* instruction. The hypervisor may replace it with something else but only the
diff --git a/arch/x86/include/asm/microcode_intel.h b/arch/x86/include/asm/microcode_intel.h
index 9067166409bf..bbe296e0bce1 100644
--- a/arch/x86/include/asm/microcode_intel.h
+++ b/arch/x86/include/asm/microcode_intel.h
@@ -43,7 +43,7 @@ struct extended_sigtable {
#define DWSIZE (sizeof(u32))
#define get_totalsize(mc) \
- (((struct microcode_intel *)mc)->hdr.totalsize ? \
+ (((struct microcode_intel *)mc)->hdr.datasize ? \
((struct microcode_intel *)mc)->hdr.totalsize : \
DEFAULT_UCODE_TOTALSIZE)
diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h
index f5a617956735..b07233b64578 100644
--- a/arch/x86/include/asm/mpspec.h
+++ b/arch/x86/include/asm/mpspec.h
@@ -40,8 +40,6 @@ extern int mp_bus_id_to_type[MAX_MP_BUSSES];
extern DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
extern unsigned int boot_cpu_physical_apicid;
-extern unsigned int max_physical_apicid;
-extern int mpc_default_type;
extern unsigned long mp_lapic_addr;
#ifdef CONFIG_X86_LOCAL_APIC
@@ -88,15 +86,6 @@ static inline void early_reserve_e820_mpc_new(void) { }
#endif
int generic_processor_info(int apicid, int version);
-#ifdef CONFIG_ACPI
-extern void mp_register_ioapic(int id, u32 address, u32 gsi_base);
-extern void mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger,
- u32 gsi);
-extern void mp_config_acpi_legacy_irqs(void);
-struct device;
-extern int mp_register_gsi(struct device *dev, u32 gsi, int edge_level,
- int active_high_low);
-#endif /* CONFIG_ACPI */
#define PHYSID_ARRAY_SIZE BITS_TO_LONGS(MAX_LOCAL_APIC)
@@ -161,8 +150,4 @@ static inline void physid_set_mask_of_physid(int physid, physid_mask_t *map)
extern physid_mask_t phys_cpu_present_map;
-extern int generic_mps_oem_check(struct mpc_table *, char *, char *);
-
-extern int default_acpi_madt_oem_check(char *, char *);
-
#endif /* _ASM_X86_MPSPEC_H */
diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h
index 4064acae625d..01b493e5a99b 100644
--- a/arch/x86/include/asm/numa.h
+++ b/arch/x86/include/asm/numa.h
@@ -9,7 +9,6 @@
#ifdef CONFIG_NUMA
#define NR_NODE_MEMBLKS (MAX_NUMNODES*2)
-#define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT))
/*
* Too small node sizes may confuse the VM badly. Usually they
diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h
index 775873d3be55..802dde30c928 100644
--- a/arch/x86/include/asm/page.h
+++ b/arch/x86/include/asm/page.h
@@ -70,7 +70,6 @@ extern bool __virt_addr_valid(unsigned long kaddr);
#include <asm-generic/memory_model.h>
#include <asm-generic/getorder.h>
-#define __HAVE_ARCH_GATE_AREA 1
#define HAVE_ARCH_HUGETLB_UNMAPPED_AREA
#endif /* __KERNEL__ */
diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h
index 0f1ddee6a0ce..f408caf73430 100644
--- a/arch/x86/include/asm/page_64.h
+++ b/arch/x86/include/asm/page_64.h
@@ -39,4 +39,6 @@ void copy_page(void *to, void *from);
#endif /* !__ASSEMBLY__ */
+#define __HAVE_ARCH_GATE_AREA 1
+
#endif /* _ASM_X86_PAGE_64_H */
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index 8249df45d2f2..8dfc9fd094a3 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -51,6 +51,14 @@
ARCH_PERFMON_EVENTSEL_EDGE | \
ARCH_PERFMON_EVENTSEL_INV | \
ARCH_PERFMON_EVENTSEL_CMASK)
+#define X86_ALL_EVENT_FLAGS \
+ (ARCH_PERFMON_EVENTSEL_EDGE | \
+ ARCH_PERFMON_EVENTSEL_INV | \
+ ARCH_PERFMON_EVENTSEL_CMASK | \
+ ARCH_PERFMON_EVENTSEL_ANY | \
+ ARCH_PERFMON_EVENTSEL_PIN_CONTROL | \
+ HSW_IN_TX | \
+ HSW_IN_TX_CHECKPOINTED)
#define AMD64_RAW_EVENT_MASK \
(X86_RAW_EVENT_MASK | \
AMD64_EVENTSEL_EVENT)
diff --git a/arch/x86/include/asm/perf_event_p4.h b/arch/x86/include/asm/perf_event_p4.h
index 85e13ccf15c4..d725382c2ae0 100644
--- a/arch/x86/include/asm/perf_event_p4.h
+++ b/arch/x86/include/asm/perf_event_p4.h
@@ -189,7 +189,7 @@ static inline int p4_ht_thread(int cpu)
{
#ifdef CONFIG_SMP
if (smp_num_siblings == 2)
- return cpu != cpumask_first(__get_cpu_var(cpu_sibling_map));
+ return cpu != cpumask_first(this_cpu_cpumask_var_ptr(cpu_sibling_map));
#endif
return 0;
}
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 0ec056012618..aa97a070f09f 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -131,8 +131,13 @@ static inline int pte_exec(pte_t pte)
static inline int pte_special(pte_t pte)
{
- return (pte_flags(pte) & (_PAGE_PRESENT|_PAGE_SPECIAL)) ==
- (_PAGE_PRESENT|_PAGE_SPECIAL);
+ /*
+ * See CONFIG_NUMA_BALANCING pte_numa in include/asm-generic/pgtable.h.
+ * On x86 we have _PAGE_BIT_NUMA == _PAGE_BIT_GLOBAL+1 ==
+ * __PAGE_BIT_SOFTW1 == _PAGE_BIT_SPECIAL.
+ */
+ return (pte_flags(pte) & _PAGE_SPECIAL) &&
+ (pte_flags(pte) & (_PAGE_PRESENT|_PAGE_PROTNONE));
}
static inline unsigned long pte_pfn(pte_t pte)
diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h
index 9ee322103c6d..b6c0b404898a 100644
--- a/arch/x86/include/asm/pgtable_32.h
+++ b/arch/x86/include/asm/pgtable_32.h
@@ -32,9 +32,6 @@ static inline void pgtable_cache_init(void) { }
static inline void check_pgt_cache(void) { }
void paging_init(void);
-extern void set_pmd_pfn(unsigned long, unsigned long, pgprot_t);
-
-
/*
* Define this if things work differently on an i386 and an i486:
* it will (on an i486) warn about kernel memory accesses that are
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 5be9063545d2..4572b2f30237 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -19,6 +19,7 @@ extern pud_t level3_ident_pgt[512];
extern pmd_t level2_kernel_pgt[512];
extern pmd_t level2_fixmap_pgt[512];
extern pmd_t level2_ident_pgt[512];
+extern pte_t level1_fixmap_pgt[512];
extern pgd_t init_level4_pgt[];
#define swapper_pg_dir init_level4_pgt
@@ -115,7 +116,8 @@ static inline void native_pgd_clear(pgd_t *pgd)
native_set_pgd(pgd, native_make_pgd(0));
}
-extern void sync_global_pgds(unsigned long start, unsigned long end);
+extern void sync_global_pgds(unsigned long start, unsigned long end,
+ int removed);
/*
* Conversion functions: convert a page and protection to a page entry,
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index f216963760e5..07789647bf33 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -23,7 +23,6 @@
#define _PAGE_BIT_SPECIAL _PAGE_BIT_SOFTW1
#define _PAGE_BIT_CPA_TEST _PAGE_BIT_SOFTW1
#define _PAGE_BIT_SPLITTING _PAGE_BIT_SOFTW2 /* only valid on a PSE pmd */
-#define _PAGE_BIT_IOMAP _PAGE_BIT_SOFTW2 /* flag used to indicate IO mapping */
#define _PAGE_BIT_HIDDEN _PAGE_BIT_SOFTW3 /* hidden by kmemcheck */
#define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_SOFTW3 /* software dirty tracking */
#define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */
@@ -52,7 +51,7 @@
#define _PAGE_PSE (_AT(pteval_t, 1) << _PAGE_BIT_PSE)
#define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
#define _PAGE_SOFTW1 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW1)
-#define _PAGE_IOMAP (_AT(pteval_t, 1) << _PAGE_BIT_IOMAP)
+#define _PAGE_SOFTW2 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW2)
#define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT)
#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE)
#define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL)
@@ -168,10 +167,10 @@
#define __PAGE_KERNEL_LARGE_NOCACHE (__PAGE_KERNEL | _PAGE_CACHE_UC | _PAGE_PSE)
#define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE)
-#define __PAGE_KERNEL_IO (__PAGE_KERNEL | _PAGE_IOMAP)
-#define __PAGE_KERNEL_IO_NOCACHE (__PAGE_KERNEL_NOCACHE | _PAGE_IOMAP)
-#define __PAGE_KERNEL_IO_UC_MINUS (__PAGE_KERNEL_UC_MINUS | _PAGE_IOMAP)
-#define __PAGE_KERNEL_IO_WC (__PAGE_KERNEL_WC | _PAGE_IOMAP)
+#define __PAGE_KERNEL_IO (__PAGE_KERNEL)
+#define __PAGE_KERNEL_IO_NOCACHE (__PAGE_KERNEL_NOCACHE)
+#define __PAGE_KERNEL_IO_UC_MINUS (__PAGE_KERNEL_UC_MINUS)
+#define __PAGE_KERNEL_IO_WC (__PAGE_KERNEL_WC)
#define PAGE_KERNEL __pgprot(__PAGE_KERNEL)
#define PAGE_KERNEL_RO __pgprot(__PAGE_KERNEL_RO)
@@ -325,6 +324,20 @@ static inline pteval_t pte_flags(pte_t pte)
return native_pte_val(pte) & PTE_FLAGS_MASK;
}
+#ifdef CONFIG_NUMA_BALANCING
+/* Set of bits that distinguishes present, prot_none and numa ptes */
+#define _PAGE_NUMA_MASK (_PAGE_NUMA|_PAGE_PROTNONE|_PAGE_PRESENT)
+static inline pteval_t ptenuma_flags(pte_t pte)
+{
+ return pte_flags(pte) & _PAGE_NUMA_MASK;
+}
+
+static inline pmdval_t pmdnuma_flags(pmd_t pmd)
+{
+ return pmd_flags(pmd) & _PAGE_NUMA_MASK;
+}
+#endif /* CONFIG_NUMA_BALANCING */
+
#define pgprot_val(x) ((x).pgprot)
#define __pgprot(x) ((pgprot_t) { (x) } )
diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h
index 7024c12f7bfe..400873450e33 100644
--- a/arch/x86/include/asm/preempt.h
+++ b/arch/x86/include/asm/preempt.h
@@ -105,6 +105,7 @@ static __always_inline bool should_resched(void)
# ifdef CONFIG_CONTEXT_TRACKING
extern asmlinkage void ___preempt_schedule_context(void);
# define __preempt_schedule_context() asm ("call ___preempt_schedule_context")
+ extern asmlinkage void preempt_schedule_context(void);
# endif
#endif
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index ee30b9f0b91c..eb71ec794732 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -385,8 +385,8 @@ struct bndcsr_struct {
struct xsave_hdr_struct {
u64 xstate_bv;
- u64 reserved1[2];
- u64 reserved2[5];
+ u64 xcomp_bv;
+ u64 reserved[6];
} __attribute__((packed));
struct xsave_struct {
diff --git a/arch/x86/include/asm/prom.h b/arch/x86/include/asm/prom.h
index fbeb06ed0eaa..1d081ac1cd69 100644
--- a/arch/x86/include/asm/prom.h
+++ b/arch/x86/include/asm/prom.h
@@ -26,12 +26,10 @@
extern int of_ioapic;
extern u64 initial_dtb;
extern void add_dtb(u64 data);
-extern void x86_add_irq_domains(void);
void x86_of_pci_init(void);
void x86_dtb_init(void);
#else
static inline void add_dtb(u64 data) { }
-static inline void x86_add_irq_domains(void) { }
static inline void x86_of_pci_init(void) { }
static inline void x86_dtb_init(void) { }
#define of_ioapic 0
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 6205f0c434db..86fc2bb82287 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -75,6 +75,11 @@ convert_ip_to_linear(struct task_struct *child, struct pt_regs *regs);
extern void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs,
int error_code, int si_code);
+
+extern unsigned long syscall_trace_enter_phase1(struct pt_regs *, u32 arch);
+extern long syscall_trace_enter_phase2(struct pt_regs *, u32 arch,
+ unsigned long phase1_result);
+
extern long syscall_trace_enter(struct pt_regs *);
extern void syscall_trace_leave(struct pt_regs *);
diff --git a/arch/x86/include/asm/rwlock.h b/arch/x86/include/asm/rwlock.h
deleted file mode 100644
index a5370a03d90c..000000000000
--- a/arch/x86/include/asm/rwlock.h
+++ /dev/null
@@ -1,49 +0,0 @@
-#ifndef _ASM_X86_RWLOCK_H
-#define _ASM_X86_RWLOCK_H
-
-#include <asm/asm.h>
-
-#if CONFIG_NR_CPUS <= 2048
-
-#ifndef __ASSEMBLY__
-typedef union {
- s32 lock;
- s32 write;
-} arch_rwlock_t;
-#endif
-
-#define RW_LOCK_BIAS 0x00100000
-#define READ_LOCK_SIZE(insn) __ASM_FORM(insn##l)
-#define READ_LOCK_ATOMIC(n) atomic_##n
-#define WRITE_LOCK_ADD(n) __ASM_FORM_COMMA(addl n)
-#define WRITE_LOCK_SUB(n) __ASM_FORM_COMMA(subl n)
-#define WRITE_LOCK_CMP RW_LOCK_BIAS
-
-#else /* CONFIG_NR_CPUS > 2048 */
-
-#include <linux/const.h>
-
-#ifndef __ASSEMBLY__
-typedef union {
- s64 lock;
- struct {
- u32 read;
- s32 write;
- };
-} arch_rwlock_t;
-#endif
-
-#define RW_LOCK_BIAS (_AC(1,L) << 32)
-#define READ_LOCK_SIZE(insn) __ASM_FORM(insn##q)
-#define READ_LOCK_ATOMIC(n) atomic64_##n
-#define WRITE_LOCK_ADD(n) __ASM_FORM(incl)
-#define WRITE_LOCK_SUB(n) __ASM_FORM(decl)
-#define WRITE_LOCK_CMP 1
-
-#endif /* CONFIG_NR_CPUS */
-
-#define __ARCH_RW_LOCK_UNLOCKED { RW_LOCK_BIAS }
-
-/* Actual code is in asm/spinlock.h or in arch/x86/lib/rwlock.S */
-
-#endif /* _ASM_X86_RWLOCK_H */
diff --git a/arch/x86/include/asm/scatterlist.h b/arch/x86/include/asm/scatterlist.h
deleted file mode 100644
index 4240878b9d76..000000000000
--- a/arch/x86/include/asm/scatterlist.h
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef _ASM_X86_SCATTERLIST_H
-#define _ASM_X86_SCATTERLIST_H
-
-#include <asm-generic/scatterlist.h>
-
-#define ARCH_HAS_SG_CHAIN
-
-#endif /* _ASM_X86_SCATTERLIST_H */
diff --git a/arch/x86/include/asm/serial.h b/arch/x86/include/asm/serial.h
index 628c801535ea..460b84f64556 100644
--- a/arch/x86/include/asm/serial.h
+++ b/arch/x86/include/asm/serial.h
@@ -6,24 +6,24 @@
*
* It'd be nice if someone built a serial card with a 24.576 MHz
* clock, since the 16550A is capable of handling a top speed of 1.5
- * megabits/second; but this requires the faster clock.
+ * megabits/second; but this requires a faster clock.
*/
-#define BASE_BAUD ( 1843200 / 16 )
+#define BASE_BAUD (1843200/16)
/* Standard COM flags (except for COM4, because of the 8514 problem) */
#ifdef CONFIG_SERIAL_DETECT_IRQ
-#define STD_COM_FLAGS (ASYNC_BOOT_AUTOCONF | ASYNC_SKIP_TEST | ASYNC_AUTO_IRQ)
-#define STD_COM4_FLAGS (ASYNC_BOOT_AUTOCONF | ASYNC_AUTO_IRQ)
+# define STD_COMX_FLAGS (ASYNC_BOOT_AUTOCONF | ASYNC_SKIP_TEST | ASYNC_AUTO_IRQ)
+# define STD_COM4_FLAGS (ASYNC_BOOT_AUTOCONF | 0 | ASYNC_AUTO_IRQ)
#else
-#define STD_COM_FLAGS (ASYNC_BOOT_AUTOCONF | ASYNC_SKIP_TEST)
-#define STD_COM4_FLAGS ASYNC_BOOT_AUTOCONF
+# define STD_COMX_FLAGS (ASYNC_BOOT_AUTOCONF | ASYNC_SKIP_TEST | 0 )
+# define STD_COM4_FLAGS (ASYNC_BOOT_AUTOCONF | 0 | 0 )
#endif
-#define SERIAL_PORT_DFNS \
- /* UART CLK PORT IRQ FLAGS */ \
- { 0, BASE_BAUD, 0x3F8, 4, STD_COM_FLAGS }, /* ttyS0 */ \
- { 0, BASE_BAUD, 0x2F8, 3, STD_COM_FLAGS }, /* ttyS1 */ \
- { 0, BASE_BAUD, 0x3E8, 4, STD_COM_FLAGS }, /* ttyS2 */ \
- { 0, BASE_BAUD, 0x2E8, 3, STD_COM4_FLAGS }, /* ttyS3 */
+#define SERIAL_PORT_DFNS \
+ /* UART CLK PORT IRQ FLAGS */ \
+ { .uart = 0, BASE_BAUD, 0x3F8, 4, STD_COMX_FLAGS }, /* ttyS0 */ \
+ { .uart = 0, BASE_BAUD, 0x2F8, 3, STD_COMX_FLAGS }, /* ttyS1 */ \
+ { .uart = 0, BASE_BAUD, 0x3E8, 4, STD_COMX_FLAGS }, /* ttyS2 */ \
+ { .uart = 0, BASE_BAUD, 0x2E8, 3, STD_COM4_FLAGS }, /* ttyS3 */
#endif /* _ASM_X86_SERIAL_H */
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 8cd27e08e23c..8cd1cc3bc835 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -150,6 +150,7 @@ static inline void arch_send_call_function_ipi_mask(const struct cpumask *mask)
}
void cpu_disable_common(void);
+void cpu_die_common(unsigned int cpu);
void native_smp_prepare_boot_cpu(void);
void native_smp_prepare_cpus(unsigned int max_cpus);
void native_smp_cpus_done(unsigned int max_cpus);
diff --git a/arch/x86/include/asm/smpboot_hooks.h b/arch/x86/include/asm/smpboot_hooks.h
index 49adfd7bb4a4..0da7409f0bec 100644
--- a/arch/x86/include/asm/smpboot_hooks.h
+++ b/arch/x86/include/asm/smpboot_hooks.h
@@ -17,11 +17,11 @@ static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip)
spin_unlock_irqrestore(&rtc_lock, flags);
local_flush_tlb();
pr_debug("1.\n");
- *((volatile unsigned short *)phys_to_virt(apic->trampoline_phys_high)) =
- start_eip >> 4;
+ *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_HIGH)) =
+ start_eip >> 4;
pr_debug("2.\n");
- *((volatile unsigned short *)phys_to_virt(apic->trampoline_phys_low)) =
- start_eip & 0xf;
+ *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) =
+ start_eip & 0xf;
pr_debug("3.\n");
}
@@ -42,7 +42,7 @@ static inline void smpboot_restore_warm_reset_vector(void)
CMOS_WRITE(0, 0xf);
spin_unlock_irqrestore(&rtc_lock, flags);
- *((volatile u32 *)phys_to_virt(apic->trampoline_phys_low)) = 0;
+ *((volatile u32 *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = 0;
}
static inline void __init smpboot_setup_io_apic(void)
diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
index 54f1c8068c02..9295016485c9 100644
--- a/arch/x86/include/asm/spinlock.h
+++ b/arch/x86/include/asm/spinlock.h
@@ -187,7 +187,6 @@ static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
cpu_relax();
}
-#ifndef CONFIG_QUEUE_RWLOCK
/*
* Read-write spinlocks, allowing multiple readers
* but only one writer.
@@ -198,91 +197,15 @@ static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
* irq-safe write-lock, but readers can get non-irqsafe
* read-locks.
*
- * On x86, we implement read-write locks as a 32-bit counter
- * with the high bit (sign) being the "contended" bit.
+ * On x86, we implement read-write locks using the generic qrwlock with
+ * x86 specific optimization.
*/
-/**
- * read_can_lock - would read_trylock() succeed?
- * @lock: the rwlock in question.
- */
-static inline int arch_read_can_lock(arch_rwlock_t *lock)
-{
- return lock->lock > 0;
-}
-
-/**
- * write_can_lock - would write_trylock() succeed?
- * @lock: the rwlock in question.
- */
-static inline int arch_write_can_lock(arch_rwlock_t *lock)
-{
- return lock->write == WRITE_LOCK_CMP;
-}
-
-static inline void arch_read_lock(arch_rwlock_t *rw)
-{
- asm volatile(LOCK_PREFIX READ_LOCK_SIZE(dec) " (%0)\n\t"
- "jns 1f\n"
- "call __read_lock_failed\n\t"
- "1:\n"
- ::LOCK_PTR_REG (rw) : "memory");
-}
-
-static inline void arch_write_lock(arch_rwlock_t *rw)
-{
- asm volatile(LOCK_PREFIX WRITE_LOCK_SUB(%1) "(%0)\n\t"
- "jz 1f\n"
- "call __write_lock_failed\n\t"
- "1:\n"
- ::LOCK_PTR_REG (&rw->write), "i" (RW_LOCK_BIAS)
- : "memory");
-}
-
-static inline int arch_read_trylock(arch_rwlock_t *lock)
-{
- READ_LOCK_ATOMIC(t) *count = (READ_LOCK_ATOMIC(t) *)lock;
-
- if (READ_LOCK_ATOMIC(dec_return)(count) >= 0)
- return 1;
- READ_LOCK_ATOMIC(inc)(count);
- return 0;
-}
-
-static inline int arch_write_trylock(arch_rwlock_t *lock)
-{
- atomic_t *count = (atomic_t *)&lock->write;
-
- if (atomic_sub_and_test(WRITE_LOCK_CMP, count))
- return 1;
- atomic_add(WRITE_LOCK_CMP, count);
- return 0;
-}
-
-static inline void arch_read_unlock(arch_rwlock_t *rw)
-{
- asm volatile(LOCK_PREFIX READ_LOCK_SIZE(inc) " %0"
- :"+m" (rw->lock) : : "memory");
-}
-
-static inline void arch_write_unlock(arch_rwlock_t *rw)
-{
- asm volatile(LOCK_PREFIX WRITE_LOCK_ADD(%1) "%0"
- : "+m" (rw->write) : "i" (RW_LOCK_BIAS) : "memory");
-}
-#else
#include <asm/qrwlock.h>
-#endif /* CONFIG_QUEUE_RWLOCK */
#define arch_read_lock_flags(lock, flags) arch_read_lock(lock)
#define arch_write_lock_flags(lock, flags) arch_write_lock(lock)
-#undef READ_LOCK_SIZE
-#undef READ_LOCK_ATOMIC
-#undef WRITE_LOCK_ADD
-#undef WRITE_LOCK_SUB
-#undef WRITE_LOCK_CMP
-
#define arch_spin_relax(lock) cpu_relax()
#define arch_read_relax(lock) cpu_relax()
#define arch_write_relax(lock) cpu_relax()
diff --git a/arch/x86/include/asm/spinlock_types.h b/arch/x86/include/asm/spinlock_types.h
index 73c4c007200f..5f9d7572d82b 100644
--- a/arch/x86/include/asm/spinlock_types.h
+++ b/arch/x86/include/asm/spinlock_types.h
@@ -34,10 +34,6 @@ typedef struct arch_spinlock {
#define __ARCH_SPIN_LOCK_UNLOCKED { { 0 } }
-#ifdef CONFIG_QUEUE_RWLOCK
#include <asm-generic/qrwlock_types.h>
-#else
-#include <asm/rwlock.h>
-#endif
#endif /* _ASM_X86_SPINLOCK_TYPES_H */
diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h
index c63e925fd6b7..a00ad8f2a657 100644
--- a/arch/x86/include/asm/uv/uv_hub.h
+++ b/arch/x86/include/asm/uv/uv_hub.h
@@ -164,7 +164,7 @@ struct uv_hub_info_s {
};
DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
-#define uv_hub_info (&__get_cpu_var(__uv_hub_info))
+#define uv_hub_info this_cpu_ptr(&__uv_hub_info)
#define uv_cpu_hub_info(cpu) (&per_cpu(__uv_hub_info, cpu))
/*
@@ -601,16 +601,16 @@ struct uv_hub_nmi_s {
struct uv_cpu_nmi_s {
struct uv_hub_nmi_s *hub;
- atomic_t state;
- atomic_t pinging;
+ int state;
+ int pinging;
int queries;
int pings;
};
-DECLARE_PER_CPU(struct uv_cpu_nmi_s, __uv_cpu_nmi);
-#define uv_cpu_nmi (__get_cpu_var(__uv_cpu_nmi))
+DECLARE_PER_CPU(struct uv_cpu_nmi_s, uv_cpu_nmi);
+
#define uv_hub_nmi (uv_cpu_nmi.hub)
-#define uv_cpu_nmi_per(cpu) (per_cpu(__uv_cpu_nmi, cpu))
+#define uv_cpu_nmi_per(cpu) (per_cpu(uv_cpu_nmi, cpu))
#define uv_hub_nmi_per(cpu) (uv_cpu_nmi_per(cpu).hub)
/* uv_cpu_nmi_states */
diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h
index d949ef28c48b..7e7a79ada658 100644
--- a/arch/x86/include/asm/xsave.h
+++ b/arch/x86/include/asm/xsave.h
@@ -52,24 +52,170 @@ extern void xsave_init(void);
extern void update_regset_xstate_info(unsigned int size, u64 xstate_mask);
extern int init_fpu(struct task_struct *child);
-static inline int fpu_xrstor_checking(struct xsave_struct *fx)
+/* These macros all use (%edi)/(%rdi) as the single memory argument. */
+#define XSAVE ".byte " REX_PREFIX "0x0f,0xae,0x27"
+#define XSAVEOPT ".byte " REX_PREFIX "0x0f,0xae,0x37"
+#define XSAVES ".byte " REX_PREFIX "0x0f,0xc7,0x2f"
+#define XRSTOR ".byte " REX_PREFIX "0x0f,0xae,0x2f"
+#define XRSTORS ".byte " REX_PREFIX "0x0f,0xc7,0x1f"
+
+#define xstate_fault ".section .fixup,\"ax\"\n" \
+ "3: movl $-1,%[err]\n" \
+ " jmp 2b\n" \
+ ".previous\n" \
+ _ASM_EXTABLE(1b, 3b) \
+ : [err] "=r" (err)
+
+/*
+ * This function is called only during boot time when x86 caps are not set
+ * up and alternative can not be used yet.
+ */
+static inline int xsave_state_booting(struct xsave_struct *fx, u64 mask)
{
- int err;
+ u32 lmask = mask;
+ u32 hmask = mask >> 32;
+ int err = 0;
+
+ WARN_ON(system_state != SYSTEM_BOOTING);
+
+ if (boot_cpu_has(X86_FEATURE_XSAVES))
+ asm volatile("1:"XSAVES"\n\t"
+ "2:\n\t"
+ : : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask)
+ : "memory");
+ else
+ asm volatile("1:"XSAVE"\n\t"
+ "2:\n\t"
+ : : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask)
+ : "memory");
+
+ asm volatile(xstate_fault
+ : "0" (0)
+ : "memory");
+
+ return err;
+}
+
+/*
+ * This function is called only during boot time when x86 caps are not set
+ * up and alternative can not be used yet.
+ */
+static inline int xrstor_state_booting(struct xsave_struct *fx, u64 mask)
+{
+ u32 lmask = mask;
+ u32 hmask = mask >> 32;
+ int err = 0;
+
+ WARN_ON(system_state != SYSTEM_BOOTING);
- asm volatile("1: .byte " REX_PREFIX "0x0f,0xae,0x2f\n\t"
- "2:\n"
- ".section .fixup,\"ax\"\n"
- "3: movl $-1,%[err]\n"
- " jmp 2b\n"
- ".previous\n"
- _ASM_EXTABLE(1b, 3b)
- : [err] "=r" (err)
- : "D" (fx), "m" (*fx), "a" (-1), "d" (-1), "0" (0)
+ if (boot_cpu_has(X86_FEATURE_XSAVES))
+ asm volatile("1:"XRSTORS"\n\t"
+ "2:\n\t"
+ : : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask)
+ : "memory");
+ else
+ asm volatile("1:"XRSTOR"\n\t"
+ "2:\n\t"
+ : : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask)
+ : "memory");
+
+ asm volatile(xstate_fault
+ : "0" (0)
+ : "memory");
+
+ return err;
+}
+
+/*
+ * Save processor xstate to xsave area.
+ */
+static inline int xsave_state(struct xsave_struct *fx, u64 mask)
+{
+ u32 lmask = mask;
+ u32 hmask = mask >> 32;
+ int err = 0;
+
+ /*
+ * If xsaves is enabled, xsaves replaces xsaveopt because
+ * it supports compact format and supervisor states in addition to
+ * modified optimization in xsaveopt.
+ *
+ * Otherwise, if xsaveopt is enabled, xsaveopt replaces xsave
+ * because xsaveopt supports modified optimization which is not
+ * supported by xsave.
+ *
+ * If none of xsaves and xsaveopt is enabled, use xsave.
+ */
+ alternative_input_2(
+ "1:"XSAVE,
+ "1:"XSAVEOPT,
+ X86_FEATURE_XSAVEOPT,
+ "1:"XSAVES,
+ X86_FEATURE_XSAVES,
+ [fx] "D" (fx), "a" (lmask), "d" (hmask) :
+ "memory");
+ asm volatile("2:\n\t"
+ xstate_fault
+ : "0" (0)
: "memory");
return err;
}
+/*
+ * Restore processor xstate from xsave area.
+ */
+static inline int xrstor_state(struct xsave_struct *fx, u64 mask)
+{
+ int err = 0;
+ u32 lmask = mask;
+ u32 hmask = mask >> 32;
+
+ /*
+ * Use xrstors to restore context if it is enabled. xrstors supports
+ * compacted format of xsave area which is not supported by xrstor.
+ */
+ alternative_input(
+ "1: " XRSTOR,
+ "1: " XRSTORS,
+ X86_FEATURE_XSAVES,
+ "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask)
+ : "memory");
+
+ asm volatile("2:\n"
+ xstate_fault
+ : "0" (0)
+ : "memory");
+
+ return err;
+}
+
+/*
+ * Save xstate context for old process during context switch.
+ */
+static inline void fpu_xsave(struct fpu *fpu)
+{
+ xsave_state(&fpu->state->xsave, -1);
+}
+
+/*
+ * Restore xstate context for new process during context switch.
+ */
+static inline int fpu_xrstor_checking(struct xsave_struct *fx)
+{
+ return xrstor_state(fx, -1);
+}
+
+/*
+ * Save xstate to user space xsave area.
+ *
+ * We don't use modified optimization because xrstor/xrstors might track
+ * a different application.
+ *
+ * We don't use compacted format xsave area for
+ * backward compatibility for old applications which don't understand
+ * compacted format of xsave area.
+ */
static inline int xsave_user(struct xsave_struct __user *buf)
{
int err;
@@ -83,69 +229,34 @@ static inline int xsave_user(struct xsave_struct __user *buf)
return -EFAULT;
__asm__ __volatile__(ASM_STAC "\n"
- "1: .byte " REX_PREFIX "0x0f,0xae,0x27\n"
+ "1:"XSAVE"\n"
"2: " ASM_CLAC "\n"
- ".section .fixup,\"ax\"\n"
- "3: movl $-1,%[err]\n"
- " jmp 2b\n"
- ".previous\n"
- _ASM_EXTABLE(1b,3b)
- : [err] "=r" (err)
+ xstate_fault
: "D" (buf), "a" (-1), "d" (-1), "0" (0)
: "memory");
return err;
}
+/*
+ * Restore xstate from user space xsave area.
+ */
static inline int xrestore_user(struct xsave_struct __user *buf, u64 mask)
{
- int err;
+ int err = 0;
struct xsave_struct *xstate = ((__force struct xsave_struct *)buf);
u32 lmask = mask;
u32 hmask = mask >> 32;
__asm__ __volatile__(ASM_STAC "\n"
- "1: .byte " REX_PREFIX "0x0f,0xae,0x2f\n"
+ "1:"XRSTOR"\n"
"2: " ASM_CLAC "\n"
- ".section .fixup,\"ax\"\n"
- "3: movl $-1,%[err]\n"
- " jmp 2b\n"
- ".previous\n"
- _ASM_EXTABLE(1b,3b)
- : [err] "=r" (err)
+ xstate_fault
: "D" (xstate), "a" (lmask), "d" (hmask), "0" (0)
: "memory"); /* memory required? */
return err;
}
-static inline void xrstor_state(struct xsave_struct *fx, u64 mask)
-{
- u32 lmask = mask;
- u32 hmask = mask >> 32;
-
- asm volatile(".byte " REX_PREFIX "0x0f,0xae,0x2f\n\t"
- : : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask)
- : "memory");
-}
-
-static inline void xsave_state(struct xsave_struct *fx, u64 mask)
-{
- u32 lmask = mask;
- u32 hmask = mask >> 32;
+void *get_xsave_addr(struct xsave_struct *xsave, int xstate);
+void setup_xstate_comp(void);
- asm volatile(".byte " REX_PREFIX "0x0f,0xae,0x27\n\t"
- : : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask)
- : "memory");
-}
-
-static inline void fpu_xsave(struct fpu *fpu)
-{
- /* This, however, we can work around by forcing the compiler to select
- an addressing mode that doesn't require extended registers. */
- alternative_input(
- ".byte " REX_PREFIX "0x0f,0xae,0x27",
- ".byte " REX_PREFIX "0x0f,0xae,0x37",
- X86_FEATURE_XSAVEOPT,
- [fx] "D" (&fpu->state->xsave), "a" (-1), "d" (-1) :
- "memory");
-}
#endif
diff --git a/arch/x86/include/uapi/asm/e820.h b/arch/x86/include/uapi/asm/e820.h
index bbae02470701..d993e33f5236 100644
--- a/arch/x86/include/uapi/asm/e820.h
+++ b/arch/x86/include/uapi/asm/e820.h
@@ -21,11 +21,6 @@
* this size.
*/
-/*
- * Odd: 'make headers_check' complains about numa.h if I try
- * to collapse the next two #ifdef lines to a single line:
- * #if defined(__KERNEL__) && defined(CONFIG_EFI)
- */
#ifndef __KERNEL__
#define E820_X_MAX E820MAX
#endif
diff --git a/arch/x86/include/uapi/asm/msr-index.h b/arch/x86/include/uapi/asm/msr-index.h
index eac9e92fe181..e21331ce368f 100644
--- a/arch/x86/include/uapi/asm/msr-index.h
+++ b/arch/x86/include/uapi/asm/msr-index.h
@@ -149,6 +149,9 @@
#define MSR_CORE_C1_RES 0x00000660
+#define MSR_CC6_DEMOTION_POLICY_CONFIG 0x00000668
+#define MSR_MC6_DEMOTION_POLICY_CONFIG 0x00000669
+
#define MSR_AMD64_MC0_MASK 0xc0010044
#define MSR_IA32_MCx_CTL(x) (MSR_IA32_MC0_CTL + 4*(x))
diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
index 0e79420376eb..990a2fe1588d 100644
--- a/arch/x86/include/uapi/asm/vmx.h
+++ b/arch/x86/include/uapi/asm/vmx.h
@@ -67,6 +67,7 @@
#define EXIT_REASON_EPT_MISCONFIG 49
#define EXIT_REASON_INVEPT 50
#define EXIT_REASON_PREEMPTION_TIMER 52
+#define EXIT_REASON_INVVPID 53
#define EXIT_REASON_WBINVD 54
#define EXIT_REASON_XSETBV 55
#define EXIT_REASON_APIC_WRITE 56
@@ -114,6 +115,7 @@
{ EXIT_REASON_EOI_INDUCED, "EOI_INDUCED" }, \
{ EXIT_REASON_INVALID_STATE, "INVALID_STATE" }, \
{ EXIT_REASON_INVD, "INVD" }, \
+ { EXIT_REASON_INVVPID, "INVVPID" }, \
{ EXIT_REASON_INVPCID, "INVPCID" }
#endif /* _UAPIVMX_H */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index bde3993624f1..8f1e77440b2b 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -39,8 +39,6 @@ obj-y += tsc.o tsc_msr.o io_delay.o rtc.o
obj-y += pci-iommu_table.o
obj-y += resource.o
-obj-$(CONFIG_PREEMPT) += preempt.o
-
obj-y += process.o
obj-y += i387.o xsave.o
obj-y += ptrace.o
@@ -71,6 +69,7 @@ obj-$(CONFIG_FTRACE_SYSCALLS) += ftrace.o
obj-$(CONFIG_X86_TSC) += trace_clock.o
obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o
obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o
+obj-$(CONFIG_KEXEC_FILE) += kexec-bzimage64.o
obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o
obj-y += kprobes/
obj-$(CONFIG_MODULES) += module.o
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index a531f6564ed0..a142e77693e1 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -31,6 +31,7 @@
#include <linux/module.h>
#include <linux/dmi.h>
#include <linux/irq.h>
+#include <linux/irqdomain.h>
#include <linux/slab.h>
#include <linux/bootmem.h>
#include <linux/ioport.h>
@@ -43,6 +44,7 @@
#include <asm/io.h>
#include <asm/mpspec.h>
#include <asm/smp.h>
+#include <asm/i8259.h>
#include "sleep.h" /* To include x86_acpi_suspend_lowlevel */
static int __initdata acpi_force = 0;
@@ -93,44 +95,7 @@ static u32 isa_irq_to_gsi[NR_IRQS_LEGACY] __read_mostly = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
};
-static unsigned int gsi_to_irq(unsigned int gsi)
-{
- unsigned int irq = gsi + NR_IRQS_LEGACY;
- unsigned int i;
-
- for (i = 0; i < NR_IRQS_LEGACY; i++) {
- if (isa_irq_to_gsi[i] == gsi) {
- return i;
- }
- }
-
- /* Provide an identity mapping of gsi == irq
- * except on truly weird platforms that have
- * non isa irqs in the first 16 gsis.
- */
- if (gsi >= NR_IRQS_LEGACY)
- irq = gsi;
- else
- irq = gsi_top + gsi;
-
- return irq;
-}
-
-static u32 irq_to_gsi(int irq)
-{
- unsigned int gsi;
-
- if (irq < NR_IRQS_LEGACY)
- gsi = isa_irq_to_gsi[irq];
- else if (irq < gsi_top)
- gsi = irq;
- else if (irq < (gsi_top + NR_IRQS_LEGACY))
- gsi = irq - gsi_top;
- else
- gsi = 0xffffffff;
-
- return gsi;
-}
+#define ACPI_INVALID_GSI INT_MIN
/*
* This is just a simple wrapper around early_ioremap(),
@@ -341,11 +306,145 @@ acpi_parse_lapic_nmi(struct acpi_subtable_header * header, const unsigned long e
#endif /*CONFIG_X86_LOCAL_APIC */
#ifdef CONFIG_X86_IO_APIC
+#define MP_ISA_BUS 0
+
+static void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger,
+ u32 gsi)
+{
+ int ioapic;
+ int pin;
+ struct mpc_intsrc mp_irq;
+
+ /*
+ * Convert 'gsi' to 'ioapic.pin'.
+ */
+ ioapic = mp_find_ioapic(gsi);
+ if (ioapic < 0)
+ return;
+ pin = mp_find_ioapic_pin(ioapic, gsi);
+
+ /*
+ * TBD: This check is for faulty timer entries, where the override
+ * erroneously sets the trigger to level, resulting in a HUGE
+ * increase of timer interrupts!
+ */
+ if ((bus_irq == 0) && (trigger == 3))
+ trigger = 1;
+
+ mp_irq.type = MP_INTSRC;
+ mp_irq.irqtype = mp_INT;
+ mp_irq.irqflag = (trigger << 2) | polarity;
+ mp_irq.srcbus = MP_ISA_BUS;
+ mp_irq.srcbusirq = bus_irq; /* IRQ */
+ mp_irq.dstapic = mpc_ioapic_id(ioapic); /* APIC ID */
+ mp_irq.dstirq = pin; /* INTIN# */
+
+ mp_save_irq(&mp_irq);
+
+ /*
+ * Reset default identity mapping if gsi is also an legacy IRQ,
+ * otherwise there will be more than one entry with the same GSI
+ * and acpi_isa_irq_to_gsi() may give wrong result.
+ */
+ if (gsi < nr_legacy_irqs() && isa_irq_to_gsi[gsi] == gsi)
+ isa_irq_to_gsi[gsi] = ACPI_INVALID_GSI;
+ isa_irq_to_gsi[bus_irq] = gsi;
+}
+
+static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int trigger,
+ int polarity)
+{
+#ifdef CONFIG_X86_MPPARSE
+ struct mpc_intsrc mp_irq;
+ struct pci_dev *pdev;
+ unsigned char number;
+ unsigned int devfn;
+ int ioapic;
+ u8 pin;
+
+ if (!acpi_ioapic)
+ return 0;
+ if (!dev || !dev_is_pci(dev))
+ return 0;
+
+ pdev = to_pci_dev(dev);
+ number = pdev->bus->number;
+ devfn = pdev->devfn;
+ pin = pdev->pin;
+ /* print the entry should happen on mptable identically */
+ mp_irq.type = MP_INTSRC;
+ mp_irq.irqtype = mp_INT;
+ mp_irq.irqflag = (trigger == ACPI_EDGE_SENSITIVE ? 4 : 0x0c) |
+ (polarity == ACPI_ACTIVE_HIGH ? 1 : 3);
+ mp_irq.srcbus = number;
+ mp_irq.srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3);
+ ioapic = mp_find_ioapic(gsi);
+ mp_irq.dstapic = mpc_ioapic_id(ioapic);
+ mp_irq.dstirq = mp_find_ioapic_pin(ioapic, gsi);
+
+ mp_save_irq(&mp_irq);
+#endif
+ return 0;
+}
+
+static int mp_register_gsi(struct device *dev, u32 gsi, int trigger,
+ int polarity)
+{
+ int irq, node;
+
+ if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
+ return gsi;
+
+ /* Don't set up the ACPI SCI because it's already set up */
+ if (acpi_gbl_FADT.sci_interrupt == gsi)
+ return mp_map_gsi_to_irq(gsi, IOAPIC_MAP_ALLOC);
+
+ trigger = trigger == ACPI_EDGE_SENSITIVE ? 0 : 1;
+ polarity = polarity == ACPI_ACTIVE_HIGH ? 0 : 1;
+ node = dev ? dev_to_node(dev) : NUMA_NO_NODE;
+ if (mp_set_gsi_attr(gsi, trigger, polarity, node)) {
+ pr_warn("Failed to set pin attr for GSI%d\n", gsi);
+ return -1;
+ }
+
+ irq = mp_map_gsi_to_irq(gsi, IOAPIC_MAP_ALLOC);
+ if (irq < 0)
+ return irq;
+
+ if (enable_update_mptable)
+ mp_config_acpi_gsi(dev, gsi, trigger, polarity);
+
+ return irq;
+}
+
+static void mp_unregister_gsi(u32 gsi)
+{
+ int irq;
+
+ if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
+ return;
+
+ if (acpi_gbl_FADT.sci_interrupt == gsi)
+ return;
+
+ irq = mp_map_gsi_to_irq(gsi, 0);
+ if (irq > 0)
+ mp_unmap_irq(irq);
+}
+
+static struct irq_domain_ops acpi_irqdomain_ops = {
+ .map = mp_irqdomain_map,
+ .unmap = mp_irqdomain_unmap,
+};
static int __init
acpi_parse_ioapic(struct acpi_subtable_header * header, const unsigned long end)
{
struct acpi_madt_io_apic *ioapic = NULL;
+ struct ioapic_domain_cfg cfg = {
+ .type = IOAPIC_DOMAIN_DYNAMIC,
+ .ops = &acpi_irqdomain_ops,
+ };
ioapic = (struct acpi_madt_io_apic *)header;
@@ -354,8 +453,12 @@ acpi_parse_ioapic(struct acpi_subtable_header * header, const unsigned long end)
acpi_table_print_madt_entry(header);
- mp_register_ioapic(ioapic->id,
- ioapic->address, ioapic->global_irq_base);
+ /* Statically assign IRQ numbers for IOAPICs hosting legacy IRQs */
+ if (ioapic->global_irq_base < nr_legacy_irqs())
+ cfg.type = IOAPIC_DOMAIN_LEGACY;
+
+ mp_register_ioapic(ioapic->id, ioapic->address, ioapic->global_irq_base,
+ &cfg);
return 0;
}
@@ -378,11 +481,6 @@ static void __init acpi_sci_ioapic_setup(u8 bus_irq, u16 polarity, u16 trigger,
if (acpi_sci_flags & ACPI_MADT_POLARITY_MASK)
polarity = acpi_sci_flags & ACPI_MADT_POLARITY_MASK;
- /*
- * mp_config_acpi_legacy_irqs() already setup IRQs < 16
- * If GSI is < 16, this will update its flags,
- * else it will create a new mp_irqs[] entry.
- */
mp_override_legacy_irq(bus_irq, polarity, trigger, gsi);
/*
@@ -504,25 +602,32 @@ void __init acpi_pic_sci_set_trigger(unsigned int irq, u16 trigger)
outb(new >> 8, 0x4d1);
}
-int acpi_gsi_to_irq(u32 gsi, unsigned int *irq)
+int acpi_gsi_to_irq(u32 gsi, unsigned int *irqp)
{
- *irq = gsi_to_irq(gsi);
-
-#ifdef CONFIG_X86_IO_APIC
- if (acpi_irq_model == ACPI_IRQ_MODEL_IOAPIC)
- setup_IO_APIC_irq_extra(gsi);
-#endif
+ int irq;
+ if (acpi_irq_model == ACPI_IRQ_MODEL_PIC) {
+ *irqp = gsi;
+ } else {
+ irq = mp_map_gsi_to_irq(gsi,
+ IOAPIC_MAP_ALLOC | IOAPIC_MAP_CHECK);
+ if (irq < 0)
+ return -1;
+ *irqp = irq;
+ }
return 0;
}
EXPORT_SYMBOL_GPL(acpi_gsi_to_irq);
int acpi_isa_irq_to_gsi(unsigned isa_irq, u32 *gsi)
{
- if (isa_irq >= 16)
- return -1;
- *gsi = irq_to_gsi(isa_irq);
- return 0;
+ if (isa_irq < nr_legacy_irqs() &&
+ isa_irq_to_gsi[isa_irq] != ACPI_INVALID_GSI) {
+ *gsi = isa_irq_to_gsi[isa_irq];
+ return 0;
+ }
+
+ return -1;
}
static int acpi_register_gsi_pic(struct device *dev, u32 gsi,
@@ -542,15 +647,25 @@ static int acpi_register_gsi_pic(struct device *dev, u32 gsi,
static int acpi_register_gsi_ioapic(struct device *dev, u32 gsi,
int trigger, int polarity)
{
+ int irq = gsi;
+
#ifdef CONFIG_X86_IO_APIC
- gsi = mp_register_gsi(dev, gsi, trigger, polarity);
+ irq = mp_register_gsi(dev, gsi, trigger, polarity);
#endif
- return gsi;
+ return irq;
+}
+
+static void acpi_unregister_gsi_ioapic(u32 gsi)
+{
+#ifdef CONFIG_X86_IO_APIC
+ mp_unregister_gsi(gsi);
+#endif
}
int (*__acpi_register_gsi)(struct device *dev, u32 gsi,
int trigger, int polarity) = acpi_register_gsi_pic;
+void (*__acpi_unregister_gsi)(u32 gsi) = NULL;
#ifdef CONFIG_ACPI_SLEEP
int (*acpi_suspend_lowlevel)(void) = x86_acpi_suspend_lowlevel;
@@ -564,32 +679,22 @@ int (*acpi_suspend_lowlevel)(void);
*/
int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
{
- unsigned int irq;
- unsigned int plat_gsi = gsi;
-
- plat_gsi = (*__acpi_register_gsi)(dev, gsi, trigger, polarity);
- irq = gsi_to_irq(plat_gsi);
-
- return irq;
+ return __acpi_register_gsi(dev, gsi, trigger, polarity);
}
EXPORT_SYMBOL_GPL(acpi_register_gsi);
void acpi_unregister_gsi(u32 gsi)
{
+ if (__acpi_unregister_gsi)
+ __acpi_unregister_gsi(gsi);
}
EXPORT_SYMBOL_GPL(acpi_unregister_gsi);
-void __init acpi_set_irq_model_pic(void)
-{
- acpi_irq_model = ACPI_IRQ_MODEL_PIC;
- __acpi_register_gsi = acpi_register_gsi_pic;
- acpi_ioapic = 0;
-}
-
-void __init acpi_set_irq_model_ioapic(void)
+static void __init acpi_set_irq_model_ioapic(void)
{
acpi_irq_model = ACPI_IRQ_MODEL_IOAPIC;
__acpi_register_gsi = acpi_register_gsi_ioapic;
+ __acpi_unregister_gsi = acpi_unregister_gsi_ioapic;
acpi_ioapic = 1;
}
@@ -825,9 +930,8 @@ static int __init early_acpi_parse_madt_lapic_addr_ovr(void)
* and (optionally) overriden by a LAPIC_ADDR_OVR entry (64-bit value).
*/
- count =
- acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_OVERRIDE,
- acpi_parse_lapic_addr_ovr, 0);
+ count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_OVERRIDE,
+ acpi_parse_lapic_addr_ovr, 0);
if (count < 0) {
printk(KERN_ERR PREFIX
"Error parsing LAPIC address override entry\n");
@@ -852,9 +956,8 @@ static int __init acpi_parse_madt_lapic_entries(void)
* and (optionally) overriden by a LAPIC_ADDR_OVR entry (64-bit value).
*/
- count =
- acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_OVERRIDE,
- acpi_parse_lapic_addr_ovr, 0);
+ count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_OVERRIDE,
+ acpi_parse_lapic_addr_ovr, 0);
if (count < 0) {
printk(KERN_ERR PREFIX
"Error parsing LAPIC address override entry\n");
@@ -882,11 +985,10 @@ static int __init acpi_parse_madt_lapic_entries(void)
return count;
}
- x2count =
- acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_X2APIC_NMI,
- acpi_parse_x2apic_nmi, 0);
- count =
- acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_NMI, acpi_parse_lapic_nmi, 0);
+ x2count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_X2APIC_NMI,
+ acpi_parse_x2apic_nmi, 0);
+ count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_NMI,
+ acpi_parse_lapic_nmi, 0);
if (count < 0 || x2count < 0) {
printk(KERN_ERR PREFIX "Error parsing LAPIC NMI entry\n");
/* TBD: Cleanup to allow fallback to MPS */
@@ -897,44 +999,7 @@ static int __init acpi_parse_madt_lapic_entries(void)
#endif /* CONFIG_X86_LOCAL_APIC */
#ifdef CONFIG_X86_IO_APIC
-#define MP_ISA_BUS 0
-
-void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
-{
- int ioapic;
- int pin;
- struct mpc_intsrc mp_irq;
-
- /*
- * Convert 'gsi' to 'ioapic.pin'.
- */
- ioapic = mp_find_ioapic(gsi);
- if (ioapic < 0)
- return;
- pin = mp_find_ioapic_pin(ioapic, gsi);
-
- /*
- * TBD: This check is for faulty timer entries, where the override
- * erroneously sets the trigger to level, resulting in a HUGE
- * increase of timer interrupts!
- */
- if ((bus_irq == 0) && (trigger == 3))
- trigger = 1;
-
- mp_irq.type = MP_INTSRC;
- mp_irq.irqtype = mp_INT;
- mp_irq.irqflag = (trigger << 2) | polarity;
- mp_irq.srcbus = MP_ISA_BUS;
- mp_irq.srcbusirq = bus_irq; /* IRQ */
- mp_irq.dstapic = mpc_ioapic_id(ioapic); /* APIC ID */
- mp_irq.dstirq = pin; /* INTIN# */
-
- mp_save_irq(&mp_irq);
-
- isa_irq_to_gsi[bus_irq] = gsi;
-}
-
-void __init mp_config_acpi_legacy_irqs(void)
+static void __init mp_config_acpi_legacy_irqs(void)
{
int i;
struct mpc_intsrc mp_irq;
@@ -952,7 +1017,7 @@ void __init mp_config_acpi_legacy_irqs(void)
* Use the default configuration for the IRQs 0-15. Unless
* overridden by (MADT) interrupt source override entries.
*/
- for (i = 0; i < 16; i++) {
+ for (i = 0; i < nr_legacy_irqs(); i++) {
int ioapic, pin;
unsigned int dstapic;
int idx;
@@ -1000,84 +1065,6 @@ void __init mp_config_acpi_legacy_irqs(void)
}
}
-static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int trigger,
- int polarity)
-{
-#ifdef CONFIG_X86_MPPARSE
- struct mpc_intsrc mp_irq;
- struct pci_dev *pdev;
- unsigned char number;
- unsigned int devfn;
- int ioapic;
- u8 pin;
-
- if (!acpi_ioapic)
- return 0;
- if (!dev || !dev_is_pci(dev))
- return 0;
-
- pdev = to_pci_dev(dev);
- number = pdev->bus->number;
- devfn = pdev->devfn;
- pin = pdev->pin;
- /* print the entry should happen on mptable identically */
- mp_irq.type = MP_INTSRC;
- mp_irq.irqtype = mp_INT;
- mp_irq.irqflag = (trigger == ACPI_EDGE_SENSITIVE ? 4 : 0x0c) |
- (polarity == ACPI_ACTIVE_HIGH ? 1 : 3);
- mp_irq.srcbus = number;
- mp_irq.srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3);
- ioapic = mp_find_ioapic(gsi);
- mp_irq.dstapic = mpc_ioapic_id(ioapic);
- mp_irq.dstirq = mp_find_ioapic_pin(ioapic, gsi);
-
- mp_save_irq(&mp_irq);
-#endif
- return 0;
-}
-
-int mp_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
-{
- int ioapic;
- int ioapic_pin;
- struct io_apic_irq_attr irq_attr;
- int ret;
-
- if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
- return gsi;
-
- /* Don't set up the ACPI SCI because it's already set up */
- if (acpi_gbl_FADT.sci_interrupt == gsi)
- return gsi;
-
- ioapic = mp_find_ioapic(gsi);
- if (ioapic < 0) {
- printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi);
- return gsi;
- }
-
- ioapic_pin = mp_find_ioapic_pin(ioapic, gsi);
-
- if (ioapic_pin > MP_MAX_IOAPIC_PIN) {
- printk(KERN_ERR "Invalid reference to IOAPIC pin "
- "%d-%d\n", mpc_ioapic_id(ioapic),
- ioapic_pin);
- return gsi;
- }
-
- if (enable_update_mptable)
- mp_config_acpi_gsi(dev, gsi, trigger, polarity);
-
- set_io_apic_irq_attr(&irq_attr, ioapic, ioapic_pin,
- trigger == ACPI_EDGE_SENSITIVE ? 0 : 1,
- polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
- ret = io_apic_set_pci_routing(dev, gsi_to_irq(gsi), &irq_attr);
- if (ret < 0)
- gsi = INT_MIN;
-
- return gsi;
-}
-
/*
* Parse IOAPIC related entries in MADT
* returns 0 on success, < 0 on error
@@ -1107,9 +1094,8 @@ static int __init acpi_parse_madt_ioapic_entries(void)
return -ENODEV;
}
- count =
- acpi_table_parse_madt(ACPI_MADT_TYPE_IO_APIC, acpi_parse_ioapic,
- MAX_IO_APICS);
+ count = acpi_table_parse_madt(ACPI_MADT_TYPE_IO_APIC, acpi_parse_ioapic,
+ MAX_IO_APICS);
if (!count) {
printk(KERN_ERR PREFIX "No IOAPIC entries present\n");
return -ENODEV;
@@ -1118,9 +1104,8 @@ static int __init acpi_parse_madt_ioapic_entries(void)
return count;
}
- count =
- acpi_table_parse_madt(ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, acpi_parse_int_src_ovr,
- nr_irqs);
+ count = acpi_table_parse_madt(ACPI_MADT_TYPE_INTERRUPT_OVERRIDE,
+ acpi_parse_int_src_ovr, nr_irqs);
if (count < 0) {
printk(KERN_ERR PREFIX
"Error parsing interrupt source overrides entry\n");
@@ -1139,9 +1124,8 @@ static int __init acpi_parse_madt_ioapic_entries(void)
/* Fill in identity legacy mappings where no override */
mp_config_acpi_legacy_irqs();
- count =
- acpi_table_parse_madt(ACPI_MADT_TYPE_NMI_SOURCE, acpi_parse_nmi_src,
- nr_irqs);
+ count = acpi_table_parse_madt(ACPI_MADT_TYPE_NMI_SOURCE,
+ acpi_parse_nmi_src, nr_irqs);
if (count < 0) {
printk(KERN_ERR PREFIX "Error parsing NMI SRC entry\n");
/* TBD: Cleanup to allow fallback to MPS */
diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c
index af5b08ab3b71..b708738d016e 100644
--- a/arch/x86/kernel/apb_timer.c
+++ b/arch/x86/kernel/apb_timer.c
@@ -146,7 +146,7 @@ static inline int is_apbt_capable(void)
static int __init apbt_clockevent_register(void)
{
struct sfi_timer_table_entry *mtmr;
- struct apbt_dev *adev = &__get_cpu_var(cpu_apbt_dev);
+ struct apbt_dev *adev = this_cpu_ptr(&cpu_apbt_dev);
mtmr = sfi_get_mtmr(APBT_CLOCKEVENT0_NUM);
if (mtmr == NULL) {
@@ -185,8 +185,6 @@ static void apbt_setup_irq(struct apbt_dev *adev)
irq_modify_status(adev->irq, 0, IRQ_MOVE_PCNTXT);
irq_set_affinity(adev->irq, cpumask_of(adev->cpu));
- /* APB timer irqs are set up as mp_irqs, timer is edge type */
- __irq_set_handler(adev->irq, handle_edge_irq, 0, "edge");
}
/* Should be called with per cpu */
@@ -200,7 +198,7 @@ void apbt_setup_secondary_clock(void)
if (!cpu)
return;
- adev = &__get_cpu_var(cpu_apbt_dev);
+ adev = this_cpu_ptr(&cpu_apbt_dev);
if (!adev->timer) {
adev->timer = dw_apb_clockevent_init(cpu, adev->name,
APBT_CLOCKEVENT_RATING, adev_virt_addr(adev),
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index ad28db7e6bde..ba6cc041edb1 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -67,7 +67,7 @@ EXPORT_SYMBOL_GPL(boot_cpu_physical_apicid);
/*
* The highest APIC ID seen during enumeration.
*/
-unsigned int max_physical_apicid;
+static unsigned int max_physical_apicid;
/*
* Bitmask of physically existing CPUs:
@@ -561,7 +561,7 @@ static DEFINE_PER_CPU(struct clock_event_device, lapic_events);
*/
static void setup_APIC_timer(void)
{
- struct clock_event_device *levt = &__get_cpu_var(lapic_events);
+ struct clock_event_device *levt = this_cpu_ptr(&lapic_events);
if (this_cpu_has(X86_FEATURE_ARAT)) {
lapic_clockevent.features &= ~CLOCK_EVT_FEAT_C3STOP;
@@ -696,7 +696,7 @@ calibrate_by_pmtimer(long deltapm, long *delta, long *deltatsc)
static int __init calibrate_APIC_clock(void)
{
- struct clock_event_device *levt = &__get_cpu_var(lapic_events);
+ struct clock_event_device *levt = this_cpu_ptr(&lapic_events);
void (*real_handler)(struct clock_event_device *dev);
unsigned long deltaj;
long delta, deltatsc;
@@ -1297,7 +1297,7 @@ void setup_local_APIC(void)
unsigned int value, queued;
int i, j, acked = 0;
unsigned long long tsc = 0, ntsc;
- long long max_loops = cpu_khz;
+ long long max_loops = cpu_khz ? cpu_khz : 1000000;
if (cpu_has_tsc)
rdtscll(tsc);
@@ -1342,17 +1342,6 @@ void setup_local_APIC(void)
/* always use the value from LDR */
early_per_cpu(x86_cpu_to_logical_apicid, cpu) =
logical_smp_processor_id();
-
- /*
- * Some NUMA implementations (NUMAQ) don't initialize apicid to
- * node mapping during NUMA init. Now that logical apicid is
- * guaranteed to be known, give it another chance. This is already
- * a bit too late - percpu allocation has already happened without
- * proper NUMA affinity.
- */
- if (apic->x86_32_numa_cpu_node)
- set_apicid_to_node(early_per_cpu(x86_cpu_to_apicid, cpu),
- apic->x86_32_numa_cpu_node(cpu));
#endif
/*
@@ -1394,7 +1383,7 @@ void setup_local_APIC(void)
break;
}
if (queued) {
- if (cpu_has_tsc) {
+ if (cpu_has_tsc && cpu_khz) {
rdtscll(ntsc);
max_loops = (cpu_khz << 10) - (ntsc - tsc);
} else
@@ -2053,8 +2042,6 @@ void __init connect_bsp_APIC(void)
imcr_pic_to_apic();
}
#endif
- if (apic->enable_apic_mode)
- apic->enable_apic_mode();
}
/**
@@ -2451,51 +2438,6 @@ static void apic_pm_activate(void) { }
#ifdef CONFIG_X86_64
-static int apic_cluster_num(void)
-{
- int i, clusters, zeros;
- unsigned id;
- u16 *bios_cpu_apicid;
- DECLARE_BITMAP(clustermap, NUM_APIC_CLUSTERS);
-
- bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid);
- bitmap_zero(clustermap, NUM_APIC_CLUSTERS);
-
- for (i = 0; i < nr_cpu_ids; i++) {
- /* are we being called early in kernel startup? */
- if (bios_cpu_apicid) {
- id = bios_cpu_apicid[i];
- } else if (i < nr_cpu_ids) {
- if (cpu_present(i))
- id = per_cpu(x86_bios_cpu_apicid, i);
- else
- continue;
- } else
- break;
-
- if (id != BAD_APICID)
- __set_bit(APIC_CLUSTERID(id), clustermap);
- }
-
- /* Problem: Partially populated chassis may not have CPUs in some of
- * the APIC clusters they have been allocated. Only present CPUs have
- * x86_bios_cpu_apicid entries, thus causing zeroes in the bitmap.
- * Since clusters are allocated sequentially, count zeros only if
- * they are bounded by ones.
- */
- clusters = 0;
- zeros = 0;
- for (i = 0; i < NUM_APIC_CLUSTERS; i++) {
- if (test_bit(i, clustermap)) {
- clusters += 1 + zeros;
- zeros = 0;
- } else
- ++zeros;
- }
-
- return clusters;
-}
-
static int multi_checked;
static int multi;
@@ -2540,20 +2482,7 @@ static void dmi_check_multi(void)
int apic_is_clustered_box(void)
{
dmi_check_multi();
- if (multi)
- return 1;
-
- if (!is_vsmp_box())
- return 0;
-
- /*
- * ScaleMP vSMPowered boxes have one cluster per board and TSCs are
- * not guaranteed to be synced between boards
- */
- if (apic_cluster_num() > 1)
- return 1;
-
- return 0;
+ return multi;
}
#endif
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index 7c1b29479513..de918c410eae 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -168,21 +168,16 @@ static struct apic apic_flat = {
.disable_esr = 0,
.dest_logical = APIC_DEST_LOGICAL,
.check_apicid_used = NULL,
- .check_apicid_present = NULL,
.vector_allocation_domain = flat_vector_allocation_domain,
.init_apic_ldr = flat_init_apic_ldr,
.ioapic_phys_id_map = NULL,
.setup_apic_routing = NULL,
- .multi_timer_check = NULL,
.cpu_present_to_apicid = default_cpu_present_to_apicid,
.apicid_to_cpu_present = NULL,
- .setup_portio_remap = NULL,
.check_phys_apicid_present = default_check_phys_apicid_present,
- .enable_apic_mode = NULL,
.phys_pkg_id = flat_phys_pkg_id,
- .mps_oem_check = NULL,
.get_apic_id = flat_get_apic_id,
.set_apic_id = set_apic_id,
@@ -196,10 +191,7 @@ static struct apic apic_flat = {
.send_IPI_all = flat_send_IPI_all,
.send_IPI_self = apic_send_IPI_self,
- .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
- .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
.wait_for_init_deassert = false,
- .smp_callin_clear_local_apic = NULL,
.inquire_remote_apic = default_inquire_remote_apic,
.read = native_apic_mem_read,
@@ -283,7 +275,6 @@ static struct apic apic_physflat = {
.disable_esr = 0,
.dest_logical = 0,
.check_apicid_used = NULL,
- .check_apicid_present = NULL,
.vector_allocation_domain = default_vector_allocation_domain,
/* not needed, but shouldn't hurt: */
@@ -291,14 +282,10 @@ static struct apic apic_physflat = {
.ioapic_phys_id_map = NULL,
.setup_apic_routing = NULL,
- .multi_timer_check = NULL,
.cpu_present_to_apicid = default_cpu_present_to_apicid,
.apicid_to_cpu_present = NULL,
- .setup_portio_remap = NULL,
.check_phys_apicid_present = default_check_phys_apicid_present,
- .enable_apic_mode = NULL,
.phys_pkg_id = flat_phys_pkg_id,
- .mps_oem_check = NULL,
.get_apic_id = flat_get_apic_id,
.set_apic_id = set_apic_id,
@@ -312,10 +299,7 @@ static struct apic apic_physflat = {
.send_IPI_all = physflat_send_IPI_all,
.send_IPI_self = apic_send_IPI_self,
- .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
- .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
.wait_for_init_deassert = false,
- .smp_callin_clear_local_apic = NULL,
.inquire_remote_apic = default_inquire_remote_apic,
.read = native_apic_mem_read,
diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c
index 8c7c98249c20..b205cdbdbe6a 100644
--- a/arch/x86/kernel/apic/apic_noop.c
+++ b/arch/x86/kernel/apic/apic_noop.c
@@ -89,16 +89,6 @@ static const struct cpumask *noop_target_cpus(void)
return cpumask_of(0);
}
-static unsigned long noop_check_apicid_used(physid_mask_t *map, int apicid)
-{
- return physid_isset(apicid, *map);
-}
-
-static unsigned long noop_check_apicid_present(int bit)
-{
- return physid_isset(bit, phys_cpu_present_map);
-}
-
static void noop_vector_allocation_domain(int cpu, struct cpumask *retmask,
const struct cpumask *mask)
{
@@ -133,27 +123,21 @@ struct apic apic_noop = {
.target_cpus = noop_target_cpus,
.disable_esr = 0,
.dest_logical = APIC_DEST_LOGICAL,
- .check_apicid_used = noop_check_apicid_used,
- .check_apicid_present = noop_check_apicid_present,
+ .check_apicid_used = default_check_apicid_used,
.vector_allocation_domain = noop_vector_allocation_domain,
.init_apic_ldr = noop_init_apic_ldr,
.ioapic_phys_id_map = default_ioapic_phys_id_map,
.setup_apic_routing = NULL,
- .multi_timer_check = NULL,
.cpu_present_to_apicid = default_cpu_present_to_apicid,
.apicid_to_cpu_present = physid_set_mask_of_physid,
- .setup_portio_remap = NULL,
.check_phys_apicid_present = default_check_phys_apicid_present,
- .enable_apic_mode = NULL,
.phys_pkg_id = noop_phys_pkg_id,
- .mps_oem_check = NULL,
-
.get_apic_id = noop_get_apic_id,
.set_apic_id = NULL,
.apic_id_mask = 0x0F << 24,
@@ -168,12 +152,7 @@ struct apic apic_noop = {
.wakeup_secondary_cpu = noop_wakeup_secondary_cpu,
- /* should be safe */
- .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
- .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
-
.wait_for_init_deassert = false,
- .smp_callin_clear_local_apic = NULL,
.inquire_remote_apic = NULL,
.read = noop_apic_read,
diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c
index a5b45df8bc88..4128b5fcb559 100644
--- a/arch/x86/kernel/apic/apic_numachip.c
+++ b/arch/x86/kernel/apic/apic_numachip.c
@@ -32,7 +32,7 @@
static int numachip_system __read_mostly;
-static const struct apic apic_numachip __read_mostly;
+static const struct apic apic_numachip;
static unsigned int get_apic_id(unsigned long x)
{
@@ -217,21 +217,16 @@ static const struct apic apic_numachip __refconst = {
.disable_esr = 0,
.dest_logical = 0,
.check_apicid_used = NULL,
- .check_apicid_present = NULL,
.vector_allocation_domain = default_vector_allocation_domain,
.init_apic_ldr = flat_init_apic_ldr,
.ioapic_phys_id_map = NULL,
.setup_apic_routing = NULL,
- .multi_timer_check = NULL,
.cpu_present_to_apicid = default_cpu_present_to_apicid,
.apicid_to_cpu_present = NULL,
- .setup_portio_remap = NULL,
.check_phys_apicid_present = default_check_phys_apicid_present,
- .enable_apic_mode = NULL,
.phys_pkg_id = numachip_phys_pkg_id,
- .mps_oem_check = NULL,
.get_apic_id = get_apic_id,
.set_apic_id = set_apic_id,
@@ -246,10 +241,7 @@ static const struct apic apic_numachip __refconst = {
.send_IPI_self = numachip_send_IPI_self,
.wakeup_secondary_cpu = numachip_wakeup_secondary,
- .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
- .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
.wait_for_init_deassert = false,
- .smp_callin_clear_local_apic = NULL,
.inquire_remote_apic = NULL, /* REMRD not supported */
.read = native_apic_mem_read,
diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c
index e4840aa7a255..c4a8d63f8220 100644
--- a/arch/x86/kernel/apic/bigsmp_32.c
+++ b/arch/x86/kernel/apic/bigsmp_32.c
@@ -31,11 +31,6 @@ static unsigned long bigsmp_check_apicid_used(physid_mask_t *map, int apicid)
return 0;
}
-static unsigned long bigsmp_check_apicid_present(int bit)
-{
- return 1;
-}
-
static int bigsmp_early_logical_apicid(int cpu)
{
/* on bigsmp, logical apicid is the same as physical */
@@ -168,21 +163,16 @@ static struct apic apic_bigsmp = {
.disable_esr = 1,
.dest_logical = 0,
.check_apicid_used = bigsmp_check_apicid_used,
- .check_apicid_present = bigsmp_check_apicid_present,
.vector_allocation_domain = default_vector_allocation_domain,
.init_apic_ldr = bigsmp_init_apic_ldr,
.ioapic_phys_id_map = bigsmp_ioapic_phys_id_map,
.setup_apic_routing = bigsmp_setup_apic_routing,
- .multi_timer_check = NULL,
.cpu_present_to_apicid = bigsmp_cpu_present_to_apicid,
.apicid_to_cpu_present = physid_set_mask_of_physid,
- .setup_portio_remap = NULL,
.check_phys_apicid_present = bigsmp_check_phys_apicid_present,
- .enable_apic_mode = NULL,
.phys_pkg_id = bigsmp_phys_pkg_id,
- .mps_oem_check = NULL,
.get_apic_id = bigsmp_get_apic_id,
.set_apic_id = NULL,
@@ -196,11 +186,7 @@ static struct apic apic_bigsmp = {
.send_IPI_all = bigsmp_send_IPI_all,
.send_IPI_self = default_send_IPI_self,
- .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
- .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
-
.wait_for_init_deassert = true,
- .smp_callin_clear_local_apic = NULL,
.inquire_remote_apic = default_inquire_remote_apic,
.read = native_apic_mem_read,
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 81e08eff05ee..1183d545da1e 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -31,6 +31,7 @@
#include <linux/acpi.h>
#include <linux/module.h>
#include <linux/syscore_ops.h>
+#include <linux/irqdomain.h>
#include <linux/msi.h>
#include <linux/htirq.h>
#include <linux/freezer.h>
@@ -62,6 +63,16 @@
#define __apicdebuginit(type) static type __init
+#define for_each_ioapic(idx) \
+ for ((idx) = 0; (idx) < nr_ioapics; (idx)++)
+#define for_each_ioapic_reverse(idx) \
+ for ((idx) = nr_ioapics - 1; (idx) >= 0; (idx)--)
+#define for_each_pin(idx, pin) \
+ for ((pin) = 0; (pin) < ioapics[(idx)].nr_registers; (pin)++)
+#define for_each_ioapic_pin(idx, pin) \
+ for_each_ioapic((idx)) \
+ for_each_pin((idx), (pin))
+
#define for_each_irq_pin(entry, head) \
for (entry = head; entry; entry = entry->next)
@@ -73,6 +84,17 @@ int sis_apic_bug = -1;
static DEFINE_RAW_SPINLOCK(ioapic_lock);
static DEFINE_RAW_SPINLOCK(vector_lock);
+static DEFINE_MUTEX(ioapic_mutex);
+static unsigned int ioapic_dynirq_base;
+static int ioapic_initialized;
+
+struct mp_pin_info {
+ int trigger;
+ int polarity;
+ int node;
+ int set;
+ u32 count;
+};
static struct ioapic {
/*
@@ -87,7 +109,9 @@ static struct ioapic {
struct mpc_ioapic mp_config;
/* IO APIC gsi routing info */
struct mp_ioapic_gsi gsi_config;
- DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1);
+ struct ioapic_domain_cfg irqdomain_cfg;
+ struct irq_domain *irqdomain;
+ struct mp_pin_info *pin_info;
} ioapics[MAX_IO_APICS];
#define mpc_ioapic_ver(ioapic_idx) ioapics[ioapic_idx].mp_config.apicver
@@ -107,6 +131,41 @@ struct mp_ioapic_gsi *mp_ioapic_gsi_routing(int ioapic_idx)
return &ioapics[ioapic_idx].gsi_config;
}
+static inline int mp_ioapic_pin_count(int ioapic)
+{
+ struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(ioapic);
+
+ return gsi_cfg->gsi_end - gsi_cfg->gsi_base + 1;
+}
+
+u32 mp_pin_to_gsi(int ioapic, int pin)
+{
+ return mp_ioapic_gsi_routing(ioapic)->gsi_base + pin;
+}
+
+/*
+ * Initialize all legacy IRQs and all pins on the first IOAPIC
+ * if we have legacy interrupt controller. Kernel boot option "pirq="
+ * may rely on non-legacy pins on the first IOAPIC.
+ */
+static inline int mp_init_irq_at_boot(int ioapic, int irq)
+{
+ if (!nr_legacy_irqs())
+ return 0;
+
+ return ioapic == 0 || (irq >= 0 && irq < nr_legacy_irqs());
+}
+
+static inline struct mp_pin_info *mp_pin_info(int ioapic_idx, int pin)
+{
+ return ioapics[ioapic_idx].pin_info + pin;
+}
+
+static inline struct irq_domain *mp_ioapic_irqdomain(int ioapic)
+{
+ return ioapics[ioapic].irqdomain;
+}
+
int nr_ioapics;
/* The one past the highest gsi number used */
@@ -118,9 +177,6 @@ struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES];
/* # of MP IRQ source entries */
int mp_irq_entries;
-/* GSI interrupts */
-static int nr_irqs_gsi = NR_IRQS_LEGACY;
-
#ifdef CONFIG_EISA
int mp_bus_id_to_type[MAX_MP_BUSSES];
#endif
@@ -149,8 +205,7 @@ static int __init parse_noapic(char *str)
}
early_param("noapic", parse_noapic);
-static int io_apic_setup_irq_pin(unsigned int irq, int node,
- struct io_apic_irq_attr *attr);
+static struct irq_cfg *alloc_irq_and_cfg_at(unsigned int at, int node);
/* Will be called in mpparse/acpi/sfi codes for saving IRQ info */
void mp_save_irq(struct mpc_intsrc *m)
@@ -182,19 +237,15 @@ static struct irq_pin_list *alloc_irq_pin_list(int node)
return kzalloc_node(sizeof(struct irq_pin_list), GFP_KERNEL, node);
}
-
-/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
-static struct irq_cfg irq_cfgx[NR_IRQS_LEGACY];
-
int __init arch_early_irq_init(void)
{
struct irq_cfg *cfg;
- int count, node, i;
+ int i, node = cpu_to_node(0);
- if (!legacy_pic->nr_legacy_irqs)
+ if (!nr_legacy_irqs())
io_apic_irqs = ~0UL;
- for (i = 0; i < nr_ioapics; i++) {
+ for_each_ioapic(i) {
ioapics[i].saved_registers =
kzalloc(sizeof(struct IO_APIC_route_entry) *
ioapics[i].nr_registers, GFP_KERNEL);
@@ -202,28 +253,20 @@ int __init arch_early_irq_init(void)
pr_err("IOAPIC %d: suspend/resume impossible!\n", i);
}
- cfg = irq_cfgx;
- count = ARRAY_SIZE(irq_cfgx);
- node = cpu_to_node(0);
-
- for (i = 0; i < count; i++) {
- irq_set_chip_data(i, &cfg[i]);
- zalloc_cpumask_var_node(&cfg[i].domain, GFP_KERNEL, node);
- zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_KERNEL, node);
- /*
- * For legacy IRQ's, start with assigning irq0 to irq15 to
- * IRQ0_VECTOR to IRQ15_VECTOR for all cpu's.
- */
- if (i < legacy_pic->nr_legacy_irqs) {
- cfg[i].vector = IRQ0_VECTOR + i;
- cpumask_setall(cfg[i].domain);
- }
+ /*
+ * For legacy IRQ's, start with assigning irq0 to irq15 to
+ * IRQ0_VECTOR to IRQ15_VECTOR for all cpu's.
+ */
+ for (i = 0; i < nr_legacy_irqs(); i++) {
+ cfg = alloc_irq_and_cfg_at(i, node);
+ cfg->vector = IRQ0_VECTOR + i;
+ cpumask_setall(cfg->domain);
}
return 0;
}
-static struct irq_cfg *irq_cfg(unsigned int irq)
+static inline struct irq_cfg *irq_cfg(unsigned int irq)
{
return irq_get_chip_data(irq);
}
@@ -265,7 +308,7 @@ static struct irq_cfg *alloc_irq_and_cfg_at(unsigned int at, int node)
if (res < 0) {
if (res != -EEXIST)
return NULL;
- cfg = irq_get_chip_data(at);
+ cfg = irq_cfg(at);
if (cfg)
return cfg;
}
@@ -425,6 +468,21 @@ static int __add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pi
return 0;
}
+static void __remove_pin_from_irq(struct irq_cfg *cfg, int apic, int pin)
+{
+ struct irq_pin_list **last, *entry;
+
+ last = &cfg->irq_2_pin;
+ for_each_irq_pin(entry, cfg->irq_2_pin)
+ if (entry->apic == apic && entry->pin == pin) {
+ *last = entry->next;
+ kfree(entry);
+ return;
+ } else {
+ last = &entry->next;
+ }
+}
+
static void add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin)
{
if (__add_pin_to_irq_node(cfg, node, apic, pin))
@@ -627,9 +685,8 @@ static void clear_IO_APIC (void)
{
int apic, pin;
- for (apic = 0; apic < nr_ioapics; apic++)
- for (pin = 0; pin < ioapics[apic].nr_registers; pin++)
- clear_IO_APIC_pin(apic, pin);
+ for_each_ioapic_pin(apic, pin)
+ clear_IO_APIC_pin(apic, pin);
}
#ifdef CONFIG_X86_32
@@ -678,13 +735,13 @@ int save_ioapic_entries(void)
int apic, pin;
int err = 0;
- for (apic = 0; apic < nr_ioapics; apic++) {
+ for_each_ioapic(apic) {
if (!ioapics[apic].saved_registers) {
err = -ENOMEM;
continue;
}
- for (pin = 0; pin < ioapics[apic].nr_registers; pin++)
+ for_each_pin(apic, pin)
ioapics[apic].saved_registers[pin] =
ioapic_read_entry(apic, pin);
}
@@ -699,11 +756,11 @@ void mask_ioapic_entries(void)
{
int apic, pin;
- for (apic = 0; apic < nr_ioapics; apic++) {
+ for_each_ioapic(apic) {
if (!ioapics[apic].saved_registers)
continue;
- for (pin = 0; pin < ioapics[apic].nr_registers; pin++) {
+ for_each_pin(apic, pin) {
struct IO_APIC_route_entry entry;
entry = ioapics[apic].saved_registers[pin];
@@ -722,11 +779,11 @@ int restore_ioapic_entries(void)
{
int apic, pin;
- for (apic = 0; apic < nr_ioapics; apic++) {
+ for_each_ioapic(apic) {
if (!ioapics[apic].saved_registers)
continue;
- for (pin = 0; pin < ioapics[apic].nr_registers; pin++)
+ for_each_pin(apic, pin)
ioapic_write_entry(apic, pin,
ioapics[apic].saved_registers[pin]);
}
@@ -785,7 +842,7 @@ static int __init find_isa_irq_apic(int irq, int type)
if (i < mp_irq_entries) {
int ioapic_idx;
- for (ioapic_idx = 0; ioapic_idx < nr_ioapics; ioapic_idx++)
+ for_each_ioapic(ioapic_idx)
if (mpc_ioapic_id(ioapic_idx) == mp_irqs[i].dstapic)
return ioapic_idx;
}
@@ -799,7 +856,7 @@ static int __init find_isa_irq_apic(int irq, int type)
*/
static int EISA_ELCR(unsigned int irq)
{
- if (irq < legacy_pic->nr_legacy_irqs) {
+ if (irq < nr_legacy_irqs()) {
unsigned int port = 0x4d0 + (irq >> 3);
return (inb(port) >> (irq & 7)) & 1;
}
@@ -939,29 +996,106 @@ static int irq_trigger(int idx)
return trigger;
}
-static int pin_2_irq(int idx, int apic, int pin)
+static int alloc_irq_from_domain(struct irq_domain *domain, u32 gsi, int pin)
+{
+ int irq = -1;
+ int ioapic = (int)(long)domain->host_data;
+ int type = ioapics[ioapic].irqdomain_cfg.type;
+
+ switch (type) {
+ case IOAPIC_DOMAIN_LEGACY:
+ /*
+ * Dynamically allocate IRQ number for non-ISA IRQs in the first 16
+ * GSIs on some weird platforms.
+ */
+ if (gsi < nr_legacy_irqs())
+ irq = irq_create_mapping(domain, pin);
+ else if (irq_create_strict_mappings(domain, gsi, pin, 1) == 0)
+ irq = gsi;
+ break;
+ case IOAPIC_DOMAIN_STRICT:
+ if (irq_create_strict_mappings(domain, gsi, pin, 1) == 0)
+ irq = gsi;
+ break;
+ case IOAPIC_DOMAIN_DYNAMIC:
+ irq = irq_create_mapping(domain, pin);
+ break;
+ default:
+ WARN(1, "ioapic: unknown irqdomain type %d\n", type);
+ break;
+ }
+
+ return irq > 0 ? irq : -1;
+}
+
+static int mp_map_pin_to_irq(u32 gsi, int idx, int ioapic, int pin,
+ unsigned int flags)
{
int irq;
- int bus = mp_irqs[idx].srcbus;
- struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(apic);
+ struct irq_domain *domain = mp_ioapic_irqdomain(ioapic);
+ struct mp_pin_info *info = mp_pin_info(ioapic, pin);
+
+ if (!domain)
+ return -1;
+
+ mutex_lock(&ioapic_mutex);
/*
- * Debugging check, we are in big trouble if this message pops up!
+ * Don't use irqdomain to manage ISA IRQs because there may be
+ * multiple IOAPIC pins sharing the same ISA IRQ number and
+ * irqdomain only supports 1:1 mapping between IOAPIC pin and
+ * IRQ number. A typical IOAPIC has 24 pins, pin 0-15 are used
+ * for legacy IRQs and pin 16-23 are used for PCI IRQs (PIRQ A-H).
+ * When ACPI is disabled, only legacy IRQ numbers (IRQ0-15) are
+ * available, and some BIOSes may use MP Interrupt Source records
+ * to override IRQ numbers for PIRQs instead of reprogramming
+ * the interrupt routing logic. Thus there may be multiple pins
+ * sharing the same legacy IRQ number when ACPI is disabled.
*/
- if (mp_irqs[idx].dstirq != pin)
- pr_err("broken BIOS or MPTABLE parser, ayiee!!\n");
-
- if (test_bit(bus, mp_bus_not_pci)) {
+ if (idx >= 0 && test_bit(mp_irqs[idx].srcbus, mp_bus_not_pci)) {
irq = mp_irqs[idx].srcbusirq;
+ if (flags & IOAPIC_MAP_ALLOC) {
+ if (info->count == 0 &&
+ mp_irqdomain_map(domain, irq, pin) != 0)
+ irq = -1;
+
+ /* special handling for timer IRQ0 */
+ if (irq == 0)
+ info->count++;
+ }
} else {
- u32 gsi = gsi_cfg->gsi_base + pin;
+ irq = irq_find_mapping(domain, pin);
+ if (irq <= 0 && (flags & IOAPIC_MAP_ALLOC))
+ irq = alloc_irq_from_domain(domain, gsi, pin);
+ }
- if (gsi >= NR_IRQS_LEGACY)
- irq = gsi;
- else
- irq = gsi_top + gsi;
+ if (flags & IOAPIC_MAP_ALLOC) {
+ /* special handling for legacy IRQs */
+ if (irq < nr_legacy_irqs() && info->count == 1 &&
+ mp_irqdomain_map(domain, irq, pin) != 0)
+ irq = -1;
+
+ if (irq > 0)
+ info->count++;
+ else if (info->count == 0)
+ info->set = 0;
}
+ mutex_unlock(&ioapic_mutex);
+
+ return irq > 0 ? irq : -1;
+}
+
+static int pin_2_irq(int idx, int ioapic, int pin, unsigned int flags)
+{
+ u32 gsi = mp_pin_to_gsi(ioapic, pin);
+
+ /*
+ * Debugging check, we are in big trouble if this message pops up!
+ */
+ if (mp_irqs[idx].dstirq != pin)
+ pr_err("broken BIOS or MPTABLE parser, ayiee!!\n");
+
#ifdef CONFIG_X86_32
/*
* PCI IRQ command line redirection. Yes, limits are hardcoded.
@@ -972,16 +1106,58 @@ static int pin_2_irq(int idx, int apic, int pin)
apic_printk(APIC_VERBOSE, KERN_DEBUG
"disabling PIRQ%d\n", pin-16);
} else {
- irq = pirq_entries[pin-16];
+ int irq = pirq_entries[pin-16];
apic_printk(APIC_VERBOSE, KERN_DEBUG
"using PIRQ%d -> IRQ %d\n",
pin-16, irq);
+ return irq;
}
}
}
#endif
- return irq;
+ return mp_map_pin_to_irq(gsi, idx, ioapic, pin, flags);
+}
+
+int mp_map_gsi_to_irq(u32 gsi, unsigned int flags)
+{
+ int ioapic, pin, idx;
+
+ ioapic = mp_find_ioapic(gsi);
+ if (ioapic < 0)
+ return -1;
+
+ pin = mp_find_ioapic_pin(ioapic, gsi);
+ idx = find_irq_entry(ioapic, pin, mp_INT);
+ if ((flags & IOAPIC_MAP_CHECK) && idx < 0)
+ return -1;
+
+ return mp_map_pin_to_irq(gsi, idx, ioapic, pin, flags);
+}
+
+void mp_unmap_irq(int irq)
+{
+ struct irq_data *data = irq_get_irq_data(irq);
+ struct mp_pin_info *info;
+ int ioapic, pin;
+
+ if (!data || !data->domain)
+ return;
+
+ ioapic = (int)(long)data->domain->host_data;
+ pin = (int)data->hwirq;
+ info = mp_pin_info(ioapic, pin);
+
+ mutex_lock(&ioapic_mutex);
+ if (--info->count == 0) {
+ info->set = 0;
+ if (irq < nr_legacy_irqs() &&
+ ioapics[ioapic].irqdomain_cfg.type == IOAPIC_DOMAIN_LEGACY)
+ mp_irqdomain_unmap(data->domain, irq);
+ else
+ irq_dispose_mapping(irq);
+ }
+ mutex_unlock(&ioapic_mutex);
}
/*
@@ -991,7 +1167,7 @@ static int pin_2_irq(int idx, int apic, int pin)
int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin,
struct io_apic_irq_attr *irq_attr)
{
- int ioapic_idx, i, best_guess = -1;
+ int irq, i, best_ioapic = -1, best_idx = -1;
apic_printk(APIC_DEBUG,
"querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n",
@@ -1001,44 +1177,56 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin,
"PCI BIOS passed nonexistent PCI bus %d!\n", bus);
return -1;
}
+
for (i = 0; i < mp_irq_entries; i++) {
int lbus = mp_irqs[i].srcbus;
+ int ioapic_idx, found = 0;
- for (ioapic_idx = 0; ioapic_idx < nr_ioapics; ioapic_idx++)
+ if (bus != lbus || mp_irqs[i].irqtype != mp_INT ||
+ slot != ((mp_irqs[i].srcbusirq >> 2) & 0x1f))
+ continue;
+
+ for_each_ioapic(ioapic_idx)
if (mpc_ioapic_id(ioapic_idx) == mp_irqs[i].dstapic ||
- mp_irqs[i].dstapic == MP_APIC_ALL)
+ mp_irqs[i].dstapic == MP_APIC_ALL) {
+ found = 1;
break;
+ }
+ if (!found)
+ continue;
- if (!test_bit(lbus, mp_bus_not_pci) &&
- !mp_irqs[i].irqtype &&
- (bus == lbus) &&
- (slot == ((mp_irqs[i].srcbusirq >> 2) & 0x1f))) {
- int irq = pin_2_irq(i, ioapic_idx, mp_irqs[i].dstirq);
+ /* Skip ISA IRQs */
+ irq = pin_2_irq(i, ioapic_idx, mp_irqs[i].dstirq, 0);
+ if (irq > 0 && !IO_APIC_IRQ(irq))
+ continue;
- if (!(ioapic_idx || IO_APIC_IRQ(irq)))
- continue;
+ if (pin == (mp_irqs[i].srcbusirq & 3)) {
+ best_idx = i;
+ best_ioapic = ioapic_idx;
+ goto out;
+ }
- if (pin == (mp_irqs[i].srcbusirq & 3)) {
- set_io_apic_irq_attr(irq_attr, ioapic_idx,
- mp_irqs[i].dstirq,
- irq_trigger(i),
- irq_polarity(i));
- return irq;
- }
- /*
- * Use the first all-but-pin matching entry as a
- * best-guess fuzzy result for broken mptables.
- */
- if (best_guess < 0) {
- set_io_apic_irq_attr(irq_attr, ioapic_idx,
- mp_irqs[i].dstirq,
- irq_trigger(i),
- irq_polarity(i));
- best_guess = irq;
- }
+ /*
+ * Use the first all-but-pin matching entry as a
+ * best-guess fuzzy result for broken mptables.
+ */
+ if (best_idx < 0) {
+ best_idx = i;
+ best_ioapic = ioapic_idx;
}
}
- return best_guess;
+ if (best_idx < 0)
+ return -1;
+
+out:
+ irq = pin_2_irq(best_idx, best_ioapic, mp_irqs[best_idx].dstirq,
+ IOAPIC_MAP_ALLOC);
+ if (irq > 0)
+ set_io_apic_irq_attr(irq_attr, best_ioapic,
+ mp_irqs[best_idx].dstirq,
+ irq_trigger(best_idx),
+ irq_polarity(best_idx));
+ return irq;
}
EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
@@ -1198,7 +1386,7 @@ void __setup_vector_irq(int cpu)
raw_spin_lock(&vector_lock);
/* Mark the inuse vectors */
for_each_active_irq(irq) {
- cfg = irq_get_chip_data(irq);
+ cfg = irq_cfg(irq);
if (!cfg)
continue;
@@ -1227,12 +1415,10 @@ static inline int IO_APIC_irq_trigger(int irq)
{
int apic, idx, pin;
- for (apic = 0; apic < nr_ioapics; apic++) {
- for (pin = 0; pin < ioapics[apic].nr_registers; pin++) {
- idx = find_irq_entry(apic, pin, mp_INT);
- if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin)))
- return irq_trigger(idx);
- }
+ for_each_ioapic_pin(apic, pin) {
+ idx = find_irq_entry(apic, pin, mp_INT);
+ if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin, 0)))
+ return irq_trigger(idx);
}
/*
* nonexistent IRQs are edge default
@@ -1330,95 +1516,29 @@ static void setup_ioapic_irq(unsigned int irq, struct irq_cfg *cfg,
}
ioapic_register_intr(irq, cfg, attr->trigger);
- if (irq < legacy_pic->nr_legacy_irqs)
+ if (irq < nr_legacy_irqs())
legacy_pic->mask(irq);
ioapic_write_entry(attr->ioapic, attr->ioapic_pin, entry);
}
-static bool __init io_apic_pin_not_connected(int idx, int ioapic_idx, int pin)
-{
- if (idx != -1)
- return false;
-
- apic_printk(APIC_VERBOSE, KERN_DEBUG " apic %d pin %d not connected\n",
- mpc_ioapic_id(ioapic_idx), pin);
- return true;
-}
-
-static void __init __io_apic_setup_irqs(unsigned int ioapic_idx)
-{
- int idx, node = cpu_to_node(0);
- struct io_apic_irq_attr attr;
- unsigned int pin, irq;
-
- for (pin = 0; pin < ioapics[ioapic_idx].nr_registers; pin++) {
- idx = find_irq_entry(ioapic_idx, pin, mp_INT);
- if (io_apic_pin_not_connected(idx, ioapic_idx, pin))
- continue;
-
- irq = pin_2_irq(idx, ioapic_idx, pin);
-
- if ((ioapic_idx > 0) && (irq > 16))
- continue;
-
- /*
- * Skip the timer IRQ if there's a quirk handler
- * installed and if it returns 1:
- */
- if (apic->multi_timer_check &&
- apic->multi_timer_check(ioapic_idx, irq))
- continue;
-
- set_io_apic_irq_attr(&attr, ioapic_idx, pin, irq_trigger(idx),
- irq_polarity(idx));
-
- io_apic_setup_irq_pin(irq, node, &attr);
- }
-}
-
static void __init setup_IO_APIC_irqs(void)
{
- unsigned int ioapic_idx;
+ unsigned int ioapic, pin;
+ int idx;
apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
- for (ioapic_idx = 0; ioapic_idx < nr_ioapics; ioapic_idx++)
- __io_apic_setup_irqs(ioapic_idx);
-}
-
-/*
- * for the gsit that is not in first ioapic
- * but could not use acpi_register_gsi()
- * like some special sci in IBM x3330
- */
-void setup_IO_APIC_irq_extra(u32 gsi)
-{
- int ioapic_idx = 0, pin, idx, irq, node = cpu_to_node(0);
- struct io_apic_irq_attr attr;
-
- /*
- * Convert 'gsi' to 'ioapic.pin'.
- */
- ioapic_idx = mp_find_ioapic(gsi);
- if (ioapic_idx < 0)
- return;
-
- pin = mp_find_ioapic_pin(ioapic_idx, gsi);
- idx = find_irq_entry(ioapic_idx, pin, mp_INT);
- if (idx == -1)
- return;
-
- irq = pin_2_irq(idx, ioapic_idx, pin);
-
- /* Only handle the non legacy irqs on secondary ioapics */
- if (ioapic_idx == 0 || irq < NR_IRQS_LEGACY)
- return;
-
- set_io_apic_irq_attr(&attr, ioapic_idx, pin, irq_trigger(idx),
- irq_polarity(idx));
-
- io_apic_setup_irq_pin_once(irq, node, &attr);
+ for_each_ioapic_pin(ioapic, pin) {
+ idx = find_irq_entry(ioapic, pin, mp_INT);
+ if (idx < 0)
+ apic_printk(APIC_VERBOSE,
+ KERN_DEBUG " apic %d pin %d not connected\n",
+ mpc_ioapic_id(ioapic), pin);
+ else
+ pin_2_irq(idx, ioapic, pin,
+ ioapic ? 0 : IOAPIC_MAP_ALLOC);
+ }
}
/*
@@ -1586,7 +1706,7 @@ __apicdebuginit(void) print_IO_APICs(void)
struct irq_chip *chip;
printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
- for (ioapic_idx = 0; ioapic_idx < nr_ioapics; ioapic_idx++)
+ for_each_ioapic(ioapic_idx)
printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
mpc_ioapic_id(ioapic_idx),
ioapics[ioapic_idx].nr_registers);
@@ -1597,7 +1717,7 @@ __apicdebuginit(void) print_IO_APICs(void)
*/
printk(KERN_INFO "testing the IO APIC.......................\n");
- for (ioapic_idx = 0; ioapic_idx < nr_ioapics; ioapic_idx++)
+ for_each_ioapic(ioapic_idx)
print_IO_APIC(ioapic_idx);
printk(KERN_DEBUG "IRQ to pin mappings:\n");
@@ -1608,7 +1728,7 @@ __apicdebuginit(void) print_IO_APICs(void)
if (chip != &ioapic_chip)
continue;
- cfg = irq_get_chip_data(irq);
+ cfg = irq_cfg(irq);
if (!cfg)
continue;
entry = cfg->irq_2_pin;
@@ -1758,7 +1878,7 @@ __apicdebuginit(void) print_PIC(void)
unsigned int v;
unsigned long flags;
- if (!legacy_pic->nr_legacy_irqs)
+ if (!nr_legacy_irqs())
return;
printk(KERN_DEBUG "\nprinting PIC contents\n");
@@ -1828,26 +1948,22 @@ static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
void __init enable_IO_APIC(void)
{
int i8259_apic, i8259_pin;
- int apic;
+ int apic, pin;
- if (!legacy_pic->nr_legacy_irqs)
+ if (!nr_legacy_irqs())
return;
- for(apic = 0; apic < nr_ioapics; apic++) {
- int pin;
+ for_each_ioapic_pin(apic, pin) {
/* See if any of the pins is in ExtINT mode */
- for (pin = 0; pin < ioapics[apic].nr_registers; pin++) {
- struct IO_APIC_route_entry entry;
- entry = ioapic_read_entry(apic, pin);
+ struct IO_APIC_route_entry entry = ioapic_read_entry(apic, pin);
- /* If the interrupt line is enabled and in ExtInt mode
- * I have found the pin where the i8259 is connected.
- */
- if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) {
- ioapic_i8259.apic = apic;
- ioapic_i8259.pin = pin;
- goto found_i8259;
- }
+ /* If the interrupt line is enabled and in ExtInt mode
+ * I have found the pin where the i8259 is connected.
+ */
+ if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) {
+ ioapic_i8259.apic = apic;
+ ioapic_i8259.pin = pin;
+ goto found_i8259;
}
}
found_i8259:
@@ -1919,7 +2035,7 @@ void disable_IO_APIC(void)
*/
clear_IO_APIC();
- if (!legacy_pic->nr_legacy_irqs)
+ if (!nr_legacy_irqs())
return;
x86_io_apic_ops.disable();
@@ -1950,7 +2066,7 @@ void __init setup_ioapic_ids_from_mpc_nocheck(void)
/*
* Set the IOAPIC ID to the value stored in the MPC table.
*/
- for (ioapic_idx = 0; ioapic_idx < nr_ioapics; ioapic_idx++) {
+ for_each_ioapic(ioapic_idx) {
/* Read the register 0 value */
raw_spin_lock_irqsave(&ioapic_lock, flags);
reg_00.raw = io_apic_read(ioapic_idx, 0);
@@ -2123,7 +2239,7 @@ static unsigned int startup_ioapic_irq(struct irq_data *data)
unsigned long flags;
raw_spin_lock_irqsave(&ioapic_lock, flags);
- if (irq < legacy_pic->nr_legacy_irqs) {
+ if (irq < nr_legacy_irqs()) {
legacy_pic->mask(irq);
if (legacy_pic->irq_pending(irq))
was_pending = 1;
@@ -2225,7 +2341,7 @@ asmlinkage __visible void smp_irq_move_cleanup_interrupt(void)
apic->send_IPI_self(IRQ_MOVE_CLEANUP_VECTOR);
goto unlock;
}
- __this_cpu_write(vector_irq[vector], -1);
+ __this_cpu_write(vector_irq[vector], VECTOR_UNDEFINED);
unlock:
raw_spin_unlock(&desc->lock);
}
@@ -2253,7 +2369,7 @@ static void irq_complete_move(struct irq_cfg *cfg)
void irq_force_complete_move(int irq)
{
- struct irq_cfg *cfg = irq_get_chip_data(irq);
+ struct irq_cfg *cfg = irq_cfg(irq);
if (!cfg)
return;
@@ -2507,6 +2623,7 @@ static struct irq_chip ioapic_chip __read_mostly = {
.irq_eoi = ack_apic_level,
.irq_set_affinity = native_ioapic_set_affinity,
.irq_retrigger = ioapic_retrigger_irq,
+ .flags = IRQCHIP_SKIP_SET_WAKE,
};
static inline void init_IO_APIC_traps(void)
@@ -2514,26 +2631,15 @@ static inline void init_IO_APIC_traps(void)
struct irq_cfg *cfg;
unsigned int irq;
- /*
- * NOTE! The local APIC isn't very good at handling
- * multiple interrupts at the same interrupt level.
- * As the interrupt level is determined by taking the
- * vector number and shifting that right by 4, we
- * want to spread these out a bit so that they don't
- * all fall in the same interrupt level.
- *
- * Also, we've got to be careful not to trash gate
- * 0x80, because int 0x80 is hm, kind of importantish. ;)
- */
for_each_active_irq(irq) {
- cfg = irq_get_chip_data(irq);
+ cfg = irq_cfg(irq);
if (IO_APIC_IRQ(irq) && cfg && !cfg->vector) {
/*
* Hmm.. We don't have an entry for this,
* so default to an old-fashioned 8259
* interrupt if we can..
*/
- if (irq < legacy_pic->nr_legacy_irqs)
+ if (irq < nr_legacy_irqs())
legacy_pic->make_irq(irq);
else
/* Strange. Oh, well.. */
@@ -2649,8 +2755,6 @@ static int __init disable_timer_pin_setup(char *arg)
}
early_param("disable_timer_pin_1", disable_timer_pin_setup);
-int timer_through_8259 __initdata;
-
/*
* This code may look a bit paranoid, but it's supposed to cooperate with
* a wide range of boards and BIOS bugs. Fortunately only the timer IRQ
@@ -2661,7 +2765,7 @@ int timer_through_8259 __initdata;
*/
static inline void __init check_timer(void)
{
- struct irq_cfg *cfg = irq_get_chip_data(0);
+ struct irq_cfg *cfg = irq_cfg(0);
int node = cpu_to_node(0);
int apic1, pin1, apic2, pin2;
unsigned long flags;
@@ -2755,7 +2859,6 @@ static inline void __init check_timer(void)
legacy_pic->unmask(0);
if (timer_irq_works()) {
apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
- timer_through_8259 = 1;
goto out;
}
/*
@@ -2827,15 +2930,54 @@ out:
*/
#define PIC_IRQS (1UL << PIC_CASCADE_IR)
+static int mp_irqdomain_create(int ioapic)
+{
+ size_t size;
+ int hwirqs = mp_ioapic_pin_count(ioapic);
+ struct ioapic *ip = &ioapics[ioapic];
+ struct ioapic_domain_cfg *cfg = &ip->irqdomain_cfg;
+ struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(ioapic);
+
+ size = sizeof(struct mp_pin_info) * mp_ioapic_pin_count(ioapic);
+ ip->pin_info = kzalloc(size, GFP_KERNEL);
+ if (!ip->pin_info)
+ return -ENOMEM;
+
+ if (cfg->type == IOAPIC_DOMAIN_INVALID)
+ return 0;
+
+ ip->irqdomain = irq_domain_add_linear(cfg->dev, hwirqs, cfg->ops,
+ (void *)(long)ioapic);
+ if(!ip->irqdomain) {
+ kfree(ip->pin_info);
+ ip->pin_info = NULL;
+ return -ENOMEM;
+ }
+
+ if (cfg->type == IOAPIC_DOMAIN_LEGACY ||
+ cfg->type == IOAPIC_DOMAIN_STRICT)
+ ioapic_dynirq_base = max(ioapic_dynirq_base,
+ gsi_cfg->gsi_end + 1);
+
+ if (gsi_cfg->gsi_base == 0)
+ irq_set_default_host(ip->irqdomain);
+
+ return 0;
+}
+
void __init setup_IO_APIC(void)
{
+ int ioapic;
/*
* calling enable_IO_APIC() is moved to setup_local_APIC for BP
*/
- io_apic_irqs = legacy_pic->nr_legacy_irqs ? ~PIC_IRQS : ~0UL;
+ io_apic_irqs = nr_legacy_irqs() ? ~PIC_IRQS : ~0UL;
apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n");
+ for_each_ioapic(ioapic)
+ BUG_ON(mp_irqdomain_create(ioapic));
+
/*
* Set up IO-APIC IRQ routing.
*/
@@ -2844,8 +2986,10 @@ void __init setup_IO_APIC(void)
sync_Arb_IDs();
setup_IO_APIC_irqs();
init_IO_APIC_traps();
- if (legacy_pic->nr_legacy_irqs)
+ if (nr_legacy_irqs())
check_timer();
+
+ ioapic_initialized = 1;
}
/*
@@ -2880,7 +3024,7 @@ static void ioapic_resume(void)
{
int ioapic_idx;
- for (ioapic_idx = nr_ioapics - 1; ioapic_idx >= 0; ioapic_idx--)
+ for_each_ioapic_reverse(ioapic_idx)
resume_ioapic_id(ioapic_idx);
restore_ioapic_entries();
@@ -2926,7 +3070,7 @@ int arch_setup_hwirq(unsigned int irq, int node)
void arch_teardown_hwirq(unsigned int irq)
{
- struct irq_cfg *cfg = irq_get_chip_data(irq);
+ struct irq_cfg *cfg = irq_cfg(irq);
unsigned long flags;
free_remapped_irq(irq);
@@ -3030,6 +3174,7 @@ static struct irq_chip msi_chip = {
.irq_ack = ack_apic_edge,
.irq_set_affinity = msi_set_affinity,
.irq_retrigger = ioapic_retrigger_irq,
+ .flags = IRQCHIP_SKIP_SET_WAKE,
};
int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc,
@@ -3053,7 +3198,7 @@ int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc,
if (!irq_offset)
write_msi_msg(irq, &msg);
- setup_remapped_irq(irq, irq_get_chip_data(irq), chip);
+ setup_remapped_irq(irq, irq_cfg(irq), chip);
irq_set_chip_and_handler_name(irq, chip, handle_edge_irq, "edge");
@@ -3128,6 +3273,7 @@ static struct irq_chip dmar_msi_type = {
.irq_ack = ack_apic_edge,
.irq_set_affinity = dmar_msi_set_affinity,
.irq_retrigger = ioapic_retrigger_irq,
+ .flags = IRQCHIP_SKIP_SET_WAKE,
};
int arch_setup_dmar_msi(unsigned int irq)
@@ -3178,6 +3324,7 @@ static struct irq_chip hpet_msi_type = {
.irq_ack = ack_apic_edge,
.irq_set_affinity = hpet_msi_set_affinity,
.irq_retrigger = ioapic_retrigger_irq,
+ .flags = IRQCHIP_SKIP_SET_WAKE,
};
int default_setup_hpet_msi(unsigned int irq, unsigned int id)
@@ -3192,7 +3339,7 @@ int default_setup_hpet_msi(unsigned int irq, unsigned int id)
hpet_msi_write(irq_get_handler_data(irq), &msg);
irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
- setup_remapped_irq(irq, irq_get_chip_data(irq), chip);
+ setup_remapped_irq(irq, irq_cfg(irq), chip);
irq_set_chip_and_handler_name(irq, chip, handle_edge_irq, "edge");
return 0;
@@ -3241,6 +3388,7 @@ static struct irq_chip ht_irq_chip = {
.irq_ack = ack_apic_edge,
.irq_set_affinity = ht_set_affinity,
.irq_retrigger = ioapic_retrigger_irq,
+ .flags = IRQCHIP_SKIP_SET_WAKE,
};
int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
@@ -3303,27 +3451,6 @@ io_apic_setup_irq_pin(unsigned int irq, int node, struct io_apic_irq_attr *attr)
return ret;
}
-int io_apic_setup_irq_pin_once(unsigned int irq, int node,
- struct io_apic_irq_attr *attr)
-{
- unsigned int ioapic_idx = attr->ioapic, pin = attr->ioapic_pin;
- int ret;
- struct IO_APIC_route_entry orig_entry;
-
- /* Avoid redundant programming */
- if (test_bit(pin, ioapics[ioapic_idx].pin_programmed)) {
- pr_debug("Pin %d-%d already programmed\n", mpc_ioapic_id(ioapic_idx), pin);
- orig_entry = ioapic_read_entry(attr->ioapic, pin);
- if (attr->trigger == orig_entry.trigger && attr->polarity == orig_entry.polarity)
- return 0;
- return -EBUSY;
- }
- ret = io_apic_setup_irq_pin(irq, node, attr);
- if (!ret)
- set_bit(pin, ioapics[ioapic_idx].pin_programmed);
- return ret;
-}
-
static int __init io_apic_get_redir_entries(int ioapic)
{
union IO_APIC_reg_01 reg_01;
@@ -3340,20 +3467,13 @@ static int __init io_apic_get_redir_entries(int ioapic)
return reg_01.bits.entries + 1;
}
-static void __init probe_nr_irqs_gsi(void)
-{
- int nr;
-
- nr = gsi_top + NR_IRQS_LEGACY;
- if (nr > nr_irqs_gsi)
- nr_irqs_gsi = nr;
-
- printk(KERN_DEBUG "nr_irqs_gsi: %d\n", nr_irqs_gsi);
-}
-
unsigned int arch_dynirq_lower_bound(unsigned int from)
{
- return from < nr_irqs_gsi ? nr_irqs_gsi : from;
+ /*
+ * dmar_alloc_hwirq() may be called before setup_IO_APIC(), so use
+ * gsi_top if ioapic_dynirq_base hasn't been initialized yet.
+ */
+ return ioapic_initialized ? ioapic_dynirq_base : gsi_top;
}
int __init arch_probe_nr_irqs(void)
@@ -3363,33 +3483,17 @@ int __init arch_probe_nr_irqs(void)
if (nr_irqs > (NR_VECTORS * nr_cpu_ids))
nr_irqs = NR_VECTORS * nr_cpu_ids;
- nr = nr_irqs_gsi + 8 * nr_cpu_ids;
+ nr = (gsi_top + nr_legacy_irqs()) + 8 * nr_cpu_ids;
#if defined(CONFIG_PCI_MSI) || defined(CONFIG_HT_IRQ)
/*
* for MSI and HT dyn irq
*/
- nr += nr_irqs_gsi * 16;
+ nr += gsi_top * 16;
#endif
if (nr < nr_irqs)
nr_irqs = nr;
- return NR_IRQS_LEGACY;
-}
-
-int io_apic_set_pci_routing(struct device *dev, int irq,
- struct io_apic_irq_attr *irq_attr)
-{
- int node;
-
- if (!IO_APIC_IRQ(irq)) {
- apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
- irq_attr->ioapic);
- return -EINVAL;
- }
-
- node = dev ? dev_to_node(dev) : cpu_to_node(0);
-
- return io_apic_setup_irq_pin_once(irq, node, irq_attr);
+ return 0;
}
#ifdef CONFIG_X86_32
@@ -3483,9 +3587,8 @@ static u8 __init io_apic_unique_id(u8 id)
DECLARE_BITMAP(used, 256);
bitmap_zero(used, 256);
- for (i = 0; i < nr_ioapics; i++) {
+ for_each_ioapic(i)
__set_bit(mpc_ioapic_id(i), used);
- }
if (!test_bit(id, used))
return id;
return find_first_zero_bit(used, 256);
@@ -3543,14 +3646,13 @@ void __init setup_ioapic_dest(void)
if (skip_ioapic_setup == 1)
return;
- for (ioapic = 0; ioapic < nr_ioapics; ioapic++)
- for (pin = 0; pin < ioapics[ioapic].nr_registers; pin++) {
+ for_each_ioapic_pin(ioapic, pin) {
irq_entry = find_irq_entry(ioapic, pin, mp_INT);
if (irq_entry == -1)
continue;
- irq = pin_2_irq(irq_entry, ioapic, pin);
- if ((ioapic > 0) && (irq > 16))
+ irq = pin_2_irq(irq_entry, ioapic, pin, 0);
+ if (irq < 0 || !mp_init_irq_at_boot(ioapic, irq))
continue;
idata = irq_get_irq_data(irq);
@@ -3573,29 +3675,33 @@ void __init setup_ioapic_dest(void)
static struct resource *ioapic_resources;
-static struct resource * __init ioapic_setup_resources(int nr_ioapics)
+static struct resource * __init ioapic_setup_resources(void)
{
unsigned long n;
struct resource *res;
char *mem;
- int i;
+ int i, num = 0;
- if (nr_ioapics <= 0)
+ for_each_ioapic(i)
+ num++;
+ if (num == 0)
return NULL;
n = IOAPIC_RESOURCE_NAME_SIZE + sizeof(struct resource);
- n *= nr_ioapics;
+ n *= num;
mem = alloc_bootmem(n);
res = (void *)mem;
- mem += sizeof(struct resource) * nr_ioapics;
+ mem += sizeof(struct resource) * num;
- for (i = 0; i < nr_ioapics; i++) {
- res[i].name = mem;
- res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+ num = 0;
+ for_each_ioapic(i) {
+ res[num].name = mem;
+ res[num].flags = IORESOURCE_MEM | IORESOURCE_BUSY;
snprintf(mem, IOAPIC_RESOURCE_NAME_SIZE, "IOAPIC %u", i);
mem += IOAPIC_RESOURCE_NAME_SIZE;
+ num++;
}
ioapic_resources = res;
@@ -3609,8 +3715,8 @@ void __init native_io_apic_init_mappings(void)
struct resource *ioapic_res;
int i;
- ioapic_res = ioapic_setup_resources(nr_ioapics);
- for (i = 0; i < nr_ioapics; i++) {
+ ioapic_res = ioapic_setup_resources();
+ for_each_ioapic(i) {
if (smp_found_config) {
ioapic_phys = mpc_ioapic_addr(i);
#ifdef CONFIG_X86_32
@@ -3641,8 +3747,6 @@ fake_ioapic_page:
ioapic_res->end = ioapic_phys + IO_APIC_SLOT_SIZE - 1;
ioapic_res++;
}
-
- probe_nr_irqs_gsi();
}
void __init ioapic_insert_resources(void)
@@ -3657,7 +3761,7 @@ void __init ioapic_insert_resources(void)
return;
}
- for (i = 0; i < nr_ioapics; i++) {
+ for_each_ioapic(i) {
insert_resource(&iomem_resource, r);
r++;
}
@@ -3665,16 +3769,15 @@ void __init ioapic_insert_resources(void)
int mp_find_ioapic(u32 gsi)
{
- int i = 0;
+ int i;
if (nr_ioapics == 0)
return -1;
/* Find the IOAPIC that manages this GSI. */
- for (i = 0; i < nr_ioapics; i++) {
+ for_each_ioapic(i) {
struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(i);
- if ((gsi >= gsi_cfg->gsi_base)
- && (gsi <= gsi_cfg->gsi_end))
+ if (gsi >= gsi_cfg->gsi_base && gsi <= gsi_cfg->gsi_end)
return i;
}
@@ -3686,7 +3789,7 @@ int mp_find_ioapic_pin(int ioapic, u32 gsi)
{
struct mp_ioapic_gsi *gsi_cfg;
- if (WARN_ON(ioapic == -1))
+ if (WARN_ON(ioapic < 0))
return -1;
gsi_cfg = mp_ioapic_gsi_routing(ioapic);
@@ -3729,7 +3832,8 @@ static __init int bad_ioapic_register(int idx)
return 0;
}
-void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
+void __init mp_register_ioapic(int id, u32 address, u32 gsi_base,
+ struct ioapic_domain_cfg *cfg)
{
int idx = 0;
int entries;
@@ -3743,6 +3847,8 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
ioapics[idx].mp_config.type = MP_IOAPIC;
ioapics[idx].mp_config.flags = MPC_APIC_USABLE;
ioapics[idx].mp_config.apicaddr = address;
+ ioapics[idx].irqdomain = NULL;
+ ioapics[idx].irqdomain_cfg = *cfg;
set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
@@ -3779,6 +3885,97 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
nr_ioapics++;
}
+int mp_irqdomain_map(struct irq_domain *domain, unsigned int virq,
+ irq_hw_number_t hwirq)
+{
+ int ioapic = (int)(long)domain->host_data;
+ struct mp_pin_info *info = mp_pin_info(ioapic, hwirq);
+ struct io_apic_irq_attr attr;
+
+ /* Get default attribute if not set by caller yet */
+ if (!info->set) {
+ u32 gsi = mp_pin_to_gsi(ioapic, hwirq);
+
+ if (acpi_get_override_irq(gsi, &info->trigger,
+ &info->polarity) < 0) {
+ /*
+ * PCI interrupts are always polarity one level
+ * triggered.
+ */
+ info->trigger = 1;
+ info->polarity = 1;
+ }
+ info->node = NUMA_NO_NODE;
+
+ /*
+ * setup_IO_APIC_irqs() programs all legacy IRQs with default
+ * trigger and polarity attributes. Don't set the flag for that
+ * case so the first legacy IRQ user could reprogram the pin
+ * with real trigger and polarity attributes.
+ */
+ if (virq >= nr_legacy_irqs() || info->count)
+ info->set = 1;
+ }
+ set_io_apic_irq_attr(&attr, ioapic, hwirq, info->trigger,
+ info->polarity);
+
+ return io_apic_setup_irq_pin(virq, info->node, &attr);
+}
+
+void mp_irqdomain_unmap(struct irq_domain *domain, unsigned int virq)
+{
+ struct irq_data *data = irq_get_irq_data(virq);
+ struct irq_cfg *cfg = irq_cfg(virq);
+ int ioapic = (int)(long)domain->host_data;
+ int pin = (int)data->hwirq;
+
+ ioapic_mask_entry(ioapic, pin);
+ __remove_pin_from_irq(cfg, ioapic, pin);
+ WARN_ON(cfg->irq_2_pin != NULL);
+ arch_teardown_hwirq(virq);
+}
+
+int mp_set_gsi_attr(u32 gsi, int trigger, int polarity, int node)
+{
+ int ret = 0;
+ int ioapic, pin;
+ struct mp_pin_info *info;
+
+ ioapic = mp_find_ioapic(gsi);
+ if (ioapic < 0)
+ return -ENODEV;
+
+ pin = mp_find_ioapic_pin(ioapic, gsi);
+ info = mp_pin_info(ioapic, pin);
+ trigger = trigger ? 1 : 0;
+ polarity = polarity ? 1 : 0;
+
+ mutex_lock(&ioapic_mutex);
+ if (!info->set) {
+ info->trigger = trigger;
+ info->polarity = polarity;
+ info->node = node;
+ info->set = 1;
+ } else if (info->trigger != trigger || info->polarity != polarity) {
+ ret = -EBUSY;
+ }
+ mutex_unlock(&ioapic_mutex);
+
+ return ret;
+}
+
+bool mp_should_keep_irq(struct device *dev)
+{
+ if (dev->power.is_prepared)
+ return true;
+#ifdef CONFIG_PM_RUNTIME
+ if (dev->power.runtime_status == RPM_SUSPENDING)
+ return true;
+#endif
+
+ return false;
+}
+
/* Enable IOAPIC early just for system timer */
void __init pre_init_apic_IRQ0(void)
{
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index cceb352c968c..bda488680dbc 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -88,21 +88,16 @@ static struct apic apic_default = {
.disable_esr = 0,
.dest_logical = APIC_DEST_LOGICAL,
.check_apicid_used = default_check_apicid_used,
- .check_apicid_present = default_check_apicid_present,
.vector_allocation_domain = flat_vector_allocation_domain,
.init_apic_ldr = default_init_apic_ldr,
.ioapic_phys_id_map = default_ioapic_phys_id_map,
.setup_apic_routing = setup_apic_flat_routing,
- .multi_timer_check = NULL,
.cpu_present_to_apicid = default_cpu_present_to_apicid,
.apicid_to_cpu_present = physid_set_mask_of_physid,
- .setup_portio_remap = NULL,
.check_phys_apicid_present = default_check_phys_apicid_present,
- .enable_apic_mode = NULL,
.phys_pkg_id = default_phys_pkg_id,
- .mps_oem_check = NULL,
.get_apic_id = default_get_apic_id,
.set_apic_id = NULL,
@@ -116,11 +111,7 @@ static struct apic apic_default = {
.send_IPI_all = default_send_IPI_all,
.send_IPI_self = default_send_IPI_self,
- .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
- .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
-
.wait_for_init_deassert = true,
- .smp_callin_clear_local_apic = NULL,
.inquire_remote_apic = default_inquire_remote_apic,
.read = native_apic_mem_read,
@@ -214,29 +205,7 @@ void __init generic_apic_probe(void)
printk(KERN_INFO "Using APIC driver %s\n", apic->name);
}
-/* These functions can switch the APIC even after the initial ->probe() */
-
-int __init
-generic_mps_oem_check(struct mpc_table *mpc, char *oem, char *productid)
-{
- struct apic **drv;
-
- for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
- if (!((*drv)->mps_oem_check))
- continue;
- if (!(*drv)->mps_oem_check(mpc, oem, productid))
- continue;
-
- if (!cmdline_apic) {
- apic = *drv;
- printk(KERN_INFO "Switched to APIC driver `%s'.\n",
- apic->name);
- }
- return 1;
- }
- return 0;
-}
-
+/* This function can switch the APIC even after the initial ->probe() */
int __init default_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
{
struct apic **drv;
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index e66766bf1641..e658f21681c8 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -42,7 +42,7 @@ __x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest)
* We are to modify mask, so we need an own copy
* and be sure it's manipulated with irq off.
*/
- ipi_mask_ptr = __raw_get_cpu_var(ipi_mask);
+ ipi_mask_ptr = this_cpu_cpumask_var_ptr(ipi_mask);
cpumask_copy(ipi_mask_ptr, mask);
/*
@@ -249,21 +249,16 @@ static struct apic apic_x2apic_cluster = {
.disable_esr = 0,
.dest_logical = APIC_DEST_LOGICAL,
.check_apicid_used = NULL,
- .check_apicid_present = NULL,
.vector_allocation_domain = cluster_vector_allocation_domain,
.init_apic_ldr = init_x2apic_ldr,
.ioapic_phys_id_map = NULL,
.setup_apic_routing = NULL,
- .multi_timer_check = NULL,
.cpu_present_to_apicid = default_cpu_present_to_apicid,
.apicid_to_cpu_present = NULL,
- .setup_portio_remap = NULL,
.check_phys_apicid_present = default_check_phys_apicid_present,
- .enable_apic_mode = NULL,
.phys_pkg_id = x2apic_phys_pkg_id,
- .mps_oem_check = NULL,
.get_apic_id = x2apic_get_apic_id,
.set_apic_id = x2apic_set_apic_id,
@@ -277,10 +272,7 @@ static struct apic apic_x2apic_cluster = {
.send_IPI_all = x2apic_send_IPI_all,
.send_IPI_self = x2apic_send_IPI_self,
- .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
- .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
.wait_for_init_deassert = false,
- .smp_callin_clear_local_apic = NULL,
.inquire_remote_apic = NULL,
.read = native_apic_msr_read,
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index 6d600ebf6c12..6fae733e9194 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -103,21 +103,16 @@ static struct apic apic_x2apic_phys = {
.disable_esr = 0,
.dest_logical = 0,
.check_apicid_used = NULL,
- .check_apicid_present = NULL,
.vector_allocation_domain = default_vector_allocation_domain,
.init_apic_ldr = init_x2apic_ldr,
.ioapic_phys_id_map = NULL,
.setup_apic_routing = NULL,
- .multi_timer_check = NULL,
.cpu_present_to_apicid = default_cpu_present_to_apicid,
.apicid_to_cpu_present = NULL,
- .setup_portio_remap = NULL,
.check_phys_apicid_present = default_check_phys_apicid_present,
- .enable_apic_mode = NULL,
.phys_pkg_id = x2apic_phys_pkg_id,
- .mps_oem_check = NULL,
.get_apic_id = x2apic_get_apic_id,
.set_apic_id = x2apic_set_apic_id,
@@ -131,10 +126,7 @@ static struct apic apic_x2apic_phys = {
.send_IPI_all = x2apic_send_IPI_all,
.send_IPI_self = x2apic_send_IPI_self,
- .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
- .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
.wait_for_init_deassert = false,
- .smp_callin_clear_local_apic = NULL,
.inquire_remote_apic = NULL,
.read = native_apic_msr_read,
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 293b41df54ef..8e9dcfd630e4 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -204,7 +204,6 @@ EXPORT_SYMBOL(sn_rtc_cycles_per_second);
static int uv_wakeup_secondary(int phys_apicid, unsigned long start_rip)
{
-#ifdef CONFIG_SMP
unsigned long val;
int pnode;
@@ -223,7 +222,6 @@ static int uv_wakeup_secondary(int phys_apicid, unsigned long start_rip)
uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
atomic_set(&init_deasserted, 1);
-#endif
return 0;
}
@@ -365,21 +363,16 @@ static struct apic __refdata apic_x2apic_uv_x = {
.disable_esr = 0,
.dest_logical = APIC_DEST_LOGICAL,
.check_apicid_used = NULL,
- .check_apicid_present = NULL,
.vector_allocation_domain = default_vector_allocation_domain,
.init_apic_ldr = uv_init_apic_ldr,
.ioapic_phys_id_map = NULL,
.setup_apic_routing = NULL,
- .multi_timer_check = NULL,
.cpu_present_to_apicid = default_cpu_present_to_apicid,
.apicid_to_cpu_present = NULL,
- .setup_portio_remap = NULL,
.check_phys_apicid_present = default_check_phys_apicid_present,
- .enable_apic_mode = NULL,
.phys_pkg_id = uv_phys_pkg_id,
- .mps_oem_check = NULL,
.get_apic_id = x2apic_get_apic_id,
.set_apic_id = set_apic_id,
@@ -394,10 +387,7 @@ static struct apic __refdata apic_x2apic_uv_x = {
.send_IPI_self = uv_send_IPI_self,
.wakeup_secondary_cpu = uv_wakeup_secondary,
- .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
- .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
.wait_for_init_deassert = false,
- .smp_callin_clear_local_apic = NULL,
.inquire_remote_apic = NULL,
.read = native_apic_msr_read,
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 7fd54f09b011..e27b49d7c922 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -13,10 +13,13 @@ nostackp := $(call cc-option, -fno-stack-protector)
CFLAGS_common.o := $(nostackp)
obj-y := intel_cacheinfo.o scattered.o topology.o
-obj-y += proc.o capflags.o powerflags.o common.o
+obj-y += common.o
obj-y += rdrand.o
obj-y += match.o
+obj-$(CONFIG_PROC_FS) += proc.o
+obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o
+
obj-$(CONFIG_X86_32) += bugs.o
obj-$(CONFIG_X86_64) += bugs_64.o
@@ -36,7 +39,12 @@ obj-$(CONFIG_CPU_SUP_AMD) += perf_event_amd_iommu.o
endif
obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_p6.o perf_event_knc.o perf_event_p4.o
obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_lbr.o perf_event_intel_ds.o perf_event_intel.o
-obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_uncore.o perf_event_intel_rapl.o
+obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_rapl.o
+
+obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE) += perf_event_intel_uncore.o \
+ perf_event_intel_uncore_snb.o \
+ perf_event_intel_uncore_snbep.o \
+ perf_event_intel_uncore_nhmex.o
endif
@@ -48,6 +56,7 @@ obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o perf_event_amd_ibs.o
obj-$(CONFIG_HYPERVISOR_GUEST) += vmware.o hypervisor.o mshyperv.o
+ifdef CONFIG_X86_FEATURE_NAMES
quiet_cmd_mkcapflags = MKCAP $@
cmd_mkcapflags = $(CONFIG_SHELL) $(srctree)/$(src)/mkcapflags.sh $< $@
@@ -56,3 +65,4 @@ cpufeature = $(src)/../../include/asm/cpufeature.h
targets += capflags.c
$(obj)/capflags.c: $(cpufeature) $(src)/mkcapflags.sh FORCE
$(call if_changed,mkcapflags)
+endif
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 60e5497681f5..813d29d00a17 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -525,6 +525,13 @@ static void early_init_amd(struct cpuinfo_x86 *c)
}
#endif
+ /*
+ * This is only needed to tell the kernel whether to use VMCALL
+ * and VMMCALL. VMMCALL is never executed except under virt, so
+ * we can set it unconditionally.
+ */
+ set_cpu_cap(c, X86_FEATURE_VMMCALL);
+
/* F16h erratum 793, CVE-2013-6885 */
if (c->x86 == 0x16 && c->x86_model <= 0xf)
msr_set_bit(MSR_AMD64_LS_CFG, 15);
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 333fd5209336..4b4f78c9ba19 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -148,6 +148,7 @@ static int __init x86_xsave_setup(char *s)
{
setup_clear_cpu_cap(X86_FEATURE_XSAVE);
setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
+ setup_clear_cpu_cap(X86_FEATURE_XSAVES);
setup_clear_cpu_cap(X86_FEATURE_AVX);
setup_clear_cpu_cap(X86_FEATURE_AVX2);
return 1;
@@ -161,6 +162,13 @@ static int __init x86_xsaveopt_setup(char *s)
}
__setup("noxsaveopt", x86_xsaveopt_setup);
+static int __init x86_xsaves_setup(char *s)
+{
+ setup_clear_cpu_cap(X86_FEATURE_XSAVES);
+ return 1;
+}
+__setup("noxsaves", x86_xsaves_setup);
+
#ifdef CONFIG_X86_32
static int cachesize_override = -1;
static int disable_x86_serial_nr = 1;
@@ -338,8 +346,8 @@ static void filter_cpuid_features(struct cpuinfo_x86 *c, bool warn)
continue;
printk(KERN_WARNING
- "CPU: CPU feature %s disabled, no CPUID level 0x%x\n",
- x86_cap_flags[df->feature], df->level);
+ "CPU: CPU feature " X86_CAP_FMT " disabled, no CPUID level 0x%x\n",
+ x86_cap_flag(df->feature), df->level);
}
}
@@ -956,6 +964,7 @@ static void vgetcpu_set_mode(void)
vgetcpu_mode = VGETCPU_LSL;
}
+#ifdef CONFIG_IA32_EMULATION
/* May not be __init: called during resume */
static void syscall32_cpu_init(void)
{
@@ -967,7 +976,8 @@ static void syscall32_cpu_init(void)
wrmsrl(MSR_CSTAR, ia32_cstar_target);
}
-#endif
+#endif /* CONFIG_IA32_EMULATION */
+#endif /* CONFIG_X86_64 */
#ifdef CONFIG_X86_32
void enable_sep_cpu(void)
@@ -1176,7 +1186,7 @@ void syscall_init(void)
/* Flags to clear on syscall */
wrmsrl(MSR_SYSCALL_MASK,
X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|
- X86_EFLAGS_IOPL|X86_EFLAGS_AC);
+ X86_EFLAGS_IOPL|X86_EFLAGS_AC|X86_EFLAGS_NT);
}
/*
@@ -1190,9 +1200,9 @@ DEFINE_PER_CPU(int, debug_stack_usage);
int is_debug_stack(unsigned long addr)
{
- return __get_cpu_var(debug_stack_usage) ||
- (addr <= __get_cpu_var(debug_stack_addr) &&
- addr > (__get_cpu_var(debug_stack_addr) - DEBUG_STKSZ));
+ return __this_cpu_read(debug_stack_usage) ||
+ (addr <= __this_cpu_read(debug_stack_addr) &&
+ addr > (__this_cpu_read(debug_stack_addr) - DEBUG_STKSZ));
}
NOKPROBE_SYMBOL(is_debug_stack);
@@ -1258,6 +1268,19 @@ static void dbg_restore_debug_regs(void)
#define dbg_restore_debug_regs()
#endif /* ! CONFIG_KGDB */
+static void wait_for_master_cpu(int cpu)
+{
+#ifdef CONFIG_SMP
+ /*
+ * wait for ACK from master CPU before continuing
+ * with AP initialization
+ */
+ WARN_ON(cpumask_test_and_set_cpu(cpu, cpu_initialized_mask));
+ while (!cpumask_test_cpu(cpu, cpu_callout_mask))
+ cpu_relax();
+#endif
+}
+
/*
* cpu_init() initializes state that is per-CPU. Some data is already
* initialized (naturally) in the bootstrap process, such as the GDT
@@ -1273,16 +1296,17 @@ void cpu_init(void)
struct task_struct *me;
struct tss_struct *t;
unsigned long v;
- int cpu;
+ int cpu = stack_smp_processor_id();
int i;
+ wait_for_master_cpu(cpu);
+
/*
* Load microcode on this cpu if a valid microcode is available.
* This is early microcode loading procedure.
*/
load_ucode_ap();
- cpu = stack_smp_processor_id();
t = &per_cpu(init_tss, cpu);
oist = &per_cpu(orig_ist, cpu);
@@ -1294,9 +1318,6 @@ void cpu_init(void)
me = current;
- if (cpumask_test_and_set_cpu(cpu, cpu_initialized_mask))
- panic("CPU#%d already initialized!\n", cpu);
-
pr_debug("Initializing CPU#%d\n", cpu);
clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
@@ -1373,17 +1394,13 @@ void cpu_init(void)
struct tss_struct *t = &per_cpu(init_tss, cpu);
struct thread_struct *thread = &curr->thread;
- show_ucode_info_early();
+ wait_for_master_cpu(cpu);
- if (cpumask_test_and_set_cpu(cpu, cpu_initialized_mask)) {
- printk(KERN_WARNING "CPU#%d already initialized!\n", cpu);
- for (;;)
- local_irq_enable();
- }
+ show_ucode_info_early();
printk(KERN_INFO "Initializing CPU#%d\n", cpu);
- if (cpu_has_vme || cpu_has_tsc || cpu_has_de)
+ if (cpu_feature_enabled(X86_FEATURE_VME) || cpu_has_tsc || cpu_has_de)
clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
load_current_idt();
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 74e804ddc5c7..9cc6b6f25f42 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -144,6 +144,21 @@ static void early_init_intel(struct cpuinfo_x86 *c)
setup_clear_cpu_cap(X86_FEATURE_ERMS);
}
}
+
+ /*
+ * Intel Quark Core DevMan_001.pdf section 6.4.11
+ * "The operating system also is required to invalidate (i.e., flush)
+ * the TLB when any changes are made to any of the page table entries.
+ * The operating system must reload CR3 to cause the TLB to be flushed"
+ *
+ * As a result cpu_has_pge() in arch/x86/include/asm/tlbflush.h should
+ * be false so that __flush_tlb_all() causes CR3 insted of CR4.PGE
+ * to be modified
+ */
+ if (c->x86 == 5 && c->x86_model == 9) {
+ pr_info("Disabling PGE capability bit\n");
+ setup_clear_cpu_cap(X86_FEATURE_PGE);
+ }
}
#ifdef CONFIG_X86_32
@@ -198,12 +213,13 @@ static void intel_workarounds(struct cpuinfo_x86 *c)
{
#ifdef CONFIG_X86_F00F_BUG
/*
- * All current models of Pentium and Pentium with MMX technology CPUs
+ * All models of Pentium and Pentium with MMX technology CPUs
* have the F0 0F bug, which lets nonprivileged users lock up the
* system. Announce that the fault handler will be checking for it.
+ * The Quark is also family 5, but does not have the same bug.
*/
clear_cpu_bug(c, X86_BUG_F00F);
- if (!paravirt_enabled() && c->x86 == 5) {
+ if (!paravirt_enabled() && c->x86 == 5 && c->x86_model < 9) {
static int f00f_workaround_enabled;
set_cpu_bug(c, X86_BUG_F00F);
@@ -382,6 +398,13 @@ static void init_intel(struct cpuinfo_x86 *c)
}
l2 = init_intel_cacheinfo(c);
+
+ /* Detect legacy cache sizes if init_intel_cacheinfo did not */
+ if (l2 == 0) {
+ cpu_detect_cache_sizes(c);
+ l2 = c->x86_cache_size;
+ }
+
if (c->cpuid_level > 9) {
unsigned eax = cpuid_eax(10);
/* Check for version and the number of counters */
@@ -485,6 +508,13 @@ static unsigned int intel_size_cache(struct cpuinfo_x86 *c, unsigned int size)
*/
if ((c->x86 == 6) && (c->x86_model == 11) && (size == 0))
size = 256;
+
+ /*
+ * Intel Quark SoC X1000 contains a 4-way set associative
+ * 16K cache with a 16 byte cache line and 256 lines per tag
+ */
+ if ((c->x86 == 5) && (c->x86_model == 9))
+ size = 16;
return size;
}
#endif
@@ -686,7 +716,8 @@ static const struct cpu_dev intel_cpu_dev = {
[3] = "OverDrive PODP5V83",
[4] = "Pentium MMX",
[7] = "Mobile Pentium 75 - 200",
- [8] = "Mobile Pentium MMX"
+ [8] = "Mobile Pentium MMX",
+ [9] = "Quark SoC X1000",
}
},
{ .family = 6, .model_names =
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 9c8f7394c612..c7035073dfc1 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -461,7 +461,7 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
- if (strict_strtoul(buf, 10, &val) < 0)
+ if (kstrtoul(buf, 10, &val) < 0)
return -EINVAL;
err = amd_set_l3_disable_slot(this_leaf->base.nb, cpu, slot, val);
@@ -511,7 +511,7 @@ store_subcaches(struct _cpuid4_info *this_leaf, const char *buf, size_t count,
if (!this_leaf->base.nb || !amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
return -EINVAL;
- if (strict_strtoul(buf, 16, &val) < 0)
+ if (kstrtoul(buf, 16, &val) < 0)
return -EINVAL;
if (amd_set_subcaches(cpu, val))
diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c
index 5ac2d1fb28bc..4cfba4371a71 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-inject.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c
@@ -83,7 +83,7 @@ static DEFINE_MUTEX(mce_inject_mutex);
static int mce_raise_notify(unsigned int cmd, struct pt_regs *regs)
{
int cpu = smp_processor_id();
- struct mce *m = &__get_cpu_var(injectm);
+ struct mce *m = this_cpu_ptr(&injectm);
if (!cpumask_test_cpu(cpu, mce_inject_cpumask))
return NMI_DONE;
cpumask_clear_cpu(cpu, mce_inject_cpumask);
@@ -97,7 +97,7 @@ static int mce_raise_notify(unsigned int cmd, struct pt_regs *regs)
static void mce_irq_ipi(void *info)
{
int cpu = smp_processor_id();
- struct mce *m = &__get_cpu_var(injectm);
+ struct mce *m = this_cpu_ptr(&injectm);
if (cpumask_test_cpu(cpu, mce_inject_cpumask) &&
m->inject_flags & MCJ_EXCEPTION) {
@@ -109,7 +109,7 @@ static void mce_irq_ipi(void *info)
/* Inject mce on current CPU */
static int raise_local(void)
{
- struct mce *m = &__get_cpu_var(injectm);
+ struct mce *m = this_cpu_ptr(&injectm);
int context = MCJ_CTX(m->inject_flags);
int ret = 0;
int cpu = m->extcpu;
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 4fc57975acc1..61a9668cebfd 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -400,7 +400,7 @@ static u64 mce_rdmsrl(u32 msr)
if (offset < 0)
return 0;
- return *(u64 *)((char *)&__get_cpu_var(injectm) + offset);
+ return *(u64 *)((char *)this_cpu_ptr(&injectm) + offset);
}
if (rdmsrl_safe(msr, &v)) {
@@ -422,7 +422,7 @@ static void mce_wrmsrl(u32 msr, u64 v)
int offset = msr_to_offset(msr);
if (offset >= 0)
- *(u64 *)((char *)&__get_cpu_var(injectm) + offset) = v;
+ *(u64 *)((char *)this_cpu_ptr(&injectm) + offset) = v;
return;
}
wrmsrl(msr, v);
@@ -478,7 +478,7 @@ static DEFINE_PER_CPU(struct mce_ring, mce_ring);
/* Runs with CPU affinity in workqueue */
static int mce_ring_empty(void)
{
- struct mce_ring *r = &__get_cpu_var(mce_ring);
+ struct mce_ring *r = this_cpu_ptr(&mce_ring);
return r->start == r->end;
}
@@ -490,7 +490,7 @@ static int mce_ring_get(unsigned long *pfn)
*pfn = 0;
get_cpu();
- r = &__get_cpu_var(mce_ring);
+ r = this_cpu_ptr(&mce_ring);
if (r->start == r->end)
goto out;
*pfn = r->ring[r->start];
@@ -504,7 +504,7 @@ out:
/* Always runs in MCE context with preempt off */
static int mce_ring_add(unsigned long pfn)
{
- struct mce_ring *r = &__get_cpu_var(mce_ring);
+ struct mce_ring *r = this_cpu_ptr(&mce_ring);
unsigned next;
next = (r->end + 1) % MCE_RING_SIZE;
@@ -526,7 +526,7 @@ int mce_available(struct cpuinfo_x86 *c)
static void mce_schedule_work(void)
{
if (!mce_ring_empty())
- schedule_work(&__get_cpu_var(mce_work));
+ schedule_work(this_cpu_ptr(&mce_work));
}
DEFINE_PER_CPU(struct irq_work, mce_irq_work);
@@ -551,7 +551,7 @@ static void mce_report_event(struct pt_regs *regs)
return;
}
- irq_work_queue(&__get_cpu_var(mce_irq_work));
+ irq_work_queue(this_cpu_ptr(&mce_irq_work));
}
/*
@@ -1045,7 +1045,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
mce_gather_info(&m, regs);
- final = &__get_cpu_var(mces_seen);
+ final = this_cpu_ptr(&mces_seen);
*final = m;
memset(valid_banks, 0, sizeof(valid_banks));
@@ -1278,22 +1278,22 @@ static unsigned long (*mce_adjust_timer)(unsigned long interval) =
static int cmc_error_seen(void)
{
- unsigned long *v = &__get_cpu_var(mce_polled_error);
+ unsigned long *v = this_cpu_ptr(&mce_polled_error);
return test_and_clear_bit(0, v);
}
static void mce_timer_fn(unsigned long data)
{
- struct timer_list *t = &__get_cpu_var(mce_timer);
+ struct timer_list *t = this_cpu_ptr(&mce_timer);
unsigned long iv;
int notify;
WARN_ON(smp_processor_id() != data);
- if (mce_available(__this_cpu_ptr(&cpu_info))) {
+ if (mce_available(this_cpu_ptr(&cpu_info))) {
machine_check_poll(MCP_TIMESTAMP,
- &__get_cpu_var(mce_poll_banks));
+ this_cpu_ptr(&mce_poll_banks));
mce_intel_cmci_poll();
}
@@ -1323,7 +1323,7 @@ static void mce_timer_fn(unsigned long data)
*/
void mce_timer_kick(unsigned long interval)
{
- struct timer_list *t = &__get_cpu_var(mce_timer);
+ struct timer_list *t = this_cpu_ptr(&mce_timer);
unsigned long when = jiffies + interval;
unsigned long iv = __this_cpu_read(mce_next_interval);
@@ -1659,7 +1659,7 @@ static void mce_start_timer(unsigned int cpu, struct timer_list *t)
static void __mcheck_cpu_init_timer(void)
{
- struct timer_list *t = &__get_cpu_var(mce_timer);
+ struct timer_list *t = this_cpu_ptr(&mce_timer);
unsigned int cpu = smp_processor_id();
setup_timer(t, mce_timer_fn, cpu);
@@ -1702,8 +1702,8 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c)
__mcheck_cpu_init_generic();
__mcheck_cpu_init_vendor(c);
__mcheck_cpu_init_timer();
- INIT_WORK(&__get_cpu_var(mce_work), mce_process_work);
- init_irq_work(&__get_cpu_var(mce_irq_work), &mce_irq_work_cb);
+ INIT_WORK(this_cpu_ptr(&mce_work), mce_process_work);
+ init_irq_work(this_cpu_ptr(&mce_irq_work), &mce_irq_work_cb);
}
/*
@@ -1955,7 +1955,7 @@ static struct miscdevice mce_chrdev_device = {
static void __mce_disable_bank(void *arg)
{
int bank = *((int *)arg);
- __clear_bit(bank, __get_cpu_var(mce_poll_banks));
+ __clear_bit(bank, this_cpu_ptr(mce_poll_banks));
cmci_disable_bank(bank);
}
@@ -2065,7 +2065,7 @@ static void mce_syscore_shutdown(void)
static void mce_syscore_resume(void)
{
__mcheck_cpu_init_generic();
- __mcheck_cpu_init_vendor(__this_cpu_ptr(&cpu_info));
+ __mcheck_cpu_init_vendor(raw_cpu_ptr(&cpu_info));
}
static struct syscore_ops mce_syscore_ops = {
@@ -2080,7 +2080,7 @@ static struct syscore_ops mce_syscore_ops = {
static void mce_cpu_restart(void *data)
{
- if (!mce_available(__this_cpu_ptr(&cpu_info)))
+ if (!mce_available(raw_cpu_ptr(&cpu_info)))
return;
__mcheck_cpu_init_generic();
__mcheck_cpu_init_timer();
@@ -2096,14 +2096,14 @@ static void mce_restart(void)
/* Toggle features for corrected errors */
static void mce_disable_cmci(void *data)
{
- if (!mce_available(__this_cpu_ptr(&cpu_info)))
+ if (!mce_available(raw_cpu_ptr(&cpu_info)))
return;
cmci_clear();
}
static void mce_enable_ce(void *all)
{
- if (!mce_available(__this_cpu_ptr(&cpu_info)))
+ if (!mce_available(raw_cpu_ptr(&cpu_info)))
return;
cmci_reenable();
cmci_recheck();
@@ -2136,7 +2136,7 @@ static ssize_t set_bank(struct device *s, struct device_attribute *attr,
{
u64 new;
- if (strict_strtoull(buf, 0, &new) < 0)
+ if (kstrtou64(buf, 0, &new) < 0)
return -EINVAL;
attr_to_bank(attr)->ctl = new;
@@ -2174,7 +2174,7 @@ static ssize_t set_ignore_ce(struct device *s,
{
u64 new;
- if (strict_strtoull(buf, 0, &new) < 0)
+ if (kstrtou64(buf, 0, &new) < 0)
return -EINVAL;
if (mca_cfg.ignore_ce ^ !!new) {
@@ -2198,7 +2198,7 @@ static ssize_t set_cmci_disabled(struct device *s,
{
u64 new;
- if (strict_strtoull(buf, 0, &new) < 0)
+ if (kstrtou64(buf, 0, &new) < 0)
return -EINVAL;
if (mca_cfg.cmci_disabled ^ !!new) {
@@ -2336,7 +2336,7 @@ static void mce_disable_cpu(void *h)
unsigned long action = *(unsigned long *)h;
int i;
- if (!mce_available(__this_cpu_ptr(&cpu_info)))
+ if (!mce_available(raw_cpu_ptr(&cpu_info)))
return;
if (!(action & CPU_TASKS_FROZEN))
@@ -2354,7 +2354,7 @@ static void mce_reenable_cpu(void *h)
unsigned long action = *(unsigned long *)h;
int i;
- if (!mce_available(__this_cpu_ptr(&cpu_info)))
+ if (!mce_available(raw_cpu_ptr(&cpu_info)))
return;
if (!(action & CPU_TASKS_FROZEN))
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 603df4f74640..5d4999f95aec 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -310,7 +310,7 @@ static void amd_threshold_interrupt(void)
* event.
*/
machine_check_poll(MCP_TIMESTAMP,
- &__get_cpu_var(mce_poll_banks));
+ this_cpu_ptr(&mce_poll_banks));
if (high & MASK_OVERFLOW_HI) {
rdmsrl(address, m.misc);
@@ -353,7 +353,7 @@ store_interrupt_enable(struct threshold_block *b, const char *buf, size_t size)
if (!b->interrupt_capable)
return -EINVAL;
- if (strict_strtoul(buf, 0, &new) < 0)
+ if (kstrtoul(buf, 0, &new) < 0)
return -EINVAL;
b->interrupt_enable = !!new;
@@ -372,7 +372,7 @@ store_threshold_limit(struct threshold_block *b, const char *buf, size_t size)
struct thresh_restart tr;
unsigned long new;
- if (strict_strtoul(buf, 0, &new) < 0)
+ if (kstrtoul(buf, 0, &new) < 0)
return -EINVAL;
if (new > THRESHOLD_MAX)
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index 3bdb95ae8c43..b3c97bafc123 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -86,7 +86,7 @@ void mce_intel_cmci_poll(void)
{
if (__this_cpu_read(cmci_storm_state) == CMCI_STORM_NONE)
return;
- machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
+ machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned));
}
void mce_intel_hcpu_update(unsigned long cpu)
@@ -145,7 +145,7 @@ static void cmci_storm_disable_banks(void)
u64 val;
raw_spin_lock_irqsave(&cmci_discover_lock, flags);
- owned = __get_cpu_var(mce_banks_owned);
+ owned = this_cpu_ptr(mce_banks_owned);
for_each_set_bit(bank, owned, MAX_NR_BANKS) {
rdmsrl(MSR_IA32_MCx_CTL2(bank), val);
val &= ~MCI_CTL2_CMCI_EN;
@@ -195,7 +195,7 @@ static void intel_threshold_interrupt(void)
{
if (cmci_storm_detect())
return;
- machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
+ machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned));
mce_notify_irq();
}
@@ -206,7 +206,7 @@ static void intel_threshold_interrupt(void)
*/
static void cmci_discover(int banks)
{
- unsigned long *owned = (void *)&__get_cpu_var(mce_banks_owned);
+ unsigned long *owned = (void *)this_cpu_ptr(&mce_banks_owned);
unsigned long flags;
int i;
int bios_wrong_thresh = 0;
@@ -228,7 +228,7 @@ static void cmci_discover(int banks)
/* Already owned by someone else? */
if (val & MCI_CTL2_CMCI_EN) {
clear_bit(i, owned);
- __clear_bit(i, __get_cpu_var(mce_poll_banks));
+ __clear_bit(i, this_cpu_ptr(mce_poll_banks));
continue;
}
@@ -252,7 +252,7 @@ static void cmci_discover(int banks)
/* Did the enable bit stick? -- the bank supports CMCI */
if (val & MCI_CTL2_CMCI_EN) {
set_bit(i, owned);
- __clear_bit(i, __get_cpu_var(mce_poll_banks));
+ __clear_bit(i, this_cpu_ptr(mce_poll_banks));
/*
* We are able to set thresholds for some banks that
* had a threshold of 0. This means the BIOS has not
@@ -263,7 +263,7 @@ static void cmci_discover(int banks)
(val & MCI_CTL2_CMCI_THRESHOLD_MASK))
bios_wrong_thresh = 1;
} else {
- WARN_ON(!test_bit(i, __get_cpu_var(mce_poll_banks)));
+ WARN_ON(!test_bit(i, this_cpu_ptr(mce_poll_banks)));
}
}
raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
@@ -284,10 +284,10 @@ void cmci_recheck(void)
unsigned long flags;
int banks;
- if (!mce_available(__this_cpu_ptr(&cpu_info)) || !cmci_supported(&banks))
+ if (!mce_available(raw_cpu_ptr(&cpu_info)) || !cmci_supported(&banks))
return;
local_irq_save(flags);
- machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
+ machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned));
local_irq_restore(flags);
}
@@ -296,12 +296,12 @@ static void __cmci_disable_bank(int bank)
{
u64 val;
- if (!test_bit(bank, __get_cpu_var(mce_banks_owned)))
+ if (!test_bit(bank, this_cpu_ptr(mce_banks_owned)))
return;
rdmsrl(MSR_IA32_MCx_CTL2(bank), val);
val &= ~MCI_CTL2_CMCI_EN;
wrmsrl(MSR_IA32_MCx_CTL2(bank), val);
- __clear_bit(bank, __get_cpu_var(mce_banks_owned));
+ __clear_bit(bank, this_cpu_ptr(mce_banks_owned));
}
/*
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index 36a1bb6d1ee0..1af51b1586d7 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -498,8 +498,8 @@ void intel_init_thermal(struct cpuinfo_x86 *c)
if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) {
- printk(KERN_DEBUG
- "CPU%d: Thermal monitoring handled by SMI\n", cpu);
+ if (system_state == SYSTEM_BOOTING)
+ printk(KERN_DEBUG "CPU%d: Thermal monitoring handled by SMI\n", cpu);
return;
}
diff --git a/arch/x86/kernel/cpu/microcode/amd_early.c b/arch/x86/kernel/cpu/microcode/amd_early.c
index 617a9e284245..06674473b0e6 100644
--- a/arch/x86/kernel/cpu/microcode/amd_early.c
+++ b/arch/x86/kernel/cpu/microcode/amd_early.c
@@ -27,7 +27,7 @@ static u32 ucode_new_rev;
u8 amd_ucode_patch[PATCH_MAX_SIZE];
static u16 this_equiv_id;
-struct cpio_data ucode_cpio;
+static struct cpio_data ucode_cpio;
/*
* Microcode patch container file is prepended to the initrd in cpio format.
@@ -108,12 +108,13 @@ static size_t compute_container_size(u8 *data, u32 total_size)
* load_microcode_amd() to save equivalent cpu table and microcode patches in
* kernel heap memory.
*/
-static void apply_ucode_in_initrd(void *ucode, size_t size)
+static void apply_ucode_in_initrd(void *ucode, size_t size, bool save_patch)
{
struct equiv_cpu_entry *eq;
size_t *cont_sz;
u32 *header;
u8 *data, **cont;
+ u8 (*patch)[PATCH_MAX_SIZE];
u16 eq_id = 0;
int offset, left;
u32 rev, eax, ebx, ecx, edx;
@@ -123,10 +124,12 @@ static void apply_ucode_in_initrd(void *ucode, size_t size)
new_rev = (u32 *)__pa_nodebug(&ucode_new_rev);
cont_sz = (size_t *)__pa_nodebug(&container_size);
cont = (u8 **)__pa_nodebug(&container);
+ patch = (u8 (*)[PATCH_MAX_SIZE])__pa_nodebug(&amd_ucode_patch);
#else
new_rev = &ucode_new_rev;
cont_sz = &container_size;
cont = &container;
+ patch = &amd_ucode_patch;
#endif
data = ucode;
@@ -213,9 +216,9 @@ static void apply_ucode_in_initrd(void *ucode, size_t size)
rev = mc->hdr.patch_id;
*new_rev = rev;
- /* save ucode patch */
- memcpy(amd_ucode_patch, mc,
- min_t(u32, header[1], PATCH_MAX_SIZE));
+ if (save_patch)
+ memcpy(patch, mc,
+ min_t(u32, header[1], PATCH_MAX_SIZE));
}
}
@@ -246,7 +249,7 @@ void __init load_ucode_amd_bsp(void)
*data = cp.data;
*size = cp.size;
- apply_ucode_in_initrd(cp.data, cp.size);
+ apply_ucode_in_initrd(cp.data, cp.size, true);
}
#ifdef CONFIG_X86_32
@@ -263,7 +266,7 @@ void load_ucode_amd_ap(void)
size_t *usize;
void **ucode;
- mc = (struct microcode_amd *)__pa(amd_ucode_patch);
+ mc = (struct microcode_amd *)__pa_nodebug(amd_ucode_patch);
if (mc->hdr.patch_id && mc->hdr.processor_rev_id) {
__apply_microcode_amd(mc);
return;
@@ -275,7 +278,7 @@ void load_ucode_amd_ap(void)
if (!*ucode || !*usize)
return;
- apply_ucode_in_initrd(*ucode, *usize);
+ apply_ucode_in_initrd(*ucode, *usize, false);
}
static void __init collect_cpu_sig_on_bsp(void *arg)
@@ -339,7 +342,7 @@ void load_ucode_amd_ap(void)
* AP has a different equivalence ID than BSP, looks like
* mixed-steppings silicon so go through the ucode blob anew.
*/
- apply_ucode_in_initrd(ucode_cpio.data, ucode_cpio.size);
+ apply_ucode_in_initrd(ucode_cpio.data, ucode_cpio.size, false);
}
}
#endif
@@ -347,7 +350,9 @@ void load_ucode_amd_ap(void)
int __init save_microcode_in_initrd_amd(void)
{
unsigned long cont;
+ int retval = 0;
enum ucode_state ret;
+ u8 *cont_va;
u32 eax;
if (!container)
@@ -355,13 +360,15 @@ int __init save_microcode_in_initrd_amd(void)
#ifdef CONFIG_X86_32
get_bsp_sig();
- cont = (unsigned long)container;
+ cont = (unsigned long)container;
+ cont_va = __va(container);
#else
/*
* We need the physical address of the container for both bitness since
* boot_params.hdr.ramdisk_image is a physical address.
*/
- cont = __pa(container);
+ cont = __pa(container);
+ cont_va = container;
#endif
/*
@@ -372,6 +379,8 @@ int __init save_microcode_in_initrd_amd(void)
if (relocated_ramdisk)
container = (u8 *)(__va(relocated_ramdisk) +
(cont - boot_params.hdr.ramdisk_image));
+ else
+ container = cont_va;
if (ucode_new_rev)
pr_info("microcode: updated early to new patch_level=0x%08x\n",
@@ -382,7 +391,7 @@ int __init save_microcode_in_initrd_amd(void)
ret = load_microcode_amd(eax, container, container_size);
if (ret != UCODE_OK)
- return -EINVAL;
+ retval = -EINVAL;
/*
* This will be freed any msec now, stash patches for the current
@@ -391,5 +400,5 @@ int __init save_microcode_in_initrd_amd(void)
container = NULL;
container_size = 0;
- return 0;
+ return retval;
}
diff --git a/arch/x86/kernel/cpu/microcode/core_early.c b/arch/x86/kernel/cpu/microcode/core_early.c
index 5f28a64e71ea..2c017f242a78 100644
--- a/arch/x86/kernel/cpu/microcode/core_early.c
+++ b/arch/x86/kernel/cpu/microcode/core_early.c
@@ -124,7 +124,7 @@ void __init load_ucode_bsp(void)
static bool check_loader_disabled_ap(void)
{
#ifdef CONFIG_X86_32
- return __pa_nodebug(dis_ucode_ldr);
+ return *((bool *)__pa_nodebug(&dis_ucode_ldr));
#else
return dis_ucode_ldr;
#endif
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index a276fa75d9b5..c6826d1e8082 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -127,7 +127,7 @@ static int get_matching_mc(struct microcode_intel *mc_intel, int cpu)
return get_matching_microcode(csig, cpf, mc_intel, crev);
}
-int apply_microcode(int cpu)
+static int apply_microcode_intel(int cpu)
{
struct microcode_intel *mc_intel;
struct ucode_cpu_info *uci;
@@ -314,7 +314,7 @@ static struct microcode_ops microcode_intel_ops = {
.request_microcode_user = request_microcode_user,
.request_microcode_fw = request_microcode_fw,
.collect_cpu_info = collect_cpu_info,
- .apply_microcode = apply_microcode,
+ .apply_microcode = apply_microcode_intel,
.microcode_fini_cpu = microcode_fini_cpu,
};
diff --git a/arch/x86/kernel/cpu/microcode/intel_early.c b/arch/x86/kernel/cpu/microcode/intel_early.c
index 18f739129e72..b88343f7a3b3 100644
--- a/arch/x86/kernel/cpu/microcode/intel_early.c
+++ b/arch/x86/kernel/cpu/microcode/intel_early.c
@@ -28,8 +28,8 @@
#include <asm/tlbflush.h>
#include <asm/setup.h>
-unsigned long mc_saved_in_initrd[MAX_UCODE_COUNT];
-struct mc_saved_data {
+static unsigned long mc_saved_in_initrd[MAX_UCODE_COUNT];
+static struct mc_saved_data {
unsigned int mc_saved_count;
struct microcode_intel **mc_saved;
} mc_saved_data;
@@ -415,7 +415,7 @@ static void __ref show_saved_mc(void)
struct ucode_cpu_info uci;
if (mc_saved_data.mc_saved_count == 0) {
- pr_debug("no micorcode data saved.\n");
+ pr_debug("no microcode data saved.\n");
return;
}
pr_debug("Total microcode saved: %d\n", mc_saved_data.mc_saved_count);
@@ -506,7 +506,7 @@ int save_mc_for_early(u8 *mc)
if (mc_saved && mc_saved_count)
memcpy(mc_saved_tmp, mc_saved,
- mc_saved_count * sizeof(struct mirocode_intel *));
+ mc_saved_count * sizeof(struct microcode_intel *));
/*
* Save the microcode patch mc in mc_save_tmp structure if it's a newer
* version.
@@ -526,7 +526,7 @@ int save_mc_for_early(u8 *mc)
show_saved_mc();
/*
- * Free old saved microcod data.
+ * Free old saved microcode data.
*/
if (mc_saved) {
for (i = 0; i < mc_saved_count_init; i++)
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index f961de9964c7..ea5f363a1948 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -707,7 +707,7 @@ void __init mtrr_bp_init(void)
} else {
switch (boot_cpu_data.x86_vendor) {
case X86_VENDOR_AMD:
- if (cpu_has_k6_mtrr) {
+ if (cpu_feature_enabled(X86_FEATURE_K6_MTRR)) {
/* Pre-Athlon (K6) AMD CPU MTRRs */
mtrr_if = mtrr_ops[X86_VENDOR_AMD];
size_or_mask = SIZE_OR_MASK_BITS(32);
@@ -715,14 +715,14 @@ void __init mtrr_bp_init(void)
}
break;
case X86_VENDOR_CENTAUR:
- if (cpu_has_centaur_mcr) {
+ if (cpu_feature_enabled(X86_FEATURE_CENTAUR_MCR)) {
mtrr_if = mtrr_ops[X86_VENDOR_CENTAUR];
size_or_mask = SIZE_OR_MASK_BITS(32);
size_and_mask = 0;
}
break;
case X86_VENDOR_CYRIX:
- if (cpu_has_cyrix_arr) {
+ if (cpu_feature_enabled(X86_FEATURE_CYRIX_ARR)) {
mtrr_if = mtrr_ops[X86_VENDOR_CYRIX];
size_or_mask = SIZE_OR_MASK_BITS(32);
size_and_mask = 0;
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 2879ecdaac43..143e5f5dc855 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -243,7 +243,9 @@ static bool check_hw_exists(void)
msr_fail:
printk(KERN_CONT "Broken PMU hardware detected, using software events only.\n");
- printk(KERN_ERR "Failed to access perfctr msr (MSR %x is %Lx)\n", reg, val_new);
+ printk("%sFailed to access perfctr msr (MSR %x is %Lx)\n",
+ boot_cpu_has(X86_FEATURE_HYPERVISOR) ? KERN_INFO : KERN_ERR,
+ reg, val_new);
return false;
}
@@ -387,7 +389,7 @@ int x86_pmu_hw_config(struct perf_event *event)
precise++;
/* Support for IP fixup */
- if (x86_pmu.lbr_nr)
+ if (x86_pmu.lbr_nr || x86_pmu.intel_cap.pebs_format >= 2)
precise++;
}
@@ -487,7 +489,7 @@ static int __x86_pmu_event_init(struct perf_event *event)
void x86_pmu_disable_all(void)
{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
int idx;
for (idx = 0; idx < x86_pmu.num_counters; idx++) {
@@ -505,7 +507,7 @@ void x86_pmu_disable_all(void)
static void x86_pmu_disable(struct pmu *pmu)
{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
if (!x86_pmu_initialized())
return;
@@ -522,7 +524,7 @@ static void x86_pmu_disable(struct pmu *pmu)
void x86_pmu_enable_all(int added)
{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
int idx;
for (idx = 0; idx < x86_pmu.num_counters; idx++) {
@@ -869,7 +871,7 @@ static void x86_pmu_start(struct perf_event *event, int flags);
static void x86_pmu_enable(struct pmu *pmu)
{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct perf_event *event;
struct hw_perf_event *hwc;
int i, added = cpuc->n_added;
@@ -1020,7 +1022,7 @@ void x86_pmu_enable_event(struct perf_event *event)
*/
static int x86_pmu_add(struct perf_event *event, int flags)
{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct hw_perf_event *hwc;
int assign[X86_PMC_IDX_MAX];
int n, n0, ret;
@@ -1071,7 +1073,7 @@ out:
static void x86_pmu_start(struct perf_event *event, int flags)
{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
int idx = event->hw.idx;
if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
@@ -1150,7 +1152,7 @@ void perf_event_print_debug(void)
void x86_pmu_stop(struct perf_event *event, int flags)
{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct hw_perf_event *hwc = &event->hw;
if (__test_and_clear_bit(hwc->idx, cpuc->active_mask)) {
@@ -1172,7 +1174,7 @@ void x86_pmu_stop(struct perf_event *event, int flags)
static void x86_pmu_del(struct perf_event *event, int flags)
{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
int i;
/*
@@ -1227,7 +1229,7 @@ int x86_pmu_handle_irq(struct pt_regs *regs)
int idx, handled = 0;
u64 val;
- cpuc = &__get_cpu_var(cpu_hw_events);
+ cpuc = this_cpu_ptr(&cpu_hw_events);
/*
* Some chipsets need to unmask the LVTPC in a particular spot
@@ -1636,7 +1638,7 @@ static void x86_pmu_cancel_txn(struct pmu *pmu)
*/
static int x86_pmu_commit_txn(struct pmu *pmu)
{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
int assign[X86_PMC_IDX_MAX];
int n, ret;
@@ -1995,7 +1997,7 @@ static unsigned long get_segment_base(unsigned int segment)
if (idx > GDT_ENTRIES)
return 0;
- desc = __this_cpu_ptr(&gdt_page.gdt[0]);
+ desc = raw_cpu_ptr(gdt_page.gdt);
}
return get_desc_base(desc + idx);
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 8ade93111e03..fc5eb390b368 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -67,8 +67,10 @@ struct event_constraint {
*/
#define PERF_X86_EVENT_PEBS_LDLAT 0x1 /* ld+ldlat data address sampling */
#define PERF_X86_EVENT_PEBS_ST 0x2 /* st data address sampling */
-#define PERF_X86_EVENT_PEBS_ST_HSW 0x4 /* haswell style st data sampling */
+#define PERF_X86_EVENT_PEBS_ST_HSW 0x4 /* haswell style datala, store */
#define PERF_X86_EVENT_COMMITTED 0x8 /* event passed commit_txn */
+#define PERF_X86_EVENT_PEBS_LD_HSW 0x10 /* haswell style datala, load */
+#define PERF_X86_EVENT_PEBS_NA_HSW 0x20 /* haswell style datala, unknown */
struct amd_nb {
int nb_id; /* NorthBridge id */
@@ -252,18 +254,52 @@ struct cpu_hw_events {
EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK)
#define INTEL_PLD_CONSTRAINT(c, n) \
- __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK, \
+ __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LDLAT)
#define INTEL_PST_CONSTRAINT(c, n) \
- __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK, \
+ __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST)
-/* DataLA version of store sampling without extra enable bit. */
-#define INTEL_PST_HSW_CONSTRAINT(c, n) \
- __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK, \
+/* Event constraint, but match on all event flags too. */
+#define INTEL_FLAGS_EVENT_CONSTRAINT(c, n) \
+ EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS)
+
+/* Check only flags, but allow all event/umask */
+#define INTEL_ALL_EVENT_CONSTRAINT(code, n) \
+ EVENT_CONSTRAINT(code, n, X86_ALL_EVENT_FLAGS)
+
+/* Check flags and event code, and set the HSW store flag */
+#define INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_ST(code, n) \
+ __EVENT_CONSTRAINT(code, n, \
+ ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS, \
+ HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST_HSW)
+
+/* Check flags and event code, and set the HSW load flag */
+#define INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(code, n) \
+ __EVENT_CONSTRAINT(code, n, \
+ ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS, \
+ HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LD_HSW)
+
+/* Check flags and event code/umask, and set the HSW store flag */
+#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(code, n) \
+ __EVENT_CONSTRAINT(code, n, \
+ INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST_HSW)
+/* Check flags and event code/umask, and set the HSW load flag */
+#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(code, n) \
+ __EVENT_CONSTRAINT(code, n, \
+ INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
+ HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LD_HSW)
+
+/* Check flags and event code/umask, and set the HSW N/A flag */
+#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(code, n) \
+ __EVENT_CONSTRAINT(code, n, \
+ INTEL_ARCH_EVENT_MASK|INTEL_ARCH_EVENT_MASK, \
+ HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_NA_HSW)
+
+
/*
* We define the end marker as having a weight of -1
* to enable blacklisting of events using a counter bitmask
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index beeb7cc07044..28926311aac1 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -699,7 +699,7 @@ __init int amd_pmu_init(void)
void amd_pmu_enable_virt(void)
{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
cpuc->perf_ctr_virt_mask = 0;
@@ -711,7 +711,7 @@ EXPORT_SYMBOL_GPL(amd_pmu_enable_virt);
void amd_pmu_disable_virt(void)
{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
/*
* We only mask out the Host-only bit so that host-only counting works
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 2502d0d9d246..944bf019b74f 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1045,7 +1045,7 @@ static inline bool intel_pmu_needs_lbr_smpl(struct perf_event *event)
static void intel_pmu_disable_all(void)
{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
@@ -1058,7 +1058,7 @@ static void intel_pmu_disable_all(void)
static void intel_pmu_enable_all(int added)
{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
intel_pmu_pebs_enable_all();
intel_pmu_lbr_enable_all();
@@ -1092,7 +1092,7 @@ static void intel_pmu_enable_all(int added)
*/
static void intel_pmu_nhm_workaround(void)
{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
static const unsigned long nhm_magic[4] = {
0x4300B5,
0x4300D2,
@@ -1191,7 +1191,7 @@ static inline bool event_is_checkpointed(struct perf_event *event)
static void intel_pmu_disable_event(struct perf_event *event)
{
struct hw_perf_event *hwc = &event->hw;
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
if (unlikely(hwc->idx == INTEL_PMC_IDX_FIXED_BTS)) {
intel_pmu_disable_bts();
@@ -1255,7 +1255,7 @@ static void intel_pmu_enable_fixed(struct hw_perf_event *hwc)
static void intel_pmu_enable_event(struct perf_event *event)
{
struct hw_perf_event *hwc = &event->hw;
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
if (unlikely(hwc->idx == INTEL_PMC_IDX_FIXED_BTS)) {
if (!__this_cpu_read(cpu_hw_events.enabled))
@@ -1349,7 +1349,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
u64 status;
int handled;
- cpuc = &__get_cpu_var(cpu_hw_events);
+ cpuc = this_cpu_ptr(&cpu_hw_events);
/*
* No known reason to not always do late ACK,
@@ -1781,7 +1781,7 @@ EXPORT_SYMBOL_GPL(perf_guest_get_msrs);
static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr)
{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct perf_guest_switch_msr *arr = cpuc->guest_switch_msrs;
arr[0].msr = MSR_CORE_PERF_GLOBAL_CTRL;
@@ -1802,7 +1802,7 @@ static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr)
static struct perf_guest_switch_msr *core_guest_get_msrs(int *nr)
{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct perf_guest_switch_msr *arr = cpuc->guest_switch_msrs;
int idx;
@@ -1836,7 +1836,7 @@ static void core_pmu_enable_event(struct perf_event *event)
static void core_pmu_enable_all(int added)
{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
int idx;
for (idx = 0; idx < x86_pmu.num_counters; idx++) {
@@ -2367,15 +2367,15 @@ __init int intel_pmu_init(void)
* Install the hw-cache-events table:
*/
switch (boot_cpu_data.x86_model) {
- case 14: /* 65 nm core solo/duo, "Yonah" */
+ case 14: /* 65nm Core "Yonah" */
pr_cont("Core events, ");
break;
- case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */
+ case 15: /* 65nm Core2 "Merom" */
x86_add_quirk(intel_clovertown_quirk);
- case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */
- case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */
- case 29: /* six-core 45 nm xeon "Dunnington" */
+ case 22: /* 65nm Core2 "Merom-L" */
+ case 23: /* 45nm Core2 "Penryn" */
+ case 29: /* 45nm Core2 "Dunnington (MP) */
memcpy(hw_cache_event_ids, core2_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
@@ -2386,9 +2386,9 @@ __init int intel_pmu_init(void)
pr_cont("Core2 events, ");
break;
- case 26: /* 45 nm nehalem, "Bloomfield" */
- case 30: /* 45 nm nehalem, "Lynnfield" */
- case 46: /* 45 nm nehalem-ex, "Beckton" */
+ case 30: /* 45nm Nehalem */
+ case 26: /* 45nm Nehalem-EP */
+ case 46: /* 45nm Nehalem-EX */
memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
memcpy(hw_cache_extra_regs, nehalem_hw_cache_extra_regs,
@@ -2415,11 +2415,11 @@ __init int intel_pmu_init(void)
pr_cont("Nehalem events, ");
break;
- case 28: /* Atom */
- case 38: /* Lincroft */
- case 39: /* Penwell */
- case 53: /* Cloverview */
- case 54: /* Cedarview */
+ case 28: /* 45nm Atom "Pineview" */
+ case 38: /* 45nm Atom "Lincroft" */
+ case 39: /* 32nm Atom "Penwell" */
+ case 53: /* 32nm Atom "Cloverview" */
+ case 54: /* 32nm Atom "Cedarview" */
memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
@@ -2430,8 +2430,8 @@ __init int intel_pmu_init(void)
pr_cont("Atom events, ");
break;
- case 55: /* Atom 22nm "Silvermont" */
- case 77: /* Avoton "Silvermont" */
+ case 55: /* 22nm Atom "Silvermont" */
+ case 77: /* 22nm Atom "Silvermont Avoton/Rangely" */
memcpy(hw_cache_event_ids, slm_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
memcpy(hw_cache_extra_regs, slm_hw_cache_extra_regs,
@@ -2446,9 +2446,9 @@ __init int intel_pmu_init(void)
pr_cont("Silvermont events, ");
break;
- case 37: /* 32 nm nehalem, "Clarkdale" */
- case 44: /* 32 nm nehalem, "Gulftown" */
- case 47: /* 32 nm Xeon E7 */
+ case 37: /* 32nm Westmere */
+ case 44: /* 32nm Westmere-EP */
+ case 47: /* 32nm Westmere-EX */
memcpy(hw_cache_event_ids, westmere_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
memcpy(hw_cache_extra_regs, nehalem_hw_cache_extra_regs,
@@ -2474,8 +2474,8 @@ __init int intel_pmu_init(void)
pr_cont("Westmere events, ");
break;
- case 42: /* SandyBridge */
- case 45: /* SandyBridge, "Romely-EP" */
+ case 42: /* 32nm SandyBridge */
+ case 45: /* 32nm SandyBridge-E/EN/EP */
x86_add_quirk(intel_sandybridge_quirk);
memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
@@ -2506,8 +2506,9 @@ __init int intel_pmu_init(void)
pr_cont("SandyBridge events, ");
break;
- case 58: /* IvyBridge */
- case 62: /* IvyBridge EP */
+
+ case 58: /* 22nm IvyBridge */
+ case 62: /* 22nm IvyBridge-EP/EX */
memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
/* dTLB-load-misses on IVB is different than SNB */
@@ -2539,11 +2540,10 @@ __init int intel_pmu_init(void)
break;
- case 60: /* Haswell Client */
- case 70:
- case 71:
- case 63:
- case 69:
+ case 60: /* 22nm Haswell Core */
+ case 63: /* 22nm Haswell Server */
+ case 69: /* 22nm Haswell ULT */
+ case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */
x86_pmu.late_ack = true;
memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, sizeof(hw_cache_event_ids));
memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
@@ -2552,7 +2552,7 @@ __init int intel_pmu_init(void)
x86_pmu.event_constraints = intel_hsw_event_constraints;
x86_pmu.pebs_constraints = intel_hsw_pebs_event_constraints;
- x86_pmu.extra_regs = intel_snb_extra_regs;
+ x86_pmu.extra_regs = intel_snbep_extra_regs;
x86_pmu.pebs_aliases = intel_pebs_aliases_snb;
/* all extra regs are per-cpu when HT is on */
x86_pmu.er_flags |= ERF_HAS_RSP_1;
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 696ade311ded..46211bcc813e 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -108,14 +108,16 @@ static u64 precise_store_data(u64 status)
return val;
}
-static u64 precise_store_data_hsw(struct perf_event *event, u64 status)
+static u64 precise_datala_hsw(struct perf_event *event, u64 status)
{
union perf_mem_data_src dse;
- u64 cfg = event->hw.config & INTEL_ARCH_EVENT_MASK;
- dse.val = 0;
- dse.mem_op = PERF_MEM_OP_STORE;
- dse.mem_lvl = PERF_MEM_LVL_NA;
+ dse.val = PERF_MEM_NA;
+
+ if (event->hw.flags & PERF_X86_EVENT_PEBS_ST_HSW)
+ dse.mem_op = PERF_MEM_OP_STORE;
+ else if (event->hw.flags & PERF_X86_EVENT_PEBS_LD_HSW)
+ dse.mem_op = PERF_MEM_OP_LOAD;
/*
* L1 info only valid for following events:
@@ -125,15 +127,12 @@ static u64 precise_store_data_hsw(struct perf_event *event, u64 status)
* MEM_UOPS_RETIRED.SPLIT_STORES
* MEM_UOPS_RETIRED.ALL_STORES
*/
- if (cfg != 0x12d0 && cfg != 0x22d0 && cfg != 0x42d0 && cfg != 0x82d0)
- return dse.mem_lvl;
-
- if (status & 1)
- dse.mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_HIT;
- else
- dse.mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_MISS;
-
- /* Nothing else supported. Sorry. */
+ if (event->hw.flags & PERF_X86_EVENT_PEBS_ST_HSW) {
+ if (status & 1)
+ dse.mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_HIT;
+ else
+ dse.mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_MISS;
+ }
return dse.val;
}
@@ -475,7 +474,7 @@ void intel_pmu_enable_bts(u64 config)
void intel_pmu_disable_bts(void)
{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
unsigned long debugctlmsr;
if (!cpuc->ds)
@@ -492,7 +491,7 @@ void intel_pmu_disable_bts(void)
int intel_pmu_drain_bts_buffer(void)
{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct debug_store *ds = cpuc->ds;
struct bts_record {
u64 from;
@@ -569,28 +568,10 @@ struct event_constraint intel_atom_pebs_event_constraints[] = {
};
struct event_constraint intel_slm_pebs_event_constraints[] = {
- INTEL_UEVENT_CONSTRAINT(0x0103, 0x1), /* REHABQ.LD_BLOCK_ST_FORWARD_PS */
- INTEL_UEVENT_CONSTRAINT(0x0803, 0x1), /* REHABQ.LD_SPLITS_PS */
- INTEL_UEVENT_CONSTRAINT(0x0204, 0x1), /* MEM_UOPS_RETIRED.L2_HIT_LOADS_PS */
- INTEL_UEVENT_CONSTRAINT(0x0404, 0x1), /* MEM_UOPS_RETIRED.L2_MISS_LOADS_PS */
- INTEL_UEVENT_CONSTRAINT(0x0804, 0x1), /* MEM_UOPS_RETIRED.DTLB_MISS_LOADS_PS */
- INTEL_UEVENT_CONSTRAINT(0x2004, 0x1), /* MEM_UOPS_RETIRED.HITM_PS */
- INTEL_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY_PS */
- INTEL_UEVENT_CONSTRAINT(0x00c4, 0x1), /* BR_INST_RETIRED.ALL_BRANCHES_PS */
- INTEL_UEVENT_CONSTRAINT(0x7ec4, 0x1), /* BR_INST_RETIRED.JCC_PS */
- INTEL_UEVENT_CONSTRAINT(0xbfc4, 0x1), /* BR_INST_RETIRED.FAR_BRANCH_PS */
- INTEL_UEVENT_CONSTRAINT(0xebc4, 0x1), /* BR_INST_RETIRED.NON_RETURN_IND_PS */
- INTEL_UEVENT_CONSTRAINT(0xf7c4, 0x1), /* BR_INST_RETIRED.RETURN_PS */
- INTEL_UEVENT_CONSTRAINT(0xf9c4, 0x1), /* BR_INST_RETIRED.CALL_PS */
- INTEL_UEVENT_CONSTRAINT(0xfbc4, 0x1), /* BR_INST_RETIRED.IND_CALL_PS */
- INTEL_UEVENT_CONSTRAINT(0xfdc4, 0x1), /* BR_INST_RETIRED.REL_CALL_PS */
- INTEL_UEVENT_CONSTRAINT(0xfec4, 0x1), /* BR_INST_RETIRED.TAKEN_JCC_PS */
- INTEL_UEVENT_CONSTRAINT(0x00c5, 0x1), /* BR_INST_MISP_RETIRED.ALL_BRANCHES_PS */
- INTEL_UEVENT_CONSTRAINT(0x7ec5, 0x1), /* BR_INST_MISP_RETIRED.JCC_PS */
- INTEL_UEVENT_CONSTRAINT(0xebc5, 0x1), /* BR_INST_MISP_RETIRED.NON_RETURN_IND_PS */
- INTEL_UEVENT_CONSTRAINT(0xf7c5, 0x1), /* BR_INST_MISP_RETIRED.RETURN_PS */
- INTEL_UEVENT_CONSTRAINT(0xfbc5, 0x1), /* BR_INST_MISP_RETIRED.IND_CALL_PS */
- INTEL_UEVENT_CONSTRAINT(0xfec5, 0x1), /* BR_INST_MISP_RETIRED.TAKEN_JCC_PS */
+ /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
+ INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
+ /* Allow all events as PEBS with no flags */
+ INTEL_ALL_EVENT_CONSTRAINT(0, 0x1),
EVENT_CONSTRAINT_END
};
@@ -626,68 +607,44 @@ struct event_constraint intel_westmere_pebs_event_constraints[] = {
struct event_constraint intel_snb_pebs_event_constraints[] = {
INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */
- INTEL_UEVENT_CONSTRAINT(0x01c2, 0xf), /* UOPS_RETIRED.ALL */
- INTEL_UEVENT_CONSTRAINT(0x02c2, 0xf), /* UOPS_RETIRED.RETIRE_SLOTS */
- INTEL_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */
- INTEL_EVENT_CONSTRAINT(0xc5, 0xf), /* BR_MISP_RETIRED.* */
INTEL_PLD_CONSTRAINT(0x01cd, 0x8), /* MEM_TRANS_RETIRED.LAT_ABOVE_THR */
INTEL_PST_CONSTRAINT(0x02cd, 0x8), /* MEM_TRANS_RETIRED.PRECISE_STORES */
- INTEL_EVENT_CONSTRAINT(0xd0, 0xf), /* MEM_UOP_RETIRED.* */
- INTEL_EVENT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
- INTEL_EVENT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
- INTEL_EVENT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
- INTEL_UEVENT_CONSTRAINT(0x02d4, 0xf), /* MEM_LOAD_UOPS_MISC_RETIRED.LLC_MISS */
+ /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
+ INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
+ /* Allow all events as PEBS with no flags */
+ INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
EVENT_CONSTRAINT_END
};
struct event_constraint intel_ivb_pebs_event_constraints[] = {
INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */
- INTEL_UEVENT_CONSTRAINT(0x01c2, 0xf), /* UOPS_RETIRED.ALL */
- INTEL_UEVENT_CONSTRAINT(0x02c2, 0xf), /* UOPS_RETIRED.RETIRE_SLOTS */
- INTEL_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */
- INTEL_EVENT_CONSTRAINT(0xc5, 0xf), /* BR_MISP_RETIRED.* */
INTEL_PLD_CONSTRAINT(0x01cd, 0x8), /* MEM_TRANS_RETIRED.LAT_ABOVE_THR */
INTEL_PST_CONSTRAINT(0x02cd, 0x8), /* MEM_TRANS_RETIRED.PRECISE_STORES */
- INTEL_EVENT_CONSTRAINT(0xd0, 0xf), /* MEM_UOP_RETIRED.* */
- INTEL_EVENT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
- INTEL_EVENT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
- INTEL_EVENT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
+ /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
+ INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
+ /* Allow all events as PEBS with no flags */
+ INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
EVENT_CONSTRAINT_END
};
struct event_constraint intel_hsw_pebs_event_constraints[] = {
INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */
- INTEL_PST_HSW_CONSTRAINT(0x01c2, 0xf), /* UOPS_RETIRED.ALL */
- INTEL_UEVENT_CONSTRAINT(0x02c2, 0xf), /* UOPS_RETIRED.RETIRE_SLOTS */
- INTEL_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */
- INTEL_UEVENT_CONSTRAINT(0x01c5, 0xf), /* BR_MISP_RETIRED.CONDITIONAL */
- INTEL_UEVENT_CONSTRAINT(0x04c5, 0xf), /* BR_MISP_RETIRED.ALL_BRANCHES */
- INTEL_UEVENT_CONSTRAINT(0x20c5, 0xf), /* BR_MISP_RETIRED.NEAR_TAKEN */
- INTEL_PLD_CONSTRAINT(0x01cd, 0x8), /* MEM_TRANS_RETIRED.* */
- /* MEM_UOPS_RETIRED.STLB_MISS_LOADS */
- INTEL_UEVENT_CONSTRAINT(0x11d0, 0xf),
- /* MEM_UOPS_RETIRED.STLB_MISS_STORES */
- INTEL_UEVENT_CONSTRAINT(0x12d0, 0xf),
- INTEL_UEVENT_CONSTRAINT(0x21d0, 0xf), /* MEM_UOPS_RETIRED.LOCK_LOADS */
- INTEL_UEVENT_CONSTRAINT(0x41d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_LOADS */
- /* MEM_UOPS_RETIRED.SPLIT_STORES */
- INTEL_UEVENT_CONSTRAINT(0x42d0, 0xf),
- INTEL_UEVENT_CONSTRAINT(0x81d0, 0xf), /* MEM_UOPS_RETIRED.ALL_LOADS */
- INTEL_PST_HSW_CONSTRAINT(0x82d0, 0xf), /* MEM_UOPS_RETIRED.ALL_STORES */
- INTEL_UEVENT_CONSTRAINT(0x01d1, 0xf), /* MEM_LOAD_UOPS_RETIRED.L1_HIT */
- INTEL_UEVENT_CONSTRAINT(0x02d1, 0xf), /* MEM_LOAD_UOPS_RETIRED.L2_HIT */
- INTEL_UEVENT_CONSTRAINT(0x04d1, 0xf), /* MEM_LOAD_UOPS_RETIRED.L3_HIT */
- /* MEM_LOAD_UOPS_RETIRED.HIT_LFB */
- INTEL_UEVENT_CONSTRAINT(0x40d1, 0xf),
- /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_MISS */
- INTEL_UEVENT_CONSTRAINT(0x01d2, 0xf),
- /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT */
- INTEL_UEVENT_CONSTRAINT(0x02d2, 0xf),
- /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.LOCAL_DRAM */
- INTEL_UEVENT_CONSTRAINT(0x01d3, 0xf),
- INTEL_UEVENT_CONSTRAINT(0x04c8, 0xf), /* HLE_RETIRED.Abort */
- INTEL_UEVENT_CONSTRAINT(0x04c9, 0xf), /* RTM_RETIRED.Abort */
-
+ INTEL_PLD_CONSTRAINT(0x01cd, 0xf), /* MEM_TRANS_RETIRED.* */
+ /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
+ INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(0x01c2, 0xf), /* UOPS_RETIRED.ALL */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x11d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_LOADS */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x21d0, 0xf), /* MEM_UOPS_RETIRED.LOCK_LOADS */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x41d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_LOADS */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x81d0, 0xf), /* MEM_UOPS_RETIRED.ALL_LOADS */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x12d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_STORES */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x42d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_STORES */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x82d0, 0xf), /* MEM_UOPS_RETIRED.ALL_STORES */
+ INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
+ INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd2, 0xf), /* MEM_LOAD_UOPS_L3_HIT_RETIRED.* */
+ INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd3, 0xf), /* MEM_LOAD_UOPS_L3_MISS_RETIRED.* */
+ /* Allow all events as PEBS with no flags */
+ INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
EVENT_CONSTRAINT_END
};
@@ -712,7 +669,7 @@ struct event_constraint *intel_pebs_constraints(struct perf_event *event)
void intel_pmu_pebs_enable(struct perf_event *event)
{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct hw_perf_event *hwc = &event->hw;
hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT;
@@ -727,7 +684,7 @@ void intel_pmu_pebs_enable(struct perf_event *event)
void intel_pmu_pebs_disable(struct perf_event *event)
{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct hw_perf_event *hwc = &event->hw;
cpuc->pebs_enabled &= ~(1ULL << hwc->idx);
@@ -745,7 +702,7 @@ void intel_pmu_pebs_disable(struct perf_event *event)
void intel_pmu_pebs_enable_all(void)
{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
if (cpuc->pebs_enabled)
wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled);
@@ -753,7 +710,7 @@ void intel_pmu_pebs_enable_all(void)
void intel_pmu_pebs_disable_all(void)
{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
if (cpuc->pebs_enabled)
wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
@@ -761,7 +718,7 @@ void intel_pmu_pebs_disable_all(void)
static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
unsigned long from = cpuc->lbr_entries[0].from;
unsigned long old_to, to = cpuc->lbr_entries[0].to;
unsigned long ip = regs->ip;
@@ -864,51 +821,53 @@ static inline u64 intel_hsw_transaction(struct pebs_record_hsw *pebs)
static void __intel_pmu_pebs_event(struct perf_event *event,
struct pt_regs *iregs, void *__pebs)
{
+#define PERF_X86_EVENT_PEBS_HSW_PREC \
+ (PERF_X86_EVENT_PEBS_ST_HSW | \
+ PERF_X86_EVENT_PEBS_LD_HSW | \
+ PERF_X86_EVENT_PEBS_NA_HSW)
/*
* We cast to the biggest pebs_record but are careful not to
* unconditionally access the 'extra' entries.
*/
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct pebs_record_hsw *pebs = __pebs;
struct perf_sample_data data;
struct pt_regs regs;
u64 sample_type;
- int fll, fst;
+ int fll, fst, dsrc;
+ int fl = event->hw.flags;
if (!intel_pmu_save_and_restart(event))
return;
- fll = event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT;
- fst = event->hw.flags & (PERF_X86_EVENT_PEBS_ST |
- PERF_X86_EVENT_PEBS_ST_HSW);
+ sample_type = event->attr.sample_type;
+ dsrc = sample_type & PERF_SAMPLE_DATA_SRC;
+
+ fll = fl & PERF_X86_EVENT_PEBS_LDLAT;
+ fst = fl & (PERF_X86_EVENT_PEBS_ST | PERF_X86_EVENT_PEBS_HSW_PREC);
perf_sample_data_init(&data, 0, event->hw.last_period);
data.period = event->hw.last_period;
- sample_type = event->attr.sample_type;
/*
- * if PEBS-LL or PreciseStore
+ * Use latency for weight (only avail with PEBS-LL)
*/
- if (fll || fst) {
- /*
- * Use latency for weight (only avail with PEBS-LL)
- */
- if (fll && (sample_type & PERF_SAMPLE_WEIGHT))
- data.weight = pebs->lat;
-
- /*
- * data.data_src encodes the data source
- */
- if (sample_type & PERF_SAMPLE_DATA_SRC) {
- if (fll)
- data.data_src.val = load_latency_data(pebs->dse);
- else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST_HSW)
- data.data_src.val =
- precise_store_data_hsw(event, pebs->dse);
- else
- data.data_src.val = precise_store_data(pebs->dse);
- }
+ if (fll && (sample_type & PERF_SAMPLE_WEIGHT))
+ data.weight = pebs->lat;
+
+ /*
+ * data.data_src encodes the data source
+ */
+ if (dsrc) {
+ u64 val = PERF_MEM_NA;
+ if (fll)
+ val = load_latency_data(pebs->dse);
+ else if (fst && (fl & PERF_X86_EVENT_PEBS_HSW_PREC))
+ val = precise_datala_hsw(event, pebs->dse);
+ else if (fst)
+ val = precise_store_data(pebs->dse);
+ data.data_src.val = val;
}
/*
@@ -935,16 +894,16 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
else
regs.flags &= ~PERF_EFLAGS_EXACT;
- if ((event->attr.sample_type & PERF_SAMPLE_ADDR) &&
+ if ((sample_type & PERF_SAMPLE_ADDR) &&
x86_pmu.intel_cap.pebs_format >= 1)
data.addr = pebs->dla;
if (x86_pmu.intel_cap.pebs_format >= 2) {
/* Only set the TSX weight when no memory weight. */
- if ((event->attr.sample_type & PERF_SAMPLE_WEIGHT) && !fll)
+ if ((sample_type & PERF_SAMPLE_WEIGHT) && !fll)
data.weight = intel_hsw_weight(pebs);
- if (event->attr.sample_type & PERF_SAMPLE_TRANSACTION)
+ if (sample_type & PERF_SAMPLE_TRANSACTION)
data.txn = intel_hsw_transaction(pebs);
}
@@ -957,7 +916,7 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct debug_store *ds = cpuc->ds;
struct perf_event *event = cpuc->events[0]; /* PMC0 only */
struct pebs_record_core *at, *top;
@@ -998,7 +957,7 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct debug_store *ds = cpuc->ds;
struct perf_event *event = NULL;
void *at, *top;
@@ -1055,7 +1014,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
* BTS, PEBS probe and setup
*/
-void intel_ds_init(void)
+void __init intel_ds_init(void)
{
/*
* No support for 32bit formats
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index 9dd2459a4c73..45fa730a5283 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -133,7 +133,7 @@ static void intel_pmu_lbr_filter(struct cpu_hw_events *cpuc);
static void __intel_pmu_lbr_enable(void)
{
u64 debugctl;
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
if (cpuc->lbr_sel)
wrmsrl(MSR_LBR_SELECT, cpuc->lbr_sel->config);
@@ -183,7 +183,7 @@ void intel_pmu_lbr_reset(void)
void intel_pmu_lbr_enable(struct perf_event *event)
{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
if (!x86_pmu.lbr_nr)
return;
@@ -203,7 +203,7 @@ void intel_pmu_lbr_enable(struct perf_event *event)
void intel_pmu_lbr_disable(struct perf_event *event)
{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
if (!x86_pmu.lbr_nr)
return;
@@ -220,7 +220,7 @@ void intel_pmu_lbr_disable(struct perf_event *event)
void intel_pmu_lbr_enable_all(void)
{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
if (cpuc->lbr_users)
__intel_pmu_lbr_enable();
@@ -228,7 +228,7 @@ void intel_pmu_lbr_enable_all(void)
void intel_pmu_lbr_disable_all(void)
{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
if (cpuc->lbr_users)
__intel_pmu_lbr_disable();
@@ -332,7 +332,7 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
void intel_pmu_lbr_read(void)
{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
if (!cpuc->lbr_users)
return;
@@ -697,7 +697,7 @@ static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX] = {
};
/* core */
-void intel_pmu_lbr_init_core(void)
+void __init intel_pmu_lbr_init_core(void)
{
x86_pmu.lbr_nr = 4;
x86_pmu.lbr_tos = MSR_LBR_TOS;
@@ -712,7 +712,7 @@ void intel_pmu_lbr_init_core(void)
}
/* nehalem/westmere */
-void intel_pmu_lbr_init_nhm(void)
+void __init intel_pmu_lbr_init_nhm(void)
{
x86_pmu.lbr_nr = 16;
x86_pmu.lbr_tos = MSR_LBR_TOS;
@@ -733,7 +733,7 @@ void intel_pmu_lbr_init_nhm(void)
}
/* sandy bridge */
-void intel_pmu_lbr_init_snb(void)
+void __init intel_pmu_lbr_init_snb(void)
{
x86_pmu.lbr_nr = 16;
x86_pmu.lbr_tos = MSR_LBR_TOS;
@@ -753,7 +753,7 @@ void intel_pmu_lbr_init_snb(void)
}
/* atom */
-void intel_pmu_lbr_init_atom(void)
+void __init intel_pmu_lbr_init_atom(void)
{
/*
* only models starting at stepping 10 seems
diff --git a/arch/x86/kernel/cpu/perf_event_intel_rapl.c b/arch/x86/kernel/cpu/perf_event_intel_rapl.c
index 619f7699487a..d64f275fe274 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_rapl.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_rapl.c
@@ -135,7 +135,7 @@ static inline u64 rapl_scale(u64 v)
* or use ldexp(count, -32).
* Watts = Joules/Time delta
*/
- return v << (32 - __get_cpu_var(rapl_pmu)->hw_unit);
+ return v << (32 - __this_cpu_read(rapl_pmu->hw_unit));
}
static u64 rapl_event_update(struct perf_event *event)
@@ -187,7 +187,7 @@ static void rapl_stop_hrtimer(struct rapl_pmu *pmu)
static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer)
{
- struct rapl_pmu *pmu = __get_cpu_var(rapl_pmu);
+ struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu);
struct perf_event *event;
unsigned long flags;
@@ -234,7 +234,7 @@ static void __rapl_pmu_event_start(struct rapl_pmu *pmu,
static void rapl_pmu_event_start(struct perf_event *event, int mode)
{
- struct rapl_pmu *pmu = __get_cpu_var(rapl_pmu);
+ struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu);
unsigned long flags;
spin_lock_irqsave(&pmu->lock, flags);
@@ -244,7 +244,7 @@ static void rapl_pmu_event_start(struct perf_event *event, int mode)
static void rapl_pmu_event_stop(struct perf_event *event, int mode)
{
- struct rapl_pmu *pmu = __get_cpu_var(rapl_pmu);
+ struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu);
struct hw_perf_event *hwc = &event->hw;
unsigned long flags;
@@ -278,7 +278,7 @@ static void rapl_pmu_event_stop(struct perf_event *event, int mode)
static int rapl_pmu_event_add(struct perf_event *event, int mode)
{
- struct rapl_pmu *pmu = __get_cpu_var(rapl_pmu);
+ struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu);
struct hw_perf_event *hwc = &event->hw;
unsigned long flags;
@@ -696,7 +696,7 @@ static int __init rapl_pmu_init(void)
return -1;
}
- pmu = __get_cpu_var(rapl_pmu);
+ pmu = __this_cpu_read(rapl_pmu);
pr_info("RAPL PMU detected, hw unit 2^-%d Joules,"
" API unit is 2^-32 Joules,"
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
index cfc6f9dfcd90..9762dbd9f3f7 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
@@ -1,83 +1,39 @@
#include "perf_event_intel_uncore.h"
static struct intel_uncore_type *empty_uncore[] = { NULL, };
-static struct intel_uncore_type **msr_uncores = empty_uncore;
-static struct intel_uncore_type **pci_uncores = empty_uncore;
-/* pci bus to socket mapping */
-static int pcibus_to_physid[256] = { [0 ... 255] = -1, };
+struct intel_uncore_type **uncore_msr_uncores = empty_uncore;
+struct intel_uncore_type **uncore_pci_uncores = empty_uncore;
-static struct pci_dev *extra_pci_dev[UNCORE_SOCKET_MAX][UNCORE_EXTRA_PCI_DEV_MAX];
+static bool pcidrv_registered;
+struct pci_driver *uncore_pci_driver;
+/* pci bus to socket mapping */
+int uncore_pcibus_to_physid[256] = { [0 ... 255] = -1, };
+struct pci_dev *uncore_extra_pci_dev[UNCORE_SOCKET_MAX][UNCORE_EXTRA_PCI_DEV_MAX];
static DEFINE_RAW_SPINLOCK(uncore_box_lock);
-
/* mask of cpus that collect uncore events */
static cpumask_t uncore_cpu_mask;
/* constraint for the fixed counter */
-static struct event_constraint constraint_fixed =
+static struct event_constraint uncore_constraint_fixed =
EVENT_CONSTRAINT(~0ULL, 1 << UNCORE_PMC_IDX_FIXED, ~0ULL);
-static struct event_constraint constraint_empty =
+struct event_constraint uncore_constraint_empty =
EVENT_CONSTRAINT(0, 0, 0);
-#define __BITS_VALUE(x, i, n) ((typeof(x))(((x) >> ((i) * (n))) & \
- ((1ULL << (n)) - 1)))
-
-DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7");
-DEFINE_UNCORE_FORMAT_ATTR(event_ext, event, "config:0-7,21");
-DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15");
-DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18");
-DEFINE_UNCORE_FORMAT_ATTR(tid_en, tid_en, "config:19");
-DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23");
-DEFINE_UNCORE_FORMAT_ATTR(cmask5, cmask, "config:24-28");
-DEFINE_UNCORE_FORMAT_ATTR(cmask8, cmask, "config:24-31");
-DEFINE_UNCORE_FORMAT_ATTR(thresh8, thresh, "config:24-31");
-DEFINE_UNCORE_FORMAT_ATTR(thresh5, thresh, "config:24-28");
-DEFINE_UNCORE_FORMAT_ATTR(occ_sel, occ_sel, "config:14-15");
-DEFINE_UNCORE_FORMAT_ATTR(occ_invert, occ_invert, "config:30");
-DEFINE_UNCORE_FORMAT_ATTR(occ_edge, occ_edge, "config:14-51");
-DEFINE_UNCORE_FORMAT_ATTR(filter_tid, filter_tid, "config1:0-4");
-DEFINE_UNCORE_FORMAT_ATTR(filter_link, filter_link, "config1:5-8");
-DEFINE_UNCORE_FORMAT_ATTR(filter_nid, filter_nid, "config1:10-17");
-DEFINE_UNCORE_FORMAT_ATTR(filter_nid2, filter_nid, "config1:32-47");
-DEFINE_UNCORE_FORMAT_ATTR(filter_state, filter_state, "config1:18-22");
-DEFINE_UNCORE_FORMAT_ATTR(filter_state2, filter_state, "config1:17-22");
-DEFINE_UNCORE_FORMAT_ATTR(filter_opc, filter_opc, "config1:23-31");
-DEFINE_UNCORE_FORMAT_ATTR(filter_opc2, filter_opc, "config1:52-60");
-DEFINE_UNCORE_FORMAT_ATTR(filter_band0, filter_band0, "config1:0-7");
-DEFINE_UNCORE_FORMAT_ATTR(filter_band1, filter_band1, "config1:8-15");
-DEFINE_UNCORE_FORMAT_ATTR(filter_band2, filter_band2, "config1:16-23");
-DEFINE_UNCORE_FORMAT_ATTR(filter_band3, filter_band3, "config1:24-31");
-DEFINE_UNCORE_FORMAT_ATTR(match_rds, match_rds, "config1:48-51");
-DEFINE_UNCORE_FORMAT_ATTR(match_rnid30, match_rnid30, "config1:32-35");
-DEFINE_UNCORE_FORMAT_ATTR(match_rnid4, match_rnid4, "config1:31");
-DEFINE_UNCORE_FORMAT_ATTR(match_dnid, match_dnid, "config1:13-17");
-DEFINE_UNCORE_FORMAT_ATTR(match_mc, match_mc, "config1:9-12");
-DEFINE_UNCORE_FORMAT_ATTR(match_opc, match_opc, "config1:5-8");
-DEFINE_UNCORE_FORMAT_ATTR(match_vnw, match_vnw, "config1:3-4");
-DEFINE_UNCORE_FORMAT_ATTR(match0, match0, "config1:0-31");
-DEFINE_UNCORE_FORMAT_ATTR(match1, match1, "config1:32-63");
-DEFINE_UNCORE_FORMAT_ATTR(mask_rds, mask_rds, "config2:48-51");
-DEFINE_UNCORE_FORMAT_ATTR(mask_rnid30, mask_rnid30, "config2:32-35");
-DEFINE_UNCORE_FORMAT_ATTR(mask_rnid4, mask_rnid4, "config2:31");
-DEFINE_UNCORE_FORMAT_ATTR(mask_dnid, mask_dnid, "config2:13-17");
-DEFINE_UNCORE_FORMAT_ATTR(mask_mc, mask_mc, "config2:9-12");
-DEFINE_UNCORE_FORMAT_ATTR(mask_opc, mask_opc, "config2:5-8");
-DEFINE_UNCORE_FORMAT_ATTR(mask_vnw, mask_vnw, "config2:3-4");
-DEFINE_UNCORE_FORMAT_ATTR(mask0, mask0, "config2:0-31");
-DEFINE_UNCORE_FORMAT_ATTR(mask1, mask1, "config2:32-63");
-
-static void uncore_pmu_start_hrtimer(struct intel_uncore_box *box);
-static void uncore_pmu_cancel_hrtimer(struct intel_uncore_box *box);
-static void uncore_perf_event_update(struct intel_uncore_box *box, struct perf_event *event);
-static void uncore_pmu_event_read(struct perf_event *event);
-
-static struct intel_uncore_pmu *uncore_event_to_pmu(struct perf_event *event)
+ssize_t uncore_event_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct uncore_event_desc *event =
+ container_of(attr, struct uncore_event_desc, attr);
+ return sprintf(buf, "%s", event->config);
+}
+
+struct intel_uncore_pmu *uncore_event_to_pmu(struct perf_event *event)
{
return container_of(event->pmu, struct intel_uncore_pmu, pmu);
}
-static struct intel_uncore_box *
-uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu)
+struct intel_uncore_box *uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu)
{
struct intel_uncore_box *box;
@@ -86,6 +42,9 @@ uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu)
return box;
raw_spin_lock(&uncore_box_lock);
+ /* Recheck in lock to handle races. */
+ if (*per_cpu_ptr(pmu->box, cpu))
+ goto out;
list_for_each_entry(box, &pmu->box_list, list) {
if (box->phys_id == topology_physical_package_id(cpu)) {
atomic_inc(&box->refcnt);
@@ -93,12 +52,13 @@ uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu)
break;
}
}
+out:
raw_spin_unlock(&uncore_box_lock);
return *per_cpu_ptr(pmu->box, cpu);
}
-static struct intel_uncore_box *uncore_event_to_box(struct perf_event *event)
+struct intel_uncore_box *uncore_event_to_box(struct perf_event *event)
{
/*
* perf core schedules event on the basis of cpu, uncore events are
@@ -107,7 +67,7 @@ static struct intel_uncore_box *uncore_event_to_box(struct perf_event *event)
return uncore_pmu_to_box(uncore_event_to_pmu(event), smp_processor_id());
}
-static u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event *event)
+u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event *event)
{
u64 count;
@@ -119,7 +79,7 @@ static u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_eve
/*
* generic get constraint function for shared match/mask registers.
*/
-static struct event_constraint *
+struct event_constraint *
uncore_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
{
struct intel_uncore_extra_reg *er;
@@ -154,10 +114,10 @@ uncore_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
return NULL;
}
- return &constraint_empty;
+ return &uncore_constraint_empty;
}
-static void uncore_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
+void uncore_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
{
struct intel_uncore_extra_reg *er;
struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
@@ -178,7 +138,7 @@ static void uncore_put_constraint(struct intel_uncore_box *box, struct perf_even
reg1->alloc = 0;
}
-static u64 uncore_shared_reg_config(struct intel_uncore_box *box, int idx)
+u64 uncore_shared_reg_config(struct intel_uncore_box *box, int idx)
{
struct intel_uncore_extra_reg *er;
unsigned long flags;
@@ -193,2936 +153,6 @@ static u64 uncore_shared_reg_config(struct intel_uncore_box *box, int idx)
return config;
}
-/* Sandy Bridge-EP uncore support */
-static struct intel_uncore_type snbep_uncore_cbox;
-static struct intel_uncore_type snbep_uncore_pcu;
-
-static void snbep_uncore_pci_disable_box(struct intel_uncore_box *box)
-{
- struct pci_dev *pdev = box->pci_dev;
- int box_ctl = uncore_pci_box_ctl(box);
- u32 config = 0;
-
- if (!pci_read_config_dword(pdev, box_ctl, &config)) {
- config |= SNBEP_PMON_BOX_CTL_FRZ;
- pci_write_config_dword(pdev, box_ctl, config);
- }
-}
-
-static void snbep_uncore_pci_enable_box(struct intel_uncore_box *box)
-{
- struct pci_dev *pdev = box->pci_dev;
- int box_ctl = uncore_pci_box_ctl(box);
- u32 config = 0;
-
- if (!pci_read_config_dword(pdev, box_ctl, &config)) {
- config &= ~SNBEP_PMON_BOX_CTL_FRZ;
- pci_write_config_dword(pdev, box_ctl, config);
- }
-}
-
-static void snbep_uncore_pci_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
- struct pci_dev *pdev = box->pci_dev;
- struct hw_perf_event *hwc = &event->hw;
-
- pci_write_config_dword(pdev, hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
-}
-
-static void snbep_uncore_pci_disable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
- struct pci_dev *pdev = box->pci_dev;
- struct hw_perf_event *hwc = &event->hw;
-
- pci_write_config_dword(pdev, hwc->config_base, hwc->config);
-}
-
-static u64 snbep_uncore_pci_read_counter(struct intel_uncore_box *box, struct perf_event *event)
-{
- struct pci_dev *pdev = box->pci_dev;
- struct hw_perf_event *hwc = &event->hw;
- u64 count = 0;
-
- pci_read_config_dword(pdev, hwc->event_base, (u32 *)&count);
- pci_read_config_dword(pdev, hwc->event_base + 4, (u32 *)&count + 1);
-
- return count;
-}
-
-static void snbep_uncore_pci_init_box(struct intel_uncore_box *box)
-{
- struct pci_dev *pdev = box->pci_dev;
-
- pci_write_config_dword(pdev, SNBEP_PCI_PMON_BOX_CTL, SNBEP_PMON_BOX_CTL_INT);
-}
-
-static void snbep_uncore_msr_disable_box(struct intel_uncore_box *box)
-{
- u64 config;
- unsigned msr;
-
- msr = uncore_msr_box_ctl(box);
- if (msr) {
- rdmsrl(msr, config);
- config |= SNBEP_PMON_BOX_CTL_FRZ;
- wrmsrl(msr, config);
- }
-}
-
-static void snbep_uncore_msr_enable_box(struct intel_uncore_box *box)
-{
- u64 config;
- unsigned msr;
-
- msr = uncore_msr_box_ctl(box);
- if (msr) {
- rdmsrl(msr, config);
- config &= ~SNBEP_PMON_BOX_CTL_FRZ;
- wrmsrl(msr, config);
- }
-}
-
-static void snbep_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
- struct hw_perf_event *hwc = &event->hw;
- struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-
- if (reg1->idx != EXTRA_REG_NONE)
- wrmsrl(reg1->reg, uncore_shared_reg_config(box, 0));
-
- wrmsrl(hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
-}
-
-static void snbep_uncore_msr_disable_event(struct intel_uncore_box *box,
- struct perf_event *event)
-{
- struct hw_perf_event *hwc = &event->hw;
-
- wrmsrl(hwc->config_base, hwc->config);
-}
-
-static void snbep_uncore_msr_init_box(struct intel_uncore_box *box)
-{
- unsigned msr = uncore_msr_box_ctl(box);
-
- if (msr)
- wrmsrl(msr, SNBEP_PMON_BOX_CTL_INT);
-}
-
-static struct attribute *snbep_uncore_formats_attr[] = {
- &format_attr_event.attr,
- &format_attr_umask.attr,
- &format_attr_edge.attr,
- &format_attr_inv.attr,
- &format_attr_thresh8.attr,
- NULL,
-};
-
-static struct attribute *snbep_uncore_ubox_formats_attr[] = {
- &format_attr_event.attr,
- &format_attr_umask.attr,
- &format_attr_edge.attr,
- &format_attr_inv.attr,
- &format_attr_thresh5.attr,
- NULL,
-};
-
-static struct attribute *snbep_uncore_cbox_formats_attr[] = {
- &format_attr_event.attr,
- &format_attr_umask.attr,
- &format_attr_edge.attr,
- &format_attr_tid_en.attr,
- &format_attr_inv.attr,
- &format_attr_thresh8.attr,
- &format_attr_filter_tid.attr,
- &format_attr_filter_nid.attr,
- &format_attr_filter_state.attr,
- &format_attr_filter_opc.attr,
- NULL,
-};
-
-static struct attribute *snbep_uncore_pcu_formats_attr[] = {
- &format_attr_event_ext.attr,
- &format_attr_occ_sel.attr,
- &format_attr_edge.attr,
- &format_attr_inv.attr,
- &format_attr_thresh5.attr,
- &format_attr_occ_invert.attr,
- &format_attr_occ_edge.attr,
- &format_attr_filter_band0.attr,
- &format_attr_filter_band1.attr,
- &format_attr_filter_band2.attr,
- &format_attr_filter_band3.attr,
- NULL,
-};
-
-static struct attribute *snbep_uncore_qpi_formats_attr[] = {
- &format_attr_event_ext.attr,
- &format_attr_umask.attr,
- &format_attr_edge.attr,
- &format_attr_inv.attr,
- &format_attr_thresh8.attr,
- &format_attr_match_rds.attr,
- &format_attr_match_rnid30.attr,
- &format_attr_match_rnid4.attr,
- &format_attr_match_dnid.attr,
- &format_attr_match_mc.attr,
- &format_attr_match_opc.attr,
- &format_attr_match_vnw.attr,
- &format_attr_match0.attr,
- &format_attr_match1.attr,
- &format_attr_mask_rds.attr,
- &format_attr_mask_rnid30.attr,
- &format_attr_mask_rnid4.attr,
- &format_attr_mask_dnid.attr,
- &format_attr_mask_mc.attr,
- &format_attr_mask_opc.attr,
- &format_attr_mask_vnw.attr,
- &format_attr_mask0.attr,
- &format_attr_mask1.attr,
- NULL,
-};
-
-static struct uncore_event_desc snbep_uncore_imc_events[] = {
- INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0x00"),
- INTEL_UNCORE_EVENT_DESC(cas_count_read, "event=0x04,umask=0x03"),
- INTEL_UNCORE_EVENT_DESC(cas_count_write, "event=0x04,umask=0x0c"),
- { /* end: all zeroes */ },
-};
-
-static struct uncore_event_desc snbep_uncore_qpi_events[] = {
- INTEL_UNCORE_EVENT_DESC(clockticks, "event=0x14"),
- INTEL_UNCORE_EVENT_DESC(txl_flits_active, "event=0x00,umask=0x06"),
- INTEL_UNCORE_EVENT_DESC(drs_data, "event=0x102,umask=0x08"),
- INTEL_UNCORE_EVENT_DESC(ncb_data, "event=0x103,umask=0x04"),
- { /* end: all zeroes */ },
-};
-
-static struct attribute_group snbep_uncore_format_group = {
- .name = "format",
- .attrs = snbep_uncore_formats_attr,
-};
-
-static struct attribute_group snbep_uncore_ubox_format_group = {
- .name = "format",
- .attrs = snbep_uncore_ubox_formats_attr,
-};
-
-static struct attribute_group snbep_uncore_cbox_format_group = {
- .name = "format",
- .attrs = snbep_uncore_cbox_formats_attr,
-};
-
-static struct attribute_group snbep_uncore_pcu_format_group = {
- .name = "format",
- .attrs = snbep_uncore_pcu_formats_attr,
-};
-
-static struct attribute_group snbep_uncore_qpi_format_group = {
- .name = "format",
- .attrs = snbep_uncore_qpi_formats_attr,
-};
-
-#define SNBEP_UNCORE_MSR_OPS_COMMON_INIT() \
- .init_box = snbep_uncore_msr_init_box, \
- .disable_box = snbep_uncore_msr_disable_box, \
- .enable_box = snbep_uncore_msr_enable_box, \
- .disable_event = snbep_uncore_msr_disable_event, \
- .enable_event = snbep_uncore_msr_enable_event, \
- .read_counter = uncore_msr_read_counter
-
-static struct intel_uncore_ops snbep_uncore_msr_ops = {
- SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),
-};
-
-#define SNBEP_UNCORE_PCI_OPS_COMMON_INIT() \
- .init_box = snbep_uncore_pci_init_box, \
- .disable_box = snbep_uncore_pci_disable_box, \
- .enable_box = snbep_uncore_pci_enable_box, \
- .disable_event = snbep_uncore_pci_disable_event, \
- .read_counter = snbep_uncore_pci_read_counter
-
-static struct intel_uncore_ops snbep_uncore_pci_ops = {
- SNBEP_UNCORE_PCI_OPS_COMMON_INIT(),
- .enable_event = snbep_uncore_pci_enable_event, \
-};
-
-static struct event_constraint snbep_uncore_cbox_constraints[] = {
- UNCORE_EVENT_CONSTRAINT(0x01, 0x1),
- UNCORE_EVENT_CONSTRAINT(0x02, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x04, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x05, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x07, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x09, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x11, 0x1),
- UNCORE_EVENT_CONSTRAINT(0x12, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x13, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x1b, 0xc),
- UNCORE_EVENT_CONSTRAINT(0x1c, 0xc),
- UNCORE_EVENT_CONSTRAINT(0x1d, 0xc),
- UNCORE_EVENT_CONSTRAINT(0x1e, 0xc),
- EVENT_CONSTRAINT_OVERLAP(0x1f, 0xe, 0xff),
- UNCORE_EVENT_CONSTRAINT(0x21, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x31, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x32, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x35, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x36, 0x1),
- UNCORE_EVENT_CONSTRAINT(0x37, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x38, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x39, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x3b, 0x1),
- EVENT_CONSTRAINT_END
-};
-
-static struct event_constraint snbep_uncore_r2pcie_constraints[] = {
- UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x12, 0x1),
- UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x24, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x25, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x32, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
- EVENT_CONSTRAINT_END
-};
-
-static struct event_constraint snbep_uncore_r3qpi_constraints[] = {
- UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x12, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x13, 0x1),
- UNCORE_EVENT_CONSTRAINT(0x20, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x21, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x22, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x24, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x25, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x28, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x29, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x2a, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x2b, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x2c, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x2d, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x2e, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x2f, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x30, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x31, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x32, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x36, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x37, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x38, 0x3),
- UNCORE_EVENT_CONSTRAINT(0x39, 0x3),
- EVENT_CONSTRAINT_END
-};
-
-static struct intel_uncore_type snbep_uncore_ubox = {
- .name = "ubox",
- .num_counters = 2,
- .num_boxes = 1,
- .perf_ctr_bits = 44,
- .fixed_ctr_bits = 48,
- .perf_ctr = SNBEP_U_MSR_PMON_CTR0,
- .event_ctl = SNBEP_U_MSR_PMON_CTL0,
- .event_mask = SNBEP_U_MSR_PMON_RAW_EVENT_MASK,
- .fixed_ctr = SNBEP_U_MSR_PMON_UCLK_FIXED_CTR,
- .fixed_ctl = SNBEP_U_MSR_PMON_UCLK_FIXED_CTL,
- .ops = &snbep_uncore_msr_ops,
- .format_group = &snbep_uncore_ubox_format_group,
-};
-
-static struct extra_reg snbep_uncore_cbox_extra_regs[] = {
- SNBEP_CBO_EVENT_EXTRA_REG(SNBEP_CBO_PMON_CTL_TID_EN,
- SNBEP_CBO_PMON_CTL_TID_EN, 0x1),
- SNBEP_CBO_EVENT_EXTRA_REG(0x0334, 0xffff, 0x4),
- SNBEP_CBO_EVENT_EXTRA_REG(0x4334, 0xffff, 0x6),
- SNBEP_CBO_EVENT_EXTRA_REG(0x0534, 0xffff, 0x4),
- SNBEP_CBO_EVENT_EXTRA_REG(0x4534, 0xffff, 0x6),
- SNBEP_CBO_EVENT_EXTRA_REG(0x0934, 0xffff, 0x4),
- SNBEP_CBO_EVENT_EXTRA_REG(0x4934, 0xffff, 0x6),
- SNBEP_CBO_EVENT_EXTRA_REG(0x4134, 0xffff, 0x6),
- SNBEP_CBO_EVENT_EXTRA_REG(0x0135, 0xffff, 0x8),
- SNBEP_CBO_EVENT_EXTRA_REG(0x0335, 0xffff, 0x8),
- SNBEP_CBO_EVENT_EXTRA_REG(0x4135, 0xffff, 0xa),
- SNBEP_CBO_EVENT_EXTRA_REG(0x4335, 0xffff, 0xa),
- SNBEP_CBO_EVENT_EXTRA_REG(0x4435, 0xffff, 0x2),
- SNBEP_CBO_EVENT_EXTRA_REG(0x4835, 0xffff, 0x2),
- SNBEP_CBO_EVENT_EXTRA_REG(0x4a35, 0xffff, 0x2),
- SNBEP_CBO_EVENT_EXTRA_REG(0x5035, 0xffff, 0x2),
- SNBEP_CBO_EVENT_EXTRA_REG(0x0136, 0xffff, 0x8),
- SNBEP_CBO_EVENT_EXTRA_REG(0x0336, 0xffff, 0x8),
- SNBEP_CBO_EVENT_EXTRA_REG(0x4136, 0xffff, 0xa),
- SNBEP_CBO_EVENT_EXTRA_REG(0x4336, 0xffff, 0xa),
- SNBEP_CBO_EVENT_EXTRA_REG(0x4436, 0xffff, 0x2),
- SNBEP_CBO_EVENT_EXTRA_REG(0x4836, 0xffff, 0x2),
- SNBEP_CBO_EVENT_EXTRA_REG(0x4a36, 0xffff, 0x2),
- SNBEP_CBO_EVENT_EXTRA_REG(0x4037, 0x40ff, 0x2),
- EVENT_EXTRA_END
-};
-
-static void snbep_cbox_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
- struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
- struct intel_uncore_extra_reg *er = &box->shared_regs[0];
- int i;
-
- if (uncore_box_is_fake(box))
- return;
-
- for (i = 0; i < 5; i++) {
- if (reg1->alloc & (0x1 << i))
- atomic_sub(1 << (i * 6), &er->ref);
- }
- reg1->alloc = 0;
-}
-
-static struct event_constraint *
-__snbep_cbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event,
- u64 (*cbox_filter_mask)(int fields))
-{
- struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
- struct intel_uncore_extra_reg *er = &box->shared_regs[0];
- int i, alloc = 0;
- unsigned long flags;
- u64 mask;
-
- if (reg1->idx == EXTRA_REG_NONE)
- return NULL;
-
- raw_spin_lock_irqsave(&er->lock, flags);
- for (i = 0; i < 5; i++) {
- if (!(reg1->idx & (0x1 << i)))
- continue;
- if (!uncore_box_is_fake(box) && (reg1->alloc & (0x1 << i)))
- continue;
-
- mask = cbox_filter_mask(0x1 << i);
- if (!__BITS_VALUE(atomic_read(&er->ref), i, 6) ||
- !((reg1->config ^ er->config) & mask)) {
- atomic_add(1 << (i * 6), &er->ref);
- er->config &= ~mask;
- er->config |= reg1->config & mask;
- alloc |= (0x1 << i);
- } else {
- break;
- }
- }
- raw_spin_unlock_irqrestore(&er->lock, flags);
- if (i < 5)
- goto fail;
-
- if (!uncore_box_is_fake(box))
- reg1->alloc |= alloc;
-
- return NULL;
-fail:
- for (; i >= 0; i--) {
- if (alloc & (0x1 << i))
- atomic_sub(1 << (i * 6), &er->ref);
- }
- return &constraint_empty;
-}
-
-static u64 snbep_cbox_filter_mask(int fields)
-{
- u64 mask = 0;
-
- if (fields & 0x1)
- mask |= SNBEP_CB0_MSR_PMON_BOX_FILTER_TID;
- if (fields & 0x2)
- mask |= SNBEP_CB0_MSR_PMON_BOX_FILTER_NID;
- if (fields & 0x4)
- mask |= SNBEP_CB0_MSR_PMON_BOX_FILTER_STATE;
- if (fields & 0x8)
- mask |= SNBEP_CB0_MSR_PMON_BOX_FILTER_OPC;
-
- return mask;
-}
-
-static struct event_constraint *
-snbep_cbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
- return __snbep_cbox_get_constraint(box, event, snbep_cbox_filter_mask);
-}
-
-static int snbep_cbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
-{
- struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
- struct extra_reg *er;
- int idx = 0;
-
- for (er = snbep_uncore_cbox_extra_regs; er->msr; er++) {
- if (er->event != (event->hw.config & er->config_mask))
- continue;
- idx |= er->idx;
- }
-
- if (idx) {
- reg1->reg = SNBEP_C0_MSR_PMON_BOX_FILTER +
- SNBEP_CBO_MSR_OFFSET * box->pmu->pmu_idx;
- reg1->config = event->attr.config1 & snbep_cbox_filter_mask(idx);
- reg1->idx = idx;
- }
- return 0;
-}
-
-static struct intel_uncore_ops snbep_uncore_cbox_ops = {
- SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),
- .hw_config = snbep_cbox_hw_config,
- .get_constraint = snbep_cbox_get_constraint,
- .put_constraint = snbep_cbox_put_constraint,
-};
-
-static struct intel_uncore_type snbep_uncore_cbox = {
- .name = "cbox",
- .num_counters = 4,
- .num_boxes = 8,
- .perf_ctr_bits = 44,
- .event_ctl = SNBEP_C0_MSR_PMON_CTL0,
- .perf_ctr = SNBEP_C0_MSR_PMON_CTR0,
- .event_mask = SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK,
- .box_ctl = SNBEP_C0_MSR_PMON_BOX_CTL,
- .msr_offset = SNBEP_CBO_MSR_OFFSET,
- .num_shared_regs = 1,
- .constraints = snbep_uncore_cbox_constraints,
- .ops = &snbep_uncore_cbox_ops,
- .format_group = &snbep_uncore_cbox_format_group,
-};
-
-static u64 snbep_pcu_alter_er(struct perf_event *event, int new_idx, bool modify)
-{
- struct hw_perf_event *hwc = &event->hw;
- struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
- u64 config = reg1->config;
-
- if (new_idx > reg1->idx)
- config <<= 8 * (new_idx - reg1->idx);
- else
- config >>= 8 * (reg1->idx - new_idx);
-
- if (modify) {
- hwc->config += new_idx - reg1->idx;
- reg1->config = config;
- reg1->idx = new_idx;
- }
- return config;
-}
-
-static struct event_constraint *
-snbep_pcu_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
- struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
- struct intel_uncore_extra_reg *er = &box->shared_regs[0];
- unsigned long flags;
- int idx = reg1->idx;
- u64 mask, config1 = reg1->config;
- bool ok = false;
-
- if (reg1->idx == EXTRA_REG_NONE ||
- (!uncore_box_is_fake(box) && reg1->alloc))
- return NULL;
-again:
- mask = 0xffULL << (idx * 8);
- raw_spin_lock_irqsave(&er->lock, flags);
- if (!__BITS_VALUE(atomic_read(&er->ref), idx, 8) ||
- !((config1 ^ er->config) & mask)) {
- atomic_add(1 << (idx * 8), &er->ref);
- er->config &= ~mask;
- er->config |= config1 & mask;
- ok = true;
- }
- raw_spin_unlock_irqrestore(&er->lock, flags);
-
- if (!ok) {
- idx = (idx + 1) % 4;
- if (idx != reg1->idx) {
- config1 = snbep_pcu_alter_er(event, idx, false);
- goto again;
- }
- return &constraint_empty;
- }
-
- if (!uncore_box_is_fake(box)) {
- if (idx != reg1->idx)
- snbep_pcu_alter_er(event, idx, true);
- reg1->alloc = 1;
- }
- return NULL;
-}
-
-static void snbep_pcu_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
- struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
- struct intel_uncore_extra_reg *er = &box->shared_regs[0];
-
- if (uncore_box_is_fake(box) || !reg1->alloc)
- return;
-
- atomic_sub(1 << (reg1->idx * 8), &er->ref);
- reg1->alloc = 0;
-}
-
-static int snbep_pcu_hw_config(struct intel_uncore_box *box, struct perf_event *event)
-{
- struct hw_perf_event *hwc = &event->hw;
- struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
- int ev_sel = hwc->config & SNBEP_PMON_CTL_EV_SEL_MASK;
-
- if (ev_sel >= 0xb && ev_sel <= 0xe) {
- reg1->reg = SNBEP_PCU_MSR_PMON_BOX_FILTER;
- reg1->idx = ev_sel - 0xb;
- reg1->config = event->attr.config1 & (0xff << reg1->idx);
- }
- return 0;
-}
-
-static struct intel_uncore_ops snbep_uncore_pcu_ops = {
- SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),
- .hw_config = snbep_pcu_hw_config,
- .get_constraint = snbep_pcu_get_constraint,
- .put_constraint = snbep_pcu_put_constraint,
-};
-
-static struct intel_uncore_type snbep_uncore_pcu = {
- .name = "pcu",
- .num_counters = 4,
- .num_boxes = 1,
- .perf_ctr_bits = 48,
- .perf_ctr = SNBEP_PCU_MSR_PMON_CTR0,
- .event_ctl = SNBEP_PCU_MSR_PMON_CTL0,
- .event_mask = SNBEP_PCU_MSR_PMON_RAW_EVENT_MASK,
- .box_ctl = SNBEP_PCU_MSR_PMON_BOX_CTL,
- .num_shared_regs = 1,
- .ops = &snbep_uncore_pcu_ops,
- .format_group = &snbep_uncore_pcu_format_group,
-};
-
-static struct intel_uncore_type *snbep_msr_uncores[] = {
- &snbep_uncore_ubox,
- &snbep_uncore_cbox,
- &snbep_uncore_pcu,
- NULL,
-};
-
-enum {
- SNBEP_PCI_QPI_PORT0_FILTER,
- SNBEP_PCI_QPI_PORT1_FILTER,
-};
-
-static int snbep_qpi_hw_config(struct intel_uncore_box *box, struct perf_event *event)
-{
- struct hw_perf_event *hwc = &event->hw;
- struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
- struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
-
- if ((hwc->config & SNBEP_PMON_CTL_EV_SEL_MASK) == 0x38) {
- reg1->idx = 0;
- reg1->reg = SNBEP_Q_Py_PCI_PMON_PKT_MATCH0;
- reg1->config = event->attr.config1;
- reg2->reg = SNBEP_Q_Py_PCI_PMON_PKT_MASK0;
- reg2->config = event->attr.config2;
- }
- return 0;
-}
-
-static void snbep_qpi_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
- struct pci_dev *pdev = box->pci_dev;
- struct hw_perf_event *hwc = &event->hw;
- struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
- struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
-
- if (reg1->idx != EXTRA_REG_NONE) {
- int idx = box->pmu->pmu_idx + SNBEP_PCI_QPI_PORT0_FILTER;
- struct pci_dev *filter_pdev = extra_pci_dev[box->phys_id][idx];
- WARN_ON_ONCE(!filter_pdev);
- if (filter_pdev) {
- pci_write_config_dword(filter_pdev, reg1->reg,
- (u32)reg1->config);
- pci_write_config_dword(filter_pdev, reg1->reg + 4,
- (u32)(reg1->config >> 32));
- pci_write_config_dword(filter_pdev, reg2->reg,
- (u32)reg2->config);
- pci_write_config_dword(filter_pdev, reg2->reg + 4,
- (u32)(reg2->config >> 32));
- }
- }
-
- pci_write_config_dword(pdev, hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
-}
-
-static struct intel_uncore_ops snbep_uncore_qpi_ops = {
- SNBEP_UNCORE_PCI_OPS_COMMON_INIT(),
- .enable_event = snbep_qpi_enable_event,
- .hw_config = snbep_qpi_hw_config,
- .get_constraint = uncore_get_constraint,
- .put_constraint = uncore_put_constraint,
-};
-
-#define SNBEP_UNCORE_PCI_COMMON_INIT() \
- .perf_ctr = SNBEP_PCI_PMON_CTR0, \
- .event_ctl = SNBEP_PCI_PMON_CTL0, \
- .event_mask = SNBEP_PMON_RAW_EVENT_MASK, \
- .box_ctl = SNBEP_PCI_PMON_BOX_CTL, \
- .ops = &snbep_uncore_pci_ops, \
- .format_group = &snbep_uncore_format_group
-
-static struct intel_uncore_type snbep_uncore_ha = {
- .name = "ha",
- .num_counters = 4,
- .num_boxes = 1,
- .perf_ctr_bits = 48,
- SNBEP_UNCORE_PCI_COMMON_INIT(),
-};
-
-static struct intel_uncore_type snbep_uncore_imc = {
- .name = "imc",
- .num_counters = 4,
- .num_boxes = 4,
- .perf_ctr_bits = 48,
- .fixed_ctr_bits = 48,
- .fixed_ctr = SNBEP_MC_CHy_PCI_PMON_FIXED_CTR,
- .fixed_ctl = SNBEP_MC_CHy_PCI_PMON_FIXED_CTL,
- .event_descs = snbep_uncore_imc_events,
- SNBEP_UNCORE_PCI_COMMON_INIT(),
-};
-
-static struct intel_uncore_type snbep_uncore_qpi = {
- .name = "qpi",
- .num_counters = 4,
- .num_boxes = 2,
- .perf_ctr_bits = 48,
- .perf_ctr = SNBEP_PCI_PMON_CTR0,
- .event_ctl = SNBEP_PCI_PMON_CTL0,
- .event_mask = SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK,
- .box_ctl = SNBEP_PCI_PMON_BOX_CTL,
- .num_shared_regs = 1,
- .ops = &snbep_uncore_qpi_ops,
- .event_descs = snbep_uncore_qpi_events,
- .format_group = &snbep_uncore_qpi_format_group,
-};
-
-
-static struct intel_uncore_type snbep_uncore_r2pcie = {
- .name = "r2pcie",
- .num_counters = 4,
- .num_boxes = 1,
- .perf_ctr_bits = 44,
- .constraints = snbep_uncore_r2pcie_constraints,
- SNBEP_UNCORE_PCI_COMMON_INIT(),
-};
-
-static struct intel_uncore_type snbep_uncore_r3qpi = {
- .name = "r3qpi",
- .num_counters = 3,
- .num_boxes = 2,
- .perf_ctr_bits = 44,
- .constraints = snbep_uncore_r3qpi_constraints,
- SNBEP_UNCORE_PCI_COMMON_INIT(),
-};
-
-enum {
- SNBEP_PCI_UNCORE_HA,
- SNBEP_PCI_UNCORE_IMC,
- SNBEP_PCI_UNCORE_QPI,
- SNBEP_PCI_UNCORE_R2PCIE,
- SNBEP_PCI_UNCORE_R3QPI,
-};
-
-static struct intel_uncore_type *snbep_pci_uncores[] = {
- [SNBEP_PCI_UNCORE_HA] = &snbep_uncore_ha,
- [SNBEP_PCI_UNCORE_IMC] = &snbep_uncore_imc,
- [SNBEP_PCI_UNCORE_QPI] = &snbep_uncore_qpi,
- [SNBEP_PCI_UNCORE_R2PCIE] = &snbep_uncore_r2pcie,
- [SNBEP_PCI_UNCORE_R3QPI] = &snbep_uncore_r3qpi,
- NULL,
-};
-
-static DEFINE_PCI_DEVICE_TABLE(snbep_uncore_pci_ids) = {
- { /* Home Agent */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_HA),
- .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_HA, 0),
- },
- { /* MC Channel 0 */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC0),
- .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 0),
- },
- { /* MC Channel 1 */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC1),
- .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 1),
- },
- { /* MC Channel 2 */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC2),
- .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 2),
- },
- { /* MC Channel 3 */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC3),
- .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 3),
- },
- { /* QPI Port 0 */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_QPI0),
- .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_QPI, 0),
- },
- { /* QPI Port 1 */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_QPI1),
- .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_QPI, 1),
- },
- { /* R2PCIe */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R2PCIE),
- .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_R2PCIE, 0),
- },
- { /* R3QPI Link 0 */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R3QPI0),
- .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_R3QPI, 0),
- },
- { /* R3QPI Link 1 */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R3QPI1),
- .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_R3QPI, 1),
- },
- { /* QPI Port 0 filter */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x3c86),
- .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
- SNBEP_PCI_QPI_PORT0_FILTER),
- },
- { /* QPI Port 0 filter */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x3c96),
- .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
- SNBEP_PCI_QPI_PORT1_FILTER),
- },
- { /* end: all zeroes */ }
-};
-
-static struct pci_driver snbep_uncore_pci_driver = {
- .name = "snbep_uncore",
- .id_table = snbep_uncore_pci_ids,
-};
-
-/*
- * build pci bus to socket mapping
- */
-static int snbep_pci2phy_map_init(int devid)
-{
- struct pci_dev *ubox_dev = NULL;
- int i, bus, nodeid;
- int err = 0;
- u32 config = 0;
-
- while (1) {
- /* find the UBOX device */
- ubox_dev = pci_get_device(PCI_VENDOR_ID_INTEL, devid, ubox_dev);
- if (!ubox_dev)
- break;
- bus = ubox_dev->bus->number;
- /* get the Node ID of the local register */
- err = pci_read_config_dword(ubox_dev, 0x40, &config);
- if (err)
- break;
- nodeid = config;
- /* get the Node ID mapping */
- err = pci_read_config_dword(ubox_dev, 0x54, &config);
- if (err)
- break;
- /*
- * every three bits in the Node ID mapping register maps
- * to a particular node.
- */
- for (i = 0; i < 8; i++) {
- if (nodeid == ((config >> (3 * i)) & 0x7)) {
- pcibus_to_physid[bus] = i;
- break;
- }
- }
- }
-
- if (!err) {
- /*
- * For PCI bus with no UBOX device, find the next bus
- * that has UBOX device and use its mapping.
- */
- i = -1;
- for (bus = 255; bus >= 0; bus--) {
- if (pcibus_to_physid[bus] >= 0)
- i = pcibus_to_physid[bus];
- else
- pcibus_to_physid[bus] = i;
- }
- }
-
- if (ubox_dev)
- pci_dev_put(ubox_dev);
-
- return err ? pcibios_err_to_errno(err) : 0;
-}
-/* end of Sandy Bridge-EP uncore support */
-
-/* IvyTown uncore support */
-static void ivt_uncore_msr_init_box(struct intel_uncore_box *box)
-{
- unsigned msr = uncore_msr_box_ctl(box);
- if (msr)
- wrmsrl(msr, IVT_PMON_BOX_CTL_INT);
-}
-
-static void ivt_uncore_pci_init_box(struct intel_uncore_box *box)
-{
- struct pci_dev *pdev = box->pci_dev;
-
- pci_write_config_dword(pdev, SNBEP_PCI_PMON_BOX_CTL, IVT_PMON_BOX_CTL_INT);
-}
-
-#define IVT_UNCORE_MSR_OPS_COMMON_INIT() \
- .init_box = ivt_uncore_msr_init_box, \
- .disable_box = snbep_uncore_msr_disable_box, \
- .enable_box = snbep_uncore_msr_enable_box, \
- .disable_event = snbep_uncore_msr_disable_event, \
- .enable_event = snbep_uncore_msr_enable_event, \
- .read_counter = uncore_msr_read_counter
-
-static struct intel_uncore_ops ivt_uncore_msr_ops = {
- IVT_UNCORE_MSR_OPS_COMMON_INIT(),
-};
-
-static struct intel_uncore_ops ivt_uncore_pci_ops = {
- .init_box = ivt_uncore_pci_init_box,
- .disable_box = snbep_uncore_pci_disable_box,
- .enable_box = snbep_uncore_pci_enable_box,
- .disable_event = snbep_uncore_pci_disable_event,
- .enable_event = snbep_uncore_pci_enable_event,
- .read_counter = snbep_uncore_pci_read_counter,
-};
-
-#define IVT_UNCORE_PCI_COMMON_INIT() \
- .perf_ctr = SNBEP_PCI_PMON_CTR0, \
- .event_ctl = SNBEP_PCI_PMON_CTL0, \
- .event_mask = IVT_PMON_RAW_EVENT_MASK, \
- .box_ctl = SNBEP_PCI_PMON_BOX_CTL, \
- .ops = &ivt_uncore_pci_ops, \
- .format_group = &ivt_uncore_format_group
-
-static struct attribute *ivt_uncore_formats_attr[] = {
- &format_attr_event.attr,
- &format_attr_umask.attr,
- &format_attr_edge.attr,
- &format_attr_inv.attr,
- &format_attr_thresh8.attr,
- NULL,
-};
-
-static struct attribute *ivt_uncore_ubox_formats_attr[] = {
- &format_attr_event.attr,
- &format_attr_umask.attr,
- &format_attr_edge.attr,
- &format_attr_inv.attr,
- &format_attr_thresh5.attr,
- NULL,
-};
-
-static struct attribute *ivt_uncore_cbox_formats_attr[] = {
- &format_attr_event.attr,
- &format_attr_umask.attr,
- &format_attr_edge.attr,
- &format_attr_tid_en.attr,
- &format_attr_thresh8.attr,
- &format_attr_filter_tid.attr,
- &format_attr_filter_link.attr,
- &format_attr_filter_state2.attr,
- &format_attr_filter_nid2.attr,
- &format_attr_filter_opc2.attr,
- NULL,
-};
-
-static struct attribute *ivt_uncore_pcu_formats_attr[] = {
- &format_attr_event_ext.attr,
- &format_attr_occ_sel.attr,
- &format_attr_edge.attr,
- &format_attr_thresh5.attr,
- &format_attr_occ_invert.attr,
- &format_attr_occ_edge.attr,
- &format_attr_filter_band0.attr,
- &format_attr_filter_band1.attr,
- &format_attr_filter_band2.attr,
- &format_attr_filter_band3.attr,
- NULL,
-};
-
-static struct attribute *ivt_uncore_qpi_formats_attr[] = {
- &format_attr_event_ext.attr,
- &format_attr_umask.attr,
- &format_attr_edge.attr,
- &format_attr_thresh8.attr,
- &format_attr_match_rds.attr,
- &format_attr_match_rnid30.attr,
- &format_attr_match_rnid4.attr,
- &format_attr_match_dnid.attr,
- &format_attr_match_mc.attr,
- &format_attr_match_opc.attr,
- &format_attr_match_vnw.attr,
- &format_attr_match0.attr,
- &format_attr_match1.attr,
- &format_attr_mask_rds.attr,
- &format_attr_mask_rnid30.attr,
- &format_attr_mask_rnid4.attr,
- &format_attr_mask_dnid.attr,
- &format_attr_mask_mc.attr,
- &format_attr_mask_opc.attr,
- &format_attr_mask_vnw.attr,
- &format_attr_mask0.attr,
- &format_attr_mask1.attr,
- NULL,
-};
-
-static struct attribute_group ivt_uncore_format_group = {
- .name = "format",
- .attrs = ivt_uncore_formats_attr,
-};
-
-static struct attribute_group ivt_uncore_ubox_format_group = {
- .name = "format",
- .attrs = ivt_uncore_ubox_formats_attr,
-};
-
-static struct attribute_group ivt_uncore_cbox_format_group = {
- .name = "format",
- .attrs = ivt_uncore_cbox_formats_attr,
-};
-
-static struct attribute_group ivt_uncore_pcu_format_group = {
- .name = "format",
- .attrs = ivt_uncore_pcu_formats_attr,
-};
-
-static struct attribute_group ivt_uncore_qpi_format_group = {
- .name = "format",
- .attrs = ivt_uncore_qpi_formats_attr,
-};
-
-static struct intel_uncore_type ivt_uncore_ubox = {
- .name = "ubox",
- .num_counters = 2,
- .num_boxes = 1,
- .perf_ctr_bits = 44,
- .fixed_ctr_bits = 48,
- .perf_ctr = SNBEP_U_MSR_PMON_CTR0,
- .event_ctl = SNBEP_U_MSR_PMON_CTL0,
- .event_mask = IVT_U_MSR_PMON_RAW_EVENT_MASK,
- .fixed_ctr = SNBEP_U_MSR_PMON_UCLK_FIXED_CTR,
- .fixed_ctl = SNBEP_U_MSR_PMON_UCLK_FIXED_CTL,
- .ops = &ivt_uncore_msr_ops,
- .format_group = &ivt_uncore_ubox_format_group,
-};
-
-static struct extra_reg ivt_uncore_cbox_extra_regs[] = {
- SNBEP_CBO_EVENT_EXTRA_REG(SNBEP_CBO_PMON_CTL_TID_EN,
- SNBEP_CBO_PMON_CTL_TID_EN, 0x1),
- SNBEP_CBO_EVENT_EXTRA_REG(0x1031, 0x10ff, 0x2),
-
- SNBEP_CBO_EVENT_EXTRA_REG(0x1134, 0xffff, 0x4),
- SNBEP_CBO_EVENT_EXTRA_REG(0x4134, 0xffff, 0xc),
- SNBEP_CBO_EVENT_EXTRA_REG(0x5134, 0xffff, 0xc),
- SNBEP_CBO_EVENT_EXTRA_REG(0x0334, 0xffff, 0x4),
- SNBEP_CBO_EVENT_EXTRA_REG(0x4334, 0xffff, 0xc),
- SNBEP_CBO_EVENT_EXTRA_REG(0x0534, 0xffff, 0x4),
- SNBEP_CBO_EVENT_EXTRA_REG(0x4534, 0xffff, 0xc),
- SNBEP_CBO_EVENT_EXTRA_REG(0x0934, 0xffff, 0x4),
- SNBEP_CBO_EVENT_EXTRA_REG(0x4934, 0xffff, 0xc),
- SNBEP_CBO_EVENT_EXTRA_REG(0x0135, 0xffff, 0x10),
- SNBEP_CBO_EVENT_EXTRA_REG(0x0335, 0xffff, 0x10),
- SNBEP_CBO_EVENT_EXTRA_REG(0x2135, 0xffff, 0x10),
- SNBEP_CBO_EVENT_EXTRA_REG(0x2335, 0xffff, 0x10),
- SNBEP_CBO_EVENT_EXTRA_REG(0x4135, 0xffff, 0x18),
- SNBEP_CBO_EVENT_EXTRA_REG(0x4335, 0xffff, 0x18),
- SNBEP_CBO_EVENT_EXTRA_REG(0x4435, 0xffff, 0x8),
- SNBEP_CBO_EVENT_EXTRA_REG(0x4835, 0xffff, 0x8),
- SNBEP_CBO_EVENT_EXTRA_REG(0x4a35, 0xffff, 0x8),
- SNBEP_CBO_EVENT_EXTRA_REG(0x5035, 0xffff, 0x8),
- SNBEP_CBO_EVENT_EXTRA_REG(0x8135, 0xffff, 0x10),
- SNBEP_CBO_EVENT_EXTRA_REG(0x8335, 0xffff, 0x10),
- SNBEP_CBO_EVENT_EXTRA_REG(0x0136, 0xffff, 0x10),
- SNBEP_CBO_EVENT_EXTRA_REG(0x0336, 0xffff, 0x10),
- SNBEP_CBO_EVENT_EXTRA_REG(0x2136, 0xffff, 0x10),
- SNBEP_CBO_EVENT_EXTRA_REG(0x2336, 0xffff, 0x10),
- SNBEP_CBO_EVENT_EXTRA_REG(0x4136, 0xffff, 0x18),
- SNBEP_CBO_EVENT_EXTRA_REG(0x4336, 0xffff, 0x18),
- SNBEP_CBO_EVENT_EXTRA_REG(0x4436, 0xffff, 0x8),
- SNBEP_CBO_EVENT_EXTRA_REG(0x4836, 0xffff, 0x8),
- SNBEP_CBO_EVENT_EXTRA_REG(0x4a36, 0xffff, 0x8),
- SNBEP_CBO_EVENT_EXTRA_REG(0x5036, 0xffff, 0x8),
- SNBEP_CBO_EVENT_EXTRA_REG(0x8136, 0xffff, 0x10),
- SNBEP_CBO_EVENT_EXTRA_REG(0x8336, 0xffff, 0x10),
- SNBEP_CBO_EVENT_EXTRA_REG(0x4037, 0x40ff, 0x8),
- EVENT_EXTRA_END
-};
-
-static u64 ivt_cbox_filter_mask(int fields)
-{
- u64 mask = 0;
-
- if (fields & 0x1)
- mask |= IVT_CB0_MSR_PMON_BOX_FILTER_TID;
- if (fields & 0x2)
- mask |= IVT_CB0_MSR_PMON_BOX_FILTER_LINK;
- if (fields & 0x4)
- mask |= IVT_CB0_MSR_PMON_BOX_FILTER_STATE;
- if (fields & 0x8)
- mask |= IVT_CB0_MSR_PMON_BOX_FILTER_NID;
- if (fields & 0x10)
- mask |= IVT_CB0_MSR_PMON_BOX_FILTER_OPC;
-
- return mask;
-}
-
-static struct event_constraint *
-ivt_cbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
- return __snbep_cbox_get_constraint(box, event, ivt_cbox_filter_mask);
-}
-
-static int ivt_cbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
-{
- struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
- struct extra_reg *er;
- int idx = 0;
-
- for (er = ivt_uncore_cbox_extra_regs; er->msr; er++) {
- if (er->event != (event->hw.config & er->config_mask))
- continue;
- idx |= er->idx;
- }
-
- if (idx) {
- reg1->reg = SNBEP_C0_MSR_PMON_BOX_FILTER +
- SNBEP_CBO_MSR_OFFSET * box->pmu->pmu_idx;
- reg1->config = event->attr.config1 & ivt_cbox_filter_mask(idx);
- reg1->idx = idx;
- }
- return 0;
-}
-
-static void ivt_cbox_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
- struct hw_perf_event *hwc = &event->hw;
- struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-
- if (reg1->idx != EXTRA_REG_NONE) {
- u64 filter = uncore_shared_reg_config(box, 0);
- wrmsrl(reg1->reg, filter & 0xffffffff);
- wrmsrl(reg1->reg + 6, filter >> 32);
- }
-
- wrmsrl(hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
-}
-
-static struct intel_uncore_ops ivt_uncore_cbox_ops = {
- .init_box = ivt_uncore_msr_init_box,
- .disable_box = snbep_uncore_msr_disable_box,
- .enable_box = snbep_uncore_msr_enable_box,
- .disable_event = snbep_uncore_msr_disable_event,
- .enable_event = ivt_cbox_enable_event,
- .read_counter = uncore_msr_read_counter,
- .hw_config = ivt_cbox_hw_config,
- .get_constraint = ivt_cbox_get_constraint,
- .put_constraint = snbep_cbox_put_constraint,
-};
-
-static struct intel_uncore_type ivt_uncore_cbox = {
- .name = "cbox",
- .num_counters = 4,
- .num_boxes = 15,
- .perf_ctr_bits = 44,
- .event_ctl = SNBEP_C0_MSR_PMON_CTL0,
- .perf_ctr = SNBEP_C0_MSR_PMON_CTR0,
- .event_mask = IVT_CBO_MSR_PMON_RAW_EVENT_MASK,
- .box_ctl = SNBEP_C0_MSR_PMON_BOX_CTL,
- .msr_offset = SNBEP_CBO_MSR_OFFSET,
- .num_shared_regs = 1,
- .constraints = snbep_uncore_cbox_constraints,
- .ops = &ivt_uncore_cbox_ops,
- .format_group = &ivt_uncore_cbox_format_group,
-};
-
-static struct intel_uncore_ops ivt_uncore_pcu_ops = {
- IVT_UNCORE_MSR_OPS_COMMON_INIT(),
- .hw_config = snbep_pcu_hw_config,
- .get_constraint = snbep_pcu_get_constraint,
- .put_constraint = snbep_pcu_put_constraint,
-};
-
-static struct intel_uncore_type ivt_uncore_pcu = {
- .name = "pcu",
- .num_counters = 4,
- .num_boxes = 1,
- .perf_ctr_bits = 48,
- .perf_ctr = SNBEP_PCU_MSR_PMON_CTR0,
- .event_ctl = SNBEP_PCU_MSR_PMON_CTL0,
- .event_mask = IVT_PCU_MSR_PMON_RAW_EVENT_MASK,
- .box_ctl = SNBEP_PCU_MSR_PMON_BOX_CTL,
- .num_shared_regs = 1,
- .ops = &ivt_uncore_pcu_ops,
- .format_group = &ivt_uncore_pcu_format_group,
-};
-
-static struct intel_uncore_type *ivt_msr_uncores[] = {
- &ivt_uncore_ubox,
- &ivt_uncore_cbox,
- &ivt_uncore_pcu,
- NULL,
-};
-
-static struct intel_uncore_type ivt_uncore_ha = {
- .name = "ha",
- .num_counters = 4,
- .num_boxes = 2,
- .perf_ctr_bits = 48,
- IVT_UNCORE_PCI_COMMON_INIT(),
-};
-
-static struct intel_uncore_type ivt_uncore_imc = {
- .name = "imc",
- .num_counters = 4,
- .num_boxes = 8,
- .perf_ctr_bits = 48,
- .fixed_ctr_bits = 48,
- .fixed_ctr = SNBEP_MC_CHy_PCI_PMON_FIXED_CTR,
- .fixed_ctl = SNBEP_MC_CHy_PCI_PMON_FIXED_CTL,
- IVT_UNCORE_PCI_COMMON_INIT(),
-};
-
-/* registers in IRP boxes are not properly aligned */
-static unsigned ivt_uncore_irp_ctls[] = {0xd8, 0xdc, 0xe0, 0xe4};
-static unsigned ivt_uncore_irp_ctrs[] = {0xa0, 0xb0, 0xb8, 0xc0};
-
-static void ivt_uncore_irp_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
- struct pci_dev *pdev = box->pci_dev;
- struct hw_perf_event *hwc = &event->hw;
-
- pci_write_config_dword(pdev, ivt_uncore_irp_ctls[hwc->idx],
- hwc->config | SNBEP_PMON_CTL_EN);
-}
-
-static void ivt_uncore_irp_disable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
- struct pci_dev *pdev = box->pci_dev;
- struct hw_perf_event *hwc = &event->hw;
-
- pci_write_config_dword(pdev, ivt_uncore_irp_ctls[hwc->idx], hwc->config);
-}
-
-static u64 ivt_uncore_irp_read_counter(struct intel_uncore_box *box, struct perf_event *event)
-{
- struct pci_dev *pdev = box->pci_dev;
- struct hw_perf_event *hwc = &event->hw;
- u64 count = 0;
-
- pci_read_config_dword(pdev, ivt_uncore_irp_ctrs[hwc->idx], (u32 *)&count);
- pci_read_config_dword(pdev, ivt_uncore_irp_ctrs[hwc->idx] + 4, (u32 *)&count + 1);
-
- return count;
-}
-
-static struct intel_uncore_ops ivt_uncore_irp_ops = {
- .init_box = ivt_uncore_pci_init_box,
- .disable_box = snbep_uncore_pci_disable_box,
- .enable_box = snbep_uncore_pci_enable_box,
- .disable_event = ivt_uncore_irp_disable_event,
- .enable_event = ivt_uncore_irp_enable_event,
- .read_counter = ivt_uncore_irp_read_counter,
-};
-
-static struct intel_uncore_type ivt_uncore_irp = {
- .name = "irp",
- .num_counters = 4,
- .num_boxes = 1,
- .perf_ctr_bits = 48,
- .event_mask = IVT_PMON_RAW_EVENT_MASK,
- .box_ctl = SNBEP_PCI_PMON_BOX_CTL,
- .ops = &ivt_uncore_irp_ops,
- .format_group = &ivt_uncore_format_group,
-};
-
-static struct intel_uncore_ops ivt_uncore_qpi_ops = {
- .init_box = ivt_uncore_pci_init_box,
- .disable_box = snbep_uncore_pci_disable_box,
- .enable_box = snbep_uncore_pci_enable_box,
- .disable_event = snbep_uncore_pci_disable_event,
- .enable_event = snbep_qpi_enable_event,
- .read_counter = snbep_uncore_pci_read_counter,
- .hw_config = snbep_qpi_hw_config,
- .get_constraint = uncore_get_constraint,
- .put_constraint = uncore_put_constraint,
-};
-
-static struct intel_uncore_type ivt_uncore_qpi = {
- .name = "qpi",
- .num_counters = 4,
- .num_boxes = 3,
- .perf_ctr_bits = 48,
- .perf_ctr = SNBEP_PCI_PMON_CTR0,
- .event_ctl = SNBEP_PCI_PMON_CTL0,
- .event_mask = IVT_QPI_PCI_PMON_RAW_EVENT_MASK,
- .box_ctl = SNBEP_PCI_PMON_BOX_CTL,
- .num_shared_regs = 1,
- .ops = &ivt_uncore_qpi_ops,
- .format_group = &ivt_uncore_qpi_format_group,
-};
-
-static struct intel_uncore_type ivt_uncore_r2pcie = {
- .name = "r2pcie",
- .num_counters = 4,
- .num_boxes = 1,
- .perf_ctr_bits = 44,
- .constraints = snbep_uncore_r2pcie_constraints,
- IVT_UNCORE_PCI_COMMON_INIT(),
-};
-
-static struct intel_uncore_type ivt_uncore_r3qpi = {
- .name = "r3qpi",
- .num_counters = 3,
- .num_boxes = 2,
- .perf_ctr_bits = 44,
- .constraints = snbep_uncore_r3qpi_constraints,
- IVT_UNCORE_PCI_COMMON_INIT(),
-};
-
-enum {
- IVT_PCI_UNCORE_HA,
- IVT_PCI_UNCORE_IMC,
- IVT_PCI_UNCORE_IRP,
- IVT_PCI_UNCORE_QPI,
- IVT_PCI_UNCORE_R2PCIE,
- IVT_PCI_UNCORE_R3QPI,
-};
-
-static struct intel_uncore_type *ivt_pci_uncores[] = {
- [IVT_PCI_UNCORE_HA] = &ivt_uncore_ha,
- [IVT_PCI_UNCORE_IMC] = &ivt_uncore_imc,
- [IVT_PCI_UNCORE_IRP] = &ivt_uncore_irp,
- [IVT_PCI_UNCORE_QPI] = &ivt_uncore_qpi,
- [IVT_PCI_UNCORE_R2PCIE] = &ivt_uncore_r2pcie,
- [IVT_PCI_UNCORE_R3QPI] = &ivt_uncore_r3qpi,
- NULL,
-};
-
-static DEFINE_PCI_DEVICE_TABLE(ivt_uncore_pci_ids) = {
- { /* Home Agent 0 */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe30),
- .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_HA, 0),
- },
- { /* Home Agent 1 */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe38),
- .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_HA, 1),
- },
- { /* MC0 Channel 0 */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb4),
- .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 0),
- },
- { /* MC0 Channel 1 */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb5),
- .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 1),
- },
- { /* MC0 Channel 3 */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb0),
- .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 2),
- },
- { /* MC0 Channel 4 */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb1),
- .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 3),
- },
- { /* MC1 Channel 0 */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef4),
- .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 4),
- },
- { /* MC1 Channel 1 */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef5),
- .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 5),
- },
- { /* MC1 Channel 3 */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef0),
- .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 6),
- },
- { /* MC1 Channel 4 */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef1),
- .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IMC, 7),
- },
- { /* IRP */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe39),
- .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_IRP, 0),
- },
- { /* QPI0 Port 0 */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe32),
- .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_QPI, 0),
- },
- { /* QPI0 Port 1 */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe33),
- .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_QPI, 1),
- },
- { /* QPI1 Port 2 */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe3a),
- .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_QPI, 2),
- },
- { /* R2PCIe */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe34),
- .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_R2PCIE, 0),
- },
- { /* R3QPI0 Link 0 */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe36),
- .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_R3QPI, 0),
- },
- { /* R3QPI0 Link 1 */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe37),
- .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_R3QPI, 1),
- },
- { /* R3QPI1 Link 2 */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe3e),
- .driver_data = UNCORE_PCI_DEV_DATA(IVT_PCI_UNCORE_R3QPI, 2),
- },
- { /* QPI Port 0 filter */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe86),
- .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
- SNBEP_PCI_QPI_PORT0_FILTER),
- },
- { /* QPI Port 0 filter */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe96),
- .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
- SNBEP_PCI_QPI_PORT1_FILTER),
- },
- { /* end: all zeroes */ }
-};
-
-static struct pci_driver ivt_uncore_pci_driver = {
- .name = "ivt_uncore",
- .id_table = ivt_uncore_pci_ids,
-};
-/* end of IvyTown uncore support */
-
-/* Sandy Bridge uncore support */
-static void snb_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
- struct hw_perf_event *hwc = &event->hw;
-
- if (hwc->idx < UNCORE_PMC_IDX_FIXED)
- wrmsrl(hwc->config_base, hwc->config | SNB_UNC_CTL_EN);
- else
- wrmsrl(hwc->config_base, SNB_UNC_CTL_EN);
-}
-
-static void snb_uncore_msr_disable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
- wrmsrl(event->hw.config_base, 0);
-}
-
-static void snb_uncore_msr_init_box(struct intel_uncore_box *box)
-{
- if (box->pmu->pmu_idx == 0) {
- wrmsrl(SNB_UNC_PERF_GLOBAL_CTL,
- SNB_UNC_GLOBAL_CTL_EN | SNB_UNC_GLOBAL_CTL_CORE_ALL);
- }
-}
-
-static struct uncore_event_desc snb_uncore_events[] = {
- INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0x00"),
- { /* end: all zeroes */ },
-};
-
-static struct attribute *snb_uncore_formats_attr[] = {
- &format_attr_event.attr,
- &format_attr_umask.attr,
- &format_attr_edge.attr,
- &format_attr_inv.attr,
- &format_attr_cmask5.attr,
- NULL,
-};
-
-static struct attribute_group snb_uncore_format_group = {
- .name = "format",
- .attrs = snb_uncore_formats_attr,
-};
-
-static struct intel_uncore_ops snb_uncore_msr_ops = {
- .init_box = snb_uncore_msr_init_box,
- .disable_event = snb_uncore_msr_disable_event,
- .enable_event = snb_uncore_msr_enable_event,
- .read_counter = uncore_msr_read_counter,
-};
-
-static struct event_constraint snb_uncore_cbox_constraints[] = {
- UNCORE_EVENT_CONSTRAINT(0x80, 0x1),
- UNCORE_EVENT_CONSTRAINT(0x83, 0x1),
- EVENT_CONSTRAINT_END
-};
-
-static struct intel_uncore_type snb_uncore_cbox = {
- .name = "cbox",
- .num_counters = 2,
- .num_boxes = 4,
- .perf_ctr_bits = 44,
- .fixed_ctr_bits = 48,
- .perf_ctr = SNB_UNC_CBO_0_PER_CTR0,
- .event_ctl = SNB_UNC_CBO_0_PERFEVTSEL0,
- .fixed_ctr = SNB_UNC_FIXED_CTR,
- .fixed_ctl = SNB_UNC_FIXED_CTR_CTRL,
- .single_fixed = 1,
- .event_mask = SNB_UNC_RAW_EVENT_MASK,
- .msr_offset = SNB_UNC_CBO_MSR_OFFSET,
- .constraints = snb_uncore_cbox_constraints,
- .ops = &snb_uncore_msr_ops,
- .format_group = &snb_uncore_format_group,
- .event_descs = snb_uncore_events,
-};
-
-static struct intel_uncore_type *snb_msr_uncores[] = {
- &snb_uncore_cbox,
- NULL,
-};
-
-enum {
- SNB_PCI_UNCORE_IMC,
-};
-
-static struct uncore_event_desc snb_uncore_imc_events[] = {
- INTEL_UNCORE_EVENT_DESC(data_reads, "event=0x01"),
- INTEL_UNCORE_EVENT_DESC(data_reads.scale, "6.103515625e-5"),
- INTEL_UNCORE_EVENT_DESC(data_reads.unit, "MiB"),
-
- INTEL_UNCORE_EVENT_DESC(data_writes, "event=0x02"),
- INTEL_UNCORE_EVENT_DESC(data_writes.scale, "6.103515625e-5"),
- INTEL_UNCORE_EVENT_DESC(data_writes.unit, "MiB"),
-
- { /* end: all zeroes */ },
-};
-
-#define SNB_UNCORE_PCI_IMC_EVENT_MASK 0xff
-#define SNB_UNCORE_PCI_IMC_BAR_OFFSET 0x48
-
-/* page size multiple covering all config regs */
-#define SNB_UNCORE_PCI_IMC_MAP_SIZE 0x6000
-
-#define SNB_UNCORE_PCI_IMC_DATA_READS 0x1
-#define SNB_UNCORE_PCI_IMC_DATA_READS_BASE 0x5050
-#define SNB_UNCORE_PCI_IMC_DATA_WRITES 0x2
-#define SNB_UNCORE_PCI_IMC_DATA_WRITES_BASE 0x5054
-#define SNB_UNCORE_PCI_IMC_CTR_BASE SNB_UNCORE_PCI_IMC_DATA_READS_BASE
-
-static struct attribute *snb_uncore_imc_formats_attr[] = {
- &format_attr_event.attr,
- NULL,
-};
-
-static struct attribute_group snb_uncore_imc_format_group = {
- .name = "format",
- .attrs = snb_uncore_imc_formats_attr,
-};
-
-static void snb_uncore_imc_init_box(struct intel_uncore_box *box)
-{
- struct pci_dev *pdev = box->pci_dev;
- int where = SNB_UNCORE_PCI_IMC_BAR_OFFSET;
- resource_size_t addr;
- u32 pci_dword;
-
- pci_read_config_dword(pdev, where, &pci_dword);
- addr = pci_dword;
-
-#ifdef CONFIG_PHYS_ADDR_T_64BIT
- pci_read_config_dword(pdev, where + 4, &pci_dword);
- addr |= ((resource_size_t)pci_dword << 32);
-#endif
-
- addr &= ~(PAGE_SIZE - 1);
-
- box->io_addr = ioremap(addr, SNB_UNCORE_PCI_IMC_MAP_SIZE);
- box->hrtimer_duration = UNCORE_SNB_IMC_HRTIMER_INTERVAL;
-}
-
-static void snb_uncore_imc_enable_box(struct intel_uncore_box *box)
-{}
-
-static void snb_uncore_imc_disable_box(struct intel_uncore_box *box)
-{}
-
-static void snb_uncore_imc_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{}
-
-static void snb_uncore_imc_disable_event(struct intel_uncore_box *box, struct perf_event *event)
-{}
-
-static u64 snb_uncore_imc_read_counter(struct intel_uncore_box *box, struct perf_event *event)
-{
- struct hw_perf_event *hwc = &event->hw;
-
- return (u64)*(unsigned int *)(box->io_addr + hwc->event_base);
-}
-
-/*
- * custom event_init() function because we define our own fixed, free
- * running counters, so we do not want to conflict with generic uncore
- * logic. Also simplifies processing
- */
-static int snb_uncore_imc_event_init(struct perf_event *event)
-{
- struct intel_uncore_pmu *pmu;
- struct intel_uncore_box *box;
- struct hw_perf_event *hwc = &event->hw;
- u64 cfg = event->attr.config & SNB_UNCORE_PCI_IMC_EVENT_MASK;
- int idx, base;
-
- if (event->attr.type != event->pmu->type)
- return -ENOENT;
-
- pmu = uncore_event_to_pmu(event);
- /* no device found for this pmu */
- if (pmu->func_id < 0)
- return -ENOENT;
-
- /* Sampling not supported yet */
- if (hwc->sample_period)
- return -EINVAL;
-
- /* unsupported modes and filters */
- if (event->attr.exclude_user ||
- event->attr.exclude_kernel ||
- event->attr.exclude_hv ||
- event->attr.exclude_idle ||
- event->attr.exclude_host ||
- event->attr.exclude_guest ||
- event->attr.sample_period) /* no sampling */
- return -EINVAL;
-
- /*
- * Place all uncore events for a particular physical package
- * onto a single cpu
- */
- if (event->cpu < 0)
- return -EINVAL;
-
- /* check only supported bits are set */
- if (event->attr.config & ~SNB_UNCORE_PCI_IMC_EVENT_MASK)
- return -EINVAL;
-
- box = uncore_pmu_to_box(pmu, event->cpu);
- if (!box || box->cpu < 0)
- return -EINVAL;
-
- event->cpu = box->cpu;
-
- event->hw.idx = -1;
- event->hw.last_tag = ~0ULL;
- event->hw.extra_reg.idx = EXTRA_REG_NONE;
- event->hw.branch_reg.idx = EXTRA_REG_NONE;
- /*
- * check event is known (whitelist, determines counter)
- */
- switch (cfg) {
- case SNB_UNCORE_PCI_IMC_DATA_READS:
- base = SNB_UNCORE_PCI_IMC_DATA_READS_BASE;
- idx = UNCORE_PMC_IDX_FIXED;
- break;
- case SNB_UNCORE_PCI_IMC_DATA_WRITES:
- base = SNB_UNCORE_PCI_IMC_DATA_WRITES_BASE;
- idx = UNCORE_PMC_IDX_FIXED + 1;
- break;
- default:
- return -EINVAL;
- }
-
- /* must be done before validate_group */
- event->hw.event_base = base;
- event->hw.config = cfg;
- event->hw.idx = idx;
-
- /* no group validation needed, we have free running counters */
-
- return 0;
-}
-
-static int snb_uncore_imc_hw_config(struct intel_uncore_box *box, struct perf_event *event)
-{
- return 0;
-}
-
-static void snb_uncore_imc_event_start(struct perf_event *event, int flags)
-{
- struct intel_uncore_box *box = uncore_event_to_box(event);
- u64 count;
-
- if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
- return;
-
- event->hw.state = 0;
- box->n_active++;
-
- list_add_tail(&event->active_entry, &box->active_list);
-
- count = snb_uncore_imc_read_counter(box, event);
- local64_set(&event->hw.prev_count, count);
-
- if (box->n_active == 1)
- uncore_pmu_start_hrtimer(box);
-}
-
-static void snb_uncore_imc_event_stop(struct perf_event *event, int flags)
-{
- struct intel_uncore_box *box = uncore_event_to_box(event);
- struct hw_perf_event *hwc = &event->hw;
-
- if (!(hwc->state & PERF_HES_STOPPED)) {
- box->n_active--;
-
- WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
- hwc->state |= PERF_HES_STOPPED;
-
- list_del(&event->active_entry);
-
- if (box->n_active == 0)
- uncore_pmu_cancel_hrtimer(box);
- }
-
- if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
- /*
- * Drain the remaining delta count out of a event
- * that we are disabling:
- */
- uncore_perf_event_update(box, event);
- hwc->state |= PERF_HES_UPTODATE;
- }
-}
-
-static int snb_uncore_imc_event_add(struct perf_event *event, int flags)
-{
- struct intel_uncore_box *box = uncore_event_to_box(event);
- struct hw_perf_event *hwc = &event->hw;
-
- if (!box)
- return -ENODEV;
-
- hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
- if (!(flags & PERF_EF_START))
- hwc->state |= PERF_HES_ARCH;
-
- snb_uncore_imc_event_start(event, 0);
-
- box->n_events++;
-
- return 0;
-}
-
-static void snb_uncore_imc_event_del(struct perf_event *event, int flags)
-{
- struct intel_uncore_box *box = uncore_event_to_box(event);
- int i;
-
- snb_uncore_imc_event_stop(event, PERF_EF_UPDATE);
-
- for (i = 0; i < box->n_events; i++) {
- if (event == box->event_list[i]) {
- --box->n_events;
- break;
- }
- }
-}
-
-static int snb_pci2phy_map_init(int devid)
-{
- struct pci_dev *dev = NULL;
- int bus;
-
- dev = pci_get_device(PCI_VENDOR_ID_INTEL, devid, dev);
- if (!dev)
- return -ENOTTY;
-
- bus = dev->bus->number;
-
- pcibus_to_physid[bus] = 0;
-
- pci_dev_put(dev);
-
- return 0;
-}
-
-static struct pmu snb_uncore_imc_pmu = {
- .task_ctx_nr = perf_invalid_context,
- .event_init = snb_uncore_imc_event_init,
- .add = snb_uncore_imc_event_add,
- .del = snb_uncore_imc_event_del,
- .start = snb_uncore_imc_event_start,
- .stop = snb_uncore_imc_event_stop,
- .read = uncore_pmu_event_read,
-};
-
-static struct intel_uncore_ops snb_uncore_imc_ops = {
- .init_box = snb_uncore_imc_init_box,
- .enable_box = snb_uncore_imc_enable_box,
- .disable_box = snb_uncore_imc_disable_box,
- .disable_event = snb_uncore_imc_disable_event,
- .enable_event = snb_uncore_imc_enable_event,
- .hw_config = snb_uncore_imc_hw_config,
- .read_counter = snb_uncore_imc_read_counter,
-};
-
-static struct intel_uncore_type snb_uncore_imc = {
- .name = "imc",
- .num_counters = 2,
- .num_boxes = 1,
- .fixed_ctr_bits = 32,
- .fixed_ctr = SNB_UNCORE_PCI_IMC_CTR_BASE,
- .event_descs = snb_uncore_imc_events,
- .format_group = &snb_uncore_imc_format_group,
- .perf_ctr = SNB_UNCORE_PCI_IMC_DATA_READS_BASE,
- .event_mask = SNB_UNCORE_PCI_IMC_EVENT_MASK,
- .ops = &snb_uncore_imc_ops,
- .pmu = &snb_uncore_imc_pmu,
-};
-
-static struct intel_uncore_type *snb_pci_uncores[] = {
- [SNB_PCI_UNCORE_IMC] = &snb_uncore_imc,
- NULL,
-};
-
-static DEFINE_PCI_DEVICE_TABLE(snb_uncore_pci_ids) = {
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SNB_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* end: all zeroes */ },
-};
-
-static DEFINE_PCI_DEVICE_TABLE(ivb_uncore_pci_ids) = {
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IVB_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* end: all zeroes */ },
-};
-
-static DEFINE_PCI_DEVICE_TABLE(hsw_uncore_pci_ids) = {
- { /* IMC */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_HSW_IMC),
- .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
- },
- { /* end: all zeroes */ },
-};
-
-static struct pci_driver snb_uncore_pci_driver = {
- .name = "snb_uncore",
- .id_table = snb_uncore_pci_ids,
-};
-
-static struct pci_driver ivb_uncore_pci_driver = {
- .name = "ivb_uncore",
- .id_table = ivb_uncore_pci_ids,
-};
-
-static struct pci_driver hsw_uncore_pci_driver = {
- .name = "hsw_uncore",
- .id_table = hsw_uncore_pci_ids,
-};
-
-/* end of Sandy Bridge uncore support */
-
-/* Nehalem uncore support */
-static void nhm_uncore_msr_disable_box(struct intel_uncore_box *box)
-{
- wrmsrl(NHM_UNC_PERF_GLOBAL_CTL, 0);
-}
-
-static void nhm_uncore_msr_enable_box(struct intel_uncore_box *box)
-{
- wrmsrl(NHM_UNC_PERF_GLOBAL_CTL, NHM_UNC_GLOBAL_CTL_EN_PC_ALL | NHM_UNC_GLOBAL_CTL_EN_FC);
-}
-
-static void nhm_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
- struct hw_perf_event *hwc = &event->hw;
-
- if (hwc->idx < UNCORE_PMC_IDX_FIXED)
- wrmsrl(hwc->config_base, hwc->config | SNB_UNC_CTL_EN);
- else
- wrmsrl(hwc->config_base, NHM_UNC_FIXED_CTR_CTL_EN);
-}
-
-static struct attribute *nhm_uncore_formats_attr[] = {
- &format_attr_event.attr,
- &format_attr_umask.attr,
- &format_attr_edge.attr,
- &format_attr_inv.attr,
- &format_attr_cmask8.attr,
- NULL,
-};
-
-static struct attribute_group nhm_uncore_format_group = {
- .name = "format",
- .attrs = nhm_uncore_formats_attr,
-};
-
-static struct uncore_event_desc nhm_uncore_events[] = {
- INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0x00"),
- INTEL_UNCORE_EVENT_DESC(qmc_writes_full_any, "event=0x2f,umask=0x0f"),
- INTEL_UNCORE_EVENT_DESC(qmc_normal_reads_any, "event=0x2c,umask=0x0f"),
- INTEL_UNCORE_EVENT_DESC(qhl_request_ioh_reads, "event=0x20,umask=0x01"),
- INTEL_UNCORE_EVENT_DESC(qhl_request_ioh_writes, "event=0x20,umask=0x02"),
- INTEL_UNCORE_EVENT_DESC(qhl_request_remote_reads, "event=0x20,umask=0x04"),
- INTEL_UNCORE_EVENT_DESC(qhl_request_remote_writes, "event=0x20,umask=0x08"),
- INTEL_UNCORE_EVENT_DESC(qhl_request_local_reads, "event=0x20,umask=0x10"),
- INTEL_UNCORE_EVENT_DESC(qhl_request_local_writes, "event=0x20,umask=0x20"),
- { /* end: all zeroes */ },
-};
-
-static struct intel_uncore_ops nhm_uncore_msr_ops = {
- .disable_box = nhm_uncore_msr_disable_box,
- .enable_box = nhm_uncore_msr_enable_box,
- .disable_event = snb_uncore_msr_disable_event,
- .enable_event = nhm_uncore_msr_enable_event,
- .read_counter = uncore_msr_read_counter,
-};
-
-static struct intel_uncore_type nhm_uncore = {
- .name = "",
- .num_counters = 8,
- .num_boxes = 1,
- .perf_ctr_bits = 48,
- .fixed_ctr_bits = 48,
- .event_ctl = NHM_UNC_PERFEVTSEL0,
- .perf_ctr = NHM_UNC_UNCORE_PMC0,
- .fixed_ctr = NHM_UNC_FIXED_CTR,
- .fixed_ctl = NHM_UNC_FIXED_CTR_CTRL,
- .event_mask = NHM_UNC_RAW_EVENT_MASK,
- .event_descs = nhm_uncore_events,
- .ops = &nhm_uncore_msr_ops,
- .format_group = &nhm_uncore_format_group,
-};
-
-static struct intel_uncore_type *nhm_msr_uncores[] = {
- &nhm_uncore,
- NULL,
-};
-/* end of Nehalem uncore support */
-
-/* Nehalem-EX uncore support */
-DEFINE_UNCORE_FORMAT_ATTR(event5, event, "config:1-5");
-DEFINE_UNCORE_FORMAT_ATTR(counter, counter, "config:6-7");
-DEFINE_UNCORE_FORMAT_ATTR(match, match, "config1:0-63");
-DEFINE_UNCORE_FORMAT_ATTR(mask, mask, "config2:0-63");
-
-static void nhmex_uncore_msr_init_box(struct intel_uncore_box *box)
-{
- wrmsrl(NHMEX_U_MSR_PMON_GLOBAL_CTL, NHMEX_U_PMON_GLOBAL_EN_ALL);
-}
-
-static void nhmex_uncore_msr_disable_box(struct intel_uncore_box *box)
-{
- unsigned msr = uncore_msr_box_ctl(box);
- u64 config;
-
- if (msr) {
- rdmsrl(msr, config);
- config &= ~((1ULL << uncore_num_counters(box)) - 1);
- /* WBox has a fixed counter */
- if (uncore_msr_fixed_ctl(box))
- config &= ~NHMEX_W_PMON_GLOBAL_FIXED_EN;
- wrmsrl(msr, config);
- }
-}
-
-static void nhmex_uncore_msr_enable_box(struct intel_uncore_box *box)
-{
- unsigned msr = uncore_msr_box_ctl(box);
- u64 config;
-
- if (msr) {
- rdmsrl(msr, config);
- config |= (1ULL << uncore_num_counters(box)) - 1;
- /* WBox has a fixed counter */
- if (uncore_msr_fixed_ctl(box))
- config |= NHMEX_W_PMON_GLOBAL_FIXED_EN;
- wrmsrl(msr, config);
- }
-}
-
-static void nhmex_uncore_msr_disable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
- wrmsrl(event->hw.config_base, 0);
-}
-
-static void nhmex_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
- struct hw_perf_event *hwc = &event->hw;
-
- if (hwc->idx >= UNCORE_PMC_IDX_FIXED)
- wrmsrl(hwc->config_base, NHMEX_PMON_CTL_EN_BIT0);
- else if (box->pmu->type->event_mask & NHMEX_PMON_CTL_EN_BIT0)
- wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT22);
- else
- wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT0);
-}
-
-#define NHMEX_UNCORE_OPS_COMMON_INIT() \
- .init_box = nhmex_uncore_msr_init_box, \
- .disable_box = nhmex_uncore_msr_disable_box, \
- .enable_box = nhmex_uncore_msr_enable_box, \
- .disable_event = nhmex_uncore_msr_disable_event, \
- .read_counter = uncore_msr_read_counter
-
-static struct intel_uncore_ops nhmex_uncore_ops = {
- NHMEX_UNCORE_OPS_COMMON_INIT(),
- .enable_event = nhmex_uncore_msr_enable_event,
-};
-
-static struct attribute *nhmex_uncore_ubox_formats_attr[] = {
- &format_attr_event.attr,
- &format_attr_edge.attr,
- NULL,
-};
-
-static struct attribute_group nhmex_uncore_ubox_format_group = {
- .name = "format",
- .attrs = nhmex_uncore_ubox_formats_attr,
-};
-
-static struct intel_uncore_type nhmex_uncore_ubox = {
- .name = "ubox",
- .num_counters = 1,
- .num_boxes = 1,
- .perf_ctr_bits = 48,
- .event_ctl = NHMEX_U_MSR_PMON_EV_SEL,
- .perf_ctr = NHMEX_U_MSR_PMON_CTR,
- .event_mask = NHMEX_U_PMON_RAW_EVENT_MASK,
- .box_ctl = NHMEX_U_MSR_PMON_GLOBAL_CTL,
- .ops = &nhmex_uncore_ops,
- .format_group = &nhmex_uncore_ubox_format_group
-};
-
-static struct attribute *nhmex_uncore_cbox_formats_attr[] = {
- &format_attr_event.attr,
- &format_attr_umask.attr,
- &format_attr_edge.attr,
- &format_attr_inv.attr,
- &format_attr_thresh8.attr,
- NULL,
-};
-
-static struct attribute_group nhmex_uncore_cbox_format_group = {
- .name = "format",
- .attrs = nhmex_uncore_cbox_formats_attr,
-};
-
-/* msr offset for each instance of cbox */
-static unsigned nhmex_cbox_msr_offsets[] = {
- 0x0, 0x80, 0x40, 0xc0, 0x20, 0xa0, 0x60, 0xe0, 0x240, 0x2c0,
-};
-
-static struct intel_uncore_type nhmex_uncore_cbox = {
- .name = "cbox",
- .num_counters = 6,
- .num_boxes = 10,
- .perf_ctr_bits = 48,
- .event_ctl = NHMEX_C0_MSR_PMON_EV_SEL0,
- .perf_ctr = NHMEX_C0_MSR_PMON_CTR0,
- .event_mask = NHMEX_PMON_RAW_EVENT_MASK,
- .box_ctl = NHMEX_C0_MSR_PMON_GLOBAL_CTL,
- .msr_offsets = nhmex_cbox_msr_offsets,
- .pair_ctr_ctl = 1,
- .ops = &nhmex_uncore_ops,
- .format_group = &nhmex_uncore_cbox_format_group
-};
-
-static struct uncore_event_desc nhmex_uncore_wbox_events[] = {
- INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0"),
- { /* end: all zeroes */ },
-};
-
-static struct intel_uncore_type nhmex_uncore_wbox = {
- .name = "wbox",
- .num_counters = 4,
- .num_boxes = 1,
- .perf_ctr_bits = 48,
- .event_ctl = NHMEX_W_MSR_PMON_CNT0,
- .perf_ctr = NHMEX_W_MSR_PMON_EVT_SEL0,
- .fixed_ctr = NHMEX_W_MSR_PMON_FIXED_CTR,
- .fixed_ctl = NHMEX_W_MSR_PMON_FIXED_CTL,
- .event_mask = NHMEX_PMON_RAW_EVENT_MASK,
- .box_ctl = NHMEX_W_MSR_GLOBAL_CTL,
- .pair_ctr_ctl = 1,
- .event_descs = nhmex_uncore_wbox_events,
- .ops = &nhmex_uncore_ops,
- .format_group = &nhmex_uncore_cbox_format_group
-};
-
-static int nhmex_bbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
-{
- struct hw_perf_event *hwc = &event->hw;
- struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
- struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
- int ctr, ev_sel;
-
- ctr = (hwc->config & NHMEX_B_PMON_CTR_MASK) >>
- NHMEX_B_PMON_CTR_SHIFT;
- ev_sel = (hwc->config & NHMEX_B_PMON_CTL_EV_SEL_MASK) >>
- NHMEX_B_PMON_CTL_EV_SEL_SHIFT;
-
- /* events that do not use the match/mask registers */
- if ((ctr == 0 && ev_sel > 0x3) || (ctr == 1 && ev_sel > 0x6) ||
- (ctr == 2 && ev_sel != 0x4) || ctr == 3)
- return 0;
-
- if (box->pmu->pmu_idx == 0)
- reg1->reg = NHMEX_B0_MSR_MATCH;
- else
- reg1->reg = NHMEX_B1_MSR_MATCH;
- reg1->idx = 0;
- reg1->config = event->attr.config1;
- reg2->config = event->attr.config2;
- return 0;
-}
-
-static void nhmex_bbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
- struct hw_perf_event *hwc = &event->hw;
- struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
- struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
-
- if (reg1->idx != EXTRA_REG_NONE) {
- wrmsrl(reg1->reg, reg1->config);
- wrmsrl(reg1->reg + 1, reg2->config);
- }
- wrmsrl(hwc->config_base, NHMEX_PMON_CTL_EN_BIT0 |
- (hwc->config & NHMEX_B_PMON_CTL_EV_SEL_MASK));
-}
-
-/*
- * The Bbox has 4 counters, but each counter monitors different events.
- * Use bits 6-7 in the event config to select counter.
- */
-static struct event_constraint nhmex_uncore_bbox_constraints[] = {
- EVENT_CONSTRAINT(0 , 1, 0xc0),
- EVENT_CONSTRAINT(0x40, 2, 0xc0),
- EVENT_CONSTRAINT(0x80, 4, 0xc0),
- EVENT_CONSTRAINT(0xc0, 8, 0xc0),
- EVENT_CONSTRAINT_END,
-};
-
-static struct attribute *nhmex_uncore_bbox_formats_attr[] = {
- &format_attr_event5.attr,
- &format_attr_counter.attr,
- &format_attr_match.attr,
- &format_attr_mask.attr,
- NULL,
-};
-
-static struct attribute_group nhmex_uncore_bbox_format_group = {
- .name = "format",
- .attrs = nhmex_uncore_bbox_formats_attr,
-};
-
-static struct intel_uncore_ops nhmex_uncore_bbox_ops = {
- NHMEX_UNCORE_OPS_COMMON_INIT(),
- .enable_event = nhmex_bbox_msr_enable_event,
- .hw_config = nhmex_bbox_hw_config,
- .get_constraint = uncore_get_constraint,
- .put_constraint = uncore_put_constraint,
-};
-
-static struct intel_uncore_type nhmex_uncore_bbox = {
- .name = "bbox",
- .num_counters = 4,
- .num_boxes = 2,
- .perf_ctr_bits = 48,
- .event_ctl = NHMEX_B0_MSR_PMON_CTL0,
- .perf_ctr = NHMEX_B0_MSR_PMON_CTR0,
- .event_mask = NHMEX_B_PMON_RAW_EVENT_MASK,
- .box_ctl = NHMEX_B0_MSR_PMON_GLOBAL_CTL,
- .msr_offset = NHMEX_B_MSR_OFFSET,
- .pair_ctr_ctl = 1,
- .num_shared_regs = 1,
- .constraints = nhmex_uncore_bbox_constraints,
- .ops = &nhmex_uncore_bbox_ops,
- .format_group = &nhmex_uncore_bbox_format_group
-};
-
-static int nhmex_sbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
-{
- struct hw_perf_event *hwc = &event->hw;
- struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
- struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
-
- /* only TO_R_PROG_EV event uses the match/mask register */
- if ((hwc->config & NHMEX_PMON_CTL_EV_SEL_MASK) !=
- NHMEX_S_EVENT_TO_R_PROG_EV)
- return 0;
-
- if (box->pmu->pmu_idx == 0)
- reg1->reg = NHMEX_S0_MSR_MM_CFG;
- else
- reg1->reg = NHMEX_S1_MSR_MM_CFG;
- reg1->idx = 0;
- reg1->config = event->attr.config1;
- reg2->config = event->attr.config2;
- return 0;
-}
-
-static void nhmex_sbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
- struct hw_perf_event *hwc = &event->hw;
- struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
- struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
-
- if (reg1->idx != EXTRA_REG_NONE) {
- wrmsrl(reg1->reg, 0);
- wrmsrl(reg1->reg + 1, reg1->config);
- wrmsrl(reg1->reg + 2, reg2->config);
- wrmsrl(reg1->reg, NHMEX_S_PMON_MM_CFG_EN);
- }
- wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT22);
-}
-
-static struct attribute *nhmex_uncore_sbox_formats_attr[] = {
- &format_attr_event.attr,
- &format_attr_umask.attr,
- &format_attr_edge.attr,
- &format_attr_inv.attr,
- &format_attr_thresh8.attr,
- &format_attr_match.attr,
- &format_attr_mask.attr,
- NULL,
-};
-
-static struct attribute_group nhmex_uncore_sbox_format_group = {
- .name = "format",
- .attrs = nhmex_uncore_sbox_formats_attr,
-};
-
-static struct intel_uncore_ops nhmex_uncore_sbox_ops = {
- NHMEX_UNCORE_OPS_COMMON_INIT(),
- .enable_event = nhmex_sbox_msr_enable_event,
- .hw_config = nhmex_sbox_hw_config,
- .get_constraint = uncore_get_constraint,
- .put_constraint = uncore_put_constraint,
-};
-
-static struct intel_uncore_type nhmex_uncore_sbox = {
- .name = "sbox",
- .num_counters = 4,
- .num_boxes = 2,
- .perf_ctr_bits = 48,
- .event_ctl = NHMEX_S0_MSR_PMON_CTL0,
- .perf_ctr = NHMEX_S0_MSR_PMON_CTR0,
- .event_mask = NHMEX_PMON_RAW_EVENT_MASK,
- .box_ctl = NHMEX_S0_MSR_PMON_GLOBAL_CTL,
- .msr_offset = NHMEX_S_MSR_OFFSET,
- .pair_ctr_ctl = 1,
- .num_shared_regs = 1,
- .ops = &nhmex_uncore_sbox_ops,
- .format_group = &nhmex_uncore_sbox_format_group
-};
-
-enum {
- EXTRA_REG_NHMEX_M_FILTER,
- EXTRA_REG_NHMEX_M_DSP,
- EXTRA_REG_NHMEX_M_ISS,
- EXTRA_REG_NHMEX_M_MAP,
- EXTRA_REG_NHMEX_M_MSC_THR,
- EXTRA_REG_NHMEX_M_PGT,
- EXTRA_REG_NHMEX_M_PLD,
- EXTRA_REG_NHMEX_M_ZDP_CTL_FVC,
-};
-
-static struct extra_reg nhmex_uncore_mbox_extra_regs[] = {
- MBOX_INC_SEL_EXTAR_REG(0x0, DSP),
- MBOX_INC_SEL_EXTAR_REG(0x4, MSC_THR),
- MBOX_INC_SEL_EXTAR_REG(0x5, MSC_THR),
- MBOX_INC_SEL_EXTAR_REG(0x9, ISS),
- /* event 0xa uses two extra registers */
- MBOX_INC_SEL_EXTAR_REG(0xa, ISS),
- MBOX_INC_SEL_EXTAR_REG(0xa, PLD),
- MBOX_INC_SEL_EXTAR_REG(0xb, PLD),
- /* events 0xd ~ 0x10 use the same extra register */
- MBOX_INC_SEL_EXTAR_REG(0xd, ZDP_CTL_FVC),
- MBOX_INC_SEL_EXTAR_REG(0xe, ZDP_CTL_FVC),
- MBOX_INC_SEL_EXTAR_REG(0xf, ZDP_CTL_FVC),
- MBOX_INC_SEL_EXTAR_REG(0x10, ZDP_CTL_FVC),
- MBOX_INC_SEL_EXTAR_REG(0x16, PGT),
- MBOX_SET_FLAG_SEL_EXTRA_REG(0x0, DSP),
- MBOX_SET_FLAG_SEL_EXTRA_REG(0x1, ISS),
- MBOX_SET_FLAG_SEL_EXTRA_REG(0x5, PGT),
- MBOX_SET_FLAG_SEL_EXTRA_REG(0x6, MAP),
- EVENT_EXTRA_END
-};
-
-/* Nehalem-EX or Westmere-EX ? */
-static bool uncore_nhmex;
-
-static bool nhmex_mbox_get_shared_reg(struct intel_uncore_box *box, int idx, u64 config)
-{
- struct intel_uncore_extra_reg *er;
- unsigned long flags;
- bool ret = false;
- u64 mask;
-
- if (idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) {
- er = &box->shared_regs[idx];
- raw_spin_lock_irqsave(&er->lock, flags);
- if (!atomic_read(&er->ref) || er->config == config) {
- atomic_inc(&er->ref);
- er->config = config;
- ret = true;
- }
- raw_spin_unlock_irqrestore(&er->lock, flags);
-
- return ret;
- }
- /*
- * The ZDP_CTL_FVC MSR has 4 fields which are used to control
- * events 0xd ~ 0x10. Besides these 4 fields, there are additional
- * fields which are shared.
- */
- idx -= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
- if (WARN_ON_ONCE(idx >= 4))
- return false;
-
- /* mask of the shared fields */
- if (uncore_nhmex)
- mask = NHMEX_M_PMON_ZDP_CTL_FVC_MASK;
- else
- mask = WSMEX_M_PMON_ZDP_CTL_FVC_MASK;
- er = &box->shared_regs[EXTRA_REG_NHMEX_M_ZDP_CTL_FVC];
-
- raw_spin_lock_irqsave(&er->lock, flags);
- /* add mask of the non-shared field if it's in use */
- if (__BITS_VALUE(atomic_read(&er->ref), idx, 8)) {
- if (uncore_nhmex)
- mask |= NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
- else
- mask |= WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
- }
-
- if (!atomic_read(&er->ref) || !((er->config ^ config) & mask)) {
- atomic_add(1 << (idx * 8), &er->ref);
- if (uncore_nhmex)
- mask = NHMEX_M_PMON_ZDP_CTL_FVC_MASK |
- NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
- else
- mask = WSMEX_M_PMON_ZDP_CTL_FVC_MASK |
- WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
- er->config &= ~mask;
- er->config |= (config & mask);
- ret = true;
- }
- raw_spin_unlock_irqrestore(&er->lock, flags);
-
- return ret;
-}
-
-static void nhmex_mbox_put_shared_reg(struct intel_uncore_box *box, int idx)
-{
- struct intel_uncore_extra_reg *er;
-
- if (idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) {
- er = &box->shared_regs[idx];
- atomic_dec(&er->ref);
- return;
- }
-
- idx -= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
- er = &box->shared_regs[EXTRA_REG_NHMEX_M_ZDP_CTL_FVC];
- atomic_sub(1 << (idx * 8), &er->ref);
-}
-
-static u64 nhmex_mbox_alter_er(struct perf_event *event, int new_idx, bool modify)
-{
- struct hw_perf_event *hwc = &event->hw;
- struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
- u64 idx, orig_idx = __BITS_VALUE(reg1->idx, 0, 8);
- u64 config = reg1->config;
-
- /* get the non-shared control bits and shift them */
- idx = orig_idx - EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
- if (uncore_nhmex)
- config &= NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
- else
- config &= WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
- if (new_idx > orig_idx) {
- idx = new_idx - orig_idx;
- config <<= 3 * idx;
- } else {
- idx = orig_idx - new_idx;
- config >>= 3 * idx;
- }
-
- /* add the shared control bits back */
- if (uncore_nhmex)
- config |= NHMEX_M_PMON_ZDP_CTL_FVC_MASK & reg1->config;
- else
- config |= WSMEX_M_PMON_ZDP_CTL_FVC_MASK & reg1->config;
- config |= NHMEX_M_PMON_ZDP_CTL_FVC_MASK & reg1->config;
- if (modify) {
- /* adjust the main event selector */
- if (new_idx > orig_idx)
- hwc->config += idx << NHMEX_M_PMON_CTL_INC_SEL_SHIFT;
- else
- hwc->config -= idx << NHMEX_M_PMON_CTL_INC_SEL_SHIFT;
- reg1->config = config;
- reg1->idx = ~0xff | new_idx;
- }
- return config;
-}
-
-static struct event_constraint *
-nhmex_mbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
- struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
- struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
- int i, idx[2], alloc = 0;
- u64 config1 = reg1->config;
-
- idx[0] = __BITS_VALUE(reg1->idx, 0, 8);
- idx[1] = __BITS_VALUE(reg1->idx, 1, 8);
-again:
- for (i = 0; i < 2; i++) {
- if (!uncore_box_is_fake(box) && (reg1->alloc & (0x1 << i)))
- idx[i] = 0xff;
-
- if (idx[i] == 0xff)
- continue;
-
- if (!nhmex_mbox_get_shared_reg(box, idx[i],
- __BITS_VALUE(config1, i, 32)))
- goto fail;
- alloc |= (0x1 << i);
- }
-
- /* for the match/mask registers */
- if (reg2->idx != EXTRA_REG_NONE &&
- (uncore_box_is_fake(box) || !reg2->alloc) &&
- !nhmex_mbox_get_shared_reg(box, reg2->idx, reg2->config))
- goto fail;
-
- /*
- * If it's a fake box -- as per validate_{group,event}() we
- * shouldn't touch event state and we can avoid doing so
- * since both will only call get_event_constraints() once
- * on each event, this avoids the need for reg->alloc.
- */
- if (!uncore_box_is_fake(box)) {
- if (idx[0] != 0xff && idx[0] != __BITS_VALUE(reg1->idx, 0, 8))
- nhmex_mbox_alter_er(event, idx[0], true);
- reg1->alloc |= alloc;
- if (reg2->idx != EXTRA_REG_NONE)
- reg2->alloc = 1;
- }
- return NULL;
-fail:
- if (idx[0] != 0xff && !(alloc & 0x1) &&
- idx[0] >= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) {
- /*
- * events 0xd ~ 0x10 are functional identical, but are
- * controlled by different fields in the ZDP_CTL_FVC
- * register. If we failed to take one field, try the
- * rest 3 choices.
- */
- BUG_ON(__BITS_VALUE(reg1->idx, 1, 8) != 0xff);
- idx[0] -= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
- idx[0] = (idx[0] + 1) % 4;
- idx[0] += EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
- if (idx[0] != __BITS_VALUE(reg1->idx, 0, 8)) {
- config1 = nhmex_mbox_alter_er(event, idx[0], false);
- goto again;
- }
- }
-
- if (alloc & 0x1)
- nhmex_mbox_put_shared_reg(box, idx[0]);
- if (alloc & 0x2)
- nhmex_mbox_put_shared_reg(box, idx[1]);
- return &constraint_empty;
-}
-
-static void nhmex_mbox_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
- struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
- struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
-
- if (uncore_box_is_fake(box))
- return;
-
- if (reg1->alloc & 0x1)
- nhmex_mbox_put_shared_reg(box, __BITS_VALUE(reg1->idx, 0, 8));
- if (reg1->alloc & 0x2)
- nhmex_mbox_put_shared_reg(box, __BITS_VALUE(reg1->idx, 1, 8));
- reg1->alloc = 0;
-
- if (reg2->alloc) {
- nhmex_mbox_put_shared_reg(box, reg2->idx);
- reg2->alloc = 0;
- }
-}
-
-static int nhmex_mbox_extra_reg_idx(struct extra_reg *er)
-{
- if (er->idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC)
- return er->idx;
- return er->idx + (er->event >> NHMEX_M_PMON_CTL_INC_SEL_SHIFT) - 0xd;
-}
-
-static int nhmex_mbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
-{
- struct intel_uncore_type *type = box->pmu->type;
- struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
- struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
- struct extra_reg *er;
- unsigned msr;
- int reg_idx = 0;
- /*
- * The mbox events may require 2 extra MSRs at the most. But only
- * the lower 32 bits in these MSRs are significant, so we can use
- * config1 to pass two MSRs' config.
- */
- for (er = nhmex_uncore_mbox_extra_regs; er->msr; er++) {
- if (er->event != (event->hw.config & er->config_mask))
- continue;
- if (event->attr.config1 & ~er->valid_mask)
- return -EINVAL;
-
- msr = er->msr + type->msr_offset * box->pmu->pmu_idx;
- if (WARN_ON_ONCE(msr >= 0xffff || er->idx >= 0xff))
- return -EINVAL;
-
- /* always use the 32~63 bits to pass the PLD config */
- if (er->idx == EXTRA_REG_NHMEX_M_PLD)
- reg_idx = 1;
- else if (WARN_ON_ONCE(reg_idx > 0))
- return -EINVAL;
-
- reg1->idx &= ~(0xff << (reg_idx * 8));
- reg1->reg &= ~(0xffff << (reg_idx * 16));
- reg1->idx |= nhmex_mbox_extra_reg_idx(er) << (reg_idx * 8);
- reg1->reg |= msr << (reg_idx * 16);
- reg1->config = event->attr.config1;
- reg_idx++;
- }
- /*
- * The mbox only provides ability to perform address matching
- * for the PLD events.
- */
- if (reg_idx == 2) {
- reg2->idx = EXTRA_REG_NHMEX_M_FILTER;
- if (event->attr.config2 & NHMEX_M_PMON_MM_CFG_EN)
- reg2->config = event->attr.config2;
- else
- reg2->config = ~0ULL;
- if (box->pmu->pmu_idx == 0)
- reg2->reg = NHMEX_M0_MSR_PMU_MM_CFG;
- else
- reg2->reg = NHMEX_M1_MSR_PMU_MM_CFG;
- }
- return 0;
-}
-
-static u64 nhmex_mbox_shared_reg_config(struct intel_uncore_box *box, int idx)
-{
- struct intel_uncore_extra_reg *er;
- unsigned long flags;
- u64 config;
-
- if (idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC)
- return box->shared_regs[idx].config;
-
- er = &box->shared_regs[EXTRA_REG_NHMEX_M_ZDP_CTL_FVC];
- raw_spin_lock_irqsave(&er->lock, flags);
- config = er->config;
- raw_spin_unlock_irqrestore(&er->lock, flags);
- return config;
-}
-
-static void nhmex_mbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
- struct hw_perf_event *hwc = &event->hw;
- struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
- struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
- int idx;
-
- idx = __BITS_VALUE(reg1->idx, 0, 8);
- if (idx != 0xff)
- wrmsrl(__BITS_VALUE(reg1->reg, 0, 16),
- nhmex_mbox_shared_reg_config(box, idx));
- idx = __BITS_VALUE(reg1->idx, 1, 8);
- if (idx != 0xff)
- wrmsrl(__BITS_VALUE(reg1->reg, 1, 16),
- nhmex_mbox_shared_reg_config(box, idx));
-
- if (reg2->idx != EXTRA_REG_NONE) {
- wrmsrl(reg2->reg, 0);
- if (reg2->config != ~0ULL) {
- wrmsrl(reg2->reg + 1,
- reg2->config & NHMEX_M_PMON_ADDR_MATCH_MASK);
- wrmsrl(reg2->reg + 2, NHMEX_M_PMON_ADDR_MASK_MASK &
- (reg2->config >> NHMEX_M_PMON_ADDR_MASK_SHIFT));
- wrmsrl(reg2->reg, NHMEX_M_PMON_MM_CFG_EN);
- }
- }
-
- wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT0);
-}
-
-DEFINE_UNCORE_FORMAT_ATTR(count_mode, count_mode, "config:2-3");
-DEFINE_UNCORE_FORMAT_ATTR(storage_mode, storage_mode, "config:4-5");
-DEFINE_UNCORE_FORMAT_ATTR(wrap_mode, wrap_mode, "config:6");
-DEFINE_UNCORE_FORMAT_ATTR(flag_mode, flag_mode, "config:7");
-DEFINE_UNCORE_FORMAT_ATTR(inc_sel, inc_sel, "config:9-13");
-DEFINE_UNCORE_FORMAT_ATTR(set_flag_sel, set_flag_sel, "config:19-21");
-DEFINE_UNCORE_FORMAT_ATTR(filter_cfg_en, filter_cfg_en, "config2:63");
-DEFINE_UNCORE_FORMAT_ATTR(filter_match, filter_match, "config2:0-33");
-DEFINE_UNCORE_FORMAT_ATTR(filter_mask, filter_mask, "config2:34-61");
-DEFINE_UNCORE_FORMAT_ATTR(dsp, dsp, "config1:0-31");
-DEFINE_UNCORE_FORMAT_ATTR(thr, thr, "config1:0-31");
-DEFINE_UNCORE_FORMAT_ATTR(fvc, fvc, "config1:0-31");
-DEFINE_UNCORE_FORMAT_ATTR(pgt, pgt, "config1:0-31");
-DEFINE_UNCORE_FORMAT_ATTR(map, map, "config1:0-31");
-DEFINE_UNCORE_FORMAT_ATTR(iss, iss, "config1:0-31");
-DEFINE_UNCORE_FORMAT_ATTR(pld, pld, "config1:32-63");
-
-static struct attribute *nhmex_uncore_mbox_formats_attr[] = {
- &format_attr_count_mode.attr,
- &format_attr_storage_mode.attr,
- &format_attr_wrap_mode.attr,
- &format_attr_flag_mode.attr,
- &format_attr_inc_sel.attr,
- &format_attr_set_flag_sel.attr,
- &format_attr_filter_cfg_en.attr,
- &format_attr_filter_match.attr,
- &format_attr_filter_mask.attr,
- &format_attr_dsp.attr,
- &format_attr_thr.attr,
- &format_attr_fvc.attr,
- &format_attr_pgt.attr,
- &format_attr_map.attr,
- &format_attr_iss.attr,
- &format_attr_pld.attr,
- NULL,
-};
-
-static struct attribute_group nhmex_uncore_mbox_format_group = {
- .name = "format",
- .attrs = nhmex_uncore_mbox_formats_attr,
-};
-
-static struct uncore_event_desc nhmex_uncore_mbox_events[] = {
- INTEL_UNCORE_EVENT_DESC(bbox_cmds_read, "inc_sel=0xd,fvc=0x2800"),
- INTEL_UNCORE_EVENT_DESC(bbox_cmds_write, "inc_sel=0xd,fvc=0x2820"),
- { /* end: all zeroes */ },
-};
-
-static struct uncore_event_desc wsmex_uncore_mbox_events[] = {
- INTEL_UNCORE_EVENT_DESC(bbox_cmds_read, "inc_sel=0xd,fvc=0x5000"),
- INTEL_UNCORE_EVENT_DESC(bbox_cmds_write, "inc_sel=0xd,fvc=0x5040"),
- { /* end: all zeroes */ },
-};
-
-static struct intel_uncore_ops nhmex_uncore_mbox_ops = {
- NHMEX_UNCORE_OPS_COMMON_INIT(),
- .enable_event = nhmex_mbox_msr_enable_event,
- .hw_config = nhmex_mbox_hw_config,
- .get_constraint = nhmex_mbox_get_constraint,
- .put_constraint = nhmex_mbox_put_constraint,
-};
-
-static struct intel_uncore_type nhmex_uncore_mbox = {
- .name = "mbox",
- .num_counters = 6,
- .num_boxes = 2,
- .perf_ctr_bits = 48,
- .event_ctl = NHMEX_M0_MSR_PMU_CTL0,
- .perf_ctr = NHMEX_M0_MSR_PMU_CNT0,
- .event_mask = NHMEX_M_PMON_RAW_EVENT_MASK,
- .box_ctl = NHMEX_M0_MSR_GLOBAL_CTL,
- .msr_offset = NHMEX_M_MSR_OFFSET,
- .pair_ctr_ctl = 1,
- .num_shared_regs = 8,
- .event_descs = nhmex_uncore_mbox_events,
- .ops = &nhmex_uncore_mbox_ops,
- .format_group = &nhmex_uncore_mbox_format_group,
-};
-
-static void nhmex_rbox_alter_er(struct intel_uncore_box *box, struct perf_event *event)
-{
- struct hw_perf_event *hwc = &event->hw;
- struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
-
- /* adjust the main event selector and extra register index */
- if (reg1->idx % 2) {
- reg1->idx--;
- hwc->config -= 1 << NHMEX_R_PMON_CTL_EV_SEL_SHIFT;
- } else {
- reg1->idx++;
- hwc->config += 1 << NHMEX_R_PMON_CTL_EV_SEL_SHIFT;
- }
-
- /* adjust extra register config */
- switch (reg1->idx % 6) {
- case 2:
- /* shift the 8~15 bits to the 0~7 bits */
- reg1->config >>= 8;
- break;
- case 3:
- /* shift the 0~7 bits to the 8~15 bits */
- reg1->config <<= 8;
- break;
- };
-}
-
-/*
- * Each rbox has 4 event set which monitor PQI port 0~3 or 4~7.
- * An event set consists of 6 events, the 3rd and 4th events in
- * an event set use the same extra register. So an event set uses
- * 5 extra registers.
- */
-static struct event_constraint *
-nhmex_rbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
- struct hw_perf_event *hwc = &event->hw;
- struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
- struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
- struct intel_uncore_extra_reg *er;
- unsigned long flags;
- int idx, er_idx;
- u64 config1;
- bool ok = false;
-
- if (!uncore_box_is_fake(box) && reg1->alloc)
- return NULL;
-
- idx = reg1->idx % 6;
- config1 = reg1->config;
-again:
- er_idx = idx;
- /* the 3rd and 4th events use the same extra register */
- if (er_idx > 2)
- er_idx--;
- er_idx += (reg1->idx / 6) * 5;
-
- er = &box->shared_regs[er_idx];
- raw_spin_lock_irqsave(&er->lock, flags);
- if (idx < 2) {
- if (!atomic_read(&er->ref) || er->config == reg1->config) {
- atomic_inc(&er->ref);
- er->config = reg1->config;
- ok = true;
- }
- } else if (idx == 2 || idx == 3) {
- /*
- * these two events use different fields in a extra register,
- * the 0~7 bits and the 8~15 bits respectively.
- */
- u64 mask = 0xff << ((idx - 2) * 8);
- if (!__BITS_VALUE(atomic_read(&er->ref), idx - 2, 8) ||
- !((er->config ^ config1) & mask)) {
- atomic_add(1 << ((idx - 2) * 8), &er->ref);
- er->config &= ~mask;
- er->config |= config1 & mask;
- ok = true;
- }
- } else {
- if (!atomic_read(&er->ref) ||
- (er->config == (hwc->config >> 32) &&
- er->config1 == reg1->config &&
- er->config2 == reg2->config)) {
- atomic_inc(&er->ref);
- er->config = (hwc->config >> 32);
- er->config1 = reg1->config;
- er->config2 = reg2->config;
- ok = true;
- }
- }
- raw_spin_unlock_irqrestore(&er->lock, flags);
-
- if (!ok) {
- /*
- * The Rbox events are always in pairs. The paired
- * events are functional identical, but use different
- * extra registers. If we failed to take an extra
- * register, try the alternative.
- */
- idx ^= 1;
- if (idx != reg1->idx % 6) {
- if (idx == 2)
- config1 >>= 8;
- else if (idx == 3)
- config1 <<= 8;
- goto again;
- }
- } else {
- if (!uncore_box_is_fake(box)) {
- if (idx != reg1->idx % 6)
- nhmex_rbox_alter_er(box, event);
- reg1->alloc = 1;
- }
- return NULL;
- }
- return &constraint_empty;
-}
-
-static void nhmex_rbox_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
-{
- struct intel_uncore_extra_reg *er;
- struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
- int idx, er_idx;
-
- if (uncore_box_is_fake(box) || !reg1->alloc)
- return;
-
- idx = reg1->idx % 6;
- er_idx = idx;
- if (er_idx > 2)
- er_idx--;
- er_idx += (reg1->idx / 6) * 5;
-
- er = &box->shared_regs[er_idx];
- if (idx == 2 || idx == 3)
- atomic_sub(1 << ((idx - 2) * 8), &er->ref);
- else
- atomic_dec(&er->ref);
-
- reg1->alloc = 0;
-}
-
-static int nhmex_rbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
-{
- struct hw_perf_event *hwc = &event->hw;
- struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
- struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
- int idx;
-
- idx = (event->hw.config & NHMEX_R_PMON_CTL_EV_SEL_MASK) >>
- NHMEX_R_PMON_CTL_EV_SEL_SHIFT;
- if (idx >= 0x18)
- return -EINVAL;
-
- reg1->idx = idx;
- reg1->config = event->attr.config1;
-
- switch (idx % 6) {
- case 4:
- case 5:
- hwc->config |= event->attr.config & (~0ULL << 32);
- reg2->config = event->attr.config2;
- break;
- };
- return 0;
-}
-
-static void nhmex_rbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
-{
- struct hw_perf_event *hwc = &event->hw;
- struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
- struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
- int idx, port;
-
- idx = reg1->idx;
- port = idx / 6 + box->pmu->pmu_idx * 4;
-
- switch (idx % 6) {
- case 0:
- wrmsrl(NHMEX_R_MSR_PORTN_IPERF_CFG0(port), reg1->config);
- break;
- case 1:
- wrmsrl(NHMEX_R_MSR_PORTN_IPERF_CFG1(port), reg1->config);
- break;
- case 2:
- case 3:
- wrmsrl(NHMEX_R_MSR_PORTN_QLX_CFG(port),
- uncore_shared_reg_config(box, 2 + (idx / 6) * 5));
- break;
- case 4:
- wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(port),
- hwc->config >> 32);
- wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET1_MATCH(port), reg1->config);
- wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET1_MASK(port), reg2->config);
- break;
- case 5:
- wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(port),
- hwc->config >> 32);
- wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET2_MATCH(port), reg1->config);
- wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET2_MASK(port), reg2->config);
- break;
- };
-
- wrmsrl(hwc->config_base, NHMEX_PMON_CTL_EN_BIT0 |
- (hwc->config & NHMEX_R_PMON_CTL_EV_SEL_MASK));
-}
-
-DEFINE_UNCORE_FORMAT_ATTR(xbr_mm_cfg, xbr_mm_cfg, "config:32-63");
-DEFINE_UNCORE_FORMAT_ATTR(xbr_match, xbr_match, "config1:0-63");
-DEFINE_UNCORE_FORMAT_ATTR(xbr_mask, xbr_mask, "config2:0-63");
-DEFINE_UNCORE_FORMAT_ATTR(qlx_cfg, qlx_cfg, "config1:0-15");
-DEFINE_UNCORE_FORMAT_ATTR(iperf_cfg, iperf_cfg, "config1:0-31");
-
-static struct attribute *nhmex_uncore_rbox_formats_attr[] = {
- &format_attr_event5.attr,
- &format_attr_xbr_mm_cfg.attr,
- &format_attr_xbr_match.attr,
- &format_attr_xbr_mask.attr,
- &format_attr_qlx_cfg.attr,
- &format_attr_iperf_cfg.attr,
- NULL,
-};
-
-static struct attribute_group nhmex_uncore_rbox_format_group = {
- .name = "format",
- .attrs = nhmex_uncore_rbox_formats_attr,
-};
-
-static struct uncore_event_desc nhmex_uncore_rbox_events[] = {
- INTEL_UNCORE_EVENT_DESC(qpi0_flit_send, "event=0x0,iperf_cfg=0x80000000"),
- INTEL_UNCORE_EVENT_DESC(qpi1_filt_send, "event=0x6,iperf_cfg=0x80000000"),
- INTEL_UNCORE_EVENT_DESC(qpi0_idle_filt, "event=0x0,iperf_cfg=0x40000000"),
- INTEL_UNCORE_EVENT_DESC(qpi1_idle_filt, "event=0x6,iperf_cfg=0x40000000"),
- INTEL_UNCORE_EVENT_DESC(qpi0_date_response, "event=0x0,iperf_cfg=0xc4"),
- INTEL_UNCORE_EVENT_DESC(qpi1_date_response, "event=0x6,iperf_cfg=0xc4"),
- { /* end: all zeroes */ },
-};
-
-static struct intel_uncore_ops nhmex_uncore_rbox_ops = {
- NHMEX_UNCORE_OPS_COMMON_INIT(),
- .enable_event = nhmex_rbox_msr_enable_event,
- .hw_config = nhmex_rbox_hw_config,
- .get_constraint = nhmex_rbox_get_constraint,
- .put_constraint = nhmex_rbox_put_constraint,
-};
-
-static struct intel_uncore_type nhmex_uncore_rbox = {
- .name = "rbox",
- .num_counters = 8,
- .num_boxes = 2,
- .perf_ctr_bits = 48,
- .event_ctl = NHMEX_R_MSR_PMON_CTL0,
- .perf_ctr = NHMEX_R_MSR_PMON_CNT0,
- .event_mask = NHMEX_R_PMON_RAW_EVENT_MASK,
- .box_ctl = NHMEX_R_MSR_GLOBAL_CTL,
- .msr_offset = NHMEX_R_MSR_OFFSET,
- .pair_ctr_ctl = 1,
- .num_shared_regs = 20,
- .event_descs = nhmex_uncore_rbox_events,
- .ops = &nhmex_uncore_rbox_ops,
- .format_group = &nhmex_uncore_rbox_format_group
-};
-
-static struct intel_uncore_type *nhmex_msr_uncores[] = {
- &nhmex_uncore_ubox,
- &nhmex_uncore_cbox,
- &nhmex_uncore_bbox,
- &nhmex_uncore_sbox,
- &nhmex_uncore_mbox,
- &nhmex_uncore_rbox,
- &nhmex_uncore_wbox,
- NULL,
-};
-/* end of Nehalem-EX uncore support */
-
static void uncore_assign_hw_event(struct intel_uncore_box *box, struct perf_event *event, int idx)
{
struct hw_perf_event *hwc = &event->hw;
@@ -3140,7 +170,7 @@ static void uncore_assign_hw_event(struct intel_uncore_box *box, struct perf_eve
hwc->event_base = uncore_perf_ctr(box, hwc->idx);
}
-static void uncore_perf_event_update(struct intel_uncore_box *box, struct perf_event *event)
+void uncore_perf_event_update(struct intel_uncore_box *box, struct perf_event *event)
{
u64 prev_count, new_count, delta;
int shift;
@@ -3201,14 +231,14 @@ static enum hrtimer_restart uncore_pmu_hrtimer(struct hrtimer *hrtimer)
return HRTIMER_RESTART;
}
-static void uncore_pmu_start_hrtimer(struct intel_uncore_box *box)
+void uncore_pmu_start_hrtimer(struct intel_uncore_box *box)
{
__hrtimer_start_range_ns(&box->hrtimer,
ns_to_ktime(box->hrtimer_duration), 0,
HRTIMER_MODE_REL_PINNED, 0);
}
-static void uncore_pmu_cancel_hrtimer(struct intel_uncore_box *box)
+void uncore_pmu_cancel_hrtimer(struct intel_uncore_box *box)
{
hrtimer_cancel(&box->hrtimer);
}
@@ -3291,7 +321,7 @@ uncore_get_event_constraint(struct intel_uncore_box *box, struct perf_event *eve
}
if (event->attr.config == UNCORE_FIXED_EVENT)
- return &constraint_fixed;
+ return &uncore_constraint_fixed;
if (type->constraints) {
for_each_event_constraint(c, type->constraints) {
@@ -3496,7 +526,7 @@ static void uncore_pmu_event_del(struct perf_event *event, int flags)
event->hw.last_tag = ~0ULL;
}
-static void uncore_pmu_event_read(struct perf_event *event)
+void uncore_pmu_event_read(struct perf_event *event)
{
struct intel_uncore_box *box = uncore_event_to_box(event);
uncore_perf_event_update(box, event);
@@ -3635,7 +665,7 @@ static struct attribute_group uncore_pmu_attr_group = {
.attrs = uncore_pmu_attrs,
};
-static int __init uncore_pmu_register(struct intel_uncore_pmu *pmu)
+static int uncore_pmu_register(struct intel_uncore_pmu *pmu)
{
int ret;
@@ -3758,9 +788,6 @@ fail:
return ret;
}
-static struct pci_driver *uncore_pci_driver;
-static bool pcidrv_registered;
-
/*
* add a pci uncore device
*/
@@ -3770,18 +797,20 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id
struct intel_uncore_box *box;
struct intel_uncore_type *type;
int phys_id;
+ bool first_box = false;
- phys_id = pcibus_to_physid[pdev->bus->number];
+ phys_id = uncore_pcibus_to_physid[pdev->bus->number];
if (phys_id < 0)
return -ENODEV;
if (UNCORE_PCI_DEV_TYPE(id->driver_data) == UNCORE_EXTRA_PCI_DEV) {
- extra_pci_dev[phys_id][UNCORE_PCI_DEV_IDX(id->driver_data)] = pdev;
+ int idx = UNCORE_PCI_DEV_IDX(id->driver_data);
+ uncore_extra_pci_dev[phys_id][idx] = pdev;
pci_set_drvdata(pdev, NULL);
return 0;
}
- type = pci_uncores[UNCORE_PCI_DEV_TYPE(id->driver_data)];
+ type = uncore_pci_uncores[UNCORE_PCI_DEV_TYPE(id->driver_data)];
box = uncore_alloc_box(type, NUMA_NO_NODE);
if (!box)
return -ENOMEM;
@@ -3803,9 +832,13 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id
pci_set_drvdata(pdev, box);
raw_spin_lock(&uncore_box_lock);
+ if (list_empty(&pmu->box_list))
+ first_box = true;
list_add_tail(&box->list, &pmu->box_list);
raw_spin_unlock(&uncore_box_lock);
+ if (first_box)
+ uncore_pmu_register(pmu);
return 0;
}
@@ -3813,13 +846,14 @@ static void uncore_pci_remove(struct pci_dev *pdev)
{
struct intel_uncore_box *box = pci_get_drvdata(pdev);
struct intel_uncore_pmu *pmu;
- int i, cpu, phys_id = pcibus_to_physid[pdev->bus->number];
+ int i, cpu, phys_id = uncore_pcibus_to_physid[pdev->bus->number];
+ bool last_box = false;
box = pci_get_drvdata(pdev);
if (!box) {
for (i = 0; i < UNCORE_EXTRA_PCI_DEV_MAX; i++) {
- if (extra_pci_dev[phys_id][i] == pdev) {
- extra_pci_dev[phys_id][i] = NULL;
+ if (uncore_extra_pci_dev[phys_id][i] == pdev) {
+ uncore_extra_pci_dev[phys_id][i] = NULL;
break;
}
}
@@ -3835,6 +869,8 @@ static void uncore_pci_remove(struct pci_dev *pdev)
raw_spin_lock(&uncore_box_lock);
list_del(&box->list);
+ if (list_empty(&pmu->box_list))
+ last_box = true;
raw_spin_unlock(&uncore_box_lock);
for_each_possible_cpu(cpu) {
@@ -3846,6 +882,9 @@ static void uncore_pci_remove(struct pci_dev *pdev)
WARN_ON_ONCE(atomic_read(&box->refcnt) != 1);
kfree(box);
+
+ if (last_box)
+ perf_pmu_unregister(&pmu->pmu);
}
static int __init uncore_pci_init(void)
@@ -3854,46 +893,32 @@ static int __init uncore_pci_init(void)
switch (boot_cpu_data.x86_model) {
case 45: /* Sandy Bridge-EP */
- ret = snbep_pci2phy_map_init(0x3ce0);
- if (ret)
- return ret;
- pci_uncores = snbep_pci_uncores;
- uncore_pci_driver = &snbep_uncore_pci_driver;
+ ret = snbep_uncore_pci_init();
break;
- case 62: /* IvyTown */
- ret = snbep_pci2phy_map_init(0x0e1e);
- if (ret)
- return ret;
- pci_uncores = ivt_pci_uncores;
- uncore_pci_driver = &ivt_uncore_pci_driver;
+ case 62: /* Ivy Bridge-EP */
+ ret = ivbep_uncore_pci_init();
+ break;
+ case 63: /* Haswell-EP */
+ ret = hswep_uncore_pci_init();
break;
case 42: /* Sandy Bridge */
- ret = snb_pci2phy_map_init(PCI_DEVICE_ID_INTEL_SNB_IMC);
- if (ret)
- return ret;
- pci_uncores = snb_pci_uncores;
- uncore_pci_driver = &snb_uncore_pci_driver;
+ ret = snb_uncore_pci_init();
break;
case 58: /* Ivy Bridge */
- ret = snb_pci2phy_map_init(PCI_DEVICE_ID_INTEL_IVB_IMC);
- if (ret)
- return ret;
- pci_uncores = snb_pci_uncores;
- uncore_pci_driver = &ivb_uncore_pci_driver;
+ ret = ivb_uncore_pci_init();
break;
case 60: /* Haswell */
case 69: /* Haswell Celeron */
- ret = snb_pci2phy_map_init(PCI_DEVICE_ID_INTEL_HSW_IMC);
- if (ret)
- return ret;
- pci_uncores = snb_pci_uncores;
- uncore_pci_driver = &hsw_uncore_pci_driver;
+ ret = hsw_uncore_pci_init();
break;
default:
return 0;
}
- ret = uncore_types_init(pci_uncores);
+ if (ret)
+ return ret;
+
+ ret = uncore_types_init(uncore_pci_uncores);
if (ret)
return ret;
@@ -3904,7 +929,7 @@ static int __init uncore_pci_init(void)
if (ret == 0)
pcidrv_registered = true;
else
- uncore_types_exit(pci_uncores);
+ uncore_types_exit(uncore_pci_uncores);
return ret;
}
@@ -3914,7 +939,7 @@ static void __init uncore_pci_exit(void)
if (pcidrv_registered) {
pcidrv_registered = false;
pci_unregister_driver(uncore_pci_driver);
- uncore_types_exit(pci_uncores);
+ uncore_types_exit(uncore_pci_uncores);
}
}
@@ -3940,8 +965,8 @@ static void uncore_cpu_dying(int cpu)
struct intel_uncore_box *box;
int i, j;
- for (i = 0; msr_uncores[i]; i++) {
- type = msr_uncores[i];
+ for (i = 0; uncore_msr_uncores[i]; i++) {
+ type = uncore_msr_uncores[i];
for (j = 0; j < type->num_boxes; j++) {
pmu = &type->pmus[j];
box = *per_cpu_ptr(pmu->box, cpu);
@@ -3961,8 +986,8 @@ static int uncore_cpu_starting(int cpu)
phys_id = topology_physical_package_id(cpu);
- for (i = 0; msr_uncores[i]; i++) {
- type = msr_uncores[i];
+ for (i = 0; uncore_msr_uncores[i]; i++) {
+ type = uncore_msr_uncores[i];
for (j = 0; j < type->num_boxes; j++) {
pmu = &type->pmus[j];
box = *per_cpu_ptr(pmu->box, cpu);
@@ -4002,8 +1027,8 @@ static int uncore_cpu_prepare(int cpu, int phys_id)
struct intel_uncore_box *box;
int i, j;
- for (i = 0; msr_uncores[i]; i++) {
- type = msr_uncores[i];
+ for (i = 0; uncore_msr_uncores[i]; i++) {
+ type = uncore_msr_uncores[i];
for (j = 0; j < type->num_boxes; j++) {
pmu = &type->pmus[j];
if (pmu->func_id < 0)
@@ -4083,8 +1108,8 @@ static void uncore_event_exit_cpu(int cpu)
if (target >= 0)
cpumask_set_cpu(target, &uncore_cpu_mask);
- uncore_change_context(msr_uncores, cpu, target);
- uncore_change_context(pci_uncores, cpu, target);
+ uncore_change_context(uncore_msr_uncores, cpu, target);
+ uncore_change_context(uncore_pci_uncores, cpu, target);
}
static void uncore_event_init_cpu(int cpu)
@@ -4099,8 +1124,8 @@ static void uncore_event_init_cpu(int cpu)
cpumask_set_cpu(cpu, &uncore_cpu_mask);
- uncore_change_context(msr_uncores, -1, cpu);
- uncore_change_context(pci_uncores, -1, cpu);
+ uncore_change_context(uncore_msr_uncores, -1, cpu);
+ uncore_change_context(uncore_pci_uncores, -1, cpu);
}
static int uncore_cpu_notifier(struct notifier_block *self,
@@ -4160,47 +1185,37 @@ static void __init uncore_cpu_setup(void *dummy)
static int __init uncore_cpu_init(void)
{
- int ret, max_cores;
+ int ret;
- max_cores = boot_cpu_data.x86_max_cores;
switch (boot_cpu_data.x86_model) {
case 26: /* Nehalem */
case 30:
case 37: /* Westmere */
case 44:
- msr_uncores = nhm_msr_uncores;
+ nhm_uncore_cpu_init();
break;
case 42: /* Sandy Bridge */
case 58: /* Ivy Bridge */
- if (snb_uncore_cbox.num_boxes > max_cores)
- snb_uncore_cbox.num_boxes = max_cores;
- msr_uncores = snb_msr_uncores;
+ snb_uncore_cpu_init();
break;
case 45: /* Sandy Bridge-EP */
- if (snbep_uncore_cbox.num_boxes > max_cores)
- snbep_uncore_cbox.num_boxes = max_cores;
- msr_uncores = snbep_msr_uncores;
+ snbep_uncore_cpu_init();
break;
case 46: /* Nehalem-EX */
- uncore_nhmex = true;
case 47: /* Westmere-EX aka. Xeon E7 */
- if (!uncore_nhmex)
- nhmex_uncore_mbox.event_descs = wsmex_uncore_mbox_events;
- if (nhmex_uncore_cbox.num_boxes > max_cores)
- nhmex_uncore_cbox.num_boxes = max_cores;
- msr_uncores = nhmex_msr_uncores;
+ nhmex_uncore_cpu_init();
break;
- case 62: /* IvyTown */
- if (ivt_uncore_cbox.num_boxes > max_cores)
- ivt_uncore_cbox.num_boxes = max_cores;
- msr_uncores = ivt_msr_uncores;
+ case 62: /* Ivy Bridge-EP */
+ ivbep_uncore_cpu_init();
+ break;
+ case 63: /* Haswell-EP */
+ hswep_uncore_cpu_init();
break;
-
default:
return 0;
}
- ret = uncore_types_init(msr_uncores);
+ ret = uncore_types_init(uncore_msr_uncores);
if (ret)
return ret;
@@ -4213,16 +1228,8 @@ static int __init uncore_pmus_register(void)
struct intel_uncore_type *type;
int i, j;
- for (i = 0; msr_uncores[i]; i++) {
- type = msr_uncores[i];
- for (j = 0; j < type->num_boxes; j++) {
- pmu = &type->pmus[j];
- uncore_pmu_register(pmu);
- }
- }
-
- for (i = 0; pci_uncores[i]; i++) {
- type = pci_uncores[i];
+ for (i = 0; uncore_msr_uncores[i]; i++) {
+ type = uncore_msr_uncores[i];
for (j = 0; j < type->num_boxes; j++) {
pmu = &type->pmus[j];
uncore_pmu_register(pmu);
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
index 90236f0c94a9..18eb78bbdd10 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
@@ -24,395 +24,6 @@
#define UNCORE_EVENT_CONSTRAINT(c, n) EVENT_CONSTRAINT(c, n, 0xff)
-/* SNB event control */
-#define SNB_UNC_CTL_EV_SEL_MASK 0x000000ff
-#define SNB_UNC_CTL_UMASK_MASK 0x0000ff00
-#define SNB_UNC_CTL_EDGE_DET (1 << 18)
-#define SNB_UNC_CTL_EN (1 << 22)
-#define SNB_UNC_CTL_INVERT (1 << 23)
-#define SNB_UNC_CTL_CMASK_MASK 0x1f000000
-#define NHM_UNC_CTL_CMASK_MASK 0xff000000
-#define NHM_UNC_FIXED_CTR_CTL_EN (1 << 0)
-
-#define SNB_UNC_RAW_EVENT_MASK (SNB_UNC_CTL_EV_SEL_MASK | \
- SNB_UNC_CTL_UMASK_MASK | \
- SNB_UNC_CTL_EDGE_DET | \
- SNB_UNC_CTL_INVERT | \
- SNB_UNC_CTL_CMASK_MASK)
-
-#define NHM_UNC_RAW_EVENT_MASK (SNB_UNC_CTL_EV_SEL_MASK | \
- SNB_UNC_CTL_UMASK_MASK | \
- SNB_UNC_CTL_EDGE_DET | \
- SNB_UNC_CTL_INVERT | \
- NHM_UNC_CTL_CMASK_MASK)
-
-/* SNB global control register */
-#define SNB_UNC_PERF_GLOBAL_CTL 0x391
-#define SNB_UNC_FIXED_CTR_CTRL 0x394
-#define SNB_UNC_FIXED_CTR 0x395
-
-/* SNB uncore global control */
-#define SNB_UNC_GLOBAL_CTL_CORE_ALL ((1 << 4) - 1)
-#define SNB_UNC_GLOBAL_CTL_EN (1 << 29)
-
-/* SNB Cbo register */
-#define SNB_UNC_CBO_0_PERFEVTSEL0 0x700
-#define SNB_UNC_CBO_0_PER_CTR0 0x706
-#define SNB_UNC_CBO_MSR_OFFSET 0x10
-
-/* NHM global control register */
-#define NHM_UNC_PERF_GLOBAL_CTL 0x391
-#define NHM_UNC_FIXED_CTR 0x394
-#define NHM_UNC_FIXED_CTR_CTRL 0x395
-
-/* NHM uncore global control */
-#define NHM_UNC_GLOBAL_CTL_EN_PC_ALL ((1ULL << 8) - 1)
-#define NHM_UNC_GLOBAL_CTL_EN_FC (1ULL << 32)
-
-/* NHM uncore register */
-#define NHM_UNC_PERFEVTSEL0 0x3c0
-#define NHM_UNC_UNCORE_PMC0 0x3b0
-
-/* SNB-EP Box level control */
-#define SNBEP_PMON_BOX_CTL_RST_CTRL (1 << 0)
-#define SNBEP_PMON_BOX_CTL_RST_CTRS (1 << 1)
-#define SNBEP_PMON_BOX_CTL_FRZ (1 << 8)
-#define SNBEP_PMON_BOX_CTL_FRZ_EN (1 << 16)
-#define SNBEP_PMON_BOX_CTL_INT (SNBEP_PMON_BOX_CTL_RST_CTRL | \
- SNBEP_PMON_BOX_CTL_RST_CTRS | \
- SNBEP_PMON_BOX_CTL_FRZ_EN)
-/* SNB-EP event control */
-#define SNBEP_PMON_CTL_EV_SEL_MASK 0x000000ff
-#define SNBEP_PMON_CTL_UMASK_MASK 0x0000ff00
-#define SNBEP_PMON_CTL_RST (1 << 17)
-#define SNBEP_PMON_CTL_EDGE_DET (1 << 18)
-#define SNBEP_PMON_CTL_EV_SEL_EXT (1 << 21)
-#define SNBEP_PMON_CTL_EN (1 << 22)
-#define SNBEP_PMON_CTL_INVERT (1 << 23)
-#define SNBEP_PMON_CTL_TRESH_MASK 0xff000000
-#define SNBEP_PMON_RAW_EVENT_MASK (SNBEP_PMON_CTL_EV_SEL_MASK | \
- SNBEP_PMON_CTL_UMASK_MASK | \
- SNBEP_PMON_CTL_EDGE_DET | \
- SNBEP_PMON_CTL_INVERT | \
- SNBEP_PMON_CTL_TRESH_MASK)
-
-/* SNB-EP Ubox event control */
-#define SNBEP_U_MSR_PMON_CTL_TRESH_MASK 0x1f000000
-#define SNBEP_U_MSR_PMON_RAW_EVENT_MASK \
- (SNBEP_PMON_CTL_EV_SEL_MASK | \
- SNBEP_PMON_CTL_UMASK_MASK | \
- SNBEP_PMON_CTL_EDGE_DET | \
- SNBEP_PMON_CTL_INVERT | \
- SNBEP_U_MSR_PMON_CTL_TRESH_MASK)
-
-#define SNBEP_CBO_PMON_CTL_TID_EN (1 << 19)
-#define SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK (SNBEP_PMON_RAW_EVENT_MASK | \
- SNBEP_CBO_PMON_CTL_TID_EN)
-
-/* SNB-EP PCU event control */
-#define SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK 0x0000c000
-#define SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK 0x1f000000
-#define SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT (1 << 30)
-#define SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET (1 << 31)
-#define SNBEP_PCU_MSR_PMON_RAW_EVENT_MASK \
- (SNBEP_PMON_CTL_EV_SEL_MASK | \
- SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK | \
- SNBEP_PMON_CTL_EDGE_DET | \
- SNBEP_PMON_CTL_EV_SEL_EXT | \
- SNBEP_PMON_CTL_INVERT | \
- SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK | \
- SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT | \
- SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET)
-
-#define SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK \
- (SNBEP_PMON_RAW_EVENT_MASK | \
- SNBEP_PMON_CTL_EV_SEL_EXT)
-
-/* SNB-EP pci control register */
-#define SNBEP_PCI_PMON_BOX_CTL 0xf4
-#define SNBEP_PCI_PMON_CTL0 0xd8
-/* SNB-EP pci counter register */
-#define SNBEP_PCI_PMON_CTR0 0xa0
-
-/* SNB-EP home agent register */
-#define SNBEP_HA_PCI_PMON_BOX_ADDRMATCH0 0x40
-#define SNBEP_HA_PCI_PMON_BOX_ADDRMATCH1 0x44
-#define SNBEP_HA_PCI_PMON_BOX_OPCODEMATCH 0x48
-/* SNB-EP memory controller register */
-#define SNBEP_MC_CHy_PCI_PMON_FIXED_CTL 0xf0
-#define SNBEP_MC_CHy_PCI_PMON_FIXED_CTR 0xd0
-/* SNB-EP QPI register */
-#define SNBEP_Q_Py_PCI_PMON_PKT_MATCH0 0x228
-#define SNBEP_Q_Py_PCI_PMON_PKT_MATCH1 0x22c
-#define SNBEP_Q_Py_PCI_PMON_PKT_MASK0 0x238
-#define SNBEP_Q_Py_PCI_PMON_PKT_MASK1 0x23c
-
-/* SNB-EP Ubox register */
-#define SNBEP_U_MSR_PMON_CTR0 0xc16
-#define SNBEP_U_MSR_PMON_CTL0 0xc10
-
-#define SNBEP_U_MSR_PMON_UCLK_FIXED_CTL 0xc08
-#define SNBEP_U_MSR_PMON_UCLK_FIXED_CTR 0xc09
-
-/* SNB-EP Cbo register */
-#define SNBEP_C0_MSR_PMON_CTR0 0xd16
-#define SNBEP_C0_MSR_PMON_CTL0 0xd10
-#define SNBEP_C0_MSR_PMON_BOX_CTL 0xd04
-#define SNBEP_C0_MSR_PMON_BOX_FILTER 0xd14
-#define SNBEP_CBO_MSR_OFFSET 0x20
-
-#define SNBEP_CB0_MSR_PMON_BOX_FILTER_TID 0x1f
-#define SNBEP_CB0_MSR_PMON_BOX_FILTER_NID 0x3fc00
-#define SNBEP_CB0_MSR_PMON_BOX_FILTER_STATE 0x7c0000
-#define SNBEP_CB0_MSR_PMON_BOX_FILTER_OPC 0xff800000
-
-#define SNBEP_CBO_EVENT_EXTRA_REG(e, m, i) { \
- .event = (e), \
- .msr = SNBEP_C0_MSR_PMON_BOX_FILTER, \
- .config_mask = (m), \
- .idx = (i) \
-}
-
-/* SNB-EP PCU register */
-#define SNBEP_PCU_MSR_PMON_CTR0 0xc36
-#define SNBEP_PCU_MSR_PMON_CTL0 0xc30
-#define SNBEP_PCU_MSR_PMON_BOX_CTL 0xc24
-#define SNBEP_PCU_MSR_PMON_BOX_FILTER 0xc34
-#define SNBEP_PCU_MSR_PMON_BOX_FILTER_MASK 0xffffffff
-#define SNBEP_PCU_MSR_CORE_C3_CTR 0x3fc
-#define SNBEP_PCU_MSR_CORE_C6_CTR 0x3fd
-
-/* IVT event control */
-#define IVT_PMON_BOX_CTL_INT (SNBEP_PMON_BOX_CTL_RST_CTRL | \
- SNBEP_PMON_BOX_CTL_RST_CTRS)
-#define IVT_PMON_RAW_EVENT_MASK (SNBEP_PMON_CTL_EV_SEL_MASK | \
- SNBEP_PMON_CTL_UMASK_MASK | \
- SNBEP_PMON_CTL_EDGE_DET | \
- SNBEP_PMON_CTL_TRESH_MASK)
-/* IVT Ubox */
-#define IVT_U_MSR_PMON_GLOBAL_CTL 0xc00
-#define IVT_U_PMON_GLOBAL_FRZ_ALL (1 << 31)
-#define IVT_U_PMON_GLOBAL_UNFRZ_ALL (1 << 29)
-
-#define IVT_U_MSR_PMON_RAW_EVENT_MASK \
- (SNBEP_PMON_CTL_EV_SEL_MASK | \
- SNBEP_PMON_CTL_UMASK_MASK | \
- SNBEP_PMON_CTL_EDGE_DET | \
- SNBEP_U_MSR_PMON_CTL_TRESH_MASK)
-/* IVT Cbo */
-#define IVT_CBO_MSR_PMON_RAW_EVENT_MASK (IVT_PMON_RAW_EVENT_MASK | \
- SNBEP_CBO_PMON_CTL_TID_EN)
-
-#define IVT_CB0_MSR_PMON_BOX_FILTER_TID (0x1fULL << 0)
-#define IVT_CB0_MSR_PMON_BOX_FILTER_LINK (0xfULL << 5)
-#define IVT_CB0_MSR_PMON_BOX_FILTER_STATE (0x3fULL << 17)
-#define IVT_CB0_MSR_PMON_BOX_FILTER_NID (0xffffULL << 32)
-#define IVT_CB0_MSR_PMON_BOX_FILTER_OPC (0x1ffULL << 52)
-#define IVT_CB0_MSR_PMON_BOX_FILTER_C6 (0x1ULL << 61)
-#define IVT_CB0_MSR_PMON_BOX_FILTER_NC (0x1ULL << 62)
-#define IVT_CB0_MSR_PMON_BOX_FILTER_IOSC (0x1ULL << 63)
-
-/* IVT home agent */
-#define IVT_HA_PCI_PMON_CTL_Q_OCC_RST (1 << 16)
-#define IVT_HA_PCI_PMON_RAW_EVENT_MASK \
- (IVT_PMON_RAW_EVENT_MASK | \
- IVT_HA_PCI_PMON_CTL_Q_OCC_RST)
-/* IVT PCU */
-#define IVT_PCU_MSR_PMON_RAW_EVENT_MASK \
- (SNBEP_PMON_CTL_EV_SEL_MASK | \
- SNBEP_PMON_CTL_EV_SEL_EXT | \
- SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK | \
- SNBEP_PMON_CTL_EDGE_DET | \
- SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK | \
- SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT | \
- SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET)
-/* IVT QPI */
-#define IVT_QPI_PCI_PMON_RAW_EVENT_MASK \
- (IVT_PMON_RAW_EVENT_MASK | \
- SNBEP_PMON_CTL_EV_SEL_EXT)
-
-/* NHM-EX event control */
-#define NHMEX_PMON_CTL_EV_SEL_MASK 0x000000ff
-#define NHMEX_PMON_CTL_UMASK_MASK 0x0000ff00
-#define NHMEX_PMON_CTL_EN_BIT0 (1 << 0)
-#define NHMEX_PMON_CTL_EDGE_DET (1 << 18)
-#define NHMEX_PMON_CTL_PMI_EN (1 << 20)
-#define NHMEX_PMON_CTL_EN_BIT22 (1 << 22)
-#define NHMEX_PMON_CTL_INVERT (1 << 23)
-#define NHMEX_PMON_CTL_TRESH_MASK 0xff000000
-#define NHMEX_PMON_RAW_EVENT_MASK (NHMEX_PMON_CTL_EV_SEL_MASK | \
- NHMEX_PMON_CTL_UMASK_MASK | \
- NHMEX_PMON_CTL_EDGE_DET | \
- NHMEX_PMON_CTL_INVERT | \
- NHMEX_PMON_CTL_TRESH_MASK)
-
-/* NHM-EX Ubox */
-#define NHMEX_U_MSR_PMON_GLOBAL_CTL 0xc00
-#define NHMEX_U_MSR_PMON_CTR 0xc11
-#define NHMEX_U_MSR_PMON_EV_SEL 0xc10
-
-#define NHMEX_U_PMON_GLOBAL_EN (1 << 0)
-#define NHMEX_U_PMON_GLOBAL_PMI_CORE_SEL 0x0000001e
-#define NHMEX_U_PMON_GLOBAL_EN_ALL (1 << 28)
-#define NHMEX_U_PMON_GLOBAL_RST_ALL (1 << 29)
-#define NHMEX_U_PMON_GLOBAL_FRZ_ALL (1 << 31)
-
-#define NHMEX_U_PMON_RAW_EVENT_MASK \
- (NHMEX_PMON_CTL_EV_SEL_MASK | \
- NHMEX_PMON_CTL_EDGE_DET)
-
-/* NHM-EX Cbox */
-#define NHMEX_C0_MSR_PMON_GLOBAL_CTL 0xd00
-#define NHMEX_C0_MSR_PMON_CTR0 0xd11
-#define NHMEX_C0_MSR_PMON_EV_SEL0 0xd10
-#define NHMEX_C_MSR_OFFSET 0x20
-
-/* NHM-EX Bbox */
-#define NHMEX_B0_MSR_PMON_GLOBAL_CTL 0xc20
-#define NHMEX_B0_MSR_PMON_CTR0 0xc31
-#define NHMEX_B0_MSR_PMON_CTL0 0xc30
-#define NHMEX_B_MSR_OFFSET 0x40
-#define NHMEX_B0_MSR_MATCH 0xe45
-#define NHMEX_B0_MSR_MASK 0xe46
-#define NHMEX_B1_MSR_MATCH 0xe4d
-#define NHMEX_B1_MSR_MASK 0xe4e
-
-#define NHMEX_B_PMON_CTL_EN (1 << 0)
-#define NHMEX_B_PMON_CTL_EV_SEL_SHIFT 1
-#define NHMEX_B_PMON_CTL_EV_SEL_MASK \
- (0x1f << NHMEX_B_PMON_CTL_EV_SEL_SHIFT)
-#define NHMEX_B_PMON_CTR_SHIFT 6
-#define NHMEX_B_PMON_CTR_MASK \
- (0x3 << NHMEX_B_PMON_CTR_SHIFT)
-#define NHMEX_B_PMON_RAW_EVENT_MASK \
- (NHMEX_B_PMON_CTL_EV_SEL_MASK | \
- NHMEX_B_PMON_CTR_MASK)
-
-/* NHM-EX Sbox */
-#define NHMEX_S0_MSR_PMON_GLOBAL_CTL 0xc40
-#define NHMEX_S0_MSR_PMON_CTR0 0xc51
-#define NHMEX_S0_MSR_PMON_CTL0 0xc50
-#define NHMEX_S_MSR_OFFSET 0x80
-#define NHMEX_S0_MSR_MM_CFG 0xe48
-#define NHMEX_S0_MSR_MATCH 0xe49
-#define NHMEX_S0_MSR_MASK 0xe4a
-#define NHMEX_S1_MSR_MM_CFG 0xe58
-#define NHMEX_S1_MSR_MATCH 0xe59
-#define NHMEX_S1_MSR_MASK 0xe5a
-
-#define NHMEX_S_PMON_MM_CFG_EN (0x1ULL << 63)
-#define NHMEX_S_EVENT_TO_R_PROG_EV 0
-
-/* NHM-EX Mbox */
-#define NHMEX_M0_MSR_GLOBAL_CTL 0xca0
-#define NHMEX_M0_MSR_PMU_DSP 0xca5
-#define NHMEX_M0_MSR_PMU_ISS 0xca6
-#define NHMEX_M0_MSR_PMU_MAP 0xca7
-#define NHMEX_M0_MSR_PMU_MSC_THR 0xca8
-#define NHMEX_M0_MSR_PMU_PGT 0xca9
-#define NHMEX_M0_MSR_PMU_PLD 0xcaa
-#define NHMEX_M0_MSR_PMU_ZDP_CTL_FVC 0xcab
-#define NHMEX_M0_MSR_PMU_CTL0 0xcb0
-#define NHMEX_M0_MSR_PMU_CNT0 0xcb1
-#define NHMEX_M_MSR_OFFSET 0x40
-#define NHMEX_M0_MSR_PMU_MM_CFG 0xe54
-#define NHMEX_M1_MSR_PMU_MM_CFG 0xe5c
-
-#define NHMEX_M_PMON_MM_CFG_EN (1ULL << 63)
-#define NHMEX_M_PMON_ADDR_MATCH_MASK 0x3ffffffffULL
-#define NHMEX_M_PMON_ADDR_MASK_MASK 0x7ffffffULL
-#define NHMEX_M_PMON_ADDR_MASK_SHIFT 34
-
-#define NHMEX_M_PMON_CTL_EN (1 << 0)
-#define NHMEX_M_PMON_CTL_PMI_EN (1 << 1)
-#define NHMEX_M_PMON_CTL_COUNT_MODE_SHIFT 2
-#define NHMEX_M_PMON_CTL_COUNT_MODE_MASK \
- (0x3 << NHMEX_M_PMON_CTL_COUNT_MODE_SHIFT)
-#define NHMEX_M_PMON_CTL_STORAGE_MODE_SHIFT 4
-#define NHMEX_M_PMON_CTL_STORAGE_MODE_MASK \
- (0x3 << NHMEX_M_PMON_CTL_STORAGE_MODE_SHIFT)
-#define NHMEX_M_PMON_CTL_WRAP_MODE (1 << 6)
-#define NHMEX_M_PMON_CTL_FLAG_MODE (1 << 7)
-#define NHMEX_M_PMON_CTL_INC_SEL_SHIFT 9
-#define NHMEX_M_PMON_CTL_INC_SEL_MASK \
- (0x1f << NHMEX_M_PMON_CTL_INC_SEL_SHIFT)
-#define NHMEX_M_PMON_CTL_SET_FLAG_SEL_SHIFT 19
-#define NHMEX_M_PMON_CTL_SET_FLAG_SEL_MASK \
- (0x7 << NHMEX_M_PMON_CTL_SET_FLAG_SEL_SHIFT)
-#define NHMEX_M_PMON_RAW_EVENT_MASK \
- (NHMEX_M_PMON_CTL_COUNT_MODE_MASK | \
- NHMEX_M_PMON_CTL_STORAGE_MODE_MASK | \
- NHMEX_M_PMON_CTL_WRAP_MODE | \
- NHMEX_M_PMON_CTL_FLAG_MODE | \
- NHMEX_M_PMON_CTL_INC_SEL_MASK | \
- NHMEX_M_PMON_CTL_SET_FLAG_SEL_MASK)
-
-#define NHMEX_M_PMON_ZDP_CTL_FVC_MASK (((1 << 11) - 1) | (1 << 23))
-#define NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(n) (0x7ULL << (11 + 3 * (n)))
-
-#define WSMEX_M_PMON_ZDP_CTL_FVC_MASK (((1 << 12) - 1) | (1 << 24))
-#define WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(n) (0x7ULL << (12 + 3 * (n)))
-
-/*
- * use the 9~13 bits to select event If the 7th bit is not set,
- * otherwise use the 19~21 bits to select event.
- */
-#define MBOX_INC_SEL(x) ((x) << NHMEX_M_PMON_CTL_INC_SEL_SHIFT)
-#define MBOX_SET_FLAG_SEL(x) (((x) << NHMEX_M_PMON_CTL_SET_FLAG_SEL_SHIFT) | \
- NHMEX_M_PMON_CTL_FLAG_MODE)
-#define MBOX_INC_SEL_MASK (NHMEX_M_PMON_CTL_INC_SEL_MASK | \
- NHMEX_M_PMON_CTL_FLAG_MODE)
-#define MBOX_SET_FLAG_SEL_MASK (NHMEX_M_PMON_CTL_SET_FLAG_SEL_MASK | \
- NHMEX_M_PMON_CTL_FLAG_MODE)
-#define MBOX_INC_SEL_EXTAR_REG(c, r) \
- EVENT_EXTRA_REG(MBOX_INC_SEL(c), NHMEX_M0_MSR_PMU_##r, \
- MBOX_INC_SEL_MASK, (u64)-1, NHMEX_M_##r)
-#define MBOX_SET_FLAG_SEL_EXTRA_REG(c, r) \
- EVENT_EXTRA_REG(MBOX_SET_FLAG_SEL(c), NHMEX_M0_MSR_PMU_##r, \
- MBOX_SET_FLAG_SEL_MASK, \
- (u64)-1, NHMEX_M_##r)
-
-/* NHM-EX Rbox */
-#define NHMEX_R_MSR_GLOBAL_CTL 0xe00
-#define NHMEX_R_MSR_PMON_CTL0 0xe10
-#define NHMEX_R_MSR_PMON_CNT0 0xe11
-#define NHMEX_R_MSR_OFFSET 0x20
-
-#define NHMEX_R_MSR_PORTN_QLX_CFG(n) \
- ((n) < 4 ? (0xe0c + (n)) : (0xe2c + (n) - 4))
-#define NHMEX_R_MSR_PORTN_IPERF_CFG0(n) (0xe04 + (n))
-#define NHMEX_R_MSR_PORTN_IPERF_CFG1(n) (0xe24 + (n))
-#define NHMEX_R_MSR_PORTN_XBR_OFFSET(n) \
- (((n) < 4 ? 0 : 0x10) + (n) * 4)
-#define NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(n) \
- (0xe60 + NHMEX_R_MSR_PORTN_XBR_OFFSET(n))
-#define NHMEX_R_MSR_PORTN_XBR_SET1_MATCH(n) \
- (NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(n) + 1)
-#define NHMEX_R_MSR_PORTN_XBR_SET1_MASK(n) \
- (NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(n) + 2)
-#define NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(n) \
- (0xe70 + NHMEX_R_MSR_PORTN_XBR_OFFSET(n))
-#define NHMEX_R_MSR_PORTN_XBR_SET2_MATCH(n) \
- (NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(n) + 1)
-#define NHMEX_R_MSR_PORTN_XBR_SET2_MASK(n) \
- (NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(n) + 2)
-
-#define NHMEX_R_PMON_CTL_EN (1 << 0)
-#define NHMEX_R_PMON_CTL_EV_SEL_SHIFT 1
-#define NHMEX_R_PMON_CTL_EV_SEL_MASK \
- (0x1f << NHMEX_R_PMON_CTL_EV_SEL_SHIFT)
-#define NHMEX_R_PMON_CTL_PMI_EN (1 << 6)
-#define NHMEX_R_PMON_RAW_EVENT_MASK NHMEX_R_PMON_CTL_EV_SEL_MASK
-
-/* NHM-EX Wbox */
-#define NHMEX_W_MSR_GLOBAL_CTL 0xc80
-#define NHMEX_W_MSR_PMON_CNT0 0xc90
-#define NHMEX_W_MSR_PMON_EVT_SEL0 0xc91
-#define NHMEX_W_MSR_PMON_FIXED_CTR 0x394
-#define NHMEX_W_MSR_PMON_FIXED_CTL 0x395
-
-#define NHMEX_W_PMON_GLOBAL_FIXED_EN (1ULL << 31)
-
struct intel_uncore_ops;
struct intel_uncore_pmu;
struct intel_uncore_box;
@@ -505,6 +116,9 @@ struct uncore_event_desc {
const char *config;
};
+ssize_t uncore_event_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf);
+
#define INTEL_UNCORE_EVENT_DESC(_name, _config) \
{ \
.attr = __ATTR(_name, 0444, uncore_event_show, NULL), \
@@ -522,15 +136,6 @@ static ssize_t __uncore_##_var##_show(struct kobject *kobj, \
static struct kobj_attribute format_attr_##_var = \
__ATTR(_name, 0444, __uncore_##_var##_show, NULL)
-
-static ssize_t uncore_event_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
-{
- struct uncore_event_desc *event =
- container_of(attr, struct uncore_event_desc, attr);
- return sprintf(buf, "%s", event->config);
-}
-
static inline unsigned uncore_pci_box_ctl(struct intel_uncore_box *box)
{
return box->pmu->type->box_ctl;
@@ -694,3 +299,41 @@ static inline bool uncore_box_is_fake(struct intel_uncore_box *box)
{
return (box->phys_id < 0);
}
+
+struct intel_uncore_pmu *uncore_event_to_pmu(struct perf_event *event);
+struct intel_uncore_box *uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu);
+struct intel_uncore_box *uncore_event_to_box(struct perf_event *event);
+u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event *event);
+void uncore_pmu_start_hrtimer(struct intel_uncore_box *box);
+void uncore_pmu_cancel_hrtimer(struct intel_uncore_box *box);
+void uncore_pmu_event_read(struct perf_event *event);
+void uncore_perf_event_update(struct intel_uncore_box *box, struct perf_event *event);
+struct event_constraint *
+uncore_get_constraint(struct intel_uncore_box *box, struct perf_event *event);
+void uncore_put_constraint(struct intel_uncore_box *box, struct perf_event *event);
+u64 uncore_shared_reg_config(struct intel_uncore_box *box, int idx);
+
+extern struct intel_uncore_type **uncore_msr_uncores;
+extern struct intel_uncore_type **uncore_pci_uncores;
+extern struct pci_driver *uncore_pci_driver;
+extern int uncore_pcibus_to_physid[256];
+extern struct pci_dev *uncore_extra_pci_dev[UNCORE_SOCKET_MAX][UNCORE_EXTRA_PCI_DEV_MAX];
+extern struct event_constraint uncore_constraint_empty;
+
+/* perf_event_intel_uncore_snb.c */
+int snb_uncore_pci_init(void);
+int ivb_uncore_pci_init(void);
+int hsw_uncore_pci_init(void);
+void snb_uncore_cpu_init(void);
+void nhm_uncore_cpu_init(void);
+
+/* perf_event_intel_uncore_snbep.c */
+int snbep_uncore_pci_init(void);
+void snbep_uncore_cpu_init(void);
+int ivbep_uncore_pci_init(void);
+void ivbep_uncore_cpu_init(void);
+int hswep_uncore_pci_init(void);
+void hswep_uncore_cpu_init(void);
+
+/* perf_event_intel_uncore_nhmex.c */
+void nhmex_uncore_cpu_init(void);
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_nhmex.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_nhmex.c
new file mode 100644
index 000000000000..2749965afed0
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_nhmex.c
@@ -0,0 +1,1221 @@
+/* Nehalem-EX/Westmere-EX uncore support */
+#include "perf_event_intel_uncore.h"
+
+/* NHM-EX event control */
+#define NHMEX_PMON_CTL_EV_SEL_MASK 0x000000ff
+#define NHMEX_PMON_CTL_UMASK_MASK 0x0000ff00
+#define NHMEX_PMON_CTL_EN_BIT0 (1 << 0)
+#define NHMEX_PMON_CTL_EDGE_DET (1 << 18)
+#define NHMEX_PMON_CTL_PMI_EN (1 << 20)
+#define NHMEX_PMON_CTL_EN_BIT22 (1 << 22)
+#define NHMEX_PMON_CTL_INVERT (1 << 23)
+#define NHMEX_PMON_CTL_TRESH_MASK 0xff000000
+#define NHMEX_PMON_RAW_EVENT_MASK (NHMEX_PMON_CTL_EV_SEL_MASK | \
+ NHMEX_PMON_CTL_UMASK_MASK | \
+ NHMEX_PMON_CTL_EDGE_DET | \
+ NHMEX_PMON_CTL_INVERT | \
+ NHMEX_PMON_CTL_TRESH_MASK)
+
+/* NHM-EX Ubox */
+#define NHMEX_U_MSR_PMON_GLOBAL_CTL 0xc00
+#define NHMEX_U_MSR_PMON_CTR 0xc11
+#define NHMEX_U_MSR_PMON_EV_SEL 0xc10
+
+#define NHMEX_U_PMON_GLOBAL_EN (1 << 0)
+#define NHMEX_U_PMON_GLOBAL_PMI_CORE_SEL 0x0000001e
+#define NHMEX_U_PMON_GLOBAL_EN_ALL (1 << 28)
+#define NHMEX_U_PMON_GLOBAL_RST_ALL (1 << 29)
+#define NHMEX_U_PMON_GLOBAL_FRZ_ALL (1 << 31)
+
+#define NHMEX_U_PMON_RAW_EVENT_MASK \
+ (NHMEX_PMON_CTL_EV_SEL_MASK | \
+ NHMEX_PMON_CTL_EDGE_DET)
+
+/* NHM-EX Cbox */
+#define NHMEX_C0_MSR_PMON_GLOBAL_CTL 0xd00
+#define NHMEX_C0_MSR_PMON_CTR0 0xd11
+#define NHMEX_C0_MSR_PMON_EV_SEL0 0xd10
+#define NHMEX_C_MSR_OFFSET 0x20
+
+/* NHM-EX Bbox */
+#define NHMEX_B0_MSR_PMON_GLOBAL_CTL 0xc20
+#define NHMEX_B0_MSR_PMON_CTR0 0xc31
+#define NHMEX_B0_MSR_PMON_CTL0 0xc30
+#define NHMEX_B_MSR_OFFSET 0x40
+#define NHMEX_B0_MSR_MATCH 0xe45
+#define NHMEX_B0_MSR_MASK 0xe46
+#define NHMEX_B1_MSR_MATCH 0xe4d
+#define NHMEX_B1_MSR_MASK 0xe4e
+
+#define NHMEX_B_PMON_CTL_EN (1 << 0)
+#define NHMEX_B_PMON_CTL_EV_SEL_SHIFT 1
+#define NHMEX_B_PMON_CTL_EV_SEL_MASK \
+ (0x1f << NHMEX_B_PMON_CTL_EV_SEL_SHIFT)
+#define NHMEX_B_PMON_CTR_SHIFT 6
+#define NHMEX_B_PMON_CTR_MASK \
+ (0x3 << NHMEX_B_PMON_CTR_SHIFT)
+#define NHMEX_B_PMON_RAW_EVENT_MASK \
+ (NHMEX_B_PMON_CTL_EV_SEL_MASK | \
+ NHMEX_B_PMON_CTR_MASK)
+
+/* NHM-EX Sbox */
+#define NHMEX_S0_MSR_PMON_GLOBAL_CTL 0xc40
+#define NHMEX_S0_MSR_PMON_CTR0 0xc51
+#define NHMEX_S0_MSR_PMON_CTL0 0xc50
+#define NHMEX_S_MSR_OFFSET 0x80
+#define NHMEX_S0_MSR_MM_CFG 0xe48
+#define NHMEX_S0_MSR_MATCH 0xe49
+#define NHMEX_S0_MSR_MASK 0xe4a
+#define NHMEX_S1_MSR_MM_CFG 0xe58
+#define NHMEX_S1_MSR_MATCH 0xe59
+#define NHMEX_S1_MSR_MASK 0xe5a
+
+#define NHMEX_S_PMON_MM_CFG_EN (0x1ULL << 63)
+#define NHMEX_S_EVENT_TO_R_PROG_EV 0
+
+/* NHM-EX Mbox */
+#define NHMEX_M0_MSR_GLOBAL_CTL 0xca0
+#define NHMEX_M0_MSR_PMU_DSP 0xca5
+#define NHMEX_M0_MSR_PMU_ISS 0xca6
+#define NHMEX_M0_MSR_PMU_MAP 0xca7
+#define NHMEX_M0_MSR_PMU_MSC_THR 0xca8
+#define NHMEX_M0_MSR_PMU_PGT 0xca9
+#define NHMEX_M0_MSR_PMU_PLD 0xcaa
+#define NHMEX_M0_MSR_PMU_ZDP_CTL_FVC 0xcab
+#define NHMEX_M0_MSR_PMU_CTL0 0xcb0
+#define NHMEX_M0_MSR_PMU_CNT0 0xcb1
+#define NHMEX_M_MSR_OFFSET 0x40
+#define NHMEX_M0_MSR_PMU_MM_CFG 0xe54
+#define NHMEX_M1_MSR_PMU_MM_CFG 0xe5c
+
+#define NHMEX_M_PMON_MM_CFG_EN (1ULL << 63)
+#define NHMEX_M_PMON_ADDR_MATCH_MASK 0x3ffffffffULL
+#define NHMEX_M_PMON_ADDR_MASK_MASK 0x7ffffffULL
+#define NHMEX_M_PMON_ADDR_MASK_SHIFT 34
+
+#define NHMEX_M_PMON_CTL_EN (1 << 0)
+#define NHMEX_M_PMON_CTL_PMI_EN (1 << 1)
+#define NHMEX_M_PMON_CTL_COUNT_MODE_SHIFT 2
+#define NHMEX_M_PMON_CTL_COUNT_MODE_MASK \
+ (0x3 << NHMEX_M_PMON_CTL_COUNT_MODE_SHIFT)
+#define NHMEX_M_PMON_CTL_STORAGE_MODE_SHIFT 4
+#define NHMEX_M_PMON_CTL_STORAGE_MODE_MASK \
+ (0x3 << NHMEX_M_PMON_CTL_STORAGE_MODE_SHIFT)
+#define NHMEX_M_PMON_CTL_WRAP_MODE (1 << 6)
+#define NHMEX_M_PMON_CTL_FLAG_MODE (1 << 7)
+#define NHMEX_M_PMON_CTL_INC_SEL_SHIFT 9
+#define NHMEX_M_PMON_CTL_INC_SEL_MASK \
+ (0x1f << NHMEX_M_PMON_CTL_INC_SEL_SHIFT)
+#define NHMEX_M_PMON_CTL_SET_FLAG_SEL_SHIFT 19
+#define NHMEX_M_PMON_CTL_SET_FLAG_SEL_MASK \
+ (0x7 << NHMEX_M_PMON_CTL_SET_FLAG_SEL_SHIFT)
+#define NHMEX_M_PMON_RAW_EVENT_MASK \
+ (NHMEX_M_PMON_CTL_COUNT_MODE_MASK | \
+ NHMEX_M_PMON_CTL_STORAGE_MODE_MASK | \
+ NHMEX_M_PMON_CTL_WRAP_MODE | \
+ NHMEX_M_PMON_CTL_FLAG_MODE | \
+ NHMEX_M_PMON_CTL_INC_SEL_MASK | \
+ NHMEX_M_PMON_CTL_SET_FLAG_SEL_MASK)
+
+#define NHMEX_M_PMON_ZDP_CTL_FVC_MASK (((1 << 11) - 1) | (1 << 23))
+#define NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(n) (0x7ULL << (11 + 3 * (n)))
+
+#define WSMEX_M_PMON_ZDP_CTL_FVC_MASK (((1 << 12) - 1) | (1 << 24))
+#define WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(n) (0x7ULL << (12 + 3 * (n)))
+
+/*
+ * use the 9~13 bits to select event If the 7th bit is not set,
+ * otherwise use the 19~21 bits to select event.
+ */
+#define MBOX_INC_SEL(x) ((x) << NHMEX_M_PMON_CTL_INC_SEL_SHIFT)
+#define MBOX_SET_FLAG_SEL(x) (((x) << NHMEX_M_PMON_CTL_SET_FLAG_SEL_SHIFT) | \
+ NHMEX_M_PMON_CTL_FLAG_MODE)
+#define MBOX_INC_SEL_MASK (NHMEX_M_PMON_CTL_INC_SEL_MASK | \
+ NHMEX_M_PMON_CTL_FLAG_MODE)
+#define MBOX_SET_FLAG_SEL_MASK (NHMEX_M_PMON_CTL_SET_FLAG_SEL_MASK | \
+ NHMEX_M_PMON_CTL_FLAG_MODE)
+#define MBOX_INC_SEL_EXTAR_REG(c, r) \
+ EVENT_EXTRA_REG(MBOX_INC_SEL(c), NHMEX_M0_MSR_PMU_##r, \
+ MBOX_INC_SEL_MASK, (u64)-1, NHMEX_M_##r)
+#define MBOX_SET_FLAG_SEL_EXTRA_REG(c, r) \
+ EVENT_EXTRA_REG(MBOX_SET_FLAG_SEL(c), NHMEX_M0_MSR_PMU_##r, \
+ MBOX_SET_FLAG_SEL_MASK, \
+ (u64)-1, NHMEX_M_##r)
+
+/* NHM-EX Rbox */
+#define NHMEX_R_MSR_GLOBAL_CTL 0xe00
+#define NHMEX_R_MSR_PMON_CTL0 0xe10
+#define NHMEX_R_MSR_PMON_CNT0 0xe11
+#define NHMEX_R_MSR_OFFSET 0x20
+
+#define NHMEX_R_MSR_PORTN_QLX_CFG(n) \
+ ((n) < 4 ? (0xe0c + (n)) : (0xe2c + (n) - 4))
+#define NHMEX_R_MSR_PORTN_IPERF_CFG0(n) (0xe04 + (n))
+#define NHMEX_R_MSR_PORTN_IPERF_CFG1(n) (0xe24 + (n))
+#define NHMEX_R_MSR_PORTN_XBR_OFFSET(n) \
+ (((n) < 4 ? 0 : 0x10) + (n) * 4)
+#define NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(n) \
+ (0xe60 + NHMEX_R_MSR_PORTN_XBR_OFFSET(n))
+#define NHMEX_R_MSR_PORTN_XBR_SET1_MATCH(n) \
+ (NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(n) + 1)
+#define NHMEX_R_MSR_PORTN_XBR_SET1_MASK(n) \
+ (NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(n) + 2)
+#define NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(n) \
+ (0xe70 + NHMEX_R_MSR_PORTN_XBR_OFFSET(n))
+#define NHMEX_R_MSR_PORTN_XBR_SET2_MATCH(n) \
+ (NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(n) + 1)
+#define NHMEX_R_MSR_PORTN_XBR_SET2_MASK(n) \
+ (NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(n) + 2)
+
+#define NHMEX_R_PMON_CTL_EN (1 << 0)
+#define NHMEX_R_PMON_CTL_EV_SEL_SHIFT 1
+#define NHMEX_R_PMON_CTL_EV_SEL_MASK \
+ (0x1f << NHMEX_R_PMON_CTL_EV_SEL_SHIFT)
+#define NHMEX_R_PMON_CTL_PMI_EN (1 << 6)
+#define NHMEX_R_PMON_RAW_EVENT_MASK NHMEX_R_PMON_CTL_EV_SEL_MASK
+
+/* NHM-EX Wbox */
+#define NHMEX_W_MSR_GLOBAL_CTL 0xc80
+#define NHMEX_W_MSR_PMON_CNT0 0xc90
+#define NHMEX_W_MSR_PMON_EVT_SEL0 0xc91
+#define NHMEX_W_MSR_PMON_FIXED_CTR 0x394
+#define NHMEX_W_MSR_PMON_FIXED_CTL 0x395
+
+#define NHMEX_W_PMON_GLOBAL_FIXED_EN (1ULL << 31)
+
+#define __BITS_VALUE(x, i, n) ((typeof(x))(((x) >> ((i) * (n))) & \
+ ((1ULL << (n)) - 1)))
+
+DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7");
+DEFINE_UNCORE_FORMAT_ATTR(event5, event, "config:1-5");
+DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15");
+DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18");
+DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23");
+DEFINE_UNCORE_FORMAT_ATTR(thresh8, thresh, "config:24-31");
+DEFINE_UNCORE_FORMAT_ATTR(counter, counter, "config:6-7");
+DEFINE_UNCORE_FORMAT_ATTR(match, match, "config1:0-63");
+DEFINE_UNCORE_FORMAT_ATTR(mask, mask, "config2:0-63");
+
+static void nhmex_uncore_msr_init_box(struct intel_uncore_box *box)
+{
+ wrmsrl(NHMEX_U_MSR_PMON_GLOBAL_CTL, NHMEX_U_PMON_GLOBAL_EN_ALL);
+}
+
+static void nhmex_uncore_msr_disable_box(struct intel_uncore_box *box)
+{
+ unsigned msr = uncore_msr_box_ctl(box);
+ u64 config;
+
+ if (msr) {
+ rdmsrl(msr, config);
+ config &= ~((1ULL << uncore_num_counters(box)) - 1);
+ /* WBox has a fixed counter */
+ if (uncore_msr_fixed_ctl(box))
+ config &= ~NHMEX_W_PMON_GLOBAL_FIXED_EN;
+ wrmsrl(msr, config);
+ }
+}
+
+static void nhmex_uncore_msr_enable_box(struct intel_uncore_box *box)
+{
+ unsigned msr = uncore_msr_box_ctl(box);
+ u64 config;
+
+ if (msr) {
+ rdmsrl(msr, config);
+ config |= (1ULL << uncore_num_counters(box)) - 1;
+ /* WBox has a fixed counter */
+ if (uncore_msr_fixed_ctl(box))
+ config |= NHMEX_W_PMON_GLOBAL_FIXED_EN;
+ wrmsrl(msr, config);
+ }
+}
+
+static void nhmex_uncore_msr_disable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+ wrmsrl(event->hw.config_base, 0);
+}
+
+static void nhmex_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ if (hwc->idx >= UNCORE_PMC_IDX_FIXED)
+ wrmsrl(hwc->config_base, NHMEX_PMON_CTL_EN_BIT0);
+ else if (box->pmu->type->event_mask & NHMEX_PMON_CTL_EN_BIT0)
+ wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT22);
+ else
+ wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT0);
+}
+
+#define NHMEX_UNCORE_OPS_COMMON_INIT() \
+ .init_box = nhmex_uncore_msr_init_box, \
+ .disable_box = nhmex_uncore_msr_disable_box, \
+ .enable_box = nhmex_uncore_msr_enable_box, \
+ .disable_event = nhmex_uncore_msr_disable_event, \
+ .read_counter = uncore_msr_read_counter
+
+static struct intel_uncore_ops nhmex_uncore_ops = {
+ NHMEX_UNCORE_OPS_COMMON_INIT(),
+ .enable_event = nhmex_uncore_msr_enable_event,
+};
+
+static struct attribute *nhmex_uncore_ubox_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_edge.attr,
+ NULL,
+};
+
+static struct attribute_group nhmex_uncore_ubox_format_group = {
+ .name = "format",
+ .attrs = nhmex_uncore_ubox_formats_attr,
+};
+
+static struct intel_uncore_type nhmex_uncore_ubox = {
+ .name = "ubox",
+ .num_counters = 1,
+ .num_boxes = 1,
+ .perf_ctr_bits = 48,
+ .event_ctl = NHMEX_U_MSR_PMON_EV_SEL,
+ .perf_ctr = NHMEX_U_MSR_PMON_CTR,
+ .event_mask = NHMEX_U_PMON_RAW_EVENT_MASK,
+ .box_ctl = NHMEX_U_MSR_PMON_GLOBAL_CTL,
+ .ops = &nhmex_uncore_ops,
+ .format_group = &nhmex_uncore_ubox_format_group
+};
+
+static struct attribute *nhmex_uncore_cbox_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_inv.attr,
+ &format_attr_thresh8.attr,
+ NULL,
+};
+
+static struct attribute_group nhmex_uncore_cbox_format_group = {
+ .name = "format",
+ .attrs = nhmex_uncore_cbox_formats_attr,
+};
+
+/* msr offset for each instance of cbox */
+static unsigned nhmex_cbox_msr_offsets[] = {
+ 0x0, 0x80, 0x40, 0xc0, 0x20, 0xa0, 0x60, 0xe0, 0x240, 0x2c0,
+};
+
+static struct intel_uncore_type nhmex_uncore_cbox = {
+ .name = "cbox",
+ .num_counters = 6,
+ .num_boxes = 10,
+ .perf_ctr_bits = 48,
+ .event_ctl = NHMEX_C0_MSR_PMON_EV_SEL0,
+ .perf_ctr = NHMEX_C0_MSR_PMON_CTR0,
+ .event_mask = NHMEX_PMON_RAW_EVENT_MASK,
+ .box_ctl = NHMEX_C0_MSR_PMON_GLOBAL_CTL,
+ .msr_offsets = nhmex_cbox_msr_offsets,
+ .pair_ctr_ctl = 1,
+ .ops = &nhmex_uncore_ops,
+ .format_group = &nhmex_uncore_cbox_format_group
+};
+
+static struct uncore_event_desc nhmex_uncore_wbox_events[] = {
+ INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0"),
+ { /* end: all zeroes */ },
+};
+
+static struct intel_uncore_type nhmex_uncore_wbox = {
+ .name = "wbox",
+ .num_counters = 4,
+ .num_boxes = 1,
+ .perf_ctr_bits = 48,
+ .event_ctl = NHMEX_W_MSR_PMON_CNT0,
+ .perf_ctr = NHMEX_W_MSR_PMON_EVT_SEL0,
+ .fixed_ctr = NHMEX_W_MSR_PMON_FIXED_CTR,
+ .fixed_ctl = NHMEX_W_MSR_PMON_FIXED_CTL,
+ .event_mask = NHMEX_PMON_RAW_EVENT_MASK,
+ .box_ctl = NHMEX_W_MSR_GLOBAL_CTL,
+ .pair_ctr_ctl = 1,
+ .event_descs = nhmex_uncore_wbox_events,
+ .ops = &nhmex_uncore_ops,
+ .format_group = &nhmex_uncore_cbox_format_group
+};
+
+static int nhmex_bbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+ struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+ int ctr, ev_sel;
+
+ ctr = (hwc->config & NHMEX_B_PMON_CTR_MASK) >>
+ NHMEX_B_PMON_CTR_SHIFT;
+ ev_sel = (hwc->config & NHMEX_B_PMON_CTL_EV_SEL_MASK) >>
+ NHMEX_B_PMON_CTL_EV_SEL_SHIFT;
+
+ /* events that do not use the match/mask registers */
+ if ((ctr == 0 && ev_sel > 0x3) || (ctr == 1 && ev_sel > 0x6) ||
+ (ctr == 2 && ev_sel != 0x4) || ctr == 3)
+ return 0;
+
+ if (box->pmu->pmu_idx == 0)
+ reg1->reg = NHMEX_B0_MSR_MATCH;
+ else
+ reg1->reg = NHMEX_B1_MSR_MATCH;
+ reg1->idx = 0;
+ reg1->config = event->attr.config1;
+ reg2->config = event->attr.config2;
+ return 0;
+}
+
+static void nhmex_bbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+ struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+
+ if (reg1->idx != EXTRA_REG_NONE) {
+ wrmsrl(reg1->reg, reg1->config);
+ wrmsrl(reg1->reg + 1, reg2->config);
+ }
+ wrmsrl(hwc->config_base, NHMEX_PMON_CTL_EN_BIT0 |
+ (hwc->config & NHMEX_B_PMON_CTL_EV_SEL_MASK));
+}
+
+/*
+ * The Bbox has 4 counters, but each counter monitors different events.
+ * Use bits 6-7 in the event config to select counter.
+ */
+static struct event_constraint nhmex_uncore_bbox_constraints[] = {
+ EVENT_CONSTRAINT(0 , 1, 0xc0),
+ EVENT_CONSTRAINT(0x40, 2, 0xc0),
+ EVENT_CONSTRAINT(0x80, 4, 0xc0),
+ EVENT_CONSTRAINT(0xc0, 8, 0xc0),
+ EVENT_CONSTRAINT_END,
+};
+
+static struct attribute *nhmex_uncore_bbox_formats_attr[] = {
+ &format_attr_event5.attr,
+ &format_attr_counter.attr,
+ &format_attr_match.attr,
+ &format_attr_mask.attr,
+ NULL,
+};
+
+static struct attribute_group nhmex_uncore_bbox_format_group = {
+ .name = "format",
+ .attrs = nhmex_uncore_bbox_formats_attr,
+};
+
+static struct intel_uncore_ops nhmex_uncore_bbox_ops = {
+ NHMEX_UNCORE_OPS_COMMON_INIT(),
+ .enable_event = nhmex_bbox_msr_enable_event,
+ .hw_config = nhmex_bbox_hw_config,
+ .get_constraint = uncore_get_constraint,
+ .put_constraint = uncore_put_constraint,
+};
+
+static struct intel_uncore_type nhmex_uncore_bbox = {
+ .name = "bbox",
+ .num_counters = 4,
+ .num_boxes = 2,
+ .perf_ctr_bits = 48,
+ .event_ctl = NHMEX_B0_MSR_PMON_CTL0,
+ .perf_ctr = NHMEX_B0_MSR_PMON_CTR0,
+ .event_mask = NHMEX_B_PMON_RAW_EVENT_MASK,
+ .box_ctl = NHMEX_B0_MSR_PMON_GLOBAL_CTL,
+ .msr_offset = NHMEX_B_MSR_OFFSET,
+ .pair_ctr_ctl = 1,
+ .num_shared_regs = 1,
+ .constraints = nhmex_uncore_bbox_constraints,
+ .ops = &nhmex_uncore_bbox_ops,
+ .format_group = &nhmex_uncore_bbox_format_group
+};
+
+static int nhmex_sbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+ struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+
+ /* only TO_R_PROG_EV event uses the match/mask register */
+ if ((hwc->config & NHMEX_PMON_CTL_EV_SEL_MASK) !=
+ NHMEX_S_EVENT_TO_R_PROG_EV)
+ return 0;
+
+ if (box->pmu->pmu_idx == 0)
+ reg1->reg = NHMEX_S0_MSR_MM_CFG;
+ else
+ reg1->reg = NHMEX_S1_MSR_MM_CFG;
+ reg1->idx = 0;
+ reg1->config = event->attr.config1;
+ reg2->config = event->attr.config2;
+ return 0;
+}
+
+static void nhmex_sbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+ struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+
+ if (reg1->idx != EXTRA_REG_NONE) {
+ wrmsrl(reg1->reg, 0);
+ wrmsrl(reg1->reg + 1, reg1->config);
+ wrmsrl(reg1->reg + 2, reg2->config);
+ wrmsrl(reg1->reg, NHMEX_S_PMON_MM_CFG_EN);
+ }
+ wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT22);
+}
+
+static struct attribute *nhmex_uncore_sbox_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_inv.attr,
+ &format_attr_thresh8.attr,
+ &format_attr_match.attr,
+ &format_attr_mask.attr,
+ NULL,
+};
+
+static struct attribute_group nhmex_uncore_sbox_format_group = {
+ .name = "format",
+ .attrs = nhmex_uncore_sbox_formats_attr,
+};
+
+static struct intel_uncore_ops nhmex_uncore_sbox_ops = {
+ NHMEX_UNCORE_OPS_COMMON_INIT(),
+ .enable_event = nhmex_sbox_msr_enable_event,
+ .hw_config = nhmex_sbox_hw_config,
+ .get_constraint = uncore_get_constraint,
+ .put_constraint = uncore_put_constraint,
+};
+
+static struct intel_uncore_type nhmex_uncore_sbox = {
+ .name = "sbox",
+ .num_counters = 4,
+ .num_boxes = 2,
+ .perf_ctr_bits = 48,
+ .event_ctl = NHMEX_S0_MSR_PMON_CTL0,
+ .perf_ctr = NHMEX_S0_MSR_PMON_CTR0,
+ .event_mask = NHMEX_PMON_RAW_EVENT_MASK,
+ .box_ctl = NHMEX_S0_MSR_PMON_GLOBAL_CTL,
+ .msr_offset = NHMEX_S_MSR_OFFSET,
+ .pair_ctr_ctl = 1,
+ .num_shared_regs = 1,
+ .ops = &nhmex_uncore_sbox_ops,
+ .format_group = &nhmex_uncore_sbox_format_group
+};
+
+enum {
+ EXTRA_REG_NHMEX_M_FILTER,
+ EXTRA_REG_NHMEX_M_DSP,
+ EXTRA_REG_NHMEX_M_ISS,
+ EXTRA_REG_NHMEX_M_MAP,
+ EXTRA_REG_NHMEX_M_MSC_THR,
+ EXTRA_REG_NHMEX_M_PGT,
+ EXTRA_REG_NHMEX_M_PLD,
+ EXTRA_REG_NHMEX_M_ZDP_CTL_FVC,
+};
+
+static struct extra_reg nhmex_uncore_mbox_extra_regs[] = {
+ MBOX_INC_SEL_EXTAR_REG(0x0, DSP),
+ MBOX_INC_SEL_EXTAR_REG(0x4, MSC_THR),
+ MBOX_INC_SEL_EXTAR_REG(0x5, MSC_THR),
+ MBOX_INC_SEL_EXTAR_REG(0x9, ISS),
+ /* event 0xa uses two extra registers */
+ MBOX_INC_SEL_EXTAR_REG(0xa, ISS),
+ MBOX_INC_SEL_EXTAR_REG(0xa, PLD),
+ MBOX_INC_SEL_EXTAR_REG(0xb, PLD),
+ /* events 0xd ~ 0x10 use the same extra register */
+ MBOX_INC_SEL_EXTAR_REG(0xd, ZDP_CTL_FVC),
+ MBOX_INC_SEL_EXTAR_REG(0xe, ZDP_CTL_FVC),
+ MBOX_INC_SEL_EXTAR_REG(0xf, ZDP_CTL_FVC),
+ MBOX_INC_SEL_EXTAR_REG(0x10, ZDP_CTL_FVC),
+ MBOX_INC_SEL_EXTAR_REG(0x16, PGT),
+ MBOX_SET_FLAG_SEL_EXTRA_REG(0x0, DSP),
+ MBOX_SET_FLAG_SEL_EXTRA_REG(0x1, ISS),
+ MBOX_SET_FLAG_SEL_EXTRA_REG(0x5, PGT),
+ MBOX_SET_FLAG_SEL_EXTRA_REG(0x6, MAP),
+ EVENT_EXTRA_END
+};
+
+/* Nehalem-EX or Westmere-EX ? */
+static bool uncore_nhmex;
+
+static bool nhmex_mbox_get_shared_reg(struct intel_uncore_box *box, int idx, u64 config)
+{
+ struct intel_uncore_extra_reg *er;
+ unsigned long flags;
+ bool ret = false;
+ u64 mask;
+
+ if (idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) {
+ er = &box->shared_regs[idx];
+ raw_spin_lock_irqsave(&er->lock, flags);
+ if (!atomic_read(&er->ref) || er->config == config) {
+ atomic_inc(&er->ref);
+ er->config = config;
+ ret = true;
+ }
+ raw_spin_unlock_irqrestore(&er->lock, flags);
+
+ return ret;
+ }
+ /*
+ * The ZDP_CTL_FVC MSR has 4 fields which are used to control
+ * events 0xd ~ 0x10. Besides these 4 fields, there are additional
+ * fields which are shared.
+ */
+ idx -= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
+ if (WARN_ON_ONCE(idx >= 4))
+ return false;
+
+ /* mask of the shared fields */
+ if (uncore_nhmex)
+ mask = NHMEX_M_PMON_ZDP_CTL_FVC_MASK;
+ else
+ mask = WSMEX_M_PMON_ZDP_CTL_FVC_MASK;
+ er = &box->shared_regs[EXTRA_REG_NHMEX_M_ZDP_CTL_FVC];
+
+ raw_spin_lock_irqsave(&er->lock, flags);
+ /* add mask of the non-shared field if it's in use */
+ if (__BITS_VALUE(atomic_read(&er->ref), idx, 8)) {
+ if (uncore_nhmex)
+ mask |= NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
+ else
+ mask |= WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
+ }
+
+ if (!atomic_read(&er->ref) || !((er->config ^ config) & mask)) {
+ atomic_add(1 << (idx * 8), &er->ref);
+ if (uncore_nhmex)
+ mask = NHMEX_M_PMON_ZDP_CTL_FVC_MASK |
+ NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
+ else
+ mask = WSMEX_M_PMON_ZDP_CTL_FVC_MASK |
+ WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
+ er->config &= ~mask;
+ er->config |= (config & mask);
+ ret = true;
+ }
+ raw_spin_unlock_irqrestore(&er->lock, flags);
+
+ return ret;
+}
+
+static void nhmex_mbox_put_shared_reg(struct intel_uncore_box *box, int idx)
+{
+ struct intel_uncore_extra_reg *er;
+
+ if (idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) {
+ er = &box->shared_regs[idx];
+ atomic_dec(&er->ref);
+ return;
+ }
+
+ idx -= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
+ er = &box->shared_regs[EXTRA_REG_NHMEX_M_ZDP_CTL_FVC];
+ atomic_sub(1 << (idx * 8), &er->ref);
+}
+
+static u64 nhmex_mbox_alter_er(struct perf_event *event, int new_idx, bool modify)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+ u64 idx, orig_idx = __BITS_VALUE(reg1->idx, 0, 8);
+ u64 config = reg1->config;
+
+ /* get the non-shared control bits and shift them */
+ idx = orig_idx - EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
+ if (uncore_nhmex)
+ config &= NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
+ else
+ config &= WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
+ if (new_idx > orig_idx) {
+ idx = new_idx - orig_idx;
+ config <<= 3 * idx;
+ } else {
+ idx = orig_idx - new_idx;
+ config >>= 3 * idx;
+ }
+
+ /* add the shared control bits back */
+ if (uncore_nhmex)
+ config |= NHMEX_M_PMON_ZDP_CTL_FVC_MASK & reg1->config;
+ else
+ config |= WSMEX_M_PMON_ZDP_CTL_FVC_MASK & reg1->config;
+ config |= NHMEX_M_PMON_ZDP_CTL_FVC_MASK & reg1->config;
+ if (modify) {
+ /* adjust the main event selector */
+ if (new_idx > orig_idx)
+ hwc->config += idx << NHMEX_M_PMON_CTL_INC_SEL_SHIFT;
+ else
+ hwc->config -= idx << NHMEX_M_PMON_CTL_INC_SEL_SHIFT;
+ reg1->config = config;
+ reg1->idx = ~0xff | new_idx;
+ }
+ return config;
+}
+
+static struct event_constraint *
+nhmex_mbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+ struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
+ int i, idx[2], alloc = 0;
+ u64 config1 = reg1->config;
+
+ idx[0] = __BITS_VALUE(reg1->idx, 0, 8);
+ idx[1] = __BITS_VALUE(reg1->idx, 1, 8);
+again:
+ for (i = 0; i < 2; i++) {
+ if (!uncore_box_is_fake(box) && (reg1->alloc & (0x1 << i)))
+ idx[i] = 0xff;
+
+ if (idx[i] == 0xff)
+ continue;
+
+ if (!nhmex_mbox_get_shared_reg(box, idx[i],
+ __BITS_VALUE(config1, i, 32)))
+ goto fail;
+ alloc |= (0x1 << i);
+ }
+
+ /* for the match/mask registers */
+ if (reg2->idx != EXTRA_REG_NONE &&
+ (uncore_box_is_fake(box) || !reg2->alloc) &&
+ !nhmex_mbox_get_shared_reg(box, reg2->idx, reg2->config))
+ goto fail;
+
+ /*
+ * If it's a fake box -- as per validate_{group,event}() we
+ * shouldn't touch event state and we can avoid doing so
+ * since both will only call get_event_constraints() once
+ * on each event, this avoids the need for reg->alloc.
+ */
+ if (!uncore_box_is_fake(box)) {
+ if (idx[0] != 0xff && idx[0] != __BITS_VALUE(reg1->idx, 0, 8))
+ nhmex_mbox_alter_er(event, idx[0], true);
+ reg1->alloc |= alloc;
+ if (reg2->idx != EXTRA_REG_NONE)
+ reg2->alloc = 1;
+ }
+ return NULL;
+fail:
+ if (idx[0] != 0xff && !(alloc & 0x1) &&
+ idx[0] >= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) {
+ /*
+ * events 0xd ~ 0x10 are functional identical, but are
+ * controlled by different fields in the ZDP_CTL_FVC
+ * register. If we failed to take one field, try the
+ * rest 3 choices.
+ */
+ BUG_ON(__BITS_VALUE(reg1->idx, 1, 8) != 0xff);
+ idx[0] -= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
+ idx[0] = (idx[0] + 1) % 4;
+ idx[0] += EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
+ if (idx[0] != __BITS_VALUE(reg1->idx, 0, 8)) {
+ config1 = nhmex_mbox_alter_er(event, idx[0], false);
+ goto again;
+ }
+ }
+
+ if (alloc & 0x1)
+ nhmex_mbox_put_shared_reg(box, idx[0]);
+ if (alloc & 0x2)
+ nhmex_mbox_put_shared_reg(box, idx[1]);
+ return &uncore_constraint_empty;
+}
+
+static void nhmex_mbox_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+ struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
+
+ if (uncore_box_is_fake(box))
+ return;
+
+ if (reg1->alloc & 0x1)
+ nhmex_mbox_put_shared_reg(box, __BITS_VALUE(reg1->idx, 0, 8));
+ if (reg1->alloc & 0x2)
+ nhmex_mbox_put_shared_reg(box, __BITS_VALUE(reg1->idx, 1, 8));
+ reg1->alloc = 0;
+
+ if (reg2->alloc) {
+ nhmex_mbox_put_shared_reg(box, reg2->idx);
+ reg2->alloc = 0;
+ }
+}
+
+static int nhmex_mbox_extra_reg_idx(struct extra_reg *er)
+{
+ if (er->idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC)
+ return er->idx;
+ return er->idx + (er->event >> NHMEX_M_PMON_CTL_INC_SEL_SHIFT) - 0xd;
+}
+
+static int nhmex_mbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct intel_uncore_type *type = box->pmu->type;
+ struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+ struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
+ struct extra_reg *er;
+ unsigned msr;
+ int reg_idx = 0;
+ /*
+ * The mbox events may require 2 extra MSRs at the most. But only
+ * the lower 32 bits in these MSRs are significant, so we can use
+ * config1 to pass two MSRs' config.
+ */
+ for (er = nhmex_uncore_mbox_extra_regs; er->msr; er++) {
+ if (er->event != (event->hw.config & er->config_mask))
+ continue;
+ if (event->attr.config1 & ~er->valid_mask)
+ return -EINVAL;
+
+ msr = er->msr + type->msr_offset * box->pmu->pmu_idx;
+ if (WARN_ON_ONCE(msr >= 0xffff || er->idx >= 0xff))
+ return -EINVAL;
+
+ /* always use the 32~63 bits to pass the PLD config */
+ if (er->idx == EXTRA_REG_NHMEX_M_PLD)
+ reg_idx = 1;
+ else if (WARN_ON_ONCE(reg_idx > 0))
+ return -EINVAL;
+
+ reg1->idx &= ~(0xff << (reg_idx * 8));
+ reg1->reg &= ~(0xffff << (reg_idx * 16));
+ reg1->idx |= nhmex_mbox_extra_reg_idx(er) << (reg_idx * 8);
+ reg1->reg |= msr << (reg_idx * 16);
+ reg1->config = event->attr.config1;
+ reg_idx++;
+ }
+ /*
+ * The mbox only provides ability to perform address matching
+ * for the PLD events.
+ */
+ if (reg_idx == 2) {
+ reg2->idx = EXTRA_REG_NHMEX_M_FILTER;
+ if (event->attr.config2 & NHMEX_M_PMON_MM_CFG_EN)
+ reg2->config = event->attr.config2;
+ else
+ reg2->config = ~0ULL;
+ if (box->pmu->pmu_idx == 0)
+ reg2->reg = NHMEX_M0_MSR_PMU_MM_CFG;
+ else
+ reg2->reg = NHMEX_M1_MSR_PMU_MM_CFG;
+ }
+ return 0;
+}
+
+static u64 nhmex_mbox_shared_reg_config(struct intel_uncore_box *box, int idx)
+{
+ struct intel_uncore_extra_reg *er;
+ unsigned long flags;
+ u64 config;
+
+ if (idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC)
+ return box->shared_regs[idx].config;
+
+ er = &box->shared_regs[EXTRA_REG_NHMEX_M_ZDP_CTL_FVC];
+ raw_spin_lock_irqsave(&er->lock, flags);
+ config = er->config;
+ raw_spin_unlock_irqrestore(&er->lock, flags);
+ return config;
+}
+
+static void nhmex_mbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+ struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+ int idx;
+
+ idx = __BITS_VALUE(reg1->idx, 0, 8);
+ if (idx != 0xff)
+ wrmsrl(__BITS_VALUE(reg1->reg, 0, 16),
+ nhmex_mbox_shared_reg_config(box, idx));
+ idx = __BITS_VALUE(reg1->idx, 1, 8);
+ if (idx != 0xff)
+ wrmsrl(__BITS_VALUE(reg1->reg, 1, 16),
+ nhmex_mbox_shared_reg_config(box, idx));
+
+ if (reg2->idx != EXTRA_REG_NONE) {
+ wrmsrl(reg2->reg, 0);
+ if (reg2->config != ~0ULL) {
+ wrmsrl(reg2->reg + 1,
+ reg2->config & NHMEX_M_PMON_ADDR_MATCH_MASK);
+ wrmsrl(reg2->reg + 2, NHMEX_M_PMON_ADDR_MASK_MASK &
+ (reg2->config >> NHMEX_M_PMON_ADDR_MASK_SHIFT));
+ wrmsrl(reg2->reg, NHMEX_M_PMON_MM_CFG_EN);
+ }
+ }
+
+ wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT0);
+}
+
+DEFINE_UNCORE_FORMAT_ATTR(count_mode, count_mode, "config:2-3");
+DEFINE_UNCORE_FORMAT_ATTR(storage_mode, storage_mode, "config:4-5");
+DEFINE_UNCORE_FORMAT_ATTR(wrap_mode, wrap_mode, "config:6");
+DEFINE_UNCORE_FORMAT_ATTR(flag_mode, flag_mode, "config:7");
+DEFINE_UNCORE_FORMAT_ATTR(inc_sel, inc_sel, "config:9-13");
+DEFINE_UNCORE_FORMAT_ATTR(set_flag_sel, set_flag_sel, "config:19-21");
+DEFINE_UNCORE_FORMAT_ATTR(filter_cfg_en, filter_cfg_en, "config2:63");
+DEFINE_UNCORE_FORMAT_ATTR(filter_match, filter_match, "config2:0-33");
+DEFINE_UNCORE_FORMAT_ATTR(filter_mask, filter_mask, "config2:34-61");
+DEFINE_UNCORE_FORMAT_ATTR(dsp, dsp, "config1:0-31");
+DEFINE_UNCORE_FORMAT_ATTR(thr, thr, "config1:0-31");
+DEFINE_UNCORE_FORMAT_ATTR(fvc, fvc, "config1:0-31");
+DEFINE_UNCORE_FORMAT_ATTR(pgt, pgt, "config1:0-31");
+DEFINE_UNCORE_FORMAT_ATTR(map, map, "config1:0-31");
+DEFINE_UNCORE_FORMAT_ATTR(iss, iss, "config1:0-31");
+DEFINE_UNCORE_FORMAT_ATTR(pld, pld, "config1:32-63");
+
+static struct attribute *nhmex_uncore_mbox_formats_attr[] = {
+ &format_attr_count_mode.attr,
+ &format_attr_storage_mode.attr,
+ &format_attr_wrap_mode.attr,
+ &format_attr_flag_mode.attr,
+ &format_attr_inc_sel.attr,
+ &format_attr_set_flag_sel.attr,
+ &format_attr_filter_cfg_en.attr,
+ &format_attr_filter_match.attr,
+ &format_attr_filter_mask.attr,
+ &format_attr_dsp.attr,
+ &format_attr_thr.attr,
+ &format_attr_fvc.attr,
+ &format_attr_pgt.attr,
+ &format_attr_map.attr,
+ &format_attr_iss.attr,
+ &format_attr_pld.attr,
+ NULL,
+};
+
+static struct attribute_group nhmex_uncore_mbox_format_group = {
+ .name = "format",
+ .attrs = nhmex_uncore_mbox_formats_attr,
+};
+
+static struct uncore_event_desc nhmex_uncore_mbox_events[] = {
+ INTEL_UNCORE_EVENT_DESC(bbox_cmds_read, "inc_sel=0xd,fvc=0x2800"),
+ INTEL_UNCORE_EVENT_DESC(bbox_cmds_write, "inc_sel=0xd,fvc=0x2820"),
+ { /* end: all zeroes */ },
+};
+
+static struct uncore_event_desc wsmex_uncore_mbox_events[] = {
+ INTEL_UNCORE_EVENT_DESC(bbox_cmds_read, "inc_sel=0xd,fvc=0x5000"),
+ INTEL_UNCORE_EVENT_DESC(bbox_cmds_write, "inc_sel=0xd,fvc=0x5040"),
+ { /* end: all zeroes */ },
+};
+
+static struct intel_uncore_ops nhmex_uncore_mbox_ops = {
+ NHMEX_UNCORE_OPS_COMMON_INIT(),
+ .enable_event = nhmex_mbox_msr_enable_event,
+ .hw_config = nhmex_mbox_hw_config,
+ .get_constraint = nhmex_mbox_get_constraint,
+ .put_constraint = nhmex_mbox_put_constraint,
+};
+
+static struct intel_uncore_type nhmex_uncore_mbox = {
+ .name = "mbox",
+ .num_counters = 6,
+ .num_boxes = 2,
+ .perf_ctr_bits = 48,
+ .event_ctl = NHMEX_M0_MSR_PMU_CTL0,
+ .perf_ctr = NHMEX_M0_MSR_PMU_CNT0,
+ .event_mask = NHMEX_M_PMON_RAW_EVENT_MASK,
+ .box_ctl = NHMEX_M0_MSR_GLOBAL_CTL,
+ .msr_offset = NHMEX_M_MSR_OFFSET,
+ .pair_ctr_ctl = 1,
+ .num_shared_regs = 8,
+ .event_descs = nhmex_uncore_mbox_events,
+ .ops = &nhmex_uncore_mbox_ops,
+ .format_group = &nhmex_uncore_mbox_format_group,
+};
+
+static void nhmex_rbox_alter_er(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+
+ /* adjust the main event selector and extra register index */
+ if (reg1->idx % 2) {
+ reg1->idx--;
+ hwc->config -= 1 << NHMEX_R_PMON_CTL_EV_SEL_SHIFT;
+ } else {
+ reg1->idx++;
+ hwc->config += 1 << NHMEX_R_PMON_CTL_EV_SEL_SHIFT;
+ }
+
+ /* adjust extra register config */
+ switch (reg1->idx % 6) {
+ case 2:
+ /* shift the 8~15 bits to the 0~7 bits */
+ reg1->config >>= 8;
+ break;
+ case 3:
+ /* shift the 0~7 bits to the 8~15 bits */
+ reg1->config <<= 8;
+ break;
+ }
+}
+
+/*
+ * Each rbox has 4 event set which monitor PQI port 0~3 or 4~7.
+ * An event set consists of 6 events, the 3rd and 4th events in
+ * an event set use the same extra register. So an event set uses
+ * 5 extra registers.
+ */
+static struct event_constraint *
+nhmex_rbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+ struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+ struct intel_uncore_extra_reg *er;
+ unsigned long flags;
+ int idx, er_idx;
+ u64 config1;
+ bool ok = false;
+
+ if (!uncore_box_is_fake(box) && reg1->alloc)
+ return NULL;
+
+ idx = reg1->idx % 6;
+ config1 = reg1->config;
+again:
+ er_idx = idx;
+ /* the 3rd and 4th events use the same extra register */
+ if (er_idx > 2)
+ er_idx--;
+ er_idx += (reg1->idx / 6) * 5;
+
+ er = &box->shared_regs[er_idx];
+ raw_spin_lock_irqsave(&er->lock, flags);
+ if (idx < 2) {
+ if (!atomic_read(&er->ref) || er->config == reg1->config) {
+ atomic_inc(&er->ref);
+ er->config = reg1->config;
+ ok = true;
+ }
+ } else if (idx == 2 || idx == 3) {
+ /*
+ * these two events use different fields in a extra register,
+ * the 0~7 bits and the 8~15 bits respectively.
+ */
+ u64 mask = 0xff << ((idx - 2) * 8);
+ if (!__BITS_VALUE(atomic_read(&er->ref), idx - 2, 8) ||
+ !((er->config ^ config1) & mask)) {
+ atomic_add(1 << ((idx - 2) * 8), &er->ref);
+ er->config &= ~mask;
+ er->config |= config1 & mask;
+ ok = true;
+ }
+ } else {
+ if (!atomic_read(&er->ref) ||
+ (er->config == (hwc->config >> 32) &&
+ er->config1 == reg1->config &&
+ er->config2 == reg2->config)) {
+ atomic_inc(&er->ref);
+ er->config = (hwc->config >> 32);
+ er->config1 = reg1->config;
+ er->config2 = reg2->config;
+ ok = true;
+ }
+ }
+ raw_spin_unlock_irqrestore(&er->lock, flags);
+
+ if (!ok) {
+ /*
+ * The Rbox events are always in pairs. The paired
+ * events are functional identical, but use different
+ * extra registers. If we failed to take an extra
+ * register, try the alternative.
+ */
+ idx ^= 1;
+ if (idx != reg1->idx % 6) {
+ if (idx == 2)
+ config1 >>= 8;
+ else if (idx == 3)
+ config1 <<= 8;
+ goto again;
+ }
+ } else {
+ if (!uncore_box_is_fake(box)) {
+ if (idx != reg1->idx % 6)
+ nhmex_rbox_alter_er(box, event);
+ reg1->alloc = 1;
+ }
+ return NULL;
+ }
+ return &uncore_constraint_empty;
+}
+
+static void nhmex_rbox_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct intel_uncore_extra_reg *er;
+ struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+ int idx, er_idx;
+
+ if (uncore_box_is_fake(box) || !reg1->alloc)
+ return;
+
+ idx = reg1->idx % 6;
+ er_idx = idx;
+ if (er_idx > 2)
+ er_idx--;
+ er_idx += (reg1->idx / 6) * 5;
+
+ er = &box->shared_regs[er_idx];
+ if (idx == 2 || idx == 3)
+ atomic_sub(1 << ((idx - 2) * 8), &er->ref);
+ else
+ atomic_dec(&er->ref);
+
+ reg1->alloc = 0;
+}
+
+static int nhmex_rbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+ struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
+ int idx;
+
+ idx = (event->hw.config & NHMEX_R_PMON_CTL_EV_SEL_MASK) >>
+ NHMEX_R_PMON_CTL_EV_SEL_SHIFT;
+ if (idx >= 0x18)
+ return -EINVAL;
+
+ reg1->idx = idx;
+ reg1->config = event->attr.config1;
+
+ switch (idx % 6) {
+ case 4:
+ case 5:
+ hwc->config |= event->attr.config & (~0ULL << 32);
+ reg2->config = event->attr.config2;
+ break;
+ }
+ return 0;
+}
+
+static void nhmex_rbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+ struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+ int idx, port;
+
+ idx = reg1->idx;
+ port = idx / 6 + box->pmu->pmu_idx * 4;
+
+ switch (idx % 6) {
+ case 0:
+ wrmsrl(NHMEX_R_MSR_PORTN_IPERF_CFG0(port), reg1->config);
+ break;
+ case 1:
+ wrmsrl(NHMEX_R_MSR_PORTN_IPERF_CFG1(port), reg1->config);
+ break;
+ case 2:
+ case 3:
+ wrmsrl(NHMEX_R_MSR_PORTN_QLX_CFG(port),
+ uncore_shared_reg_config(box, 2 + (idx / 6) * 5));
+ break;
+ case 4:
+ wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(port),
+ hwc->config >> 32);
+ wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET1_MATCH(port), reg1->config);
+ wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET1_MASK(port), reg2->config);
+ break;
+ case 5:
+ wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(port),
+ hwc->config >> 32);
+ wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET2_MATCH(port), reg1->config);
+ wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET2_MASK(port), reg2->config);
+ break;
+ }
+
+ wrmsrl(hwc->config_base, NHMEX_PMON_CTL_EN_BIT0 |
+ (hwc->config & NHMEX_R_PMON_CTL_EV_SEL_MASK));
+}
+
+DEFINE_UNCORE_FORMAT_ATTR(xbr_mm_cfg, xbr_mm_cfg, "config:32-63");
+DEFINE_UNCORE_FORMAT_ATTR(xbr_match, xbr_match, "config1:0-63");
+DEFINE_UNCORE_FORMAT_ATTR(xbr_mask, xbr_mask, "config2:0-63");
+DEFINE_UNCORE_FORMAT_ATTR(qlx_cfg, qlx_cfg, "config1:0-15");
+DEFINE_UNCORE_FORMAT_ATTR(iperf_cfg, iperf_cfg, "config1:0-31");
+
+static struct attribute *nhmex_uncore_rbox_formats_attr[] = {
+ &format_attr_event5.attr,
+ &format_attr_xbr_mm_cfg.attr,
+ &format_attr_xbr_match.attr,
+ &format_attr_xbr_mask.attr,
+ &format_attr_qlx_cfg.attr,
+ &format_attr_iperf_cfg.attr,
+ NULL,
+};
+
+static struct attribute_group nhmex_uncore_rbox_format_group = {
+ .name = "format",
+ .attrs = nhmex_uncore_rbox_formats_attr,
+};
+
+static struct uncore_event_desc nhmex_uncore_rbox_events[] = {
+ INTEL_UNCORE_EVENT_DESC(qpi0_flit_send, "event=0x0,iperf_cfg=0x80000000"),
+ INTEL_UNCORE_EVENT_DESC(qpi1_filt_send, "event=0x6,iperf_cfg=0x80000000"),
+ INTEL_UNCORE_EVENT_DESC(qpi0_idle_filt, "event=0x0,iperf_cfg=0x40000000"),
+ INTEL_UNCORE_EVENT_DESC(qpi1_idle_filt, "event=0x6,iperf_cfg=0x40000000"),
+ INTEL_UNCORE_EVENT_DESC(qpi0_date_response, "event=0x0,iperf_cfg=0xc4"),
+ INTEL_UNCORE_EVENT_DESC(qpi1_date_response, "event=0x6,iperf_cfg=0xc4"),
+ { /* end: all zeroes */ },
+};
+
+static struct intel_uncore_ops nhmex_uncore_rbox_ops = {
+ NHMEX_UNCORE_OPS_COMMON_INIT(),
+ .enable_event = nhmex_rbox_msr_enable_event,
+ .hw_config = nhmex_rbox_hw_config,
+ .get_constraint = nhmex_rbox_get_constraint,
+ .put_constraint = nhmex_rbox_put_constraint,
+};
+
+static struct intel_uncore_type nhmex_uncore_rbox = {
+ .name = "rbox",
+ .num_counters = 8,
+ .num_boxes = 2,
+ .perf_ctr_bits = 48,
+ .event_ctl = NHMEX_R_MSR_PMON_CTL0,
+ .perf_ctr = NHMEX_R_MSR_PMON_CNT0,
+ .event_mask = NHMEX_R_PMON_RAW_EVENT_MASK,
+ .box_ctl = NHMEX_R_MSR_GLOBAL_CTL,
+ .msr_offset = NHMEX_R_MSR_OFFSET,
+ .pair_ctr_ctl = 1,
+ .num_shared_regs = 20,
+ .event_descs = nhmex_uncore_rbox_events,
+ .ops = &nhmex_uncore_rbox_ops,
+ .format_group = &nhmex_uncore_rbox_format_group
+};
+
+static struct intel_uncore_type *nhmex_msr_uncores[] = {
+ &nhmex_uncore_ubox,
+ &nhmex_uncore_cbox,
+ &nhmex_uncore_bbox,
+ &nhmex_uncore_sbox,
+ &nhmex_uncore_mbox,
+ &nhmex_uncore_rbox,
+ &nhmex_uncore_wbox,
+ NULL,
+};
+
+void nhmex_uncore_cpu_init(void)
+{
+ if (boot_cpu_data.x86_model == 46)
+ uncore_nhmex = true;
+ else
+ nhmex_uncore_mbox.event_descs = wsmex_uncore_mbox_events;
+ if (nhmex_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
+ nhmex_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
+ uncore_msr_uncores = nhmex_msr_uncores;
+}
+/* end of Nehalem-EX uncore support */
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
new file mode 100644
index 000000000000..3001015b755c
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
@@ -0,0 +1,636 @@
+/* Nehalem/SandBridge/Haswell uncore support */
+#include "perf_event_intel_uncore.h"
+
+/* SNB event control */
+#define SNB_UNC_CTL_EV_SEL_MASK 0x000000ff
+#define SNB_UNC_CTL_UMASK_MASK 0x0000ff00
+#define SNB_UNC_CTL_EDGE_DET (1 << 18)
+#define SNB_UNC_CTL_EN (1 << 22)
+#define SNB_UNC_CTL_INVERT (1 << 23)
+#define SNB_UNC_CTL_CMASK_MASK 0x1f000000
+#define NHM_UNC_CTL_CMASK_MASK 0xff000000
+#define NHM_UNC_FIXED_CTR_CTL_EN (1 << 0)
+
+#define SNB_UNC_RAW_EVENT_MASK (SNB_UNC_CTL_EV_SEL_MASK | \
+ SNB_UNC_CTL_UMASK_MASK | \
+ SNB_UNC_CTL_EDGE_DET | \
+ SNB_UNC_CTL_INVERT | \
+ SNB_UNC_CTL_CMASK_MASK)
+
+#define NHM_UNC_RAW_EVENT_MASK (SNB_UNC_CTL_EV_SEL_MASK | \
+ SNB_UNC_CTL_UMASK_MASK | \
+ SNB_UNC_CTL_EDGE_DET | \
+ SNB_UNC_CTL_INVERT | \
+ NHM_UNC_CTL_CMASK_MASK)
+
+/* SNB global control register */
+#define SNB_UNC_PERF_GLOBAL_CTL 0x391
+#define SNB_UNC_FIXED_CTR_CTRL 0x394
+#define SNB_UNC_FIXED_CTR 0x395
+
+/* SNB uncore global control */
+#define SNB_UNC_GLOBAL_CTL_CORE_ALL ((1 << 4) - 1)
+#define SNB_UNC_GLOBAL_CTL_EN (1 << 29)
+
+/* SNB Cbo register */
+#define SNB_UNC_CBO_0_PERFEVTSEL0 0x700
+#define SNB_UNC_CBO_0_PER_CTR0 0x706
+#define SNB_UNC_CBO_MSR_OFFSET 0x10
+
+/* NHM global control register */
+#define NHM_UNC_PERF_GLOBAL_CTL 0x391
+#define NHM_UNC_FIXED_CTR 0x394
+#define NHM_UNC_FIXED_CTR_CTRL 0x395
+
+/* NHM uncore global control */
+#define NHM_UNC_GLOBAL_CTL_EN_PC_ALL ((1ULL << 8) - 1)
+#define NHM_UNC_GLOBAL_CTL_EN_FC (1ULL << 32)
+
+/* NHM uncore register */
+#define NHM_UNC_PERFEVTSEL0 0x3c0
+#define NHM_UNC_UNCORE_PMC0 0x3b0
+
+DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7");
+DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15");
+DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18");
+DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23");
+DEFINE_UNCORE_FORMAT_ATTR(cmask5, cmask, "config:24-28");
+DEFINE_UNCORE_FORMAT_ATTR(cmask8, cmask, "config:24-31");
+
+/* Sandy Bridge uncore support */
+static void snb_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ if (hwc->idx < UNCORE_PMC_IDX_FIXED)
+ wrmsrl(hwc->config_base, hwc->config | SNB_UNC_CTL_EN);
+ else
+ wrmsrl(hwc->config_base, SNB_UNC_CTL_EN);
+}
+
+static void snb_uncore_msr_disable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+ wrmsrl(event->hw.config_base, 0);
+}
+
+static void snb_uncore_msr_init_box(struct intel_uncore_box *box)
+{
+ if (box->pmu->pmu_idx == 0) {
+ wrmsrl(SNB_UNC_PERF_GLOBAL_CTL,
+ SNB_UNC_GLOBAL_CTL_EN | SNB_UNC_GLOBAL_CTL_CORE_ALL);
+ }
+}
+
+static struct uncore_event_desc snb_uncore_events[] = {
+ INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0x00"),
+ { /* end: all zeroes */ },
+};
+
+static struct attribute *snb_uncore_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_inv.attr,
+ &format_attr_cmask5.attr,
+ NULL,
+};
+
+static struct attribute_group snb_uncore_format_group = {
+ .name = "format",
+ .attrs = snb_uncore_formats_attr,
+};
+
+static struct intel_uncore_ops snb_uncore_msr_ops = {
+ .init_box = snb_uncore_msr_init_box,
+ .disable_event = snb_uncore_msr_disable_event,
+ .enable_event = snb_uncore_msr_enable_event,
+ .read_counter = uncore_msr_read_counter,
+};
+
+static struct event_constraint snb_uncore_cbox_constraints[] = {
+ UNCORE_EVENT_CONSTRAINT(0x80, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x83, 0x1),
+ EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type snb_uncore_cbox = {
+ .name = "cbox",
+ .num_counters = 2,
+ .num_boxes = 4,
+ .perf_ctr_bits = 44,
+ .fixed_ctr_bits = 48,
+ .perf_ctr = SNB_UNC_CBO_0_PER_CTR0,
+ .event_ctl = SNB_UNC_CBO_0_PERFEVTSEL0,
+ .fixed_ctr = SNB_UNC_FIXED_CTR,
+ .fixed_ctl = SNB_UNC_FIXED_CTR_CTRL,
+ .single_fixed = 1,
+ .event_mask = SNB_UNC_RAW_EVENT_MASK,
+ .msr_offset = SNB_UNC_CBO_MSR_OFFSET,
+ .constraints = snb_uncore_cbox_constraints,
+ .ops = &snb_uncore_msr_ops,
+ .format_group = &snb_uncore_format_group,
+ .event_descs = snb_uncore_events,
+};
+
+static struct intel_uncore_type *snb_msr_uncores[] = {
+ &snb_uncore_cbox,
+ NULL,
+};
+
+void snb_uncore_cpu_init(void)
+{
+ uncore_msr_uncores = snb_msr_uncores;
+ if (snb_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
+ snb_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
+}
+
+enum {
+ SNB_PCI_UNCORE_IMC,
+};
+
+static struct uncore_event_desc snb_uncore_imc_events[] = {
+ INTEL_UNCORE_EVENT_DESC(data_reads, "event=0x01"),
+ INTEL_UNCORE_EVENT_DESC(data_reads.scale, "6.103515625e-5"),
+ INTEL_UNCORE_EVENT_DESC(data_reads.unit, "MiB"),
+
+ INTEL_UNCORE_EVENT_DESC(data_writes, "event=0x02"),
+ INTEL_UNCORE_EVENT_DESC(data_writes.scale, "6.103515625e-5"),
+ INTEL_UNCORE_EVENT_DESC(data_writes.unit, "MiB"),
+
+ { /* end: all zeroes */ },
+};
+
+#define SNB_UNCORE_PCI_IMC_EVENT_MASK 0xff
+#define SNB_UNCORE_PCI_IMC_BAR_OFFSET 0x48
+
+/* page size multiple covering all config regs */
+#define SNB_UNCORE_PCI_IMC_MAP_SIZE 0x6000
+
+#define SNB_UNCORE_PCI_IMC_DATA_READS 0x1
+#define SNB_UNCORE_PCI_IMC_DATA_READS_BASE 0x5050
+#define SNB_UNCORE_PCI_IMC_DATA_WRITES 0x2
+#define SNB_UNCORE_PCI_IMC_DATA_WRITES_BASE 0x5054
+#define SNB_UNCORE_PCI_IMC_CTR_BASE SNB_UNCORE_PCI_IMC_DATA_READS_BASE
+
+static struct attribute *snb_uncore_imc_formats_attr[] = {
+ &format_attr_event.attr,
+ NULL,
+};
+
+static struct attribute_group snb_uncore_imc_format_group = {
+ .name = "format",
+ .attrs = snb_uncore_imc_formats_attr,
+};
+
+static void snb_uncore_imc_init_box(struct intel_uncore_box *box)
+{
+ struct pci_dev *pdev = box->pci_dev;
+ int where = SNB_UNCORE_PCI_IMC_BAR_OFFSET;
+ resource_size_t addr;
+ u32 pci_dword;
+
+ pci_read_config_dword(pdev, where, &pci_dword);
+ addr = pci_dword;
+
+#ifdef CONFIG_PHYS_ADDR_T_64BIT
+ pci_read_config_dword(pdev, where + 4, &pci_dword);
+ addr |= ((resource_size_t)pci_dword << 32);
+#endif
+
+ addr &= ~(PAGE_SIZE - 1);
+
+ box->io_addr = ioremap(addr, SNB_UNCORE_PCI_IMC_MAP_SIZE);
+ box->hrtimer_duration = UNCORE_SNB_IMC_HRTIMER_INTERVAL;
+}
+
+static void snb_uncore_imc_enable_box(struct intel_uncore_box *box)
+{}
+
+static void snb_uncore_imc_disable_box(struct intel_uncore_box *box)
+{}
+
+static void snb_uncore_imc_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{}
+
+static void snb_uncore_imc_disable_event(struct intel_uncore_box *box, struct perf_event *event)
+{}
+
+static u64 snb_uncore_imc_read_counter(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ return (u64)*(unsigned int *)(box->io_addr + hwc->event_base);
+}
+
+/*
+ * custom event_init() function because we define our own fixed, free
+ * running counters, so we do not want to conflict with generic uncore
+ * logic. Also simplifies processing
+ */
+static int snb_uncore_imc_event_init(struct perf_event *event)
+{
+ struct intel_uncore_pmu *pmu;
+ struct intel_uncore_box *box;
+ struct hw_perf_event *hwc = &event->hw;
+ u64 cfg = event->attr.config & SNB_UNCORE_PCI_IMC_EVENT_MASK;
+ int idx, base;
+
+ if (event->attr.type != event->pmu->type)
+ return -ENOENT;
+
+ pmu = uncore_event_to_pmu(event);
+ /* no device found for this pmu */
+ if (pmu->func_id < 0)
+ return -ENOENT;
+
+ /* Sampling not supported yet */
+ if (hwc->sample_period)
+ return -EINVAL;
+
+ /* unsupported modes and filters */
+ if (event->attr.exclude_user ||
+ event->attr.exclude_kernel ||
+ event->attr.exclude_hv ||
+ event->attr.exclude_idle ||
+ event->attr.exclude_host ||
+ event->attr.exclude_guest ||
+ event->attr.sample_period) /* no sampling */
+ return -EINVAL;
+
+ /*
+ * Place all uncore events for a particular physical package
+ * onto a single cpu
+ */
+ if (event->cpu < 0)
+ return -EINVAL;
+
+ /* check only supported bits are set */
+ if (event->attr.config & ~SNB_UNCORE_PCI_IMC_EVENT_MASK)
+ return -EINVAL;
+
+ box = uncore_pmu_to_box(pmu, event->cpu);
+ if (!box || box->cpu < 0)
+ return -EINVAL;
+
+ event->cpu = box->cpu;
+
+ event->hw.idx = -1;
+ event->hw.last_tag = ~0ULL;
+ event->hw.extra_reg.idx = EXTRA_REG_NONE;
+ event->hw.branch_reg.idx = EXTRA_REG_NONE;
+ /*
+ * check event is known (whitelist, determines counter)
+ */
+ switch (cfg) {
+ case SNB_UNCORE_PCI_IMC_DATA_READS:
+ base = SNB_UNCORE_PCI_IMC_DATA_READS_BASE;
+ idx = UNCORE_PMC_IDX_FIXED;
+ break;
+ case SNB_UNCORE_PCI_IMC_DATA_WRITES:
+ base = SNB_UNCORE_PCI_IMC_DATA_WRITES_BASE;
+ idx = UNCORE_PMC_IDX_FIXED + 1;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ /* must be done before validate_group */
+ event->hw.event_base = base;
+ event->hw.config = cfg;
+ event->hw.idx = idx;
+
+ /* no group validation needed, we have free running counters */
+
+ return 0;
+}
+
+static int snb_uncore_imc_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+ return 0;
+}
+
+static void snb_uncore_imc_event_start(struct perf_event *event, int flags)
+{
+ struct intel_uncore_box *box = uncore_event_to_box(event);
+ u64 count;
+
+ if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
+ return;
+
+ event->hw.state = 0;
+ box->n_active++;
+
+ list_add_tail(&event->active_entry, &box->active_list);
+
+ count = snb_uncore_imc_read_counter(box, event);
+ local64_set(&event->hw.prev_count, count);
+
+ if (box->n_active == 1)
+ uncore_pmu_start_hrtimer(box);
+}
+
+static void snb_uncore_imc_event_stop(struct perf_event *event, int flags)
+{
+ struct intel_uncore_box *box = uncore_event_to_box(event);
+ struct hw_perf_event *hwc = &event->hw;
+
+ if (!(hwc->state & PERF_HES_STOPPED)) {
+ box->n_active--;
+
+ WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
+ hwc->state |= PERF_HES_STOPPED;
+
+ list_del(&event->active_entry);
+
+ if (box->n_active == 0)
+ uncore_pmu_cancel_hrtimer(box);
+ }
+
+ if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
+ /*
+ * Drain the remaining delta count out of a event
+ * that we are disabling:
+ */
+ uncore_perf_event_update(box, event);
+ hwc->state |= PERF_HES_UPTODATE;
+ }
+}
+
+static int snb_uncore_imc_event_add(struct perf_event *event, int flags)
+{
+ struct intel_uncore_box *box = uncore_event_to_box(event);
+ struct hw_perf_event *hwc = &event->hw;
+
+ if (!box)
+ return -ENODEV;
+
+ hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
+ if (!(flags & PERF_EF_START))
+ hwc->state |= PERF_HES_ARCH;
+
+ snb_uncore_imc_event_start(event, 0);
+
+ box->n_events++;
+
+ return 0;
+}
+
+static void snb_uncore_imc_event_del(struct perf_event *event, int flags)
+{
+ struct intel_uncore_box *box = uncore_event_to_box(event);
+ int i;
+
+ snb_uncore_imc_event_stop(event, PERF_EF_UPDATE);
+
+ for (i = 0; i < box->n_events; i++) {
+ if (event == box->event_list[i]) {
+ --box->n_events;
+ break;
+ }
+ }
+}
+
+static int snb_pci2phy_map_init(int devid)
+{
+ struct pci_dev *dev = NULL;
+ int bus;
+
+ dev = pci_get_device(PCI_VENDOR_ID_INTEL, devid, dev);
+ if (!dev)
+ return -ENOTTY;
+
+ bus = dev->bus->number;
+
+ uncore_pcibus_to_physid[bus] = 0;
+
+ pci_dev_put(dev);
+
+ return 0;
+}
+
+static struct pmu snb_uncore_imc_pmu = {
+ .task_ctx_nr = perf_invalid_context,
+ .event_init = snb_uncore_imc_event_init,
+ .add = snb_uncore_imc_event_add,
+ .del = snb_uncore_imc_event_del,
+ .start = snb_uncore_imc_event_start,
+ .stop = snb_uncore_imc_event_stop,
+ .read = uncore_pmu_event_read,
+};
+
+static struct intel_uncore_ops snb_uncore_imc_ops = {
+ .init_box = snb_uncore_imc_init_box,
+ .enable_box = snb_uncore_imc_enable_box,
+ .disable_box = snb_uncore_imc_disable_box,
+ .disable_event = snb_uncore_imc_disable_event,
+ .enable_event = snb_uncore_imc_enable_event,
+ .hw_config = snb_uncore_imc_hw_config,
+ .read_counter = snb_uncore_imc_read_counter,
+};
+
+static struct intel_uncore_type snb_uncore_imc = {
+ .name = "imc",
+ .num_counters = 2,
+ .num_boxes = 1,
+ .fixed_ctr_bits = 32,
+ .fixed_ctr = SNB_UNCORE_PCI_IMC_CTR_BASE,
+ .event_descs = snb_uncore_imc_events,
+ .format_group = &snb_uncore_imc_format_group,
+ .perf_ctr = SNB_UNCORE_PCI_IMC_DATA_READS_BASE,
+ .event_mask = SNB_UNCORE_PCI_IMC_EVENT_MASK,
+ .ops = &snb_uncore_imc_ops,
+ .pmu = &snb_uncore_imc_pmu,
+};
+
+static struct intel_uncore_type *snb_pci_uncores[] = {
+ [SNB_PCI_UNCORE_IMC] = &snb_uncore_imc,
+ NULL,
+};
+
+static const struct pci_device_id snb_uncore_pci_ids[] = {
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SNB_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* end: all zeroes */ },
+};
+
+static const struct pci_device_id ivb_uncore_pci_ids[] = {
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IVB_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IVB_E3_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* end: all zeroes */ },
+};
+
+static const struct pci_device_id hsw_uncore_pci_ids[] = {
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_HSW_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* end: all zeroes */ },
+};
+
+static struct pci_driver snb_uncore_pci_driver = {
+ .name = "snb_uncore",
+ .id_table = snb_uncore_pci_ids,
+};
+
+static struct pci_driver ivb_uncore_pci_driver = {
+ .name = "ivb_uncore",
+ .id_table = ivb_uncore_pci_ids,
+};
+
+static struct pci_driver hsw_uncore_pci_driver = {
+ .name = "hsw_uncore",
+ .id_table = hsw_uncore_pci_ids,
+};
+
+struct imc_uncore_pci_dev {
+ __u32 pci_id;
+ struct pci_driver *driver;
+};
+#define IMC_DEV(a, d) \
+ { .pci_id = PCI_DEVICE_ID_INTEL_##a, .driver = (d) }
+
+static const struct imc_uncore_pci_dev desktop_imc_pci_ids[] = {
+ IMC_DEV(SNB_IMC, &snb_uncore_pci_driver),
+ IMC_DEV(IVB_IMC, &ivb_uncore_pci_driver), /* 3rd Gen Core processor */
+ IMC_DEV(IVB_E3_IMC, &ivb_uncore_pci_driver), /* Xeon E3-1200 v2/3rd Gen Core processor */
+ IMC_DEV(HSW_IMC, &hsw_uncore_pci_driver), /* 4th Gen Core Processor */
+ { /* end marker */ }
+};
+
+
+#define for_each_imc_pci_id(x, t) \
+ for (x = (t); (x)->pci_id; x++)
+
+static struct pci_driver *imc_uncore_find_dev(void)
+{
+ const struct imc_uncore_pci_dev *p;
+ int ret;
+
+ for_each_imc_pci_id(p, desktop_imc_pci_ids) {
+ ret = snb_pci2phy_map_init(p->pci_id);
+ if (ret == 0)
+ return p->driver;
+ }
+ return NULL;
+}
+
+static int imc_uncore_pci_init(void)
+{
+ struct pci_driver *imc_drv = imc_uncore_find_dev();
+
+ if (!imc_drv)
+ return -ENODEV;
+
+ uncore_pci_uncores = snb_pci_uncores;
+ uncore_pci_driver = imc_drv;
+
+ return 0;
+}
+
+int snb_uncore_pci_init(void)
+{
+ return imc_uncore_pci_init();
+}
+
+int ivb_uncore_pci_init(void)
+{
+ return imc_uncore_pci_init();
+}
+int hsw_uncore_pci_init(void)
+{
+ return imc_uncore_pci_init();
+}
+
+/* end of Sandy Bridge uncore support */
+
+/* Nehalem uncore support */
+static void nhm_uncore_msr_disable_box(struct intel_uncore_box *box)
+{
+ wrmsrl(NHM_UNC_PERF_GLOBAL_CTL, 0);
+}
+
+static void nhm_uncore_msr_enable_box(struct intel_uncore_box *box)
+{
+ wrmsrl(NHM_UNC_PERF_GLOBAL_CTL, NHM_UNC_GLOBAL_CTL_EN_PC_ALL | NHM_UNC_GLOBAL_CTL_EN_FC);
+}
+
+static void nhm_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ if (hwc->idx < UNCORE_PMC_IDX_FIXED)
+ wrmsrl(hwc->config_base, hwc->config | SNB_UNC_CTL_EN);
+ else
+ wrmsrl(hwc->config_base, NHM_UNC_FIXED_CTR_CTL_EN);
+}
+
+static struct attribute *nhm_uncore_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_inv.attr,
+ &format_attr_cmask8.attr,
+ NULL,
+};
+
+static struct attribute_group nhm_uncore_format_group = {
+ .name = "format",
+ .attrs = nhm_uncore_formats_attr,
+};
+
+static struct uncore_event_desc nhm_uncore_events[] = {
+ INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0x00"),
+ INTEL_UNCORE_EVENT_DESC(qmc_writes_full_any, "event=0x2f,umask=0x0f"),
+ INTEL_UNCORE_EVENT_DESC(qmc_normal_reads_any, "event=0x2c,umask=0x0f"),
+ INTEL_UNCORE_EVENT_DESC(qhl_request_ioh_reads, "event=0x20,umask=0x01"),
+ INTEL_UNCORE_EVENT_DESC(qhl_request_ioh_writes, "event=0x20,umask=0x02"),
+ INTEL_UNCORE_EVENT_DESC(qhl_request_remote_reads, "event=0x20,umask=0x04"),
+ INTEL_UNCORE_EVENT_DESC(qhl_request_remote_writes, "event=0x20,umask=0x08"),
+ INTEL_UNCORE_EVENT_DESC(qhl_request_local_reads, "event=0x20,umask=0x10"),
+ INTEL_UNCORE_EVENT_DESC(qhl_request_local_writes, "event=0x20,umask=0x20"),
+ { /* end: all zeroes */ },
+};
+
+static struct intel_uncore_ops nhm_uncore_msr_ops = {
+ .disable_box = nhm_uncore_msr_disable_box,
+ .enable_box = nhm_uncore_msr_enable_box,
+ .disable_event = snb_uncore_msr_disable_event,
+ .enable_event = nhm_uncore_msr_enable_event,
+ .read_counter = uncore_msr_read_counter,
+};
+
+static struct intel_uncore_type nhm_uncore = {
+ .name = "",
+ .num_counters = 8,
+ .num_boxes = 1,
+ .perf_ctr_bits = 48,
+ .fixed_ctr_bits = 48,
+ .event_ctl = NHM_UNC_PERFEVTSEL0,
+ .perf_ctr = NHM_UNC_UNCORE_PMC0,
+ .fixed_ctr = NHM_UNC_FIXED_CTR,
+ .fixed_ctl = NHM_UNC_FIXED_CTR_CTRL,
+ .event_mask = NHM_UNC_RAW_EVENT_MASK,
+ .event_descs = nhm_uncore_events,
+ .ops = &nhm_uncore_msr_ops,
+ .format_group = &nhm_uncore_format_group,
+};
+
+static struct intel_uncore_type *nhm_msr_uncores[] = {
+ &nhm_uncore,
+ NULL,
+};
+
+void nhm_uncore_cpu_init(void)
+{
+ uncore_msr_uncores = nhm_msr_uncores;
+}
+
+/* end of Nehalem uncore support */
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
new file mode 100644
index 000000000000..adf138eac85c
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
@@ -0,0 +1,2258 @@
+/* SandyBridge-EP/IvyTown uncore support */
+#include "perf_event_intel_uncore.h"
+
+
+/* SNB-EP Box level control */
+#define SNBEP_PMON_BOX_CTL_RST_CTRL (1 << 0)
+#define SNBEP_PMON_BOX_CTL_RST_CTRS (1 << 1)
+#define SNBEP_PMON_BOX_CTL_FRZ (1 << 8)
+#define SNBEP_PMON_BOX_CTL_FRZ_EN (1 << 16)
+#define SNBEP_PMON_BOX_CTL_INT (SNBEP_PMON_BOX_CTL_RST_CTRL | \
+ SNBEP_PMON_BOX_CTL_RST_CTRS | \
+ SNBEP_PMON_BOX_CTL_FRZ_EN)
+/* SNB-EP event control */
+#define SNBEP_PMON_CTL_EV_SEL_MASK 0x000000ff
+#define SNBEP_PMON_CTL_UMASK_MASK 0x0000ff00
+#define SNBEP_PMON_CTL_RST (1 << 17)
+#define SNBEP_PMON_CTL_EDGE_DET (1 << 18)
+#define SNBEP_PMON_CTL_EV_SEL_EXT (1 << 21)
+#define SNBEP_PMON_CTL_EN (1 << 22)
+#define SNBEP_PMON_CTL_INVERT (1 << 23)
+#define SNBEP_PMON_CTL_TRESH_MASK 0xff000000
+#define SNBEP_PMON_RAW_EVENT_MASK (SNBEP_PMON_CTL_EV_SEL_MASK | \
+ SNBEP_PMON_CTL_UMASK_MASK | \
+ SNBEP_PMON_CTL_EDGE_DET | \
+ SNBEP_PMON_CTL_INVERT | \
+ SNBEP_PMON_CTL_TRESH_MASK)
+
+/* SNB-EP Ubox event control */
+#define SNBEP_U_MSR_PMON_CTL_TRESH_MASK 0x1f000000
+#define SNBEP_U_MSR_PMON_RAW_EVENT_MASK \
+ (SNBEP_PMON_CTL_EV_SEL_MASK | \
+ SNBEP_PMON_CTL_UMASK_MASK | \
+ SNBEP_PMON_CTL_EDGE_DET | \
+ SNBEP_PMON_CTL_INVERT | \
+ SNBEP_U_MSR_PMON_CTL_TRESH_MASK)
+
+#define SNBEP_CBO_PMON_CTL_TID_EN (1 << 19)
+#define SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK (SNBEP_PMON_RAW_EVENT_MASK | \
+ SNBEP_CBO_PMON_CTL_TID_EN)
+
+/* SNB-EP PCU event control */
+#define SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK 0x0000c000
+#define SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK 0x1f000000
+#define SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT (1 << 30)
+#define SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET (1 << 31)
+#define SNBEP_PCU_MSR_PMON_RAW_EVENT_MASK \
+ (SNBEP_PMON_CTL_EV_SEL_MASK | \
+ SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK | \
+ SNBEP_PMON_CTL_EDGE_DET | \
+ SNBEP_PMON_CTL_EV_SEL_EXT | \
+ SNBEP_PMON_CTL_INVERT | \
+ SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK | \
+ SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT | \
+ SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET)
+
+#define SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK \
+ (SNBEP_PMON_RAW_EVENT_MASK | \
+ SNBEP_PMON_CTL_EV_SEL_EXT)
+
+/* SNB-EP pci control register */
+#define SNBEP_PCI_PMON_BOX_CTL 0xf4
+#define SNBEP_PCI_PMON_CTL0 0xd8
+/* SNB-EP pci counter register */
+#define SNBEP_PCI_PMON_CTR0 0xa0
+
+/* SNB-EP home agent register */
+#define SNBEP_HA_PCI_PMON_BOX_ADDRMATCH0 0x40
+#define SNBEP_HA_PCI_PMON_BOX_ADDRMATCH1 0x44
+#define SNBEP_HA_PCI_PMON_BOX_OPCODEMATCH 0x48
+/* SNB-EP memory controller register */
+#define SNBEP_MC_CHy_PCI_PMON_FIXED_CTL 0xf0
+#define SNBEP_MC_CHy_PCI_PMON_FIXED_CTR 0xd0
+/* SNB-EP QPI register */
+#define SNBEP_Q_Py_PCI_PMON_PKT_MATCH0 0x228
+#define SNBEP_Q_Py_PCI_PMON_PKT_MATCH1 0x22c
+#define SNBEP_Q_Py_PCI_PMON_PKT_MASK0 0x238
+#define SNBEP_Q_Py_PCI_PMON_PKT_MASK1 0x23c
+
+/* SNB-EP Ubox register */
+#define SNBEP_U_MSR_PMON_CTR0 0xc16
+#define SNBEP_U_MSR_PMON_CTL0 0xc10
+
+#define SNBEP_U_MSR_PMON_UCLK_FIXED_CTL 0xc08
+#define SNBEP_U_MSR_PMON_UCLK_FIXED_CTR 0xc09
+
+/* SNB-EP Cbo register */
+#define SNBEP_C0_MSR_PMON_CTR0 0xd16
+#define SNBEP_C0_MSR_PMON_CTL0 0xd10
+#define SNBEP_C0_MSR_PMON_BOX_CTL 0xd04
+#define SNBEP_C0_MSR_PMON_BOX_FILTER 0xd14
+#define SNBEP_CBO_MSR_OFFSET 0x20
+
+#define SNBEP_CB0_MSR_PMON_BOX_FILTER_TID 0x1f
+#define SNBEP_CB0_MSR_PMON_BOX_FILTER_NID 0x3fc00
+#define SNBEP_CB0_MSR_PMON_BOX_FILTER_STATE 0x7c0000
+#define SNBEP_CB0_MSR_PMON_BOX_FILTER_OPC 0xff800000
+
+#define SNBEP_CBO_EVENT_EXTRA_REG(e, m, i) { \
+ .event = (e), \
+ .msr = SNBEP_C0_MSR_PMON_BOX_FILTER, \
+ .config_mask = (m), \
+ .idx = (i) \
+}
+
+/* SNB-EP PCU register */
+#define SNBEP_PCU_MSR_PMON_CTR0 0xc36
+#define SNBEP_PCU_MSR_PMON_CTL0 0xc30
+#define SNBEP_PCU_MSR_PMON_BOX_CTL 0xc24
+#define SNBEP_PCU_MSR_PMON_BOX_FILTER 0xc34
+#define SNBEP_PCU_MSR_PMON_BOX_FILTER_MASK 0xffffffff
+#define SNBEP_PCU_MSR_CORE_C3_CTR 0x3fc
+#define SNBEP_PCU_MSR_CORE_C6_CTR 0x3fd
+
+/* IVBEP event control */
+#define IVBEP_PMON_BOX_CTL_INT (SNBEP_PMON_BOX_CTL_RST_CTRL | \
+ SNBEP_PMON_BOX_CTL_RST_CTRS)
+#define IVBEP_PMON_RAW_EVENT_MASK (SNBEP_PMON_CTL_EV_SEL_MASK | \
+ SNBEP_PMON_CTL_UMASK_MASK | \
+ SNBEP_PMON_CTL_EDGE_DET | \
+ SNBEP_PMON_CTL_TRESH_MASK)
+/* IVBEP Ubox */
+#define IVBEP_U_MSR_PMON_GLOBAL_CTL 0xc00
+#define IVBEP_U_PMON_GLOBAL_FRZ_ALL (1 << 31)
+#define IVBEP_U_PMON_GLOBAL_UNFRZ_ALL (1 << 29)
+
+#define IVBEP_U_MSR_PMON_RAW_EVENT_MASK \
+ (SNBEP_PMON_CTL_EV_SEL_MASK | \
+ SNBEP_PMON_CTL_UMASK_MASK | \
+ SNBEP_PMON_CTL_EDGE_DET | \
+ SNBEP_U_MSR_PMON_CTL_TRESH_MASK)
+/* IVBEP Cbo */
+#define IVBEP_CBO_MSR_PMON_RAW_EVENT_MASK (IVBEP_PMON_RAW_EVENT_MASK | \
+ SNBEP_CBO_PMON_CTL_TID_EN)
+
+#define IVBEP_CB0_MSR_PMON_BOX_FILTER_TID (0x1fULL << 0)
+#define IVBEP_CB0_MSR_PMON_BOX_FILTER_LINK (0xfULL << 5)
+#define IVBEP_CB0_MSR_PMON_BOX_FILTER_STATE (0x3fULL << 17)
+#define IVBEP_CB0_MSR_PMON_BOX_FILTER_NID (0xffffULL << 32)
+#define IVBEP_CB0_MSR_PMON_BOX_FILTER_OPC (0x1ffULL << 52)
+#define IVBEP_CB0_MSR_PMON_BOX_FILTER_C6 (0x1ULL << 61)
+#define IVBEP_CB0_MSR_PMON_BOX_FILTER_NC (0x1ULL << 62)
+#define IVBEP_CB0_MSR_PMON_BOX_FILTER_ISOC (0x1ULL << 63)
+
+/* IVBEP home agent */
+#define IVBEP_HA_PCI_PMON_CTL_Q_OCC_RST (1 << 16)
+#define IVBEP_HA_PCI_PMON_RAW_EVENT_MASK \
+ (IVBEP_PMON_RAW_EVENT_MASK | \
+ IVBEP_HA_PCI_PMON_CTL_Q_OCC_RST)
+/* IVBEP PCU */
+#define IVBEP_PCU_MSR_PMON_RAW_EVENT_MASK \
+ (SNBEP_PMON_CTL_EV_SEL_MASK | \
+ SNBEP_PMON_CTL_EV_SEL_EXT | \
+ SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK | \
+ SNBEP_PMON_CTL_EDGE_DET | \
+ SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK | \
+ SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT | \
+ SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET)
+/* IVBEP QPI */
+#define IVBEP_QPI_PCI_PMON_RAW_EVENT_MASK \
+ (IVBEP_PMON_RAW_EVENT_MASK | \
+ SNBEP_PMON_CTL_EV_SEL_EXT)
+
+#define __BITS_VALUE(x, i, n) ((typeof(x))(((x) >> ((i) * (n))) & \
+ ((1ULL << (n)) - 1)))
+
+/* Haswell-EP Ubox */
+#define HSWEP_U_MSR_PMON_CTR0 0x705
+#define HSWEP_U_MSR_PMON_CTL0 0x709
+#define HSWEP_U_MSR_PMON_FILTER 0x707
+
+#define HSWEP_U_MSR_PMON_UCLK_FIXED_CTL 0x703
+#define HSWEP_U_MSR_PMON_UCLK_FIXED_CTR 0x704
+
+#define HSWEP_U_MSR_PMON_BOX_FILTER_TID (0x1 << 0)
+#define HSWEP_U_MSR_PMON_BOX_FILTER_CID (0x1fULL << 1)
+#define HSWEP_U_MSR_PMON_BOX_FILTER_MASK \
+ (HSWEP_U_MSR_PMON_BOX_FILTER_TID | \
+ HSWEP_U_MSR_PMON_BOX_FILTER_CID)
+
+/* Haswell-EP CBo */
+#define HSWEP_C0_MSR_PMON_CTR0 0xe08
+#define HSWEP_C0_MSR_PMON_CTL0 0xe01
+#define HSWEP_C0_MSR_PMON_BOX_CTL 0xe00
+#define HSWEP_C0_MSR_PMON_BOX_FILTER0 0xe05
+#define HSWEP_CBO_MSR_OFFSET 0x10
+
+
+#define HSWEP_CB0_MSR_PMON_BOX_FILTER_TID (0x3fULL << 0)
+#define HSWEP_CB0_MSR_PMON_BOX_FILTER_LINK (0xfULL << 6)
+#define HSWEP_CB0_MSR_PMON_BOX_FILTER_STATE (0x7fULL << 17)
+#define HSWEP_CB0_MSR_PMON_BOX_FILTER_NID (0xffffULL << 32)
+#define HSWEP_CB0_MSR_PMON_BOX_FILTER_OPC (0x1ffULL << 52)
+#define HSWEP_CB0_MSR_PMON_BOX_FILTER_C6 (0x1ULL << 61)
+#define HSWEP_CB0_MSR_PMON_BOX_FILTER_NC (0x1ULL << 62)
+#define HSWEP_CB0_MSR_PMON_BOX_FILTER_ISOC (0x1ULL << 63)
+
+
+/* Haswell-EP Sbox */
+#define HSWEP_S0_MSR_PMON_CTR0 0x726
+#define HSWEP_S0_MSR_PMON_CTL0 0x721
+#define HSWEP_S0_MSR_PMON_BOX_CTL 0x720
+#define HSWEP_SBOX_MSR_OFFSET 0xa
+#define HSWEP_S_MSR_PMON_RAW_EVENT_MASK (SNBEP_PMON_RAW_EVENT_MASK | \
+ SNBEP_CBO_PMON_CTL_TID_EN)
+
+/* Haswell-EP PCU */
+#define HSWEP_PCU_MSR_PMON_CTR0 0x717
+#define HSWEP_PCU_MSR_PMON_CTL0 0x711
+#define HSWEP_PCU_MSR_PMON_BOX_CTL 0x710
+#define HSWEP_PCU_MSR_PMON_BOX_FILTER 0x715
+
+
+DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7");
+DEFINE_UNCORE_FORMAT_ATTR(event_ext, event, "config:0-7,21");
+DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15");
+DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18");
+DEFINE_UNCORE_FORMAT_ATTR(tid_en, tid_en, "config:19");
+DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23");
+DEFINE_UNCORE_FORMAT_ATTR(thresh8, thresh, "config:24-31");
+DEFINE_UNCORE_FORMAT_ATTR(thresh5, thresh, "config:24-28");
+DEFINE_UNCORE_FORMAT_ATTR(occ_sel, occ_sel, "config:14-15");
+DEFINE_UNCORE_FORMAT_ATTR(occ_invert, occ_invert, "config:30");
+DEFINE_UNCORE_FORMAT_ATTR(occ_edge, occ_edge, "config:14-51");
+DEFINE_UNCORE_FORMAT_ATTR(filter_tid, filter_tid, "config1:0-4");
+DEFINE_UNCORE_FORMAT_ATTR(filter_tid2, filter_tid, "config1:0");
+DEFINE_UNCORE_FORMAT_ATTR(filter_tid3, filter_tid, "config1:0-5");
+DEFINE_UNCORE_FORMAT_ATTR(filter_cid, filter_cid, "config1:5");
+DEFINE_UNCORE_FORMAT_ATTR(filter_link, filter_link, "config1:5-8");
+DEFINE_UNCORE_FORMAT_ATTR(filter_link2, filter_link, "config1:6-8");
+DEFINE_UNCORE_FORMAT_ATTR(filter_nid, filter_nid, "config1:10-17");
+DEFINE_UNCORE_FORMAT_ATTR(filter_nid2, filter_nid, "config1:32-47");
+DEFINE_UNCORE_FORMAT_ATTR(filter_state, filter_state, "config1:18-22");
+DEFINE_UNCORE_FORMAT_ATTR(filter_state2, filter_state, "config1:17-22");
+DEFINE_UNCORE_FORMAT_ATTR(filter_state3, filter_state, "config1:17-23");
+DEFINE_UNCORE_FORMAT_ATTR(filter_opc, filter_opc, "config1:23-31");
+DEFINE_UNCORE_FORMAT_ATTR(filter_opc2, filter_opc, "config1:52-60");
+DEFINE_UNCORE_FORMAT_ATTR(filter_nc, filter_nc, "config1:62");
+DEFINE_UNCORE_FORMAT_ATTR(filter_c6, filter_c6, "config1:61");
+DEFINE_UNCORE_FORMAT_ATTR(filter_isoc, filter_isoc, "config1:63");
+DEFINE_UNCORE_FORMAT_ATTR(filter_band0, filter_band0, "config1:0-7");
+DEFINE_UNCORE_FORMAT_ATTR(filter_band1, filter_band1, "config1:8-15");
+DEFINE_UNCORE_FORMAT_ATTR(filter_band2, filter_band2, "config1:16-23");
+DEFINE_UNCORE_FORMAT_ATTR(filter_band3, filter_band3, "config1:24-31");
+DEFINE_UNCORE_FORMAT_ATTR(match_rds, match_rds, "config1:48-51");
+DEFINE_UNCORE_FORMAT_ATTR(match_rnid30, match_rnid30, "config1:32-35");
+DEFINE_UNCORE_FORMAT_ATTR(match_rnid4, match_rnid4, "config1:31");
+DEFINE_UNCORE_FORMAT_ATTR(match_dnid, match_dnid, "config1:13-17");
+DEFINE_UNCORE_FORMAT_ATTR(match_mc, match_mc, "config1:9-12");
+DEFINE_UNCORE_FORMAT_ATTR(match_opc, match_opc, "config1:5-8");
+DEFINE_UNCORE_FORMAT_ATTR(match_vnw, match_vnw, "config1:3-4");
+DEFINE_UNCORE_FORMAT_ATTR(match0, match0, "config1:0-31");
+DEFINE_UNCORE_FORMAT_ATTR(match1, match1, "config1:32-63");
+DEFINE_UNCORE_FORMAT_ATTR(mask_rds, mask_rds, "config2:48-51");
+DEFINE_UNCORE_FORMAT_ATTR(mask_rnid30, mask_rnid30, "config2:32-35");
+DEFINE_UNCORE_FORMAT_ATTR(mask_rnid4, mask_rnid4, "config2:31");
+DEFINE_UNCORE_FORMAT_ATTR(mask_dnid, mask_dnid, "config2:13-17");
+DEFINE_UNCORE_FORMAT_ATTR(mask_mc, mask_mc, "config2:9-12");
+DEFINE_UNCORE_FORMAT_ATTR(mask_opc, mask_opc, "config2:5-8");
+DEFINE_UNCORE_FORMAT_ATTR(mask_vnw, mask_vnw, "config2:3-4");
+DEFINE_UNCORE_FORMAT_ATTR(mask0, mask0, "config2:0-31");
+DEFINE_UNCORE_FORMAT_ATTR(mask1, mask1, "config2:32-63");
+
+static void snbep_uncore_pci_disable_box(struct intel_uncore_box *box)
+{
+ struct pci_dev *pdev = box->pci_dev;
+ int box_ctl = uncore_pci_box_ctl(box);
+ u32 config = 0;
+
+ if (!pci_read_config_dword(pdev, box_ctl, &config)) {
+ config |= SNBEP_PMON_BOX_CTL_FRZ;
+ pci_write_config_dword(pdev, box_ctl, config);
+ }
+}
+
+static void snbep_uncore_pci_enable_box(struct intel_uncore_box *box)
+{
+ struct pci_dev *pdev = box->pci_dev;
+ int box_ctl = uncore_pci_box_ctl(box);
+ u32 config = 0;
+
+ if (!pci_read_config_dword(pdev, box_ctl, &config)) {
+ config &= ~SNBEP_PMON_BOX_CTL_FRZ;
+ pci_write_config_dword(pdev, box_ctl, config);
+ }
+}
+
+static void snbep_uncore_pci_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct pci_dev *pdev = box->pci_dev;
+ struct hw_perf_event *hwc = &event->hw;
+
+ pci_write_config_dword(pdev, hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
+}
+
+static void snbep_uncore_pci_disable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct pci_dev *pdev = box->pci_dev;
+ struct hw_perf_event *hwc = &event->hw;
+
+ pci_write_config_dword(pdev, hwc->config_base, hwc->config);
+}
+
+static u64 snbep_uncore_pci_read_counter(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct pci_dev *pdev = box->pci_dev;
+ struct hw_perf_event *hwc = &event->hw;
+ u64 count = 0;
+
+ pci_read_config_dword(pdev, hwc->event_base, (u32 *)&count);
+ pci_read_config_dword(pdev, hwc->event_base + 4, (u32 *)&count + 1);
+
+ return count;
+}
+
+static void snbep_uncore_pci_init_box(struct intel_uncore_box *box)
+{
+ struct pci_dev *pdev = box->pci_dev;
+
+ pci_write_config_dword(pdev, SNBEP_PCI_PMON_BOX_CTL, SNBEP_PMON_BOX_CTL_INT);
+}
+
+static void snbep_uncore_msr_disable_box(struct intel_uncore_box *box)
+{
+ u64 config;
+ unsigned msr;
+
+ msr = uncore_msr_box_ctl(box);
+ if (msr) {
+ rdmsrl(msr, config);
+ config |= SNBEP_PMON_BOX_CTL_FRZ;
+ wrmsrl(msr, config);
+ }
+}
+
+static void snbep_uncore_msr_enable_box(struct intel_uncore_box *box)
+{
+ u64 config;
+ unsigned msr;
+
+ msr = uncore_msr_box_ctl(box);
+ if (msr) {
+ rdmsrl(msr, config);
+ config &= ~SNBEP_PMON_BOX_CTL_FRZ;
+ wrmsrl(msr, config);
+ }
+}
+
+static void snbep_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+
+ if (reg1->idx != EXTRA_REG_NONE)
+ wrmsrl(reg1->reg, uncore_shared_reg_config(box, 0));
+
+ wrmsrl(hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
+}
+
+static void snbep_uncore_msr_disable_event(struct intel_uncore_box *box,
+ struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ wrmsrl(hwc->config_base, hwc->config);
+}
+
+static void snbep_uncore_msr_init_box(struct intel_uncore_box *box)
+{
+ unsigned msr = uncore_msr_box_ctl(box);
+
+ if (msr)
+ wrmsrl(msr, SNBEP_PMON_BOX_CTL_INT);
+}
+
+static struct attribute *snbep_uncore_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_inv.attr,
+ &format_attr_thresh8.attr,
+ NULL,
+};
+
+static struct attribute *snbep_uncore_ubox_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_inv.attr,
+ &format_attr_thresh5.attr,
+ NULL,
+};
+
+static struct attribute *snbep_uncore_cbox_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_tid_en.attr,
+ &format_attr_inv.attr,
+ &format_attr_thresh8.attr,
+ &format_attr_filter_tid.attr,
+ &format_attr_filter_nid.attr,
+ &format_attr_filter_state.attr,
+ &format_attr_filter_opc.attr,
+ NULL,
+};
+
+static struct attribute *snbep_uncore_pcu_formats_attr[] = {
+ &format_attr_event_ext.attr,
+ &format_attr_occ_sel.attr,
+ &format_attr_edge.attr,
+ &format_attr_inv.attr,
+ &format_attr_thresh5.attr,
+ &format_attr_occ_invert.attr,
+ &format_attr_occ_edge.attr,
+ &format_attr_filter_band0.attr,
+ &format_attr_filter_band1.attr,
+ &format_attr_filter_band2.attr,
+ &format_attr_filter_band3.attr,
+ NULL,
+};
+
+static struct attribute *snbep_uncore_qpi_formats_attr[] = {
+ &format_attr_event_ext.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_inv.attr,
+ &format_attr_thresh8.attr,
+ &format_attr_match_rds.attr,
+ &format_attr_match_rnid30.attr,
+ &format_attr_match_rnid4.attr,
+ &format_attr_match_dnid.attr,
+ &format_attr_match_mc.attr,
+ &format_attr_match_opc.attr,
+ &format_attr_match_vnw.attr,
+ &format_attr_match0.attr,
+ &format_attr_match1.attr,
+ &format_attr_mask_rds.attr,
+ &format_attr_mask_rnid30.attr,
+ &format_attr_mask_rnid4.attr,
+ &format_attr_mask_dnid.attr,
+ &format_attr_mask_mc.attr,
+ &format_attr_mask_opc.attr,
+ &format_attr_mask_vnw.attr,
+ &format_attr_mask0.attr,
+ &format_attr_mask1.attr,
+ NULL,
+};
+
+static struct uncore_event_desc snbep_uncore_imc_events[] = {
+ INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0x00"),
+ INTEL_UNCORE_EVENT_DESC(cas_count_read, "event=0x04,umask=0x03"),
+ INTEL_UNCORE_EVENT_DESC(cas_count_write, "event=0x04,umask=0x0c"),
+ { /* end: all zeroes */ },
+};
+
+static struct uncore_event_desc snbep_uncore_qpi_events[] = {
+ INTEL_UNCORE_EVENT_DESC(clockticks, "event=0x14"),
+ INTEL_UNCORE_EVENT_DESC(txl_flits_active, "event=0x00,umask=0x06"),
+ INTEL_UNCORE_EVENT_DESC(drs_data, "event=0x102,umask=0x08"),
+ INTEL_UNCORE_EVENT_DESC(ncb_data, "event=0x103,umask=0x04"),
+ { /* end: all zeroes */ },
+};
+
+static struct attribute_group snbep_uncore_format_group = {
+ .name = "format",
+ .attrs = snbep_uncore_formats_attr,
+};
+
+static struct attribute_group snbep_uncore_ubox_format_group = {
+ .name = "format",
+ .attrs = snbep_uncore_ubox_formats_attr,
+};
+
+static struct attribute_group snbep_uncore_cbox_format_group = {
+ .name = "format",
+ .attrs = snbep_uncore_cbox_formats_attr,
+};
+
+static struct attribute_group snbep_uncore_pcu_format_group = {
+ .name = "format",
+ .attrs = snbep_uncore_pcu_formats_attr,
+};
+
+static struct attribute_group snbep_uncore_qpi_format_group = {
+ .name = "format",
+ .attrs = snbep_uncore_qpi_formats_attr,
+};
+
+#define SNBEP_UNCORE_MSR_OPS_COMMON_INIT() \
+ .init_box = snbep_uncore_msr_init_box, \
+ .disable_box = snbep_uncore_msr_disable_box, \
+ .enable_box = snbep_uncore_msr_enable_box, \
+ .disable_event = snbep_uncore_msr_disable_event, \
+ .enable_event = snbep_uncore_msr_enable_event, \
+ .read_counter = uncore_msr_read_counter
+
+static struct intel_uncore_ops snbep_uncore_msr_ops = {
+ SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),
+};
+
+#define SNBEP_UNCORE_PCI_OPS_COMMON_INIT() \
+ .init_box = snbep_uncore_pci_init_box, \
+ .disable_box = snbep_uncore_pci_disable_box, \
+ .enable_box = snbep_uncore_pci_enable_box, \
+ .disable_event = snbep_uncore_pci_disable_event, \
+ .read_counter = snbep_uncore_pci_read_counter
+
+static struct intel_uncore_ops snbep_uncore_pci_ops = {
+ SNBEP_UNCORE_PCI_OPS_COMMON_INIT(),
+ .enable_event = snbep_uncore_pci_enable_event, \
+};
+
+static struct event_constraint snbep_uncore_cbox_constraints[] = {
+ UNCORE_EVENT_CONSTRAINT(0x01, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x02, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x04, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x05, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x07, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x09, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x11, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x12, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x13, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x1b, 0xc),
+ UNCORE_EVENT_CONSTRAINT(0x1c, 0xc),
+ UNCORE_EVENT_CONSTRAINT(0x1d, 0xc),
+ UNCORE_EVENT_CONSTRAINT(0x1e, 0xc),
+ EVENT_CONSTRAINT_OVERLAP(0x1f, 0xe, 0xff),
+ UNCORE_EVENT_CONSTRAINT(0x21, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x31, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x32, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x35, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x36, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x37, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x38, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x39, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x3b, 0x1),
+ EVENT_CONSTRAINT_END
+};
+
+static struct event_constraint snbep_uncore_r2pcie_constraints[] = {
+ UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x12, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x24, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x25, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x32, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
+ EVENT_CONSTRAINT_END
+};
+
+static struct event_constraint snbep_uncore_r3qpi_constraints[] = {
+ UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x12, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x13, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x20, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x21, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x22, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x24, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x25, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x28, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x29, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x2a, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x2b, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x2c, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x2d, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x2e, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x2f, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x30, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x31, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x32, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x36, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x37, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x38, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x39, 0x3),
+ EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type snbep_uncore_ubox = {
+ .name = "ubox",
+ .num_counters = 2,
+ .num_boxes = 1,
+ .perf_ctr_bits = 44,
+ .fixed_ctr_bits = 48,
+ .perf_ctr = SNBEP_U_MSR_PMON_CTR0,
+ .event_ctl = SNBEP_U_MSR_PMON_CTL0,
+ .event_mask = SNBEP_U_MSR_PMON_RAW_EVENT_MASK,
+ .fixed_ctr = SNBEP_U_MSR_PMON_UCLK_FIXED_CTR,
+ .fixed_ctl = SNBEP_U_MSR_PMON_UCLK_FIXED_CTL,
+ .ops = &snbep_uncore_msr_ops,
+ .format_group = &snbep_uncore_ubox_format_group,
+};
+
+static struct extra_reg snbep_uncore_cbox_extra_regs[] = {
+ SNBEP_CBO_EVENT_EXTRA_REG(SNBEP_CBO_PMON_CTL_TID_EN,
+ SNBEP_CBO_PMON_CTL_TID_EN, 0x1),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x0334, 0xffff, 0x4),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4334, 0xffff, 0x6),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x0534, 0xffff, 0x4),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4534, 0xffff, 0x6),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x0934, 0xffff, 0x4),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4934, 0xffff, 0x6),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4134, 0xffff, 0x6),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x0135, 0xffff, 0x8),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x0335, 0xffff, 0x8),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4135, 0xffff, 0xa),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4335, 0xffff, 0xa),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4435, 0xffff, 0x2),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4835, 0xffff, 0x2),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4a35, 0xffff, 0x2),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x5035, 0xffff, 0x2),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x0136, 0xffff, 0x8),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x0336, 0xffff, 0x8),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4136, 0xffff, 0xa),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4336, 0xffff, 0xa),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4436, 0xffff, 0x2),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4836, 0xffff, 0x2),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4a36, 0xffff, 0x2),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4037, 0x40ff, 0x2),
+ EVENT_EXTRA_END
+};
+
+static void snbep_cbox_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+ struct intel_uncore_extra_reg *er = &box->shared_regs[0];
+ int i;
+
+ if (uncore_box_is_fake(box))
+ return;
+
+ for (i = 0; i < 5; i++) {
+ if (reg1->alloc & (0x1 << i))
+ atomic_sub(1 << (i * 6), &er->ref);
+ }
+ reg1->alloc = 0;
+}
+
+static struct event_constraint *
+__snbep_cbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event,
+ u64 (*cbox_filter_mask)(int fields))
+{
+ struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+ struct intel_uncore_extra_reg *er = &box->shared_regs[0];
+ int i, alloc = 0;
+ unsigned long flags;
+ u64 mask;
+
+ if (reg1->idx == EXTRA_REG_NONE)
+ return NULL;
+
+ raw_spin_lock_irqsave(&er->lock, flags);
+ for (i = 0; i < 5; i++) {
+ if (!(reg1->idx & (0x1 << i)))
+ continue;
+ if (!uncore_box_is_fake(box) && (reg1->alloc & (0x1 << i)))
+ continue;
+
+ mask = cbox_filter_mask(0x1 << i);
+ if (!__BITS_VALUE(atomic_read(&er->ref), i, 6) ||
+ !((reg1->config ^ er->config) & mask)) {
+ atomic_add(1 << (i * 6), &er->ref);
+ er->config &= ~mask;
+ er->config |= reg1->config & mask;
+ alloc |= (0x1 << i);
+ } else {
+ break;
+ }
+ }
+ raw_spin_unlock_irqrestore(&er->lock, flags);
+ if (i < 5)
+ goto fail;
+
+ if (!uncore_box_is_fake(box))
+ reg1->alloc |= alloc;
+
+ return NULL;
+fail:
+ for (; i >= 0; i--) {
+ if (alloc & (0x1 << i))
+ atomic_sub(1 << (i * 6), &er->ref);
+ }
+ return &uncore_constraint_empty;
+}
+
+static u64 snbep_cbox_filter_mask(int fields)
+{
+ u64 mask = 0;
+
+ if (fields & 0x1)
+ mask |= SNBEP_CB0_MSR_PMON_BOX_FILTER_TID;
+ if (fields & 0x2)
+ mask |= SNBEP_CB0_MSR_PMON_BOX_FILTER_NID;
+ if (fields & 0x4)
+ mask |= SNBEP_CB0_MSR_PMON_BOX_FILTER_STATE;
+ if (fields & 0x8)
+ mask |= SNBEP_CB0_MSR_PMON_BOX_FILTER_OPC;
+
+ return mask;
+}
+
+static struct event_constraint *
+snbep_cbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+ return __snbep_cbox_get_constraint(box, event, snbep_cbox_filter_mask);
+}
+
+static int snbep_cbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+ struct extra_reg *er;
+ int idx = 0;
+
+ for (er = snbep_uncore_cbox_extra_regs; er->msr; er++) {
+ if (er->event != (event->hw.config & er->config_mask))
+ continue;
+ idx |= er->idx;
+ }
+
+ if (idx) {
+ reg1->reg = SNBEP_C0_MSR_PMON_BOX_FILTER +
+ SNBEP_CBO_MSR_OFFSET * box->pmu->pmu_idx;
+ reg1->config = event->attr.config1 & snbep_cbox_filter_mask(idx);
+ reg1->idx = idx;
+ }
+ return 0;
+}
+
+static struct intel_uncore_ops snbep_uncore_cbox_ops = {
+ SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),
+ .hw_config = snbep_cbox_hw_config,
+ .get_constraint = snbep_cbox_get_constraint,
+ .put_constraint = snbep_cbox_put_constraint,
+};
+
+static struct intel_uncore_type snbep_uncore_cbox = {
+ .name = "cbox",
+ .num_counters = 4,
+ .num_boxes = 8,
+ .perf_ctr_bits = 44,
+ .event_ctl = SNBEP_C0_MSR_PMON_CTL0,
+ .perf_ctr = SNBEP_C0_MSR_PMON_CTR0,
+ .event_mask = SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK,
+ .box_ctl = SNBEP_C0_MSR_PMON_BOX_CTL,
+ .msr_offset = SNBEP_CBO_MSR_OFFSET,
+ .num_shared_regs = 1,
+ .constraints = snbep_uncore_cbox_constraints,
+ .ops = &snbep_uncore_cbox_ops,
+ .format_group = &snbep_uncore_cbox_format_group,
+};
+
+static u64 snbep_pcu_alter_er(struct perf_event *event, int new_idx, bool modify)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+ u64 config = reg1->config;
+
+ if (new_idx > reg1->idx)
+ config <<= 8 * (new_idx - reg1->idx);
+ else
+ config >>= 8 * (reg1->idx - new_idx);
+
+ if (modify) {
+ hwc->config += new_idx - reg1->idx;
+ reg1->config = config;
+ reg1->idx = new_idx;
+ }
+ return config;
+}
+
+static struct event_constraint *
+snbep_pcu_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+ struct intel_uncore_extra_reg *er = &box->shared_regs[0];
+ unsigned long flags;
+ int idx = reg1->idx;
+ u64 mask, config1 = reg1->config;
+ bool ok = false;
+
+ if (reg1->idx == EXTRA_REG_NONE ||
+ (!uncore_box_is_fake(box) && reg1->alloc))
+ return NULL;
+again:
+ mask = 0xffULL << (idx * 8);
+ raw_spin_lock_irqsave(&er->lock, flags);
+ if (!__BITS_VALUE(atomic_read(&er->ref), idx, 8) ||
+ !((config1 ^ er->config) & mask)) {
+ atomic_add(1 << (idx * 8), &er->ref);
+ er->config &= ~mask;
+ er->config |= config1 & mask;
+ ok = true;
+ }
+ raw_spin_unlock_irqrestore(&er->lock, flags);
+
+ if (!ok) {
+ idx = (idx + 1) % 4;
+ if (idx != reg1->idx) {
+ config1 = snbep_pcu_alter_er(event, idx, false);
+ goto again;
+ }
+ return &uncore_constraint_empty;
+ }
+
+ if (!uncore_box_is_fake(box)) {
+ if (idx != reg1->idx)
+ snbep_pcu_alter_er(event, idx, true);
+ reg1->alloc = 1;
+ }
+ return NULL;
+}
+
+static void snbep_pcu_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+ struct intel_uncore_extra_reg *er = &box->shared_regs[0];
+
+ if (uncore_box_is_fake(box) || !reg1->alloc)
+ return;
+
+ atomic_sub(1 << (reg1->idx * 8), &er->ref);
+ reg1->alloc = 0;
+}
+
+static int snbep_pcu_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+ int ev_sel = hwc->config & SNBEP_PMON_CTL_EV_SEL_MASK;
+
+ if (ev_sel >= 0xb && ev_sel <= 0xe) {
+ reg1->reg = SNBEP_PCU_MSR_PMON_BOX_FILTER;
+ reg1->idx = ev_sel - 0xb;
+ reg1->config = event->attr.config1 & (0xff << (reg1->idx * 8));
+ }
+ return 0;
+}
+
+static struct intel_uncore_ops snbep_uncore_pcu_ops = {
+ SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),
+ .hw_config = snbep_pcu_hw_config,
+ .get_constraint = snbep_pcu_get_constraint,
+ .put_constraint = snbep_pcu_put_constraint,
+};
+
+static struct intel_uncore_type snbep_uncore_pcu = {
+ .name = "pcu",
+ .num_counters = 4,
+ .num_boxes = 1,
+ .perf_ctr_bits = 48,
+ .perf_ctr = SNBEP_PCU_MSR_PMON_CTR0,
+ .event_ctl = SNBEP_PCU_MSR_PMON_CTL0,
+ .event_mask = SNBEP_PCU_MSR_PMON_RAW_EVENT_MASK,
+ .box_ctl = SNBEP_PCU_MSR_PMON_BOX_CTL,
+ .num_shared_regs = 1,
+ .ops = &snbep_uncore_pcu_ops,
+ .format_group = &snbep_uncore_pcu_format_group,
+};
+
+static struct intel_uncore_type *snbep_msr_uncores[] = {
+ &snbep_uncore_ubox,
+ &snbep_uncore_cbox,
+ &snbep_uncore_pcu,
+ NULL,
+};
+
+void snbep_uncore_cpu_init(void)
+{
+ if (snbep_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
+ snbep_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
+ uncore_msr_uncores = snbep_msr_uncores;
+}
+
+enum {
+ SNBEP_PCI_QPI_PORT0_FILTER,
+ SNBEP_PCI_QPI_PORT1_FILTER,
+};
+
+static int snbep_qpi_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+ struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+
+ if ((hwc->config & SNBEP_PMON_CTL_EV_SEL_MASK) == 0x38) {
+ reg1->idx = 0;
+ reg1->reg = SNBEP_Q_Py_PCI_PMON_PKT_MATCH0;
+ reg1->config = event->attr.config1;
+ reg2->reg = SNBEP_Q_Py_PCI_PMON_PKT_MASK0;
+ reg2->config = event->attr.config2;
+ }
+ return 0;
+}
+
+static void snbep_qpi_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct pci_dev *pdev = box->pci_dev;
+ struct hw_perf_event *hwc = &event->hw;
+ struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+ struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
+
+ if (reg1->idx != EXTRA_REG_NONE) {
+ int idx = box->pmu->pmu_idx + SNBEP_PCI_QPI_PORT0_FILTER;
+ struct pci_dev *filter_pdev = uncore_extra_pci_dev[box->phys_id][idx];
+ if (filter_pdev) {
+ pci_write_config_dword(filter_pdev, reg1->reg,
+ (u32)reg1->config);
+ pci_write_config_dword(filter_pdev, reg1->reg + 4,
+ (u32)(reg1->config >> 32));
+ pci_write_config_dword(filter_pdev, reg2->reg,
+ (u32)reg2->config);
+ pci_write_config_dword(filter_pdev, reg2->reg + 4,
+ (u32)(reg2->config >> 32));
+ }
+ }
+
+ pci_write_config_dword(pdev, hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
+}
+
+static struct intel_uncore_ops snbep_uncore_qpi_ops = {
+ SNBEP_UNCORE_PCI_OPS_COMMON_INIT(),
+ .enable_event = snbep_qpi_enable_event,
+ .hw_config = snbep_qpi_hw_config,
+ .get_constraint = uncore_get_constraint,
+ .put_constraint = uncore_put_constraint,
+};
+
+#define SNBEP_UNCORE_PCI_COMMON_INIT() \
+ .perf_ctr = SNBEP_PCI_PMON_CTR0, \
+ .event_ctl = SNBEP_PCI_PMON_CTL0, \
+ .event_mask = SNBEP_PMON_RAW_EVENT_MASK, \
+ .box_ctl = SNBEP_PCI_PMON_BOX_CTL, \
+ .ops = &snbep_uncore_pci_ops, \
+ .format_group = &snbep_uncore_format_group
+
+static struct intel_uncore_type snbep_uncore_ha = {
+ .name = "ha",
+ .num_counters = 4,
+ .num_boxes = 1,
+ .perf_ctr_bits = 48,
+ SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct intel_uncore_type snbep_uncore_imc = {
+ .name = "imc",
+ .num_counters = 4,
+ .num_boxes = 4,
+ .perf_ctr_bits = 48,
+ .fixed_ctr_bits = 48,
+ .fixed_ctr = SNBEP_MC_CHy_PCI_PMON_FIXED_CTR,
+ .fixed_ctl = SNBEP_MC_CHy_PCI_PMON_FIXED_CTL,
+ .event_descs = snbep_uncore_imc_events,
+ SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct intel_uncore_type snbep_uncore_qpi = {
+ .name = "qpi",
+ .num_counters = 4,
+ .num_boxes = 2,
+ .perf_ctr_bits = 48,
+ .perf_ctr = SNBEP_PCI_PMON_CTR0,
+ .event_ctl = SNBEP_PCI_PMON_CTL0,
+ .event_mask = SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK,
+ .box_ctl = SNBEP_PCI_PMON_BOX_CTL,
+ .num_shared_regs = 1,
+ .ops = &snbep_uncore_qpi_ops,
+ .event_descs = snbep_uncore_qpi_events,
+ .format_group = &snbep_uncore_qpi_format_group,
+};
+
+
+static struct intel_uncore_type snbep_uncore_r2pcie = {
+ .name = "r2pcie",
+ .num_counters = 4,
+ .num_boxes = 1,
+ .perf_ctr_bits = 44,
+ .constraints = snbep_uncore_r2pcie_constraints,
+ SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct intel_uncore_type snbep_uncore_r3qpi = {
+ .name = "r3qpi",
+ .num_counters = 3,
+ .num_boxes = 2,
+ .perf_ctr_bits = 44,
+ .constraints = snbep_uncore_r3qpi_constraints,
+ SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+enum {
+ SNBEP_PCI_UNCORE_HA,
+ SNBEP_PCI_UNCORE_IMC,
+ SNBEP_PCI_UNCORE_QPI,
+ SNBEP_PCI_UNCORE_R2PCIE,
+ SNBEP_PCI_UNCORE_R3QPI,
+};
+
+static struct intel_uncore_type *snbep_pci_uncores[] = {
+ [SNBEP_PCI_UNCORE_HA] = &snbep_uncore_ha,
+ [SNBEP_PCI_UNCORE_IMC] = &snbep_uncore_imc,
+ [SNBEP_PCI_UNCORE_QPI] = &snbep_uncore_qpi,
+ [SNBEP_PCI_UNCORE_R2PCIE] = &snbep_uncore_r2pcie,
+ [SNBEP_PCI_UNCORE_R3QPI] = &snbep_uncore_r3qpi,
+ NULL,
+};
+
+static const struct pci_device_id snbep_uncore_pci_ids[] = {
+ { /* Home Agent */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_HA),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_HA, 0),
+ },
+ { /* MC Channel 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC0),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 0),
+ },
+ { /* MC Channel 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC1),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 1),
+ },
+ { /* MC Channel 2 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC2),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 2),
+ },
+ { /* MC Channel 3 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC3),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_IMC, 3),
+ },
+ { /* QPI Port 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_QPI0),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_QPI, 0),
+ },
+ { /* QPI Port 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_QPI1),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_QPI, 1),
+ },
+ { /* R2PCIe */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R2PCIE),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_R2PCIE, 0),
+ },
+ { /* R3QPI Link 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R3QPI0),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_R3QPI, 0),
+ },
+ { /* R3QPI Link 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R3QPI1),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNBEP_PCI_UNCORE_R3QPI, 1),
+ },
+ { /* QPI Port 0 filter */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x3c86),
+ .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
+ SNBEP_PCI_QPI_PORT0_FILTER),
+ },
+ { /* QPI Port 0 filter */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x3c96),
+ .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
+ SNBEP_PCI_QPI_PORT1_FILTER),
+ },
+ { /* end: all zeroes */ }
+};
+
+static struct pci_driver snbep_uncore_pci_driver = {
+ .name = "snbep_uncore",
+ .id_table = snbep_uncore_pci_ids,
+};
+
+/*
+ * build pci bus to socket mapping
+ */
+static int snbep_pci2phy_map_init(int devid)
+{
+ struct pci_dev *ubox_dev = NULL;
+ int i, bus, nodeid;
+ int err = 0;
+ u32 config = 0;
+
+ while (1) {
+ /* find the UBOX device */
+ ubox_dev = pci_get_device(PCI_VENDOR_ID_INTEL, devid, ubox_dev);
+ if (!ubox_dev)
+ break;
+ bus = ubox_dev->bus->number;
+ /* get the Node ID of the local register */
+ err = pci_read_config_dword(ubox_dev, 0x40, &config);
+ if (err)
+ break;
+ nodeid = config;
+ /* get the Node ID mapping */
+ err = pci_read_config_dword(ubox_dev, 0x54, &config);
+ if (err)
+ break;
+ /*
+ * every three bits in the Node ID mapping register maps
+ * to a particular node.
+ */
+ for (i = 0; i < 8; i++) {
+ if (nodeid == ((config >> (3 * i)) & 0x7)) {
+ uncore_pcibus_to_physid[bus] = i;
+ break;
+ }
+ }
+ }
+
+ if (!err) {
+ /*
+ * For PCI bus with no UBOX device, find the next bus
+ * that has UBOX device and use its mapping.
+ */
+ i = -1;
+ for (bus = 255; bus >= 0; bus--) {
+ if (uncore_pcibus_to_physid[bus] >= 0)
+ i = uncore_pcibus_to_physid[bus];
+ else
+ uncore_pcibus_to_physid[bus] = i;
+ }
+ }
+
+ if (ubox_dev)
+ pci_dev_put(ubox_dev);
+
+ return err ? pcibios_err_to_errno(err) : 0;
+}
+
+int snbep_uncore_pci_init(void)
+{
+ int ret = snbep_pci2phy_map_init(0x3ce0);
+ if (ret)
+ return ret;
+ uncore_pci_uncores = snbep_pci_uncores;
+ uncore_pci_driver = &snbep_uncore_pci_driver;
+ return 0;
+}
+/* end of Sandy Bridge-EP uncore support */
+
+/* IvyTown uncore support */
+static void ivbep_uncore_msr_init_box(struct intel_uncore_box *box)
+{
+ unsigned msr = uncore_msr_box_ctl(box);
+ if (msr)
+ wrmsrl(msr, IVBEP_PMON_BOX_CTL_INT);
+}
+
+static void ivbep_uncore_pci_init_box(struct intel_uncore_box *box)
+{
+ struct pci_dev *pdev = box->pci_dev;
+
+ pci_write_config_dword(pdev, SNBEP_PCI_PMON_BOX_CTL, IVBEP_PMON_BOX_CTL_INT);
+}
+
+#define IVBEP_UNCORE_MSR_OPS_COMMON_INIT() \
+ .init_box = ivbep_uncore_msr_init_box, \
+ .disable_box = snbep_uncore_msr_disable_box, \
+ .enable_box = snbep_uncore_msr_enable_box, \
+ .disable_event = snbep_uncore_msr_disable_event, \
+ .enable_event = snbep_uncore_msr_enable_event, \
+ .read_counter = uncore_msr_read_counter
+
+static struct intel_uncore_ops ivbep_uncore_msr_ops = {
+ IVBEP_UNCORE_MSR_OPS_COMMON_INIT(),
+};
+
+static struct intel_uncore_ops ivbep_uncore_pci_ops = {
+ .init_box = ivbep_uncore_pci_init_box,
+ .disable_box = snbep_uncore_pci_disable_box,
+ .enable_box = snbep_uncore_pci_enable_box,
+ .disable_event = snbep_uncore_pci_disable_event,
+ .enable_event = snbep_uncore_pci_enable_event,
+ .read_counter = snbep_uncore_pci_read_counter,
+};
+
+#define IVBEP_UNCORE_PCI_COMMON_INIT() \
+ .perf_ctr = SNBEP_PCI_PMON_CTR0, \
+ .event_ctl = SNBEP_PCI_PMON_CTL0, \
+ .event_mask = IVBEP_PMON_RAW_EVENT_MASK, \
+ .box_ctl = SNBEP_PCI_PMON_BOX_CTL, \
+ .ops = &ivbep_uncore_pci_ops, \
+ .format_group = &ivbep_uncore_format_group
+
+static struct attribute *ivbep_uncore_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_inv.attr,
+ &format_attr_thresh8.attr,
+ NULL,
+};
+
+static struct attribute *ivbep_uncore_ubox_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_inv.attr,
+ &format_attr_thresh5.attr,
+ NULL,
+};
+
+static struct attribute *ivbep_uncore_cbox_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_tid_en.attr,
+ &format_attr_thresh8.attr,
+ &format_attr_filter_tid.attr,
+ &format_attr_filter_link.attr,
+ &format_attr_filter_state2.attr,
+ &format_attr_filter_nid2.attr,
+ &format_attr_filter_opc2.attr,
+ &format_attr_filter_nc.attr,
+ &format_attr_filter_c6.attr,
+ &format_attr_filter_isoc.attr,
+ NULL,
+};
+
+static struct attribute *ivbep_uncore_pcu_formats_attr[] = {
+ &format_attr_event_ext.attr,
+ &format_attr_occ_sel.attr,
+ &format_attr_edge.attr,
+ &format_attr_thresh5.attr,
+ &format_attr_occ_invert.attr,
+ &format_attr_occ_edge.attr,
+ &format_attr_filter_band0.attr,
+ &format_attr_filter_band1.attr,
+ &format_attr_filter_band2.attr,
+ &format_attr_filter_band3.attr,
+ NULL,
+};
+
+static struct attribute *ivbep_uncore_qpi_formats_attr[] = {
+ &format_attr_event_ext.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_thresh8.attr,
+ &format_attr_match_rds.attr,
+ &format_attr_match_rnid30.attr,
+ &format_attr_match_rnid4.attr,
+ &format_attr_match_dnid.attr,
+ &format_attr_match_mc.attr,
+ &format_attr_match_opc.attr,
+ &format_attr_match_vnw.attr,
+ &format_attr_match0.attr,
+ &format_attr_match1.attr,
+ &format_attr_mask_rds.attr,
+ &format_attr_mask_rnid30.attr,
+ &format_attr_mask_rnid4.attr,
+ &format_attr_mask_dnid.attr,
+ &format_attr_mask_mc.attr,
+ &format_attr_mask_opc.attr,
+ &format_attr_mask_vnw.attr,
+ &format_attr_mask0.attr,
+ &format_attr_mask1.attr,
+ NULL,
+};
+
+static struct attribute_group ivbep_uncore_format_group = {
+ .name = "format",
+ .attrs = ivbep_uncore_formats_attr,
+};
+
+static struct attribute_group ivbep_uncore_ubox_format_group = {
+ .name = "format",
+ .attrs = ivbep_uncore_ubox_formats_attr,
+};
+
+static struct attribute_group ivbep_uncore_cbox_format_group = {
+ .name = "format",
+ .attrs = ivbep_uncore_cbox_formats_attr,
+};
+
+static struct attribute_group ivbep_uncore_pcu_format_group = {
+ .name = "format",
+ .attrs = ivbep_uncore_pcu_formats_attr,
+};
+
+static struct attribute_group ivbep_uncore_qpi_format_group = {
+ .name = "format",
+ .attrs = ivbep_uncore_qpi_formats_attr,
+};
+
+static struct intel_uncore_type ivbep_uncore_ubox = {
+ .name = "ubox",
+ .num_counters = 2,
+ .num_boxes = 1,
+ .perf_ctr_bits = 44,
+ .fixed_ctr_bits = 48,
+ .perf_ctr = SNBEP_U_MSR_PMON_CTR0,
+ .event_ctl = SNBEP_U_MSR_PMON_CTL0,
+ .event_mask = IVBEP_U_MSR_PMON_RAW_EVENT_MASK,
+ .fixed_ctr = SNBEP_U_MSR_PMON_UCLK_FIXED_CTR,
+ .fixed_ctl = SNBEP_U_MSR_PMON_UCLK_FIXED_CTL,
+ .ops = &ivbep_uncore_msr_ops,
+ .format_group = &ivbep_uncore_ubox_format_group,
+};
+
+static struct extra_reg ivbep_uncore_cbox_extra_regs[] = {
+ SNBEP_CBO_EVENT_EXTRA_REG(SNBEP_CBO_PMON_CTL_TID_EN,
+ SNBEP_CBO_PMON_CTL_TID_EN, 0x1),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x1031, 0x10ff, 0x2),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x1134, 0xffff, 0x4),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4134, 0xffff, 0xc),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x5134, 0xffff, 0xc),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x0334, 0xffff, 0x4),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4334, 0xffff, 0xc),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x0534, 0xffff, 0x4),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4534, 0xffff, 0xc),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x0934, 0xffff, 0x4),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4934, 0xffff, 0xc),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x0135, 0xffff, 0x10),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x0335, 0xffff, 0x10),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x2135, 0xffff, 0x10),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x2335, 0xffff, 0x10),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4135, 0xffff, 0x18),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4335, 0xffff, 0x18),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4435, 0xffff, 0x8),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4835, 0xffff, 0x8),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4a35, 0xffff, 0x8),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x5035, 0xffff, 0x8),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x8135, 0xffff, 0x10),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x8335, 0xffff, 0x10),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x0136, 0xffff, 0x10),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x0336, 0xffff, 0x10),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x2136, 0xffff, 0x10),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x2336, 0xffff, 0x10),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4136, 0xffff, 0x18),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4336, 0xffff, 0x18),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4436, 0xffff, 0x8),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4836, 0xffff, 0x8),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4a36, 0xffff, 0x8),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x5036, 0xffff, 0x8),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x8136, 0xffff, 0x10),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x8336, 0xffff, 0x10),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4037, 0x40ff, 0x8),
+ EVENT_EXTRA_END
+};
+
+static u64 ivbep_cbox_filter_mask(int fields)
+{
+ u64 mask = 0;
+
+ if (fields & 0x1)
+ mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_TID;
+ if (fields & 0x2)
+ mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_LINK;
+ if (fields & 0x4)
+ mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_STATE;
+ if (fields & 0x8)
+ mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_NID;
+ if (fields & 0x10) {
+ mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_OPC;
+ mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_NC;
+ mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_C6;
+ mask |= IVBEP_CB0_MSR_PMON_BOX_FILTER_ISOC;
+ }
+
+ return mask;
+}
+
+static struct event_constraint *
+ivbep_cbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+ return __snbep_cbox_get_constraint(box, event, ivbep_cbox_filter_mask);
+}
+
+static int ivbep_cbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+ struct extra_reg *er;
+ int idx = 0;
+
+ for (er = ivbep_uncore_cbox_extra_regs; er->msr; er++) {
+ if (er->event != (event->hw.config & er->config_mask))
+ continue;
+ idx |= er->idx;
+ }
+
+ if (idx) {
+ reg1->reg = SNBEP_C0_MSR_PMON_BOX_FILTER +
+ SNBEP_CBO_MSR_OFFSET * box->pmu->pmu_idx;
+ reg1->config = event->attr.config1 & ivbep_cbox_filter_mask(idx);
+ reg1->idx = idx;
+ }
+ return 0;
+}
+
+static void ivbep_cbox_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+
+ if (reg1->idx != EXTRA_REG_NONE) {
+ u64 filter = uncore_shared_reg_config(box, 0);
+ wrmsrl(reg1->reg, filter & 0xffffffff);
+ wrmsrl(reg1->reg + 6, filter >> 32);
+ }
+
+ wrmsrl(hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
+}
+
+static struct intel_uncore_ops ivbep_uncore_cbox_ops = {
+ .init_box = ivbep_uncore_msr_init_box,
+ .disable_box = snbep_uncore_msr_disable_box,
+ .enable_box = snbep_uncore_msr_enable_box,
+ .disable_event = snbep_uncore_msr_disable_event,
+ .enable_event = ivbep_cbox_enable_event,
+ .read_counter = uncore_msr_read_counter,
+ .hw_config = ivbep_cbox_hw_config,
+ .get_constraint = ivbep_cbox_get_constraint,
+ .put_constraint = snbep_cbox_put_constraint,
+};
+
+static struct intel_uncore_type ivbep_uncore_cbox = {
+ .name = "cbox",
+ .num_counters = 4,
+ .num_boxes = 15,
+ .perf_ctr_bits = 44,
+ .event_ctl = SNBEP_C0_MSR_PMON_CTL0,
+ .perf_ctr = SNBEP_C0_MSR_PMON_CTR0,
+ .event_mask = IVBEP_CBO_MSR_PMON_RAW_EVENT_MASK,
+ .box_ctl = SNBEP_C0_MSR_PMON_BOX_CTL,
+ .msr_offset = SNBEP_CBO_MSR_OFFSET,
+ .num_shared_regs = 1,
+ .constraints = snbep_uncore_cbox_constraints,
+ .ops = &ivbep_uncore_cbox_ops,
+ .format_group = &ivbep_uncore_cbox_format_group,
+};
+
+static struct intel_uncore_ops ivbep_uncore_pcu_ops = {
+ IVBEP_UNCORE_MSR_OPS_COMMON_INIT(),
+ .hw_config = snbep_pcu_hw_config,
+ .get_constraint = snbep_pcu_get_constraint,
+ .put_constraint = snbep_pcu_put_constraint,
+};
+
+static struct intel_uncore_type ivbep_uncore_pcu = {
+ .name = "pcu",
+ .num_counters = 4,
+ .num_boxes = 1,
+ .perf_ctr_bits = 48,
+ .perf_ctr = SNBEP_PCU_MSR_PMON_CTR0,
+ .event_ctl = SNBEP_PCU_MSR_PMON_CTL0,
+ .event_mask = IVBEP_PCU_MSR_PMON_RAW_EVENT_MASK,
+ .box_ctl = SNBEP_PCU_MSR_PMON_BOX_CTL,
+ .num_shared_regs = 1,
+ .ops = &ivbep_uncore_pcu_ops,
+ .format_group = &ivbep_uncore_pcu_format_group,
+};
+
+static struct intel_uncore_type *ivbep_msr_uncores[] = {
+ &ivbep_uncore_ubox,
+ &ivbep_uncore_cbox,
+ &ivbep_uncore_pcu,
+ NULL,
+};
+
+void ivbep_uncore_cpu_init(void)
+{
+ if (ivbep_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
+ ivbep_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
+ uncore_msr_uncores = ivbep_msr_uncores;
+}
+
+static struct intel_uncore_type ivbep_uncore_ha = {
+ .name = "ha",
+ .num_counters = 4,
+ .num_boxes = 2,
+ .perf_ctr_bits = 48,
+ IVBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct intel_uncore_type ivbep_uncore_imc = {
+ .name = "imc",
+ .num_counters = 4,
+ .num_boxes = 8,
+ .perf_ctr_bits = 48,
+ .fixed_ctr_bits = 48,
+ .fixed_ctr = SNBEP_MC_CHy_PCI_PMON_FIXED_CTR,
+ .fixed_ctl = SNBEP_MC_CHy_PCI_PMON_FIXED_CTL,
+ .event_descs = snbep_uncore_imc_events,
+ IVBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+/* registers in IRP boxes are not properly aligned */
+static unsigned ivbep_uncore_irp_ctls[] = {0xd8, 0xdc, 0xe0, 0xe4};
+static unsigned ivbep_uncore_irp_ctrs[] = {0xa0, 0xb0, 0xb8, 0xc0};
+
+static void ivbep_uncore_irp_enable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct pci_dev *pdev = box->pci_dev;
+ struct hw_perf_event *hwc = &event->hw;
+
+ pci_write_config_dword(pdev, ivbep_uncore_irp_ctls[hwc->idx],
+ hwc->config | SNBEP_PMON_CTL_EN);
+}
+
+static void ivbep_uncore_irp_disable_event(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct pci_dev *pdev = box->pci_dev;
+ struct hw_perf_event *hwc = &event->hw;
+
+ pci_write_config_dword(pdev, ivbep_uncore_irp_ctls[hwc->idx], hwc->config);
+}
+
+static u64 ivbep_uncore_irp_read_counter(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct pci_dev *pdev = box->pci_dev;
+ struct hw_perf_event *hwc = &event->hw;
+ u64 count = 0;
+
+ pci_read_config_dword(pdev, ivbep_uncore_irp_ctrs[hwc->idx], (u32 *)&count);
+ pci_read_config_dword(pdev, ivbep_uncore_irp_ctrs[hwc->idx] + 4, (u32 *)&count + 1);
+
+ return count;
+}
+
+static struct intel_uncore_ops ivbep_uncore_irp_ops = {
+ .init_box = ivbep_uncore_pci_init_box,
+ .disable_box = snbep_uncore_pci_disable_box,
+ .enable_box = snbep_uncore_pci_enable_box,
+ .disable_event = ivbep_uncore_irp_disable_event,
+ .enable_event = ivbep_uncore_irp_enable_event,
+ .read_counter = ivbep_uncore_irp_read_counter,
+};
+
+static struct intel_uncore_type ivbep_uncore_irp = {
+ .name = "irp",
+ .num_counters = 4,
+ .num_boxes = 1,
+ .perf_ctr_bits = 48,
+ .event_mask = IVBEP_PMON_RAW_EVENT_MASK,
+ .box_ctl = SNBEP_PCI_PMON_BOX_CTL,
+ .ops = &ivbep_uncore_irp_ops,
+ .format_group = &ivbep_uncore_format_group,
+};
+
+static struct intel_uncore_ops ivbep_uncore_qpi_ops = {
+ .init_box = ivbep_uncore_pci_init_box,
+ .disable_box = snbep_uncore_pci_disable_box,
+ .enable_box = snbep_uncore_pci_enable_box,
+ .disable_event = snbep_uncore_pci_disable_event,
+ .enable_event = snbep_qpi_enable_event,
+ .read_counter = snbep_uncore_pci_read_counter,
+ .hw_config = snbep_qpi_hw_config,
+ .get_constraint = uncore_get_constraint,
+ .put_constraint = uncore_put_constraint,
+};
+
+static struct intel_uncore_type ivbep_uncore_qpi = {
+ .name = "qpi",
+ .num_counters = 4,
+ .num_boxes = 3,
+ .perf_ctr_bits = 48,
+ .perf_ctr = SNBEP_PCI_PMON_CTR0,
+ .event_ctl = SNBEP_PCI_PMON_CTL0,
+ .event_mask = IVBEP_QPI_PCI_PMON_RAW_EVENT_MASK,
+ .box_ctl = SNBEP_PCI_PMON_BOX_CTL,
+ .num_shared_regs = 1,
+ .ops = &ivbep_uncore_qpi_ops,
+ .format_group = &ivbep_uncore_qpi_format_group,
+};
+
+static struct intel_uncore_type ivbep_uncore_r2pcie = {
+ .name = "r2pcie",
+ .num_counters = 4,
+ .num_boxes = 1,
+ .perf_ctr_bits = 44,
+ .constraints = snbep_uncore_r2pcie_constraints,
+ IVBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct intel_uncore_type ivbep_uncore_r3qpi = {
+ .name = "r3qpi",
+ .num_counters = 3,
+ .num_boxes = 2,
+ .perf_ctr_bits = 44,
+ .constraints = snbep_uncore_r3qpi_constraints,
+ IVBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+enum {
+ IVBEP_PCI_UNCORE_HA,
+ IVBEP_PCI_UNCORE_IMC,
+ IVBEP_PCI_UNCORE_IRP,
+ IVBEP_PCI_UNCORE_QPI,
+ IVBEP_PCI_UNCORE_R2PCIE,
+ IVBEP_PCI_UNCORE_R3QPI,
+};
+
+static struct intel_uncore_type *ivbep_pci_uncores[] = {
+ [IVBEP_PCI_UNCORE_HA] = &ivbep_uncore_ha,
+ [IVBEP_PCI_UNCORE_IMC] = &ivbep_uncore_imc,
+ [IVBEP_PCI_UNCORE_IRP] = &ivbep_uncore_irp,
+ [IVBEP_PCI_UNCORE_QPI] = &ivbep_uncore_qpi,
+ [IVBEP_PCI_UNCORE_R2PCIE] = &ivbep_uncore_r2pcie,
+ [IVBEP_PCI_UNCORE_R3QPI] = &ivbep_uncore_r3qpi,
+ NULL,
+};
+
+static const struct pci_device_id ivbep_uncore_pci_ids[] = {
+ { /* Home Agent 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe30),
+ .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_HA, 0),
+ },
+ { /* Home Agent 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe38),
+ .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_HA, 1),
+ },
+ { /* MC0 Channel 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb4),
+ .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 0),
+ },
+ { /* MC0 Channel 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb5),
+ .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 1),
+ },
+ { /* MC0 Channel 3 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb0),
+ .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 2),
+ },
+ { /* MC0 Channel 4 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xeb1),
+ .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 3),
+ },
+ { /* MC1 Channel 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef4),
+ .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 4),
+ },
+ { /* MC1 Channel 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef5),
+ .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 5),
+ },
+ { /* MC1 Channel 3 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef0),
+ .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 6),
+ },
+ { /* MC1 Channel 4 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xef1),
+ .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IMC, 7),
+ },
+ { /* IRP */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe39),
+ .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_IRP, 0),
+ },
+ { /* QPI0 Port 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe32),
+ .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_QPI, 0),
+ },
+ { /* QPI0 Port 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe33),
+ .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_QPI, 1),
+ },
+ { /* QPI1 Port 2 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe3a),
+ .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_QPI, 2),
+ },
+ { /* R2PCIe */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe34),
+ .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_R2PCIE, 0),
+ },
+ { /* R3QPI0 Link 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe36),
+ .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_R3QPI, 0),
+ },
+ { /* R3QPI0 Link 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe37),
+ .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_R3QPI, 1),
+ },
+ { /* R3QPI1 Link 2 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe3e),
+ .driver_data = UNCORE_PCI_DEV_DATA(IVBEP_PCI_UNCORE_R3QPI, 2),
+ },
+ { /* QPI Port 0 filter */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe86),
+ .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
+ SNBEP_PCI_QPI_PORT0_FILTER),
+ },
+ { /* QPI Port 0 filter */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xe96),
+ .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
+ SNBEP_PCI_QPI_PORT1_FILTER),
+ },
+ { /* end: all zeroes */ }
+};
+
+static struct pci_driver ivbep_uncore_pci_driver = {
+ .name = "ivbep_uncore",
+ .id_table = ivbep_uncore_pci_ids,
+};
+
+int ivbep_uncore_pci_init(void)
+{
+ int ret = snbep_pci2phy_map_init(0x0e1e);
+ if (ret)
+ return ret;
+ uncore_pci_uncores = ivbep_pci_uncores;
+ uncore_pci_driver = &ivbep_uncore_pci_driver;
+ return 0;
+}
+/* end of IvyTown uncore support */
+
+/* Haswell-EP uncore support */
+static struct attribute *hswep_uncore_ubox_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_inv.attr,
+ &format_attr_thresh5.attr,
+ &format_attr_filter_tid2.attr,
+ &format_attr_filter_cid.attr,
+ NULL,
+};
+
+static struct attribute_group hswep_uncore_ubox_format_group = {
+ .name = "format",
+ .attrs = hswep_uncore_ubox_formats_attr,
+};
+
+static int hswep_ubox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+ reg1->reg = HSWEP_U_MSR_PMON_FILTER;
+ reg1->config = event->attr.config1 & HSWEP_U_MSR_PMON_BOX_FILTER_MASK;
+ reg1->idx = 0;
+ return 0;
+}
+
+static struct intel_uncore_ops hswep_uncore_ubox_ops = {
+ SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),
+ .hw_config = hswep_ubox_hw_config,
+ .get_constraint = uncore_get_constraint,
+ .put_constraint = uncore_put_constraint,
+};
+
+static struct intel_uncore_type hswep_uncore_ubox = {
+ .name = "ubox",
+ .num_counters = 2,
+ .num_boxes = 1,
+ .perf_ctr_bits = 44,
+ .fixed_ctr_bits = 48,
+ .perf_ctr = HSWEP_U_MSR_PMON_CTR0,
+ .event_ctl = HSWEP_U_MSR_PMON_CTL0,
+ .event_mask = SNBEP_U_MSR_PMON_RAW_EVENT_MASK,
+ .fixed_ctr = HSWEP_U_MSR_PMON_UCLK_FIXED_CTR,
+ .fixed_ctl = HSWEP_U_MSR_PMON_UCLK_FIXED_CTL,
+ .num_shared_regs = 1,
+ .ops = &hswep_uncore_ubox_ops,
+ .format_group = &hswep_uncore_ubox_format_group,
+};
+
+static struct attribute *hswep_uncore_cbox_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_tid_en.attr,
+ &format_attr_thresh8.attr,
+ &format_attr_filter_tid3.attr,
+ &format_attr_filter_link2.attr,
+ &format_attr_filter_state3.attr,
+ &format_attr_filter_nid2.attr,
+ &format_attr_filter_opc2.attr,
+ &format_attr_filter_nc.attr,
+ &format_attr_filter_c6.attr,
+ &format_attr_filter_isoc.attr,
+ NULL,
+};
+
+static struct attribute_group hswep_uncore_cbox_format_group = {
+ .name = "format",
+ .attrs = hswep_uncore_cbox_formats_attr,
+};
+
+static struct event_constraint hswep_uncore_cbox_constraints[] = {
+ UNCORE_EVENT_CONSTRAINT(0x01, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x09, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x11, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x36, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x38, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x3b, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x3e, 0x1),
+ EVENT_CONSTRAINT_END
+};
+
+static struct extra_reg hswep_uncore_cbox_extra_regs[] = {
+ SNBEP_CBO_EVENT_EXTRA_REG(SNBEP_CBO_PMON_CTL_TID_EN,
+ SNBEP_CBO_PMON_CTL_TID_EN, 0x1),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x0334, 0xffff, 0x4),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x0534, 0xffff, 0x4),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x0934, 0xffff, 0x4),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x1134, 0xffff, 0x4),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x2134, 0xffff, 0x4),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4134, 0xffff, 0x4),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4037, 0x40ff, 0x8),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4028, 0x40ff, 0x8),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4032, 0x40ff, 0x8),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4029, 0x40ff, 0x8),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4033, 0x40ff, 0x8),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x402A, 0x40ff, 0x8),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x0135, 0xffff, 0x12),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x0335, 0xffff, 0x10),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4135, 0xffff, 0x18),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4435, 0xffff, 0x8),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4835, 0xffff, 0x8),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x5035, 0xffff, 0x8),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4335, 0xffff, 0x18),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4a35, 0xffff, 0x8),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x2335, 0xffff, 0x10),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x8335, 0xffff, 0x10),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x2135, 0xffff, 0x10),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x8135, 0xffff, 0x10),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x0136, 0xffff, 0x10),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x0336, 0xffff, 0x10),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4136, 0xffff, 0x18),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4436, 0xffff, 0x8),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4836, 0xffff, 0x8),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4336, 0xffff, 0x18),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x4a36, 0xffff, 0x8),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x2336, 0xffff, 0x10),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x8336, 0xffff, 0x10),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x2136, 0xffff, 0x10),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x8136, 0xffff, 0x10),
+ SNBEP_CBO_EVENT_EXTRA_REG(0x5036, 0xffff, 0x8),
+ EVENT_EXTRA_END
+};
+
+static u64 hswep_cbox_filter_mask(int fields)
+{
+ u64 mask = 0;
+ if (fields & 0x1)
+ mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_TID;
+ if (fields & 0x2)
+ mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_LINK;
+ if (fields & 0x4)
+ mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_STATE;
+ if (fields & 0x8)
+ mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_NID;
+ if (fields & 0x10) {
+ mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_OPC;
+ mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_NC;
+ mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_C6;
+ mask |= HSWEP_CB0_MSR_PMON_BOX_FILTER_ISOC;
+ }
+ return mask;
+}
+
+static struct event_constraint *
+hswep_cbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
+{
+ return __snbep_cbox_get_constraint(box, event, hswep_cbox_filter_mask);
+}
+
+static int hswep_cbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
+ struct extra_reg *er;
+ int idx = 0;
+
+ for (er = hswep_uncore_cbox_extra_regs; er->msr; er++) {
+ if (er->event != (event->hw.config & er->config_mask))
+ continue;
+ idx |= er->idx;
+ }
+
+ if (idx) {
+ reg1->reg = HSWEP_C0_MSR_PMON_BOX_FILTER0 +
+ HSWEP_CBO_MSR_OFFSET * box->pmu->pmu_idx;
+ reg1->config = event->attr.config1 & hswep_cbox_filter_mask(idx);
+ reg1->idx = idx;
+ }
+ return 0;
+}
+
+static void hswep_cbox_enable_event(struct intel_uncore_box *box,
+ struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+
+ if (reg1->idx != EXTRA_REG_NONE) {
+ u64 filter = uncore_shared_reg_config(box, 0);
+ wrmsrl(reg1->reg, filter & 0xffffffff);
+ wrmsrl(reg1->reg + 1, filter >> 32);
+ }
+
+ wrmsrl(hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
+}
+
+static struct intel_uncore_ops hswep_uncore_cbox_ops = {
+ .init_box = snbep_uncore_msr_init_box,
+ .disable_box = snbep_uncore_msr_disable_box,
+ .enable_box = snbep_uncore_msr_enable_box,
+ .disable_event = snbep_uncore_msr_disable_event,
+ .enable_event = hswep_cbox_enable_event,
+ .read_counter = uncore_msr_read_counter,
+ .hw_config = hswep_cbox_hw_config,
+ .get_constraint = hswep_cbox_get_constraint,
+ .put_constraint = snbep_cbox_put_constraint,
+};
+
+static struct intel_uncore_type hswep_uncore_cbox = {
+ .name = "cbox",
+ .num_counters = 4,
+ .num_boxes = 18,
+ .perf_ctr_bits = 44,
+ .event_ctl = HSWEP_C0_MSR_PMON_CTL0,
+ .perf_ctr = HSWEP_C0_MSR_PMON_CTR0,
+ .event_mask = SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK,
+ .box_ctl = HSWEP_C0_MSR_PMON_BOX_CTL,
+ .msr_offset = HSWEP_CBO_MSR_OFFSET,
+ .num_shared_regs = 1,
+ .constraints = hswep_uncore_cbox_constraints,
+ .ops = &hswep_uncore_cbox_ops,
+ .format_group = &hswep_uncore_cbox_format_group,
+};
+
+static struct attribute *hswep_uncore_sbox_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_tid_en.attr,
+ &format_attr_inv.attr,
+ &format_attr_thresh8.attr,
+ NULL,
+};
+
+static struct attribute_group hswep_uncore_sbox_format_group = {
+ .name = "format",
+ .attrs = hswep_uncore_sbox_formats_attr,
+};
+
+static struct intel_uncore_type hswep_uncore_sbox = {
+ .name = "sbox",
+ .num_counters = 4,
+ .num_boxes = 4,
+ .perf_ctr_bits = 44,
+ .event_ctl = HSWEP_S0_MSR_PMON_CTL0,
+ .perf_ctr = HSWEP_S0_MSR_PMON_CTR0,
+ .event_mask = HSWEP_S_MSR_PMON_RAW_EVENT_MASK,
+ .box_ctl = HSWEP_S0_MSR_PMON_BOX_CTL,
+ .msr_offset = HSWEP_SBOX_MSR_OFFSET,
+ .ops = &snbep_uncore_msr_ops,
+ .format_group = &hswep_uncore_sbox_format_group,
+};
+
+static int hswep_pcu_hw_config(struct intel_uncore_box *box, struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
+ int ev_sel = hwc->config & SNBEP_PMON_CTL_EV_SEL_MASK;
+
+ if (ev_sel >= 0xb && ev_sel <= 0xe) {
+ reg1->reg = HSWEP_PCU_MSR_PMON_BOX_FILTER;
+ reg1->idx = ev_sel - 0xb;
+ reg1->config = event->attr.config1 & (0xff << reg1->idx);
+ }
+ return 0;
+}
+
+static struct intel_uncore_ops hswep_uncore_pcu_ops = {
+ SNBEP_UNCORE_MSR_OPS_COMMON_INIT(),
+ .hw_config = hswep_pcu_hw_config,
+ .get_constraint = snbep_pcu_get_constraint,
+ .put_constraint = snbep_pcu_put_constraint,
+};
+
+static struct intel_uncore_type hswep_uncore_pcu = {
+ .name = "pcu",
+ .num_counters = 4,
+ .num_boxes = 1,
+ .perf_ctr_bits = 48,
+ .perf_ctr = HSWEP_PCU_MSR_PMON_CTR0,
+ .event_ctl = HSWEP_PCU_MSR_PMON_CTL0,
+ .event_mask = SNBEP_PCU_MSR_PMON_RAW_EVENT_MASK,
+ .box_ctl = HSWEP_PCU_MSR_PMON_BOX_CTL,
+ .num_shared_regs = 1,
+ .ops = &hswep_uncore_pcu_ops,
+ .format_group = &snbep_uncore_pcu_format_group,
+};
+
+static struct intel_uncore_type *hswep_msr_uncores[] = {
+ &hswep_uncore_ubox,
+ &hswep_uncore_cbox,
+ &hswep_uncore_sbox,
+ &hswep_uncore_pcu,
+ NULL,
+};
+
+void hswep_uncore_cpu_init(void)
+{
+ if (hswep_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
+ hswep_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
+ uncore_msr_uncores = hswep_msr_uncores;
+}
+
+static struct intel_uncore_type hswep_uncore_ha = {
+ .name = "ha",
+ .num_counters = 5,
+ .num_boxes = 2,
+ .perf_ctr_bits = 48,
+ SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct uncore_event_desc hswep_uncore_imc_events[] = {
+ INTEL_UNCORE_EVENT_DESC(clockticks, "event=0x00,umask=0x00"),
+ INTEL_UNCORE_EVENT_DESC(cas_count_read, "event=0x04,umask=0x03"),
+ INTEL_UNCORE_EVENT_DESC(cas_count_write, "event=0x04,umask=0x0c"),
+ { /* end: all zeroes */ },
+};
+
+static struct intel_uncore_type hswep_uncore_imc = {
+ .name = "imc",
+ .num_counters = 5,
+ .num_boxes = 8,
+ .perf_ctr_bits = 48,
+ .fixed_ctr_bits = 48,
+ .fixed_ctr = SNBEP_MC_CHy_PCI_PMON_FIXED_CTR,
+ .fixed_ctl = SNBEP_MC_CHy_PCI_PMON_FIXED_CTL,
+ .event_descs = hswep_uncore_imc_events,
+ SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct intel_uncore_ops hswep_uncore_irp_ops = {
+ .init_box = snbep_uncore_pci_init_box,
+ .disable_box = snbep_uncore_pci_disable_box,
+ .enable_box = snbep_uncore_pci_enable_box,
+ .disable_event = ivbep_uncore_irp_disable_event,
+ .enable_event = ivbep_uncore_irp_enable_event,
+ .read_counter = ivbep_uncore_irp_read_counter,
+};
+
+static struct intel_uncore_type hswep_uncore_irp = {
+ .name = "irp",
+ .num_counters = 4,
+ .num_boxes = 1,
+ .perf_ctr_bits = 48,
+ .event_mask = SNBEP_PMON_RAW_EVENT_MASK,
+ .box_ctl = SNBEP_PCI_PMON_BOX_CTL,
+ .ops = &hswep_uncore_irp_ops,
+ .format_group = &snbep_uncore_format_group,
+};
+
+static struct intel_uncore_type hswep_uncore_qpi = {
+ .name = "qpi",
+ .num_counters = 5,
+ .num_boxes = 3,
+ .perf_ctr_bits = 48,
+ .perf_ctr = SNBEP_PCI_PMON_CTR0,
+ .event_ctl = SNBEP_PCI_PMON_CTL0,
+ .event_mask = SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK,
+ .box_ctl = SNBEP_PCI_PMON_BOX_CTL,
+ .num_shared_regs = 1,
+ .ops = &snbep_uncore_qpi_ops,
+ .format_group = &snbep_uncore_qpi_format_group,
+};
+
+static struct event_constraint hswep_uncore_r2pcie_constraints[] = {
+ UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x13, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x23, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x24, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x25, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x27, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x28, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x29, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x2a, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x2b, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x2c, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x2d, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x32, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x35, 0x3),
+ EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type hswep_uncore_r2pcie = {
+ .name = "r2pcie",
+ .num_counters = 4,
+ .num_boxes = 1,
+ .perf_ctr_bits = 48,
+ .constraints = hswep_uncore_r2pcie_constraints,
+ SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct event_constraint hswep_uncore_r3qpi_constraints[] = {
+ UNCORE_EVENT_CONSTRAINT(0x01, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x07, 0x7),
+ UNCORE_EVENT_CONSTRAINT(0x08, 0x7),
+ UNCORE_EVENT_CONSTRAINT(0x09, 0x7),
+ UNCORE_EVENT_CONSTRAINT(0x0a, 0x7),
+ UNCORE_EVENT_CONSTRAINT(0x0e, 0x7),
+ UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x12, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x13, 0x1),
+ UNCORE_EVENT_CONSTRAINT(0x14, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x15, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x1f, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x20, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x21, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x22, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x25, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x28, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x29, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x2c, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x2d, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x2e, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x2f, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x31, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x32, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x36, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x37, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x38, 0x3),
+ UNCORE_EVENT_CONSTRAINT(0x39, 0x3),
+ EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type hswep_uncore_r3qpi = {
+ .name = "r3qpi",
+ .num_counters = 4,
+ .num_boxes = 3,
+ .perf_ctr_bits = 44,
+ .constraints = hswep_uncore_r3qpi_constraints,
+ SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+enum {
+ HSWEP_PCI_UNCORE_HA,
+ HSWEP_PCI_UNCORE_IMC,
+ HSWEP_PCI_UNCORE_IRP,
+ HSWEP_PCI_UNCORE_QPI,
+ HSWEP_PCI_UNCORE_R2PCIE,
+ HSWEP_PCI_UNCORE_R3QPI,
+};
+
+static struct intel_uncore_type *hswep_pci_uncores[] = {
+ [HSWEP_PCI_UNCORE_HA] = &hswep_uncore_ha,
+ [HSWEP_PCI_UNCORE_IMC] = &hswep_uncore_imc,
+ [HSWEP_PCI_UNCORE_IRP] = &hswep_uncore_irp,
+ [HSWEP_PCI_UNCORE_QPI] = &hswep_uncore_qpi,
+ [HSWEP_PCI_UNCORE_R2PCIE] = &hswep_uncore_r2pcie,
+ [HSWEP_PCI_UNCORE_R3QPI] = &hswep_uncore_r3qpi,
+ NULL,
+};
+
+static DEFINE_PCI_DEVICE_TABLE(hswep_uncore_pci_ids) = {
+ { /* Home Agent 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f30),
+ .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_HA, 0),
+ },
+ { /* Home Agent 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f38),
+ .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_HA, 1),
+ },
+ { /* MC0 Channel 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fb0),
+ .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 0),
+ },
+ { /* MC0 Channel 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fb1),
+ .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 1),
+ },
+ { /* MC0 Channel 2 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fb4),
+ .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 2),
+ },
+ { /* MC0 Channel 3 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fb5),
+ .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 3),
+ },
+ { /* MC1 Channel 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fd0),
+ .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 4),
+ },
+ { /* MC1 Channel 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fd1),
+ .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 5),
+ },
+ { /* MC1 Channel 2 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fd4),
+ .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 6),
+ },
+ { /* MC1 Channel 3 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fd5),
+ .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IMC, 7),
+ },
+ { /* IRP */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f39),
+ .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_IRP, 0),
+ },
+ { /* QPI0 Port 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f32),
+ .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_QPI, 0),
+ },
+ { /* QPI0 Port 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f33),
+ .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_QPI, 1),
+ },
+ { /* QPI1 Port 2 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f3a),
+ .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_QPI, 2),
+ },
+ { /* R2PCIe */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f34),
+ .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_R2PCIE, 0),
+ },
+ { /* R3QPI0 Link 0 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f36),
+ .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_R3QPI, 0),
+ },
+ { /* R3QPI0 Link 1 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f37),
+ .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_R3QPI, 1),
+ },
+ { /* R3QPI1 Link 2 */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f3e),
+ .driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_R3QPI, 2),
+ },
+ { /* QPI Port 0 filter */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f86),
+ .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
+ SNBEP_PCI_QPI_PORT0_FILTER),
+ },
+ { /* QPI Port 1 filter */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f96),
+ .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
+ SNBEP_PCI_QPI_PORT1_FILTER),
+ },
+ { /* end: all zeroes */ }
+};
+
+static struct pci_driver hswep_uncore_pci_driver = {
+ .name = "hswep_uncore",
+ .id_table = hswep_uncore_pci_ids,
+};
+
+int hswep_uncore_pci_init(void)
+{
+ int ret = snbep_pci2phy_map_init(0x2f1e);
+ if (ret)
+ return ret;
+ uncore_pci_uncores = hswep_pci_uncores;
+ uncore_pci_driver = &hswep_uncore_pci_driver;
+ return 0;
+}
+/* end of Haswell-EP uncore support */
diff --git a/arch/x86/kernel/cpu/perf_event_knc.c b/arch/x86/kernel/cpu/perf_event_knc.c
index 838fa8772c62..5b0c232d1ee6 100644
--- a/arch/x86/kernel/cpu/perf_event_knc.c
+++ b/arch/x86/kernel/cpu/perf_event_knc.c
@@ -217,7 +217,7 @@ static int knc_pmu_handle_irq(struct pt_regs *regs)
int bit, loops;
u64 status;
- cpuc = &__get_cpu_var(cpu_hw_events);
+ cpuc = this_cpu_ptr(&cpu_hw_events);
knc_pmu_disable_all();
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index 5d466b7d8609..f2e56783af3d 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -915,7 +915,7 @@ static inline void p4_pmu_disable_event(struct perf_event *event)
static void p4_pmu_disable_all(void)
{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
int idx;
for (idx = 0; idx < x86_pmu.num_counters; idx++) {
@@ -984,7 +984,7 @@ static void p4_pmu_enable_event(struct perf_event *event)
static void p4_pmu_enable_all(int added)
{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
int idx;
for (idx = 0; idx < x86_pmu.num_counters; idx++) {
@@ -1004,7 +1004,7 @@ static int p4_pmu_handle_irq(struct pt_regs *regs)
int idx, handled = 0;
u64 val;
- cpuc = &__get_cpu_var(cpu_hw_events);
+ cpuc = this_cpu_ptr(&cpu_hw_events);
for (idx = 0; idx < x86_pmu.num_counters; idx++) {
int overflow;
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index 507de8066594..f5ab56d14287 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -4,9 +4,14 @@
* Created by: Hariprasad Nellitheertha (hari@in.ibm.com)
*
* Copyright (C) IBM Corporation, 2004. All rights reserved.
+ * Copyright (C) Red Hat Inc., 2014. All rights reserved.
+ * Authors:
+ * Vivek Goyal <vgoyal@redhat.com>
*
*/
+#define pr_fmt(fmt) "kexec: " fmt
+
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/smp.h>
@@ -16,6 +21,7 @@
#include <linux/elf.h>
#include <linux/elfcore.h>
#include <linux/module.h>
+#include <linux/slab.h>
#include <asm/processor.h>
#include <asm/hardirq.h>
@@ -28,6 +34,45 @@
#include <asm/reboot.h>
#include <asm/virtext.h>
+/* Alignment required for elf header segment */
+#define ELF_CORE_HEADER_ALIGN 4096
+
+/* This primarily represents number of split ranges due to exclusion */
+#define CRASH_MAX_RANGES 16
+
+struct crash_mem_range {
+ u64 start, end;
+};
+
+struct crash_mem {
+ unsigned int nr_ranges;
+ struct crash_mem_range ranges[CRASH_MAX_RANGES];
+};
+
+/* Misc data about ram ranges needed to prepare elf headers */
+struct crash_elf_data {
+ struct kimage *image;
+ /*
+ * Total number of ram ranges we have after various adjustments for
+ * GART, crash reserved region etc.
+ */
+ unsigned int max_nr_ranges;
+ unsigned long gart_start, gart_end;
+
+ /* Pointer to elf header */
+ void *ehdr;
+ /* Pointer to next phdr */
+ void *bufp;
+ struct crash_mem mem;
+};
+
+/* Used while preparing memory map entries for second kernel */
+struct crash_memmap_data {
+ struct boot_params *params;
+ /* Type of memory */
+ unsigned int type;
+};
+
int in_crash_kexec;
/*
@@ -39,6 +84,7 @@ int in_crash_kexec;
*/
crash_vmclear_fn __rcu *crash_vmclear_loaded_vmcss = NULL;
EXPORT_SYMBOL_GPL(crash_vmclear_loaded_vmcss);
+unsigned long crash_zero_bytes;
static inline void cpu_crash_vmclear_loaded_vmcss(void)
{
@@ -135,3 +181,520 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
#endif
crash_save_cpu(regs, safe_smp_processor_id());
}
+
+#ifdef CONFIG_KEXEC_FILE
+static int get_nr_ram_ranges_callback(unsigned long start_pfn,
+ unsigned long nr_pfn, void *arg)
+{
+ int *nr_ranges = arg;
+
+ (*nr_ranges)++;
+ return 0;
+}
+
+static int get_gart_ranges_callback(u64 start, u64 end, void *arg)
+{
+ struct crash_elf_data *ced = arg;
+
+ ced->gart_start = start;
+ ced->gart_end = end;
+
+ /* Not expecting more than 1 gart aperture */
+ return 1;
+}
+
+
+/* Gather all the required information to prepare elf headers for ram regions */
+static void fill_up_crash_elf_data(struct crash_elf_data *ced,
+ struct kimage *image)
+{
+ unsigned int nr_ranges = 0;
+
+ ced->image = image;
+
+ walk_system_ram_range(0, -1, &nr_ranges,
+ get_nr_ram_ranges_callback);
+
+ ced->max_nr_ranges = nr_ranges;
+
+ /*
+ * We don't create ELF headers for GART aperture as an attempt
+ * to dump this memory in second kernel leads to hang/crash.
+ * If gart aperture is present, one needs to exclude that region
+ * and that could lead to need of extra phdr.
+ */
+ walk_iomem_res("GART", IORESOURCE_MEM, 0, -1,
+ ced, get_gart_ranges_callback);
+
+ /*
+ * If we have gart region, excluding that could potentially split
+ * a memory range, resulting in extra header. Account for that.
+ */
+ if (ced->gart_end)
+ ced->max_nr_ranges++;
+
+ /* Exclusion of crash region could split memory ranges */
+ ced->max_nr_ranges++;
+
+ /* If crashk_low_res is not 0, another range split possible */
+ if (crashk_low_res.end)
+ ced->max_nr_ranges++;
+}
+
+static int exclude_mem_range(struct crash_mem *mem,
+ unsigned long long mstart, unsigned long long mend)
+{
+ int i, j;
+ unsigned long long start, end;
+ struct crash_mem_range temp_range = {0, 0};
+
+ for (i = 0; i < mem->nr_ranges; i++) {
+ start = mem->ranges[i].start;
+ end = mem->ranges[i].end;
+
+ if (mstart > end || mend < start)
+ continue;
+
+ /* Truncate any area outside of range */
+ if (mstart < start)
+ mstart = start;
+ if (mend > end)
+ mend = end;
+
+ /* Found completely overlapping range */
+ if (mstart == start && mend == end) {
+ mem->ranges[i].start = 0;
+ mem->ranges[i].end = 0;
+ if (i < mem->nr_ranges - 1) {
+ /* Shift rest of the ranges to left */
+ for (j = i; j < mem->nr_ranges - 1; j++) {
+ mem->ranges[j].start =
+ mem->ranges[j+1].start;
+ mem->ranges[j].end =
+ mem->ranges[j+1].end;
+ }
+ }
+ mem->nr_ranges--;
+ return 0;
+ }
+
+ if (mstart > start && mend < end) {
+ /* Split original range */
+ mem->ranges[i].end = mstart - 1;
+ temp_range.start = mend + 1;
+ temp_range.end = end;
+ } else if (mstart != start)
+ mem->ranges[i].end = mstart - 1;
+ else
+ mem->ranges[i].start = mend + 1;
+ break;
+ }
+
+ /* If a split happend, add the split to array */
+ if (!temp_range.end)
+ return 0;
+
+ /* Split happened */
+ if (i == CRASH_MAX_RANGES - 1) {
+ pr_err("Too many crash ranges after split\n");
+ return -ENOMEM;
+ }
+
+ /* Location where new range should go */
+ j = i + 1;
+ if (j < mem->nr_ranges) {
+ /* Move over all ranges one slot towards the end */
+ for (i = mem->nr_ranges - 1; i >= j; i--)
+ mem->ranges[i + 1] = mem->ranges[i];
+ }
+
+ mem->ranges[j].start = temp_range.start;
+ mem->ranges[j].end = temp_range.end;
+ mem->nr_ranges++;
+ return 0;
+}
+
+/*
+ * Look for any unwanted ranges between mstart, mend and remove them. This
+ * might lead to split and split ranges are put in ced->mem.ranges[] array
+ */
+static int elf_header_exclude_ranges(struct crash_elf_data *ced,
+ unsigned long long mstart, unsigned long long mend)
+{
+ struct crash_mem *cmem = &ced->mem;
+ int ret = 0;
+
+ memset(cmem->ranges, 0, sizeof(cmem->ranges));
+
+ cmem->ranges[0].start = mstart;
+ cmem->ranges[0].end = mend;
+ cmem->nr_ranges = 1;
+
+ /* Exclude crashkernel region */
+ ret = exclude_mem_range(cmem, crashk_res.start, crashk_res.end);
+ if (ret)
+ return ret;
+
+ if (crashk_low_res.end) {
+ ret = exclude_mem_range(cmem, crashk_low_res.start, crashk_low_res.end);
+ if (ret)
+ return ret;
+ }
+
+ /* Exclude GART region */
+ if (ced->gart_end) {
+ ret = exclude_mem_range(cmem, ced->gart_start, ced->gart_end);
+ if (ret)
+ return ret;
+ }
+
+ return ret;
+}
+
+static int prepare_elf64_ram_headers_callback(u64 start, u64 end, void *arg)
+{
+ struct crash_elf_data *ced = arg;
+ Elf64_Ehdr *ehdr;
+ Elf64_Phdr *phdr;
+ unsigned long mstart, mend;
+ struct kimage *image = ced->image;
+ struct crash_mem *cmem;
+ int ret, i;
+
+ ehdr = ced->ehdr;
+
+ /* Exclude unwanted mem ranges */
+ ret = elf_header_exclude_ranges(ced, start, end);
+ if (ret)
+ return ret;
+
+ /* Go through all the ranges in ced->mem.ranges[] and prepare phdr */
+ cmem = &ced->mem;
+
+ for (i = 0; i < cmem->nr_ranges; i++) {
+ mstart = cmem->ranges[i].start;
+ mend = cmem->ranges[i].end;
+
+ phdr = ced->bufp;
+ ced->bufp += sizeof(Elf64_Phdr);
+
+ phdr->p_type = PT_LOAD;
+ phdr->p_flags = PF_R|PF_W|PF_X;
+ phdr->p_offset = mstart;
+
+ /*
+ * If a range matches backup region, adjust offset to backup
+ * segment.
+ */
+ if (mstart == image->arch.backup_src_start &&
+ (mend - mstart + 1) == image->arch.backup_src_sz)
+ phdr->p_offset = image->arch.backup_load_addr;
+
+ phdr->p_paddr = mstart;
+ phdr->p_vaddr = (unsigned long long) __va(mstart);
+ phdr->p_filesz = phdr->p_memsz = mend - mstart + 1;
+ phdr->p_align = 0;
+ ehdr->e_phnum++;
+ pr_debug("Crash PT_LOAD elf header. phdr=%p vaddr=0x%llx, paddr=0x%llx, sz=0x%llx e_phnum=%d p_offset=0x%llx\n",
+ phdr, phdr->p_vaddr, phdr->p_paddr, phdr->p_filesz,
+ ehdr->e_phnum, phdr->p_offset);
+ }
+
+ return ret;
+}
+
+static int prepare_elf64_headers(struct crash_elf_data *ced,
+ void **addr, unsigned long *sz)
+{
+ Elf64_Ehdr *ehdr;
+ Elf64_Phdr *phdr;
+ unsigned long nr_cpus = num_possible_cpus(), nr_phdr, elf_sz;
+ unsigned char *buf, *bufp;
+ unsigned int cpu;
+ unsigned long long notes_addr;
+ int ret;
+
+ /* extra phdr for vmcoreinfo elf note */
+ nr_phdr = nr_cpus + 1;
+ nr_phdr += ced->max_nr_ranges;
+
+ /*
+ * kexec-tools creates an extra PT_LOAD phdr for kernel text mapping
+ * area on x86_64 (ffffffff80000000 - ffffffffa0000000).
+ * I think this is required by tools like gdb. So same physical
+ * memory will be mapped in two elf headers. One will contain kernel
+ * text virtual addresses and other will have __va(physical) addresses.
+ */
+
+ nr_phdr++;
+ elf_sz = sizeof(Elf64_Ehdr) + nr_phdr * sizeof(Elf64_Phdr);
+ elf_sz = ALIGN(elf_sz, ELF_CORE_HEADER_ALIGN);
+
+ buf = vzalloc(elf_sz);
+ if (!buf)
+ return -ENOMEM;
+
+ bufp = buf;
+ ehdr = (Elf64_Ehdr *)bufp;
+ bufp += sizeof(Elf64_Ehdr);
+ memcpy(ehdr->e_ident, ELFMAG, SELFMAG);
+ ehdr->e_ident[EI_CLASS] = ELFCLASS64;
+ ehdr->e_ident[EI_DATA] = ELFDATA2LSB;
+ ehdr->e_ident[EI_VERSION] = EV_CURRENT;
+ ehdr->e_ident[EI_OSABI] = ELF_OSABI;
+ memset(ehdr->e_ident + EI_PAD, 0, EI_NIDENT - EI_PAD);
+ ehdr->e_type = ET_CORE;
+ ehdr->e_machine = ELF_ARCH;
+ ehdr->e_version = EV_CURRENT;
+ ehdr->e_phoff = sizeof(Elf64_Ehdr);
+ ehdr->e_ehsize = sizeof(Elf64_Ehdr);
+ ehdr->e_phentsize = sizeof(Elf64_Phdr);
+
+ /* Prepare one phdr of type PT_NOTE for each present cpu */
+ for_each_present_cpu(cpu) {
+ phdr = (Elf64_Phdr *)bufp;
+ bufp += sizeof(Elf64_Phdr);
+ phdr->p_type = PT_NOTE;
+ notes_addr = per_cpu_ptr_to_phys(per_cpu_ptr(crash_notes, cpu));
+ phdr->p_offset = phdr->p_paddr = notes_addr;
+ phdr->p_filesz = phdr->p_memsz = sizeof(note_buf_t);
+ (ehdr->e_phnum)++;
+ }
+
+ /* Prepare one PT_NOTE header for vmcoreinfo */
+ phdr = (Elf64_Phdr *)bufp;
+ bufp += sizeof(Elf64_Phdr);
+ phdr->p_type = PT_NOTE;
+ phdr->p_offset = phdr->p_paddr = paddr_vmcoreinfo_note();
+ phdr->p_filesz = phdr->p_memsz = sizeof(vmcoreinfo_note);
+ (ehdr->e_phnum)++;
+
+#ifdef CONFIG_X86_64
+ /* Prepare PT_LOAD type program header for kernel text region */
+ phdr = (Elf64_Phdr *)bufp;
+ bufp += sizeof(Elf64_Phdr);
+ phdr->p_type = PT_LOAD;
+ phdr->p_flags = PF_R|PF_W|PF_X;
+ phdr->p_vaddr = (Elf64_Addr)_text;
+ phdr->p_filesz = phdr->p_memsz = _end - _text;
+ phdr->p_offset = phdr->p_paddr = __pa_symbol(_text);
+ (ehdr->e_phnum)++;
+#endif
+
+ /* Prepare PT_LOAD headers for system ram chunks. */
+ ced->ehdr = ehdr;
+ ced->bufp = bufp;
+ ret = walk_system_ram_res(0, -1, ced,
+ prepare_elf64_ram_headers_callback);
+ if (ret < 0)
+ return ret;
+
+ *addr = buf;
+ *sz = elf_sz;
+ return 0;
+}
+
+/* Prepare elf headers. Return addr and size */
+static int prepare_elf_headers(struct kimage *image, void **addr,
+ unsigned long *sz)
+{
+ struct crash_elf_data *ced;
+ int ret;
+
+ ced = kzalloc(sizeof(*ced), GFP_KERNEL);
+ if (!ced)
+ return -ENOMEM;
+
+ fill_up_crash_elf_data(ced, image);
+
+ /* By default prepare 64bit headers */
+ ret = prepare_elf64_headers(ced, addr, sz);
+ kfree(ced);
+ return ret;
+}
+
+static int add_e820_entry(struct boot_params *params, struct e820entry *entry)
+{
+ unsigned int nr_e820_entries;
+
+ nr_e820_entries = params->e820_entries;
+ if (nr_e820_entries >= E820MAX)
+ return 1;
+
+ memcpy(&params->e820_map[nr_e820_entries], entry,
+ sizeof(struct e820entry));
+ params->e820_entries++;
+ return 0;
+}
+
+static int memmap_entry_callback(u64 start, u64 end, void *arg)
+{
+ struct crash_memmap_data *cmd = arg;
+ struct boot_params *params = cmd->params;
+ struct e820entry ei;
+
+ ei.addr = start;
+ ei.size = end - start + 1;
+ ei.type = cmd->type;
+ add_e820_entry(params, &ei);
+
+ return 0;
+}
+
+static int memmap_exclude_ranges(struct kimage *image, struct crash_mem *cmem,
+ unsigned long long mstart,
+ unsigned long long mend)
+{
+ unsigned long start, end;
+ int ret = 0;
+
+ cmem->ranges[0].start = mstart;
+ cmem->ranges[0].end = mend;
+ cmem->nr_ranges = 1;
+
+ /* Exclude Backup region */
+ start = image->arch.backup_load_addr;
+ end = start + image->arch.backup_src_sz - 1;
+ ret = exclude_mem_range(cmem, start, end);
+ if (ret)
+ return ret;
+
+ /* Exclude elf header region */
+ start = image->arch.elf_load_addr;
+ end = start + image->arch.elf_headers_sz - 1;
+ return exclude_mem_range(cmem, start, end);
+}
+
+/* Prepare memory map for crash dump kernel */
+int crash_setup_memmap_entries(struct kimage *image, struct boot_params *params)
+{
+ int i, ret = 0;
+ unsigned long flags;
+ struct e820entry ei;
+ struct crash_memmap_data cmd;
+ struct crash_mem *cmem;
+
+ cmem = vzalloc(sizeof(struct crash_mem));
+ if (!cmem)
+ return -ENOMEM;
+
+ memset(&cmd, 0, sizeof(struct crash_memmap_data));
+ cmd.params = params;
+
+ /* Add first 640K segment */
+ ei.addr = image->arch.backup_src_start;
+ ei.size = image->arch.backup_src_sz;
+ ei.type = E820_RAM;
+ add_e820_entry(params, &ei);
+
+ /* Add ACPI tables */
+ cmd.type = E820_ACPI;
+ flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+ walk_iomem_res("ACPI Tables", flags, 0, -1, &cmd,
+ memmap_entry_callback);
+
+ /* Add ACPI Non-volatile Storage */
+ cmd.type = E820_NVS;
+ walk_iomem_res("ACPI Non-volatile Storage", flags, 0, -1, &cmd,
+ memmap_entry_callback);
+
+ /* Add crashk_low_res region */
+ if (crashk_low_res.end) {
+ ei.addr = crashk_low_res.start;
+ ei.size = crashk_low_res.end - crashk_low_res.start + 1;
+ ei.type = E820_RAM;
+ add_e820_entry(params, &ei);
+ }
+
+ /* Exclude some ranges from crashk_res and add rest to memmap */
+ ret = memmap_exclude_ranges(image, cmem, crashk_res.start,
+ crashk_res.end);
+ if (ret)
+ goto out;
+
+ for (i = 0; i < cmem->nr_ranges; i++) {
+ ei.size = cmem->ranges[i].end - cmem->ranges[i].start + 1;
+
+ /* If entry is less than a page, skip it */
+ if (ei.size < PAGE_SIZE)
+ continue;
+ ei.addr = cmem->ranges[i].start;
+ ei.type = E820_RAM;
+ add_e820_entry(params, &ei);
+ }
+
+out:
+ vfree(cmem);
+ return ret;
+}
+
+static int determine_backup_region(u64 start, u64 end, void *arg)
+{
+ struct kimage *image = arg;
+
+ image->arch.backup_src_start = start;
+ image->arch.backup_src_sz = end - start + 1;
+
+ /* Expecting only one range for backup region */
+ return 1;
+}
+
+int crash_load_segments(struct kimage *image)
+{
+ unsigned long src_start, src_sz, elf_sz;
+ void *elf_addr;
+ int ret;
+
+ /*
+ * Determine and load a segment for backup area. First 640K RAM
+ * region is backup source
+ */
+
+ ret = walk_system_ram_res(KEXEC_BACKUP_SRC_START, KEXEC_BACKUP_SRC_END,
+ image, determine_backup_region);
+
+ /* Zero or postive return values are ok */
+ if (ret < 0)
+ return ret;
+
+ src_start = image->arch.backup_src_start;
+ src_sz = image->arch.backup_src_sz;
+
+ /* Add backup segment. */
+ if (src_sz) {
+ /*
+ * Ideally there is no source for backup segment. This is
+ * copied in purgatory after crash. Just add a zero filled
+ * segment for now to make sure checksum logic works fine.
+ */
+ ret = kexec_add_buffer(image, (char *)&crash_zero_bytes,
+ sizeof(crash_zero_bytes), src_sz,
+ PAGE_SIZE, 0, -1, 0,
+ &image->arch.backup_load_addr);
+ if (ret)
+ return ret;
+ pr_debug("Loaded backup region at 0x%lx backup_start=0x%lx memsz=0x%lx\n",
+ image->arch.backup_load_addr, src_start, src_sz);
+ }
+
+ /* Prepare elf headers and add a segment */
+ ret = prepare_elf_headers(image, &elf_addr, &elf_sz);
+ if (ret)
+ return ret;
+
+ image->arch.elf_headers = elf_addr;
+ image->arch.elf_headers_sz = elf_sz;
+
+ ret = kexec_add_buffer(image, (char *)elf_addr, elf_sz, elf_sz,
+ ELF_CORE_HEADER_ALIGN, 0, -1, 0,
+ &image->arch.elf_load_addr);
+ if (ret) {
+ vfree((void *)image->arch.elf_headers);
+ return ret;
+ }
+ pr_debug("Loaded ELF headers at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
+ image->arch.elf_load_addr, elf_sz, elf_sz);
+
+ return ret;
+}
+#endif /* CONFIG_KEXEC_FILE */
diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c
index 7db54b5d5f86..3d3503351242 100644
--- a/arch/x86/kernel/devicetree.c
+++ b/arch/x86/kernel/devicetree.c
@@ -21,6 +21,7 @@
#include <asm/apic.h>
#include <asm/pci_x86.h>
#include <asm/setup.h>
+#include <asm/i8259.h>
__initdata u64 initial_dtb;
char __initdata cmd_line[COMMAND_LINE_SIZE];
@@ -165,82 +166,6 @@ static void __init dtb_lapic_setup(void)
#ifdef CONFIG_X86_IO_APIC
static unsigned int ioapic_id;
-static void __init dtb_add_ioapic(struct device_node *dn)
-{
- struct resource r;
- int ret;
-
- ret = of_address_to_resource(dn, 0, &r);
- if (ret) {
- printk(KERN_ERR "Can't obtain address from node %s.\n",
- dn->full_name);
- return;
- }
- mp_register_ioapic(++ioapic_id, r.start, gsi_top);
-}
-
-static void __init dtb_ioapic_setup(void)
-{
- struct device_node *dn;
-
- for_each_compatible_node(dn, NULL, "intel,ce4100-ioapic")
- dtb_add_ioapic(dn);
-
- if (nr_ioapics) {
- of_ioapic = 1;
- return;
- }
- printk(KERN_ERR "Error: No information about IO-APIC in OF.\n");
-}
-#else
-static void __init dtb_ioapic_setup(void) {}
-#endif
-
-static void __init dtb_apic_setup(void)
-{
- dtb_lapic_setup();
- dtb_ioapic_setup();
-}
-
-#ifdef CONFIG_OF_FLATTREE
-static void __init x86_flattree_get_config(void)
-{
- u32 size, map_len;
- void *dt;
-
- if (!initial_dtb)
- return;
-
- map_len = max(PAGE_SIZE - (initial_dtb & ~PAGE_MASK), (u64)128);
-
- initial_boot_params = dt = early_memremap(initial_dtb, map_len);
- size = of_get_flat_dt_size();
- if (map_len < size) {
- early_iounmap(dt, map_len);
- initial_boot_params = dt = early_memremap(initial_dtb, size);
- map_len = size;
- }
-
- unflatten_and_copy_device_tree();
- early_iounmap(dt, map_len);
-}
-#else
-static inline void x86_flattree_get_config(void) { }
-#endif
-
-void __init x86_dtb_init(void)
-{
- x86_flattree_get_config();
-
- if (!of_have_populated_dt())
- return;
-
- dtb_setup_hpet();
- dtb_apic_setup();
-}
-
-#ifdef CONFIG_X86_IO_APIC
-
struct of_ioapic_type {
u32 out_type;
u32 trigger;
@@ -276,10 +201,8 @@ static int ioapic_xlate(struct irq_domain *domain,
const u32 *intspec, u32 intsize,
irq_hw_number_t *out_hwirq, u32 *out_type)
{
- struct io_apic_irq_attr attr;
struct of_ioapic_type *it;
- u32 line, idx;
- int rc;
+ u32 line, idx, gsi;
if (WARN_ON(intsize < 2))
return -EINVAL;
@@ -291,13 +214,10 @@ static int ioapic_xlate(struct irq_domain *domain,
it = &of_ioapic_type[intspec[1]];
- idx = (u32) domain->host_data;
- set_io_apic_irq_attr(&attr, idx, line, it->trigger, it->polarity);
-
- rc = io_apic_setup_irq_pin_once(irq_find_mapping(domain, line),
- cpu_to_node(0), &attr);
- if (rc)
- return rc;
+ idx = (u32)(long)domain->host_data;
+ gsi = mp_pin_to_gsi(idx, line);
+ if (mp_set_gsi_attr(gsi, it->trigger, it->polarity, cpu_to_node(0)))
+ return -EBUSY;
*out_hwirq = line;
*out_type = it->out_type;
@@ -305,81 +225,86 @@ static int ioapic_xlate(struct irq_domain *domain,
}
const struct irq_domain_ops ioapic_irq_domain_ops = {
+ .map = mp_irqdomain_map,
+ .unmap = mp_irqdomain_unmap,
.xlate = ioapic_xlate,
};
-static void dt_add_ioapic_domain(unsigned int ioapic_num,
- struct device_node *np)
+static void __init dtb_add_ioapic(struct device_node *dn)
{
- struct irq_domain *id;
- struct mp_ioapic_gsi *gsi_cfg;
+ struct resource r;
int ret;
- int num;
-
- gsi_cfg = mp_ioapic_gsi_routing(ioapic_num);
- num = gsi_cfg->gsi_end - gsi_cfg->gsi_base + 1;
-
- id = irq_domain_add_linear(np, num, &ioapic_irq_domain_ops,
- (void *)ioapic_num);
- BUG_ON(!id);
- if (gsi_cfg->gsi_base == 0) {
- /*
- * The first NR_IRQS_LEGACY irq descs are allocated in
- * early_irq_init() and need just a mapping. The
- * remaining irqs need both. All of them are preallocated
- * and assigned so we can keep the 1:1 mapping which the ioapic
- * is having.
- */
- irq_domain_associate_many(id, 0, 0, NR_IRQS_LEGACY);
-
- if (num > NR_IRQS_LEGACY) {
- ret = irq_create_strict_mappings(id, NR_IRQS_LEGACY,
- NR_IRQS_LEGACY, num - NR_IRQS_LEGACY);
- if (ret)
- pr_err("Error creating mapping for the "
- "remaining IRQs: %d\n", ret);
- }
- irq_set_default_host(id);
- } else {
- ret = irq_create_strict_mappings(id, gsi_cfg->gsi_base, 0, num);
- if (ret)
- pr_err("Error creating IRQ mapping: %d\n", ret);
+ struct ioapic_domain_cfg cfg = {
+ .type = IOAPIC_DOMAIN_DYNAMIC,
+ .ops = &ioapic_irq_domain_ops,
+ .dev = dn,
+ };
+
+ ret = of_address_to_resource(dn, 0, &r);
+ if (ret) {
+ printk(KERN_ERR "Can't obtain address from node %s.\n",
+ dn->full_name);
+ return;
}
+ mp_register_ioapic(++ioapic_id, r.start, gsi_top, &cfg);
}
-static void __init ioapic_add_ofnode(struct device_node *np)
+static void __init dtb_ioapic_setup(void)
{
- struct resource r;
- int i, ret;
+ struct device_node *dn;
- ret = of_address_to_resource(np, 0, &r);
- if (ret) {
- printk(KERN_ERR "Failed to obtain address for %s\n",
- np->full_name);
+ for_each_compatible_node(dn, NULL, "intel,ce4100-ioapic")
+ dtb_add_ioapic(dn);
+
+ if (nr_ioapics) {
+ of_ioapic = 1;
return;
}
+ printk(KERN_ERR "Error: No information about IO-APIC in OF.\n");
+}
+#else
+static void __init dtb_ioapic_setup(void) {}
+#endif
- for (i = 0; i < nr_ioapics; i++) {
- if (r.start == mpc_ioapic_addr(i)) {
- dt_add_ioapic_domain(i, np);
- return;
- }
- }
- printk(KERN_ERR "IOxAPIC at %s is not registered.\n", np->full_name);
+static void __init dtb_apic_setup(void)
+{
+ dtb_lapic_setup();
+ dtb_ioapic_setup();
}
-void __init x86_add_irq_domains(void)
+#ifdef CONFIG_OF_FLATTREE
+static void __init x86_flattree_get_config(void)
{
- struct device_node *dp;
+ u32 size, map_len;
+ void *dt;
- if (!of_have_populated_dt())
+ if (!initial_dtb)
return;
- for_each_node_with_property(dp, "interrupt-controller") {
- if (of_device_is_compatible(dp, "intel,ce4100-ioapic"))
- ioapic_add_ofnode(dp);
+ map_len = max(PAGE_SIZE - (initial_dtb & ~PAGE_MASK), (u64)128);
+
+ initial_boot_params = dt = early_memremap(initial_dtb, map_len);
+ size = of_get_flat_dt_size();
+ if (map_len < size) {
+ early_iounmap(dt, map_len);
+ initial_boot_params = dt = early_memremap(initial_dtb, size);
+ map_len = size;
}
+
+ unflatten_and_copy_device_tree();
+ early_iounmap(dt, map_len);
}
#else
-void __init x86_add_irq_domains(void) { }
+static inline void x86_flattree_get_config(void) { }
#endif
+
+void __init x86_dtb_init(void)
+{
+ x86_flattree_get_config();
+
+ if (!of_have_populated_dt())
+ return;
+
+ dtb_setup_hpet();
+ dtb_apic_setup();
+}
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 988c00a1f60d..49f886481615 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -682,15 +682,14 @@ void __init parse_e820_ext(u64 phys_addr, u32 data_len)
* hibernation (32 bit) or software suspend and suspend to RAM (64 bit).
*
* This function requires the e820 map to be sorted and without any
- * overlapping entries and assumes the first e820 area to be RAM.
+ * overlapping entries.
*/
void __init e820_mark_nosave_regions(unsigned long limit_pfn)
{
int i;
- unsigned long pfn;
+ unsigned long pfn = 0;
- pfn = PFN_DOWN(e820.map[0].addr + e820.map[0].size);
- for (i = 1; i < e820.nr_map; i++) {
+ for (i = 0; i < e820.nr_map; i++) {
struct e820entry *ei = &e820.map[i];
if (pfn < PFN_UP(ei->addr))
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 47c410d99f5d..344b63f18d14 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -447,16 +447,14 @@ sysenter_exit:
sysenter_audit:
testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
jnz syscall_trace_entry
- addl $4,%esp
- CFI_ADJUST_CFA_OFFSET -4
- /* %esi already in 8(%esp) 6th arg: 4th syscall arg */
- /* %edx already in 4(%esp) 5th arg: 3rd syscall arg */
- /* %ecx already in 0(%esp) 4th arg: 2nd syscall arg */
- movl %ebx,%ecx /* 3rd arg: 1st syscall arg */
- movl %eax,%edx /* 2nd arg: syscall number */
- movl $AUDIT_ARCH_I386,%eax /* 1st arg: audit arch */
+ /* movl PT_EAX(%esp), %eax already set, syscall number: 1st arg to audit */
+ movl PT_EBX(%esp), %edx /* ebx/a0: 2nd arg to audit */
+ /* movl PT_ECX(%esp), %ecx already set, a1: 3nd arg to audit */
+ pushl_cfi PT_ESI(%esp) /* a3: 5th arg */
+ pushl_cfi PT_EDX+4(%esp) /* a2: 4th arg */
call __audit_syscall_entry
- pushl_cfi %ebx
+ popl_cfi %ecx /* get that remapped edx off the stack */
+ popl_cfi %ecx /* get that remapped esi off the stack */
movl PT_EAX(%esp),%eax /* reload syscall number */
jmp sysenter_do_call
@@ -683,7 +681,7 @@ END(syscall_badsys)
sysenter_badsys:
movl $-ENOSYS,%eax
jmp sysenter_after_call
-END(syscall_badsys)
+END(sysenter_badsys)
CFI_ENDPROC
.macro FIXUP_ESPFIX_STACK
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 2fac1343a90b..df088bb03fb3 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -404,8 +404,8 @@ GLOBAL(system_call_after_swapgs)
* and short:
*/
ENABLE_INTERRUPTS(CLBR_NONE)
- SAVE_ARGS 8,0
- movq %rax,ORIG_RAX-ARGOFFSET(%rsp)
+ SAVE_ARGS 8, 0, rax_enosys=1
+ movq_cfi rax,(ORIG_RAX-ARGOFFSET)
movq %rcx,RIP-ARGOFFSET(%rsp)
CFI_REL_OFFSET rip,RIP-ARGOFFSET
testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
@@ -417,7 +417,7 @@ system_call_fastpath:
andl $__SYSCALL_MASK,%eax
cmpl $__NR_syscall_max,%eax
#endif
- ja badsys
+ ja ret_from_sys_call /* and return regs->ax */
movq %r10,%rcx
call *sys_call_table(,%rax,8) # XXX: rip relative
movq %rax,RAX-ARGOFFSET(%rsp)
@@ -476,28 +476,8 @@ sysret_signal:
FIXUP_TOP_OF_STACK %r11, -ARGOFFSET
jmp int_check_syscall_exit_work
-badsys:
- movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
- jmp ret_from_sys_call
-
#ifdef CONFIG_AUDITSYSCALL
/*
- * Fast path for syscall audit without full syscall trace.
- * We just call __audit_syscall_entry() directly, and then
- * jump back to the normal fast path.
- */
-auditsys:
- movq %r10,%r9 /* 6th arg: 4th syscall arg */
- movq %rdx,%r8 /* 5th arg: 3rd syscall arg */
- movq %rsi,%rcx /* 4th arg: 2nd syscall arg */
- movq %rdi,%rdx /* 3rd arg: 1st syscall arg */
- movq %rax,%rsi /* 2nd arg: syscall number */
- movl $AUDIT_ARCH_X86_64,%edi /* 1st arg: audit arch */
- call __audit_syscall_entry
- LOAD_ARGS 0 /* reload call-clobbered registers */
- jmp system_call_fastpath
-
- /*
* Return fast path for syscall audit. Call __audit_syscall_exit()
* directly and then jump back to the fast path with TIF_SYSCALL_AUDIT
* masked off.
@@ -514,18 +494,25 @@ sysret_audit:
/* Do syscall tracing */
tracesys:
-#ifdef CONFIG_AUDITSYSCALL
- testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
- jz auditsys
-#endif
+ leaq -REST_SKIP(%rsp), %rdi
+ movq $AUDIT_ARCH_X86_64, %rsi
+ call syscall_trace_enter_phase1
+ test %rax, %rax
+ jnz tracesys_phase2 /* if needed, run the slow path */
+ LOAD_ARGS 0 /* else restore clobbered regs */
+ jmp system_call_fastpath /* and return to the fast path */
+
+tracesys_phase2:
SAVE_REST
- movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
FIXUP_TOP_OF_STACK %rdi
- movq %rsp,%rdi
- call syscall_trace_enter
+ movq %rsp, %rdi
+ movq $AUDIT_ARCH_X86_64, %rsi
+ movq %rax,%rdx
+ call syscall_trace_enter_phase2
+
/*
* Reload arg registers from stack in case ptrace changed them.
- * We don't reload %rax because syscall_trace_enter() returned
+ * We don't reload %rax because syscall_trace_entry_phase2() returned
* the value it wants us to use in the table lookup.
*/
LOAD_ARGS ARGOFFSET, 1
@@ -536,7 +523,7 @@ tracesys:
andl $__SYSCALL_MASK,%eax
cmpl $__NR_syscall_max,%eax
#endif
- ja int_ret_from_sys_call /* RAX(%rsp) set to -ENOSYS above */
+ ja int_ret_from_sys_call /* RAX(%rsp) is already set */
movq %r10,%rcx /* fixup for C */
call *sys_call_table(,%rax,8)
movq %rax,RAX-ARGOFFSET(%rsp)
diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
index 5f9cf20cdb68..3d5fb509bdeb 100644
--- a/arch/x86/kernel/hw_breakpoint.c
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -108,7 +108,7 @@ int arch_install_hw_breakpoint(struct perf_event *bp)
int i;
for (i = 0; i < HBP_NUM; i++) {
- struct perf_event **slot = &__get_cpu_var(bp_per_reg[i]);
+ struct perf_event **slot = this_cpu_ptr(&bp_per_reg[i]);
if (!*slot) {
*slot = bp;
@@ -122,7 +122,7 @@ int arch_install_hw_breakpoint(struct perf_event *bp)
set_debugreg(info->address, i);
__this_cpu_write(cpu_debugreg[i], info->address);
- dr7 = &__get_cpu_var(cpu_dr7);
+ dr7 = this_cpu_ptr(&cpu_dr7);
*dr7 |= encode_dr7(i, info->len, info->type);
set_debugreg(*dr7, 7);
@@ -146,7 +146,7 @@ void arch_uninstall_hw_breakpoint(struct perf_event *bp)
int i;
for (i = 0; i < HBP_NUM; i++) {
- struct perf_event **slot = &__get_cpu_var(bp_per_reg[i]);
+ struct perf_event **slot = this_cpu_ptr(&bp_per_reg[i]);
if (*slot == bp) {
*slot = NULL;
@@ -157,7 +157,7 @@ void arch_uninstall_hw_breakpoint(struct perf_event *bp)
if (WARN_ONCE(i == HBP_NUM, "Can't find any breakpoint slot"))
return;
- dr7 = &__get_cpu_var(cpu_dr7);
+ dr7 = this_cpu_ptr(&cpu_dr7);
*dr7 &= ~__encode_dr7(i, info->len, info->type);
set_debugreg(*dr7, 7);
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index d5dd80814419..a9a4229f6161 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -375,7 +375,7 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset,
/*
* These bits must be zero.
*/
- xsave_hdr->reserved1[0] = xsave_hdr->reserved1[1] = 0;
+ memset(xsave_hdr->reserved, 0, 48);
return ret;
}
diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c
index 8af817105e29..e7cc5370cd2f 100644
--- a/arch/x86/kernel/i8259.c
+++ b/arch/x86/kernel/i8259.c
@@ -111,8 +111,7 @@ static void make_8259A_irq(unsigned int irq)
{
disable_irq_nosync(irq);
io_apic_irqs &= ~(1<<irq);
- irq_set_chip_and_handler_name(irq, &i8259A_chip, handle_level_irq,
- i8259A_chip.name);
+ irq_set_chip_and_handler(irq, &i8259A_chip, handle_level_irq);
enable_irq(irq);
}
diff --git a/arch/x86/kernel/iosf_mbi.c b/arch/x86/kernel/iosf_mbi.c
index d30acdc1229d..82f8d02f0df2 100644
--- a/arch/x86/kernel/iosf_mbi.c
+++ b/arch/x86/kernel/iosf_mbi.c
@@ -22,10 +22,13 @@
#include <linux/init.h>
#include <linux/spinlock.h>
#include <linux/pci.h>
+#include <linux/debugfs.h>
+#include <linux/capability.h>
#include <asm/iosf_mbi.h>
#define PCI_DEVICE_ID_BAYTRAIL 0x0F00
+#define PCI_DEVICE_ID_BRASWELL 0x2280
#define PCI_DEVICE_ID_QUARK_X1000 0x0958
static DEFINE_SPINLOCK(iosf_mbi_lock);
@@ -187,6 +190,89 @@ bool iosf_mbi_available(void)
}
EXPORT_SYMBOL(iosf_mbi_available);
+#ifdef CONFIG_IOSF_MBI_DEBUG
+static u32 dbg_mdr;
+static u32 dbg_mcr;
+static u32 dbg_mcrx;
+
+static int mcr_get(void *data, u64 *val)
+{
+ *val = *(u32 *)data;
+ return 0;
+}
+
+static int mcr_set(void *data, u64 val)
+{
+ u8 command = ((u32)val & 0xFF000000) >> 24,
+ port = ((u32)val & 0x00FF0000) >> 16,
+ offset = ((u32)val & 0x0000FF00) >> 8;
+ int err;
+
+ *(u32 *)data = val;
+
+ if (!capable(CAP_SYS_RAWIO))
+ return -EACCES;
+
+ if (command & 1u)
+ err = iosf_mbi_write(port,
+ command,
+ dbg_mcrx | offset,
+ dbg_mdr);
+ else
+ err = iosf_mbi_read(port,
+ command,
+ dbg_mcrx | offset,
+ &dbg_mdr);
+
+ return err;
+}
+DEFINE_SIMPLE_ATTRIBUTE(iosf_mcr_fops, mcr_get, mcr_set , "%llx\n");
+
+static struct dentry *iosf_dbg;
+
+static void iosf_sideband_debug_init(void)
+{
+ struct dentry *d;
+
+ iosf_dbg = debugfs_create_dir("iosf_sb", NULL);
+ if (IS_ERR_OR_NULL(iosf_dbg))
+ return;
+
+ /* mdr */
+ d = debugfs_create_x32("mdr", 0660, iosf_dbg, &dbg_mdr);
+ if (IS_ERR_OR_NULL(d))
+ goto cleanup;
+
+ /* mcrx */
+ debugfs_create_x32("mcrx", 0660, iosf_dbg, &dbg_mcrx);
+ if (IS_ERR_OR_NULL(d))
+ goto cleanup;
+
+ /* mcr - initiates mailbox tranaction */
+ debugfs_create_file("mcr", 0660, iosf_dbg, &dbg_mcr, &iosf_mcr_fops);
+ if (IS_ERR_OR_NULL(d))
+ goto cleanup;
+
+ return;
+
+cleanup:
+ debugfs_remove_recursive(d);
+}
+
+static void iosf_debugfs_init(void)
+{
+ iosf_sideband_debug_init();
+}
+
+static void iosf_debugfs_remove(void)
+{
+ debugfs_remove_recursive(iosf_dbg);
+}
+#else
+static inline void iosf_debugfs_init(void) { }
+static inline void iosf_debugfs_remove(void) { }
+#endif /* CONFIG_IOSF_MBI_DEBUG */
+
static int iosf_mbi_probe(struct pci_dev *pdev,
const struct pci_device_id *unused)
{
@@ -202,8 +288,9 @@ static int iosf_mbi_probe(struct pci_dev *pdev,
return 0;
}
-static DEFINE_PCI_DEVICE_TABLE(iosf_mbi_pci_ids) = {
+static const struct pci_device_id iosf_mbi_pci_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_BAYTRAIL) },
+ { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_BRASWELL) },
{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_QUARK_X1000) },
{ 0, },
};
@@ -217,11 +304,15 @@ static struct pci_driver iosf_mbi_pci_driver = {
static int __init iosf_mbi_init(void)
{
+ iosf_debugfs_init();
+
return pci_register_driver(&iosf_mbi_pci_driver);
}
static void __exit iosf_mbi_exit(void)
{
+ iosf_debugfs_remove();
+
pci_unregister_driver(&iosf_mbi_pci_driver);
if (mbi_pdev) {
pci_dev_put(mbi_pdev);
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 4d1c746892eb..e4b503d5558c 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -52,13 +52,13 @@ static inline void stack_overflow_check(struct pt_regs *regs)
regs->sp <= curbase + THREAD_SIZE)
return;
- irq_stack_top = (u64)__get_cpu_var(irq_stack_union.irq_stack) +
+ irq_stack_top = (u64)this_cpu_ptr(irq_stack_union.irq_stack) +
STACK_TOP_MARGIN;
- irq_stack_bottom = (u64)__get_cpu_var(irq_stack_ptr);
+ irq_stack_bottom = (u64)__this_cpu_read(irq_stack_ptr);
if (regs->sp >= irq_stack_top && regs->sp <= irq_stack_bottom)
return;
- oist = &__get_cpu_var(orig_ist);
+ oist = this_cpu_ptr(&orig_ist);
estack_top = (u64)oist->ist[0] - EXCEPTION_STKSZ + STACK_TOP_MARGIN;
estack_bottom = (u64)oist->ist[N_EXCEPTION_STACKS - 1];
if (regs->sp >= estack_top && regs->sp <= estack_bottom)
diff --git a/arch/x86/kernel/irq_work.c b/arch/x86/kernel/irq_work.c
index 1de84e3ab4e0..15d741ddfeeb 100644
--- a/arch/x86/kernel/irq_work.c
+++ b/arch/x86/kernel/irq_work.c
@@ -41,7 +41,7 @@ __visible void smp_trace_irq_work_interrupt(struct pt_regs *regs)
void arch_irq_work_raise(void)
{
#ifdef CONFIG_X86_LOCAL_APIC
- if (!cpu_has_apic)
+ if (!arch_irq_work_has_interrupt())
return;
apic->send_IPI_self(IRQ_WORK_VECTOR);
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 7f50156542fb..4de73ee78361 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -70,7 +70,6 @@ int vector_used_by_percpu_irq(unsigned int vector)
void __init init_ISA_irqs(void)
{
struct irq_chip *chip = legacy_pic->chip;
- const char *name = chip->name;
int i;
#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC)
@@ -78,8 +77,8 @@ void __init init_ISA_irqs(void)
#endif
legacy_pic->init(0);
- for (i = 0; i < legacy_pic->nr_legacy_irqs; i++)
- irq_set_chip_and_handler_name(i, chip, handle_level_irq, name);
+ for (i = 0; i < nr_legacy_irqs(); i++)
+ irq_set_chip_and_handler(i, chip, handle_level_irq);
}
void __init init_IRQ(void)
@@ -87,12 +86,6 @@ void __init init_IRQ(void)
int i;
/*
- * We probably need a better place for this, but it works for
- * now ...
- */
- x86_add_irq_domains();
-
- /*
* On cpu 0, Assign IRQ0_VECTOR..IRQ15_VECTOR's to IRQ 0..15.
* If these IRQ's are handled by legacy interrupt-controllers like PIC,
* then this configuration will likely be static after the boot. If
@@ -100,7 +93,7 @@ void __init init_IRQ(void)
* then this vector space can be freed and re-used dynamically as the
* irq's migrate etc.
*/
- for (i = 0; i < legacy_pic->nr_legacy_irqs; i++)
+ for (i = 0; i < nr_legacy_irqs(); i++)
per_cpu(vector_irq, 0)[IRQ0_VECTOR + i] = i;
x86_init.irqs.intr_init();
@@ -121,7 +114,7 @@ void setup_vector_irq(int cpu)
* legacy PIC, for the new cpu that is coming online, setup the static
* legacy vector to irq mapping:
*/
- for (irq = 0; irq < legacy_pic->nr_legacy_irqs; irq++)
+ for (irq = 0; irq < nr_legacy_irqs(); irq++)
per_cpu(vector_irq, cpu)[IRQ0_VECTOR + irq] = irq;
#endif
@@ -209,7 +202,7 @@ void __init native_init_IRQ(void)
set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]);
}
- if (!acpi_ioapic && !of_ioapic)
+ if (!acpi_ioapic && !of_ioapic && nr_legacy_irqs())
setup_irq(2, &irq2);
#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c
new file mode 100644
index 000000000000..ca05f86481aa
--- /dev/null
+++ b/arch/x86/kernel/kexec-bzimage64.c
@@ -0,0 +1,554 @@
+/*
+ * Kexec bzImage loader
+ *
+ * Copyright (C) 2014 Red Hat Inc.
+ * Authors:
+ * Vivek Goyal <vgoyal@redhat.com>
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2. See the file COPYING for more details.
+ */
+
+#define pr_fmt(fmt) "kexec-bzImage64: " fmt
+
+#include <linux/string.h>
+#include <linux/printk.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/kexec.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/efi.h>
+#include <linux/verify_pefile.h>
+#include <keys/system_keyring.h>
+
+#include <asm/bootparam.h>
+#include <asm/setup.h>
+#include <asm/crash.h>
+#include <asm/efi.h>
+#include <asm/kexec-bzimage64.h>
+
+#define MAX_ELFCOREHDR_STR_LEN 30 /* elfcorehdr=0x<64bit-value> */
+
+/*
+ * Defines lowest physical address for various segments. Not sure where
+ * exactly these limits came from. Current bzimage64 loader in kexec-tools
+ * uses these so I am retaining it. It can be changed over time as we gain
+ * more insight.
+ */
+#define MIN_PURGATORY_ADDR 0x3000
+#define MIN_BOOTPARAM_ADDR 0x3000
+#define MIN_KERNEL_LOAD_ADDR 0x100000
+#define MIN_INITRD_LOAD_ADDR 0x1000000
+
+/*
+ * This is a place holder for all boot loader specific data structure which
+ * gets allocated in one call but gets freed much later during cleanup
+ * time. Right now there is only one field but it can grow as need be.
+ */
+struct bzimage64_data {
+ /*
+ * Temporary buffer to hold bootparams buffer. This should be
+ * freed once the bootparam segment has been loaded.
+ */
+ void *bootparams_buf;
+};
+
+static int setup_initrd(struct boot_params *params,
+ unsigned long initrd_load_addr, unsigned long initrd_len)
+{
+ params->hdr.ramdisk_image = initrd_load_addr & 0xffffffffUL;
+ params->hdr.ramdisk_size = initrd_len & 0xffffffffUL;
+
+ params->ext_ramdisk_image = initrd_load_addr >> 32;
+ params->ext_ramdisk_size = initrd_len >> 32;
+
+ return 0;
+}
+
+static int setup_cmdline(struct kimage *image, struct boot_params *params,
+ unsigned long bootparams_load_addr,
+ unsigned long cmdline_offset, char *cmdline,
+ unsigned long cmdline_len)
+{
+ char *cmdline_ptr = ((char *)params) + cmdline_offset;
+ unsigned long cmdline_ptr_phys, len;
+ uint32_t cmdline_low_32, cmdline_ext_32;
+
+ memcpy(cmdline_ptr, cmdline, cmdline_len);
+ if (image->type == KEXEC_TYPE_CRASH) {
+ len = sprintf(cmdline_ptr + cmdline_len - 1,
+ " elfcorehdr=0x%lx", image->arch.elf_load_addr);
+ cmdline_len += len;
+ }
+ cmdline_ptr[cmdline_len - 1] = '\0';
+
+ pr_debug("Final command line is: %s\n", cmdline_ptr);
+ cmdline_ptr_phys = bootparams_load_addr + cmdline_offset;
+ cmdline_low_32 = cmdline_ptr_phys & 0xffffffffUL;
+ cmdline_ext_32 = cmdline_ptr_phys >> 32;
+
+ params->hdr.cmd_line_ptr = cmdline_low_32;
+ if (cmdline_ext_32)
+ params->ext_cmd_line_ptr = cmdline_ext_32;
+
+ return 0;
+}
+
+static int setup_e820_entries(struct boot_params *params)
+{
+ unsigned int nr_e820_entries;
+
+ nr_e820_entries = e820_saved.nr_map;
+
+ /* TODO: Pass entries more than E820MAX in bootparams setup data */
+ if (nr_e820_entries > E820MAX)
+ nr_e820_entries = E820MAX;
+
+ params->e820_entries = nr_e820_entries;
+ memcpy(&params->e820_map, &e820_saved.map,
+ nr_e820_entries * sizeof(struct e820entry));
+
+ return 0;
+}
+
+#ifdef CONFIG_EFI
+static int setup_efi_info_memmap(struct boot_params *params,
+ unsigned long params_load_addr,
+ unsigned int efi_map_offset,
+ unsigned int efi_map_sz)
+{
+ void *efi_map = (void *)params + efi_map_offset;
+ unsigned long efi_map_phys_addr = params_load_addr + efi_map_offset;
+ struct efi_info *ei = &params->efi_info;
+
+ if (!efi_map_sz)
+ return 0;
+
+ efi_runtime_map_copy(efi_map, efi_map_sz);
+
+ ei->efi_memmap = efi_map_phys_addr & 0xffffffff;
+ ei->efi_memmap_hi = efi_map_phys_addr >> 32;
+ ei->efi_memmap_size = efi_map_sz;
+
+ return 0;
+}
+
+static int
+prepare_add_efi_setup_data(struct boot_params *params,
+ unsigned long params_load_addr,
+ unsigned int efi_setup_data_offset)
+{
+ unsigned long setup_data_phys;
+ struct setup_data *sd = (void *)params + efi_setup_data_offset;
+ struct efi_setup_data *esd = (void *)sd + sizeof(struct setup_data);
+
+ esd->fw_vendor = efi.fw_vendor;
+ esd->runtime = efi.runtime;
+ esd->tables = efi.config_table;
+ esd->smbios = efi.smbios;
+
+ sd->type = SETUP_EFI;
+ sd->len = sizeof(struct efi_setup_data);
+
+ /* Add setup data */
+ setup_data_phys = params_load_addr + efi_setup_data_offset;
+ sd->next = params->hdr.setup_data;
+ params->hdr.setup_data = setup_data_phys;
+
+ return 0;
+}
+
+static int
+setup_efi_state(struct boot_params *params, unsigned long params_load_addr,
+ unsigned int efi_map_offset, unsigned int efi_map_sz,
+ unsigned int efi_setup_data_offset)
+{
+ struct efi_info *current_ei = &boot_params.efi_info;
+ struct efi_info *ei = &params->efi_info;
+
+ if (!current_ei->efi_memmap_size)
+ return 0;
+
+ /*
+ * If 1:1 mapping is not enabled, second kernel can not setup EFI
+ * and use EFI run time services. User space will have to pass
+ * acpi_rsdp=<addr> on kernel command line to make second kernel boot
+ * without efi.
+ */
+ if (efi_enabled(EFI_OLD_MEMMAP))
+ return 0;
+
+ ei->efi_loader_signature = current_ei->efi_loader_signature;
+ ei->efi_systab = current_ei->efi_systab;
+ ei->efi_systab_hi = current_ei->efi_systab_hi;
+
+ ei->efi_memdesc_version = current_ei->efi_memdesc_version;
+ ei->efi_memdesc_size = efi_get_runtime_map_desc_size();
+
+ setup_efi_info_memmap(params, params_load_addr, efi_map_offset,
+ efi_map_sz);
+ prepare_add_efi_setup_data(params, params_load_addr,
+ efi_setup_data_offset);
+ return 0;
+}
+#endif /* CONFIG_EFI */
+
+static int
+setup_boot_parameters(struct kimage *image, struct boot_params *params,
+ unsigned long params_load_addr,
+ unsigned int efi_map_offset, unsigned int efi_map_sz,
+ unsigned int efi_setup_data_offset)
+{
+ unsigned int nr_e820_entries;
+ unsigned long long mem_k, start, end;
+ int i, ret = 0;
+
+ /* Get subarch from existing bootparams */
+ params->hdr.hardware_subarch = boot_params.hdr.hardware_subarch;
+
+ /* Copying screen_info will do? */
+ memcpy(&params->screen_info, &boot_params.screen_info,
+ sizeof(struct screen_info));
+
+ /* Fill in memsize later */
+ params->screen_info.ext_mem_k = 0;
+ params->alt_mem_k = 0;
+
+ /* Default APM info */
+ memset(&params->apm_bios_info, 0, sizeof(params->apm_bios_info));
+
+ /* Default drive info */
+ memset(&params->hd0_info, 0, sizeof(params->hd0_info));
+ memset(&params->hd1_info, 0, sizeof(params->hd1_info));
+
+ /* Default sysdesc table */
+ params->sys_desc_table.length = 0;
+
+ if (image->type == KEXEC_TYPE_CRASH) {
+ ret = crash_setup_memmap_entries(image, params);
+ if (ret)
+ return ret;
+ } else
+ setup_e820_entries(params);
+
+ nr_e820_entries = params->e820_entries;
+
+ for (i = 0; i < nr_e820_entries; i++) {
+ if (params->e820_map[i].type != E820_RAM)
+ continue;
+ start = params->e820_map[i].addr;
+ end = params->e820_map[i].addr + params->e820_map[i].size - 1;
+
+ if ((start <= 0x100000) && end > 0x100000) {
+ mem_k = (end >> 10) - (0x100000 >> 10);
+ params->screen_info.ext_mem_k = mem_k;
+ params->alt_mem_k = mem_k;
+ if (mem_k > 0xfc00)
+ params->screen_info.ext_mem_k = 0xfc00; /* 64M*/
+ if (mem_k > 0xffffffff)
+ params->alt_mem_k = 0xffffffff;
+ }
+ }
+
+#ifdef CONFIG_EFI
+ /* Setup EFI state */
+ setup_efi_state(params, params_load_addr, efi_map_offset, efi_map_sz,
+ efi_setup_data_offset);
+#endif
+
+ /* Setup EDD info */
+ memcpy(params->eddbuf, boot_params.eddbuf,
+ EDDMAXNR * sizeof(struct edd_info));
+ params->eddbuf_entries = boot_params.eddbuf_entries;
+
+ memcpy(params->edd_mbr_sig_buffer, boot_params.edd_mbr_sig_buffer,
+ EDD_MBR_SIG_MAX * sizeof(unsigned int));
+
+ return ret;
+}
+
+static int bzImage64_probe(const char *buf, unsigned long len)
+{
+ int ret = -ENOEXEC;
+ struct setup_header *header;
+
+ /* kernel should be atleast two sectors long */
+ if (len < 2 * 512) {
+ pr_err("File is too short to be a bzImage\n");
+ return ret;
+ }
+
+ header = (struct setup_header *)(buf + offsetof(struct boot_params, hdr));
+ if (memcmp((char *)&header->header, "HdrS", 4) != 0) {
+ pr_err("Not a bzImage\n");
+ return ret;
+ }
+
+ if (header->boot_flag != 0xAA55) {
+ pr_err("No x86 boot sector present\n");
+ return ret;
+ }
+
+ if (header->version < 0x020C) {
+ pr_err("Must be at least protocol version 2.12\n");
+ return ret;
+ }
+
+ if (!(header->loadflags & LOADED_HIGH)) {
+ pr_err("zImage not a bzImage\n");
+ return ret;
+ }
+
+ if (!(header->xloadflags & XLF_KERNEL_64)) {
+ pr_err("Not a bzImage64. XLF_KERNEL_64 is not set.\n");
+ return ret;
+ }
+
+ if (!(header->xloadflags & XLF_CAN_BE_LOADED_ABOVE_4G)) {
+ pr_err("XLF_CAN_BE_LOADED_ABOVE_4G is not set.\n");
+ return ret;
+ }
+
+ /*
+ * Can't handle 32bit EFI as it does not allow loading kernel
+ * above 4G. This should be handled by 32bit bzImage loader
+ */
+ if (efi_enabled(EFI_RUNTIME_SERVICES) && !efi_enabled(EFI_64BIT)) {
+ pr_debug("EFI is 32 bit. Can't load kernel above 4G.\n");
+ return ret;
+ }
+
+ /* I've got a bzImage */
+ pr_debug("It's a relocatable bzImage64\n");
+ ret = 0;
+
+ return ret;
+}
+
+static void *bzImage64_load(struct kimage *image, char *kernel,
+ unsigned long kernel_len, char *initrd,
+ unsigned long initrd_len, char *cmdline,
+ unsigned long cmdline_len)
+{
+
+ struct setup_header *header;
+ int setup_sects, kern16_size, ret = 0;
+ unsigned long setup_header_size, params_cmdline_sz, params_misc_sz;
+ struct boot_params *params;
+ unsigned long bootparam_load_addr, kernel_load_addr, initrd_load_addr;
+ unsigned long purgatory_load_addr;
+ unsigned long kernel_bufsz, kernel_memsz, kernel_align;
+ char *kernel_buf;
+ struct bzimage64_data *ldata;
+ struct kexec_entry64_regs regs64;
+ void *stack;
+ unsigned int setup_hdr_offset = offsetof(struct boot_params, hdr);
+ unsigned int efi_map_offset, efi_map_sz, efi_setup_data_offset;
+
+ header = (struct setup_header *)(kernel + setup_hdr_offset);
+ setup_sects = header->setup_sects;
+ if (setup_sects == 0)
+ setup_sects = 4;
+
+ kern16_size = (setup_sects + 1) * 512;
+ if (kernel_len < kern16_size) {
+ pr_err("bzImage truncated\n");
+ return ERR_PTR(-ENOEXEC);
+ }
+
+ if (cmdline_len > header->cmdline_size) {
+ pr_err("Kernel command line too long\n");
+ return ERR_PTR(-EINVAL);
+ }
+
+ /*
+ * In case of crash dump, we will append elfcorehdr=<addr> to
+ * command line. Make sure it does not overflow
+ */
+ if (cmdline_len + MAX_ELFCOREHDR_STR_LEN > header->cmdline_size) {
+ pr_debug("Appending elfcorehdr=<addr> to command line exceeds maximum allowed length\n");
+ return ERR_PTR(-EINVAL);
+ }
+
+ /* Allocate and load backup region */
+ if (image->type == KEXEC_TYPE_CRASH) {
+ ret = crash_load_segments(image);
+ if (ret)
+ return ERR_PTR(ret);
+ }
+
+ /*
+ * Load purgatory. For 64bit entry point, purgatory code can be
+ * anywhere.
+ */
+ ret = kexec_load_purgatory(image, MIN_PURGATORY_ADDR, ULONG_MAX, 1,
+ &purgatory_load_addr);
+ if (ret) {
+ pr_err("Loading purgatory failed\n");
+ return ERR_PTR(ret);
+ }
+
+ pr_debug("Loaded purgatory at 0x%lx\n", purgatory_load_addr);
+
+
+ /*
+ * Load Bootparams and cmdline and space for efi stuff.
+ *
+ * Allocate memory together for multiple data structures so
+ * that they all can go in single area/segment and we don't
+ * have to create separate segment for each. Keeps things
+ * little bit simple
+ */
+ efi_map_sz = efi_get_runtime_map_size();
+ efi_map_sz = ALIGN(efi_map_sz, 16);
+ params_cmdline_sz = sizeof(struct boot_params) + cmdline_len +
+ MAX_ELFCOREHDR_STR_LEN;
+ params_cmdline_sz = ALIGN(params_cmdline_sz, 16);
+ params_misc_sz = params_cmdline_sz + efi_map_sz +
+ sizeof(struct setup_data) +
+ sizeof(struct efi_setup_data);
+
+ params = kzalloc(params_misc_sz, GFP_KERNEL);
+ if (!params)
+ return ERR_PTR(-ENOMEM);
+ efi_map_offset = params_cmdline_sz;
+ efi_setup_data_offset = efi_map_offset + efi_map_sz;
+
+ /* Copy setup header onto bootparams. Documentation/x86/boot.txt */
+ setup_header_size = 0x0202 + kernel[0x0201] - setup_hdr_offset;
+
+ /* Is there a limit on setup header size? */
+ memcpy(&params->hdr, (kernel + setup_hdr_offset), setup_header_size);
+
+ ret = kexec_add_buffer(image, (char *)params, params_misc_sz,
+ params_misc_sz, 16, MIN_BOOTPARAM_ADDR,
+ ULONG_MAX, 1, &bootparam_load_addr);
+ if (ret)
+ goto out_free_params;
+ pr_debug("Loaded boot_param, command line and misc at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
+ bootparam_load_addr, params_misc_sz, params_misc_sz);
+
+ /* Load kernel */
+ kernel_buf = kernel + kern16_size;
+ kernel_bufsz = kernel_len - kern16_size;
+ kernel_memsz = PAGE_ALIGN(header->init_size);
+ kernel_align = header->kernel_alignment;
+
+ ret = kexec_add_buffer(image, kernel_buf,
+ kernel_bufsz, kernel_memsz, kernel_align,
+ MIN_KERNEL_LOAD_ADDR, ULONG_MAX, 1,
+ &kernel_load_addr);
+ if (ret)
+ goto out_free_params;
+
+ pr_debug("Loaded 64bit kernel at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
+ kernel_load_addr, kernel_memsz, kernel_memsz);
+
+ /* Load initrd high */
+ if (initrd) {
+ ret = kexec_add_buffer(image, initrd, initrd_len, initrd_len,
+ PAGE_SIZE, MIN_INITRD_LOAD_ADDR,
+ ULONG_MAX, 1, &initrd_load_addr);
+ if (ret)
+ goto out_free_params;
+
+ pr_debug("Loaded initrd at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
+ initrd_load_addr, initrd_len, initrd_len);
+
+ setup_initrd(params, initrd_load_addr, initrd_len);
+ }
+
+ setup_cmdline(image, params, bootparam_load_addr,
+ sizeof(struct boot_params), cmdline, cmdline_len);
+
+ /* bootloader info. Do we need a separate ID for kexec kernel loader? */
+ params->hdr.type_of_loader = 0x0D << 4;
+ params->hdr.loadflags = 0;
+
+ /* Setup purgatory regs for entry */
+ ret = kexec_purgatory_get_set_symbol(image, "entry64_regs", &regs64,
+ sizeof(regs64), 1);
+ if (ret)
+ goto out_free_params;
+
+ regs64.rbx = 0; /* Bootstrap Processor */
+ regs64.rsi = bootparam_load_addr;
+ regs64.rip = kernel_load_addr + 0x200;
+ stack = kexec_purgatory_get_symbol_addr(image, "stack_end");
+ if (IS_ERR(stack)) {
+ pr_err("Could not find address of symbol stack_end\n");
+ ret = -EINVAL;
+ goto out_free_params;
+ }
+
+ regs64.rsp = (unsigned long)stack;
+ ret = kexec_purgatory_get_set_symbol(image, "entry64_regs", &regs64,
+ sizeof(regs64), 0);
+ if (ret)
+ goto out_free_params;
+
+ ret = setup_boot_parameters(image, params, bootparam_load_addr,
+ efi_map_offset, efi_map_sz,
+ efi_setup_data_offset);
+ if (ret)
+ goto out_free_params;
+
+ /* Allocate loader specific data */
+ ldata = kzalloc(sizeof(struct bzimage64_data), GFP_KERNEL);
+ if (!ldata) {
+ ret = -ENOMEM;
+ goto out_free_params;
+ }
+
+ /*
+ * Store pointer to params so that it could be freed after loading
+ * params segment has been loaded and contents have been copied
+ * somewhere else.
+ */
+ ldata->bootparams_buf = params;
+ return ldata;
+
+out_free_params:
+ kfree(params);
+ return ERR_PTR(ret);
+}
+
+/* This cleanup function is called after various segments have been loaded */
+static int bzImage64_cleanup(void *loader_data)
+{
+ struct bzimage64_data *ldata = loader_data;
+
+ if (!ldata)
+ return 0;
+
+ kfree(ldata->bootparams_buf);
+ ldata->bootparams_buf = NULL;
+
+ return 0;
+}
+
+#ifdef CONFIG_KEXEC_BZIMAGE_VERIFY_SIG
+static int bzImage64_verify_sig(const char *kernel, unsigned long kernel_len)
+{
+ bool trusted;
+ int ret;
+
+ ret = verify_pefile_signature(kernel, kernel_len,
+ system_trusted_keyring, &trusted);
+ if (ret < 0)
+ return ret;
+ if (!trusted)
+ return -EKEYREJECTED;
+ return 0;
+}
+#endif
+
+struct kexec_file_ops kexec_bzImage64_ops = {
+ .probe = bzImage64_probe,
+ .load = bzImage64_load,
+ .cleanup = bzImage64_cleanup,
+#ifdef CONFIG_KEXEC_BZIMAGE_VERIFY_SIG
+ .verify_sig = bzImage64_verify_sig,
+#endif
+};
diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c
index f304773285ae..f1314d0bcf0a 100644
--- a/arch/x86/kernel/kprobes/opt.c
+++ b/arch/x86/kernel/kprobes/opt.c
@@ -338,8 +338,10 @@ int arch_prepare_optimized_kprobe(struct optimized_kprobe *op)
* a relative jump.
*/
rel = (long)op->optinsn.insn - (long)op->kp.addr + RELATIVEJUMP_SIZE;
- if (abs(rel) > 0x7fffffff)
+ if (abs(rel) > 0x7fffffff) {
+ __arch_remove_optimized_kprobe(op, 0);
return -ERANGE;
+ }
buf = (u8 *)op->optinsn.insn;
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 3dd8e2c4d74a..f6945bef2cd1 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -35,6 +35,7 @@
#include <linux/slab.h>
#include <linux/kprobes.h>
#include <linux/debugfs.h>
+#include <linux/nmi.h>
#include <asm/timer.h>
#include <asm/cpu.h>
#include <asm/traps.h>
@@ -243,9 +244,9 @@ u32 kvm_read_and_reset_pf_reason(void)
{
u32 reason = 0;
- if (__get_cpu_var(apf_reason).enabled) {
- reason = __get_cpu_var(apf_reason).reason;
- __get_cpu_var(apf_reason).reason = 0;
+ if (__this_cpu_read(apf_reason.enabled)) {
+ reason = __this_cpu_read(apf_reason.reason);
+ __this_cpu_write(apf_reason.reason, 0);
}
return reason;
@@ -318,7 +319,7 @@ static void kvm_guest_apic_eoi_write(u32 reg, u32 val)
* there's no need for lock or memory barriers.
* An optimization barrier is implied in apic write.
*/
- if (__test_and_clear_bit(KVM_PV_EOI_BIT, &__get_cpu_var(kvm_apic_eoi)))
+ if (__test_and_clear_bit(KVM_PV_EOI_BIT, this_cpu_ptr(&kvm_apic_eoi)))
return;
apic_write(APIC_EOI, APIC_EOI_ACK);
}
@@ -329,13 +330,13 @@ void kvm_guest_cpu_init(void)
return;
if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) && kvmapf) {
- u64 pa = slow_virt_to_phys(&__get_cpu_var(apf_reason));
+ u64 pa = slow_virt_to_phys(this_cpu_ptr(&apf_reason));
#ifdef CONFIG_PREEMPT
pa |= KVM_ASYNC_PF_SEND_ALWAYS;
#endif
wrmsrl(MSR_KVM_ASYNC_PF_EN, pa | KVM_ASYNC_PF_ENABLED);
- __get_cpu_var(apf_reason).enabled = 1;
+ __this_cpu_write(apf_reason.enabled, 1);
printk(KERN_INFO"KVM setup async PF for cpu %d\n",
smp_processor_id());
}
@@ -344,8 +345,8 @@ void kvm_guest_cpu_init(void)
unsigned long pa;
/* Size alignment is implied but just to make it explicit. */
BUILD_BUG_ON(__alignof__(kvm_apic_eoi) < 4);
- __get_cpu_var(kvm_apic_eoi) = 0;
- pa = slow_virt_to_phys(&__get_cpu_var(kvm_apic_eoi))
+ __this_cpu_write(kvm_apic_eoi, 0);
+ pa = slow_virt_to_phys(this_cpu_ptr(&kvm_apic_eoi))
| KVM_MSR_ENABLED;
wrmsrl(MSR_KVM_PV_EOI_EN, pa);
}
@@ -356,11 +357,11 @@ void kvm_guest_cpu_init(void)
static void kvm_pv_disable_apf(void)
{
- if (!__get_cpu_var(apf_reason).enabled)
+ if (!__this_cpu_read(apf_reason.enabled))
return;
wrmsrl(MSR_KVM_ASYNC_PF_EN, 0);
- __get_cpu_var(apf_reason).enabled = 0;
+ __this_cpu_write(apf_reason.enabled, 0);
printk(KERN_INFO"Unregister pv shared memory for cpu %d\n",
smp_processor_id());
@@ -499,6 +500,13 @@ void __init kvm_guest_init(void)
#else
kvm_guest_cpu_init();
#endif
+
+ /*
+ * Hard lockup detection is enabled by default. Disable it, as guests
+ * can get false positives too easily, for example if the host is
+ * overcommitted.
+ */
+ watchdog_enable_hardlockup_detector(false);
}
static noinline uint32_t __kvm_cpuid_base(void)
@@ -716,7 +724,7 @@ __visible void kvm_lock_spinning(struct arch_spinlock *lock, __ticket_t want)
if (in_nmi())
return;
- w = &__get_cpu_var(klock_waiting);
+ w = this_cpu_ptr(&klock_waiting);
cpu = smp_processor_id();
start = spin_time_start();
diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
index 1667b1de8d5d..72e8e310258d 100644
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -247,7 +247,8 @@ void machine_kexec(struct kimage *image)
/* now call it */
image->start = relocate_kernel_ptr((unsigned long)image->head,
(unsigned long)page_list,
- image->start, cpu_has_pae,
+ image->start,
+ boot_cpu_has(X86_FEATURE_PAE),
image->preserve_context);
#ifdef CONFIG_KEXEC_JUMP
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 679cef0791cd..485981059a40 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -6,6 +6,8 @@
* Version 2. See the file COPYING for more details.
*/
+#define pr_fmt(fmt) "kexec: " fmt
+
#include <linux/mm.h>
#include <linux/kexec.h>
#include <linux/string.h>
@@ -21,6 +23,13 @@
#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
#include <asm/debugreg.h>
+#include <asm/kexec-bzimage64.h>
+
+#ifdef CONFIG_KEXEC_FILE
+static struct kexec_file_ops *kexec_file_loaders[] = {
+ &kexec_bzImage64_ops,
+};
+#endif
static void free_transition_pgtable(struct kimage *image)
{
@@ -171,6 +180,45 @@ static void load_segments(void)
);
}
+#ifdef CONFIG_KEXEC_FILE
+/* Update purgatory as needed after various image segments have been prepared */
+static int arch_update_purgatory(struct kimage *image)
+{
+ int ret = 0;
+
+ if (!image->file_mode)
+ return 0;
+
+ /* Setup copying of backup region */
+ if (image->type == KEXEC_TYPE_CRASH) {
+ ret = kexec_purgatory_get_set_symbol(image, "backup_dest",
+ &image->arch.backup_load_addr,
+ sizeof(image->arch.backup_load_addr), 0);
+ if (ret)
+ return ret;
+
+ ret = kexec_purgatory_get_set_symbol(image, "backup_src",
+ &image->arch.backup_src_start,
+ sizeof(image->arch.backup_src_start), 0);
+ if (ret)
+ return ret;
+
+ ret = kexec_purgatory_get_set_symbol(image, "backup_sz",
+ &image->arch.backup_src_sz,
+ sizeof(image->arch.backup_src_sz), 0);
+ if (ret)
+ return ret;
+ }
+
+ return ret;
+}
+#else /* !CONFIG_KEXEC_FILE */
+static inline int arch_update_purgatory(struct kimage *image)
+{
+ return 0;
+}
+#endif /* CONFIG_KEXEC_FILE */
+
int machine_kexec_prepare(struct kimage *image)
{
unsigned long start_pgtable;
@@ -184,6 +232,11 @@ int machine_kexec_prepare(struct kimage *image)
if (result)
return result;
+ /* update purgatory as needed */
+ result = arch_update_purgatory(image);
+ if (result)
+ return result;
+
return 0;
}
@@ -283,3 +336,200 @@ void arch_crash_save_vmcoreinfo(void)
(unsigned long)&_text - __START_KERNEL);
}
+/* arch-dependent functionality related to kexec file-based syscall */
+
+#ifdef CONFIG_KEXEC_FILE
+int arch_kexec_kernel_image_probe(struct kimage *image, void *buf,
+ unsigned long buf_len)
+{
+ int i, ret = -ENOEXEC;
+ struct kexec_file_ops *fops;
+
+ for (i = 0; i < ARRAY_SIZE(kexec_file_loaders); i++) {
+ fops = kexec_file_loaders[i];
+ if (!fops || !fops->probe)
+ continue;
+
+ ret = fops->probe(buf, buf_len);
+ if (!ret) {
+ image->fops = fops;
+ return ret;
+ }
+ }
+
+ return ret;
+}
+
+void *arch_kexec_kernel_image_load(struct kimage *image)
+{
+ vfree(image->arch.elf_headers);
+ image->arch.elf_headers = NULL;
+
+ if (!image->fops || !image->fops->load)
+ return ERR_PTR(-ENOEXEC);
+
+ return image->fops->load(image, image->kernel_buf,
+ image->kernel_buf_len, image->initrd_buf,
+ image->initrd_buf_len, image->cmdline_buf,
+ image->cmdline_buf_len);
+}
+
+int arch_kimage_file_post_load_cleanup(struct kimage *image)
+{
+ if (!image->fops || !image->fops->cleanup)
+ return 0;
+
+ return image->fops->cleanup(image->image_loader_data);
+}
+
+int arch_kexec_kernel_verify_sig(struct kimage *image, void *kernel,
+ unsigned long kernel_len)
+{
+ if (!image->fops || !image->fops->verify_sig) {
+ pr_debug("kernel loader does not support signature verification.");
+ return -EKEYREJECTED;
+ }
+
+ return image->fops->verify_sig(kernel, kernel_len);
+}
+
+/*
+ * Apply purgatory relocations.
+ *
+ * ehdr: Pointer to elf headers
+ * sechdrs: Pointer to section headers.
+ * relsec: section index of SHT_RELA section.
+ *
+ * TODO: Some of the code belongs to generic code. Move that in kexec.c.
+ */
+int arch_kexec_apply_relocations_add(const Elf64_Ehdr *ehdr,
+ Elf64_Shdr *sechdrs, unsigned int relsec)
+{
+ unsigned int i;
+ Elf64_Rela *rel;
+ Elf64_Sym *sym;
+ void *location;
+ Elf64_Shdr *section, *symtabsec;
+ unsigned long address, sec_base, value;
+ const char *strtab, *name, *shstrtab;
+
+ /*
+ * ->sh_offset has been modified to keep the pointer to section
+ * contents in memory
+ */
+ rel = (void *)sechdrs[relsec].sh_offset;
+
+ /* Section to which relocations apply */
+ section = &sechdrs[sechdrs[relsec].sh_info];
+
+ pr_debug("Applying relocate section %u to %u\n", relsec,
+ sechdrs[relsec].sh_info);
+
+ /* Associated symbol table */
+ symtabsec = &sechdrs[sechdrs[relsec].sh_link];
+
+ /* String table */
+ if (symtabsec->sh_link >= ehdr->e_shnum) {
+ /* Invalid strtab section number */
+ pr_err("Invalid string table section index %d\n",
+ symtabsec->sh_link);
+ return -ENOEXEC;
+ }
+
+ strtab = (char *)sechdrs[symtabsec->sh_link].sh_offset;
+
+ /* section header string table */
+ shstrtab = (char *)sechdrs[ehdr->e_shstrndx].sh_offset;
+
+ for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) {
+
+ /*
+ * rel[i].r_offset contains byte offset from beginning
+ * of section to the storage unit affected.
+ *
+ * This is location to update (->sh_offset). This is temporary
+ * buffer where section is currently loaded. This will finally
+ * be loaded to a different address later, pointed to by
+ * ->sh_addr. kexec takes care of moving it
+ * (kexec_load_segment()).
+ */
+ location = (void *)(section->sh_offset + rel[i].r_offset);
+
+ /* Final address of the location */
+ address = section->sh_addr + rel[i].r_offset;
+
+ /*
+ * rel[i].r_info contains information about symbol table index
+ * w.r.t which relocation must be made and type of relocation
+ * to apply. ELF64_R_SYM() and ELF64_R_TYPE() macros get
+ * these respectively.
+ */
+ sym = (Elf64_Sym *)symtabsec->sh_offset +
+ ELF64_R_SYM(rel[i].r_info);
+
+ if (sym->st_name)
+ name = strtab + sym->st_name;
+ else
+ name = shstrtab + sechdrs[sym->st_shndx].sh_name;
+
+ pr_debug("Symbol: %s info: %02x shndx: %02x value=%llx size: %llx\n",
+ name, sym->st_info, sym->st_shndx, sym->st_value,
+ sym->st_size);
+
+ if (sym->st_shndx == SHN_UNDEF) {
+ pr_err("Undefined symbol: %s\n", name);
+ return -ENOEXEC;
+ }
+
+ if (sym->st_shndx == SHN_COMMON) {
+ pr_err("symbol '%s' in common section\n", name);
+ return -ENOEXEC;
+ }
+
+ if (sym->st_shndx == SHN_ABS)
+ sec_base = 0;
+ else if (sym->st_shndx >= ehdr->e_shnum) {
+ pr_err("Invalid section %d for symbol %s\n",
+ sym->st_shndx, name);
+ return -ENOEXEC;
+ } else
+ sec_base = sechdrs[sym->st_shndx].sh_addr;
+
+ value = sym->st_value;
+ value += sec_base;
+ value += rel[i].r_addend;
+
+ switch (ELF64_R_TYPE(rel[i].r_info)) {
+ case R_X86_64_NONE:
+ break;
+ case R_X86_64_64:
+ *(u64 *)location = value;
+ break;
+ case R_X86_64_32:
+ *(u32 *)location = value;
+ if (value != *(u32 *)location)
+ goto overflow;
+ break;
+ case R_X86_64_32S:
+ *(s32 *)location = value;
+ if ((s64)value != *(s32 *)location)
+ goto overflow;
+ break;
+ case R_X86_64_PC32:
+ value -= (u64)address;
+ *(u32 *)location = value;
+ break;
+ default:
+ pr_err("Unknown rela relocation: %llu\n",
+ ELF64_R_TYPE(rel[i].r_info));
+ return -ENOEXEC;
+ }
+ }
+ return 0;
+
+overflow:
+ pr_err("Overflow in relocation type %d value 0x%lx\n",
+ (int)ELF64_R_TYPE(rel[i].r_info), value);
+ return -ENOEXEC;
+}
+#endif /* CONFIG_KEXEC_FILE */
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index d2b56489d70f..2d2a237f2c73 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -19,6 +19,7 @@
#include <linux/module.h>
#include <linux/smp.h>
#include <linux/pci.h>
+#include <linux/irqdomain.h>
#include <asm/mtrr.h>
#include <asm/mpspec.h>
@@ -67,7 +68,7 @@ static void __init MP_processor_info(struct mpc_cpu *m)
boot_cpu_physical_apicid = m->apicid;
}
- printk(KERN_INFO "Processor #%d%s\n", m->apicid, bootup_cpu);
+ pr_info("Processor #%d%s\n", m->apicid, bootup_cpu);
generic_processor_info(apicid, m->apicver);
}
@@ -87,9 +88,8 @@ static void __init MP_bus_info(struct mpc_bus *m)
#if MAX_MP_BUSSES < 256
if (m->busid >= MAX_MP_BUSSES) {
- printk(KERN_WARNING "MP table busid value (%d) for bustype %s "
- " is too large, max. supported is %d\n",
- m->busid, str, MAX_MP_BUSSES - 1);
+ pr_warn("MP table busid value (%d) for bustype %s is too large, max. supported is %d\n",
+ m->busid, str, MAX_MP_BUSSES - 1);
return;
}
#endif
@@ -110,19 +110,29 @@ static void __init MP_bus_info(struct mpc_bus *m)
mp_bus_id_to_type[m->busid] = MP_BUS_EISA;
#endif
} else
- printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str);
+ pr_warn("Unknown bustype %s - ignoring\n", str);
}
+static struct irq_domain_ops mp_ioapic_irqdomain_ops = {
+ .map = mp_irqdomain_map,
+ .unmap = mp_irqdomain_unmap,
+};
+
static void __init MP_ioapic_info(struct mpc_ioapic *m)
{
+ struct ioapic_domain_cfg cfg = {
+ .type = IOAPIC_DOMAIN_LEGACY,
+ .ops = &mp_ioapic_irqdomain_ops,
+ };
+
if (m->flags & MPC_APIC_USABLE)
- mp_register_ioapic(m->apicid, m->apicaddr, gsi_top);
+ mp_register_ioapic(m->apicid, m->apicaddr, gsi_top, &cfg);
}
static void __init print_mp_irq_info(struct mpc_intsrc *mp_irq)
{
- apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x,"
- " IRQ %02x, APIC ID %x, APIC INT %02x\n",
+ apic_printk(APIC_VERBOSE,
+ "Int: type %d, pol %d, trig %d, bus %02x, IRQ %02x, APIC ID %x, APIC INT %02x\n",
mp_irq->irqtype, mp_irq->irqflag & 3,
(mp_irq->irqflag >> 2) & 3, mp_irq->srcbus,
mp_irq->srcbusirq, mp_irq->dstapic, mp_irq->dstirq);
@@ -135,8 +145,8 @@ static inline void __init MP_ioapic_info(struct mpc_ioapic *m) {}
static void __init MP_lintsrc_info(struct mpc_lintsrc *m)
{
- apic_printk(APIC_VERBOSE, "Lint: type %d, pol %d, trig %d, bus %02x,"
- " IRQ %02x, APIC ID %x, APIC LINT %02x\n",
+ apic_printk(APIC_VERBOSE,
+ "Lint: type %d, pol %d, trig %d, bus %02x, IRQ %02x, APIC ID %x, APIC LINT %02x\n",
m->irqtype, m->irqflag & 3, (m->irqflag >> 2) & 3, m->srcbusid,
m->srcbusirq, m->destapic, m->destapiclint);
}
@@ -148,34 +158,33 @@ static int __init smp_check_mpc(struct mpc_table *mpc, char *oem, char *str)
{
if (memcmp(mpc->signature, MPC_SIGNATURE, 4)) {
- printk(KERN_ERR "MPTABLE: bad signature [%c%c%c%c]!\n",
+ pr_err("MPTABLE: bad signature [%c%c%c%c]!\n",
mpc->signature[0], mpc->signature[1],
mpc->signature[2], mpc->signature[3]);
return 0;
}
if (mpf_checksum((unsigned char *)mpc, mpc->length)) {
- printk(KERN_ERR "MPTABLE: checksum error!\n");
+ pr_err("MPTABLE: checksum error!\n");
return 0;
}
if (mpc->spec != 0x01 && mpc->spec != 0x04) {
- printk(KERN_ERR "MPTABLE: bad table version (%d)!!\n",
- mpc->spec);
+ pr_err("MPTABLE: bad table version (%d)!!\n", mpc->spec);
return 0;
}
if (!mpc->lapic) {
- printk(KERN_ERR "MPTABLE: null local APIC address!\n");
+ pr_err("MPTABLE: null local APIC address!\n");
return 0;
}
memcpy(oem, mpc->oem, 8);
oem[8] = 0;
- printk(KERN_INFO "MPTABLE: OEM ID: %s\n", oem);
+ pr_info("MPTABLE: OEM ID: %s\n", oem);
memcpy(str, mpc->productid, 12);
str[12] = 0;
- printk(KERN_INFO "MPTABLE: Product ID: %s\n", str);
+ pr_info("MPTABLE: Product ID: %s\n", str);
- printk(KERN_INFO "MPTABLE: APIC at: 0x%X\n", mpc->lapic);
+ pr_info("MPTABLE: APIC at: 0x%X\n", mpc->lapic);
return 1;
}
@@ -188,8 +197,8 @@ static void skip_entry(unsigned char **ptr, int *count, int size)
static void __init smp_dump_mptable(struct mpc_table *mpc, unsigned char *mpt)
{
- printk(KERN_ERR "Your mptable is wrong, contact your HW vendor!\n"
- "type %x\n", *mpt);
+ pr_err("Your mptable is wrong, contact your HW vendor!\n");
+ pr_cont("type %x\n", *mpt);
print_hex_dump(KERN_ERR, " ", DUMP_PREFIX_ADDRESS, 16,
1, mpc, mpc->length, 1);
}
@@ -207,9 +216,6 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
if (!smp_check_mpc(mpc, oem, str))
return 0;
-#ifdef CONFIG_X86_32
- generic_mps_oem_check(mpc, oem, str);
-#endif
/* Initialize the lapic mapping */
if (!acpi_lapic)
register_lapic_address(mpc->lapic);
@@ -259,7 +265,7 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
}
if (!num_processors)
- printk(KERN_ERR "MPTABLE: no processors registered!\n");
+ pr_err("MPTABLE: no processors registered!\n");
return num_processors;
}
@@ -295,16 +301,13 @@ static void __init construct_default_ioirq_mptable(int mpc_default_type)
* If it does, we assume it's valid.
*/
if (mpc_default_type == 5) {
- printk(KERN_INFO "ISA/PCI bus type with no IRQ information... "
- "falling back to ELCR\n");
+ pr_info("ISA/PCI bus type with no IRQ information... falling back to ELCR\n");
if (ELCR_trigger(0) || ELCR_trigger(1) || ELCR_trigger(2) ||
ELCR_trigger(13))
- printk(KERN_ERR "ELCR contains invalid data... "
- "not using ELCR\n");
+ pr_err("ELCR contains invalid data... not using ELCR\n");
else {
- printk(KERN_INFO
- "Using ELCR to identify PCI interrupts\n");
+ pr_info("Using ELCR to identify PCI interrupts\n");
ELCR_fallback = 1;
}
}
@@ -353,7 +356,7 @@ static void __init construct_ioapic_table(int mpc_default_type)
bus.busid = 0;
switch (mpc_default_type) {
default:
- printk(KERN_ERR "???\nUnknown standard configuration %d\n",
+ pr_err("???\nUnknown standard configuration %d\n",
mpc_default_type);
/* fall through */
case 1:
@@ -462,8 +465,8 @@ static int __init check_physptr(struct mpf_intel *mpf, unsigned int early)
#ifdef CONFIG_X86_LOCAL_APIC
smp_found_config = 0;
#endif
- printk(KERN_ERR "BIOS bug, MP table errors detected!...\n"
- "... disabling SMP support. (tell your hw vendor)\n");
+ pr_err("BIOS bug, MP table errors detected!...\n");
+ pr_cont("... disabling SMP support. (tell your hw vendor)\n");
early_iounmap(mpc, size);
return -1;
}
@@ -481,8 +484,7 @@ static int __init check_physptr(struct mpf_intel *mpf, unsigned int early)
if (!mp_irq_entries) {
struct mpc_bus bus;
- printk(KERN_ERR "BIOS bug, no explicit IRQ entries, "
- "using default mptable. (tell your hw vendor)\n");
+ pr_err("BIOS bug, no explicit IRQ entries, using default mptable. (tell your hw vendor)\n");
bus.type = MP_BUS;
bus.busid = 0;
@@ -516,14 +518,14 @@ void __init default_get_smp_config(unsigned int early)
if (acpi_lapic && acpi_ioapic)
return;
- printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n",
- mpf->specification);
+ pr_info("Intel MultiProcessor Specification v1.%d\n",
+ mpf->specification);
#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32)
if (mpf->feature2 & (1 << 7)) {
- printk(KERN_INFO " IMCR and PIC compatibility mode.\n");
+ pr_info(" IMCR and PIC compatibility mode.\n");
pic_mode = 1;
} else {
- printk(KERN_INFO " Virtual Wire compatibility mode.\n");
+ pr_info(" Virtual Wire compatibility mode.\n");
pic_mode = 0;
}
#endif
@@ -539,8 +541,7 @@ void __init default_get_smp_config(unsigned int early)
return;
}
- printk(KERN_INFO "Default MP configuration #%d\n",
- mpf->feature1);
+ pr_info("Default MP configuration #%d\n", mpf->feature1);
construct_default_ISA_mptable(mpf->feature1);
} else if (mpf->physptr) {
@@ -550,7 +551,7 @@ void __init default_get_smp_config(unsigned int early)
BUG();
if (!early)
- printk(KERN_INFO "Processors: %d\n", num_processors);
+ pr_info("Processors: %d\n", num_processors);
/*
* Only use the first configuration found.
*/
@@ -583,10 +584,10 @@ static int __init smp_scan_config(unsigned long base, unsigned long length)
#endif
mpf_found = mpf;
- printk(KERN_INFO "found SMP MP-table at [mem %#010llx-%#010llx] mapped at [%p]\n",
- (unsigned long long) virt_to_phys(mpf),
- (unsigned long long) virt_to_phys(mpf) +
- sizeof(*mpf) - 1, mpf);
+ pr_info("found SMP MP-table at [mem %#010llx-%#010llx] mapped at [%p]\n",
+ (unsigned long long) virt_to_phys(mpf),
+ (unsigned long long) virt_to_phys(mpf) +
+ sizeof(*mpf) - 1, mpf);
mem = virt_to_phys(mpf);
memblock_reserve(mem, sizeof(*mpf));
@@ -735,7 +736,7 @@ static int __init replace_intsrc_all(struct mpc_table *mpc,
int nr_m_spare = 0;
unsigned char *mpt = ((unsigned char *)mpc) + count;
- printk(KERN_INFO "mpc_length %x\n", mpc->length);
+ pr_info("mpc_length %x\n", mpc->length);
while (count < mpc->length) {
switch (*mpt) {
case MP_PROCESSOR:
@@ -862,13 +863,13 @@ static int __init update_mp_table(void)
if (!smp_check_mpc(mpc, oem, str))
return 0;
- printk(KERN_INFO "mpf: %llx\n", (u64)virt_to_phys(mpf));
- printk(KERN_INFO "physptr: %x\n", mpf->physptr);
+ pr_info("mpf: %llx\n", (u64)virt_to_phys(mpf));
+ pr_info("physptr: %x\n", mpf->physptr);
if (mpc_new_phys && mpc->length > mpc_new_length) {
mpc_new_phys = 0;
- printk(KERN_INFO "mpc_new_length is %ld, please use alloc_mptable=8k\n",
- mpc_new_length);
+ pr_info("mpc_new_length is %ld, please use alloc_mptable=8k\n",
+ mpc_new_length);
}
if (!mpc_new_phys) {
@@ -879,10 +880,10 @@ static int __init update_mp_table(void)
mpc->checksum = 0xff;
new = mpf_checksum((unsigned char *)mpc, mpc->length);
if (old == new) {
- printk(KERN_INFO "mpc is readonly, please try alloc_mptable instead\n");
+ pr_info("mpc is readonly, please try alloc_mptable instead\n");
return 0;
}
- printk(KERN_INFO "use in-position replacing\n");
+ pr_info("use in-position replacing\n");
} else {
mpf->physptr = mpc_new_phys;
mpc_new = phys_to_virt(mpc_new_phys);
@@ -892,7 +893,7 @@ static int __init update_mp_table(void)
if (mpc_new_phys - mpf->physptr) {
struct mpf_intel *mpf_new;
/* steal 16 bytes from [0, 1k) */
- printk(KERN_INFO "mpf new: %x\n", 0x400 - 16);
+ pr_info("mpf new: %x\n", 0x400 - 16);
mpf_new = phys_to_virt(0x400 - 16);
memcpy(mpf_new, mpf, 16);
mpf = mpf_new;
@@ -900,7 +901,7 @@ static int __init update_mp_table(void)
}
mpf->checksum = 0;
mpf->checksum -= mpf_checksum((unsigned char *)mpf, 16);
- printk(KERN_INFO "physptr new: %x\n", mpf->physptr);
+ pr_info("physptr new: %x\n", mpf->physptr);
}
/*
diff --git a/arch/x86/kernel/pmc_atom.c b/arch/x86/kernel/pmc_atom.c
index 0c424a67985d..0ee5025e0fa4 100644
--- a/arch/x86/kernel/pmc_atom.c
+++ b/arch/x86/kernel/pmc_atom.c
@@ -235,6 +235,11 @@ err:
pmc_dbgfs_unregister(pmc);
return -ENODEV;
}
+#else
+static int pmc_dbgfs_register(struct pmc_dev *pmc, struct pci_dev *pdev)
+{
+ return 0;
+}
#endif /* CONFIG_DEBUG_FS */
static int pmc_setup_dev(struct pci_dev *pdev)
@@ -262,14 +267,12 @@ static int pmc_setup_dev(struct pci_dev *pdev)
/* PMC hardware registers setup */
pmc_hw_reg_setup(pmc);
-#ifdef CONFIG_DEBUG_FS
ret = pmc_dbgfs_register(pmc, pdev);
if (ret) {
iounmap(pmc->regmap);
- return ret;
}
-#endif /* CONFIG_DEBUG_FS */
- return 0;
+
+ return ret;
}
/*
diff --git a/arch/x86/kernel/preempt.S b/arch/x86/kernel/preempt.S
deleted file mode 100644
index ca7f0d58a87d..000000000000
--- a/arch/x86/kernel/preempt.S
+++ /dev/null
@@ -1,25 +0,0 @@
-
-#include <linux/linkage.h>
-#include <asm/dwarf2.h>
-#include <asm/asm.h>
-#include <asm/calling.h>
-
-ENTRY(___preempt_schedule)
- CFI_STARTPROC
- SAVE_ALL
- call preempt_schedule
- RESTORE_ALL
- ret
- CFI_ENDPROC
-
-#ifdef CONFIG_CONTEXT_TRACKING
-
-ENTRY(___preempt_schedule_context)
- CFI_STARTPROC
- SAVE_ALL
- call preempt_schedule_context
- RESTORE_ALL
- ret
- CFI_ENDPROC
-
-#endif
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 4505e2a950d8..e127ddaa2d5a 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -64,14 +64,16 @@ EXPORT_SYMBOL_GPL(task_xstate_cachep);
*/
int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
{
- int ret;
-
*dst = *src;
- if (fpu_allocated(&src->thread.fpu)) {
- memset(&dst->thread.fpu, 0, sizeof(dst->thread.fpu));
- ret = fpu_alloc(&dst->thread.fpu);
- if (ret)
- return ret;
+
+ dst->thread.fpu_counter = 0;
+ dst->thread.fpu.has_fpu = 0;
+ dst->thread.fpu.last_cpu = ~0;
+ dst->thread.fpu.state = NULL;
+ if (tsk_used_math(src)) {
+ int err = fpu_alloc(&dst->thread.fpu);
+ if (err)
+ return err;
fpu_copy(dst, src);
}
return 0;
@@ -93,6 +95,7 @@ void arch_task_cache_init(void)
kmem_cache_create("task_xstate", xstate_size,
__alignof__(union thread_xstate),
SLAB_PANIC | SLAB_NOTRACK, NULL);
+ setup_xstate_comp();
}
/*
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 7bc86bbe7485..8f3ebfe710d0 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -138,6 +138,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
p->thread.sp = (unsigned long) childregs;
p->thread.sp0 = (unsigned long) (childregs+1);
+ memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
if (unlikely(p->flags & PF_KTHREAD)) {
/* kernel thread */
@@ -152,9 +153,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
childregs->orig_ax = -1;
childregs->cs = __KERNEL_CS | get_kernel_rpl();
childregs->flags = X86_EFLAGS_IF | X86_EFLAGS_FIXED;
- p->thread.fpu_counter = 0;
p->thread.io_bitmap_ptr = NULL;
- memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
return 0;
}
*childregs = *current_pt_regs();
@@ -165,13 +164,10 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
p->thread.ip = (unsigned long) ret_from_fork;
task_user_gs(p) = get_user_gs(current_pt_regs());
- p->thread.fpu_counter = 0;
p->thread.io_bitmap_ptr = NULL;
tsk = current;
err = -ENOMEM;
- memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
-
if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr,
IO_BITMAP_BYTES, GFP_KERNEL);
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index ca5b02d405c3..3ed4a68d4013 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -163,7 +163,6 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
p->thread.sp = (unsigned long) childregs;
p->thread.usersp = me->thread.usersp;
set_tsk_thread_flag(p, TIF_FORK);
- p->thread.fpu_counter = 0;
p->thread.io_bitmap_ptr = NULL;
savesegment(gs, p->thread.gsindex);
@@ -193,8 +192,6 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
childregs->sp = sp;
err = -ENOMEM;
- memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
-
if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
p->thread.io_bitmap_ptr = kmemdup(me->thread.io_bitmap_ptr,
IO_BITMAP_BYTES, GFP_KERNEL);
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 678c0ada3b3c..749b0e423419 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -1441,24 +1441,126 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs,
force_sig_info(SIGTRAP, &info, tsk);
}
-
-#ifdef CONFIG_X86_32
-# define IS_IA32 1
-#elif defined CONFIG_IA32_EMULATION
-# define IS_IA32 is_compat_task()
-#else
-# define IS_IA32 0
+static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch)
+{
+#ifdef CONFIG_X86_64
+ if (arch == AUDIT_ARCH_X86_64) {
+ audit_syscall_entry(regs->orig_ax, regs->di,
+ regs->si, regs->dx, regs->r10);
+ } else
#endif
+ {
+ audit_syscall_entry(regs->orig_ax, regs->bx,
+ regs->cx, regs->dx, regs->si);
+ }
+}
/*
- * We must return the syscall number to actually look up in the table.
- * This can be -1L to skip running any syscall at all.
+ * We can return 0 to resume the syscall or anything else to go to phase
+ * 2. If we resume the syscall, we need to put something appropriate in
+ * regs->orig_ax.
+ *
+ * NB: We don't have full pt_regs here, but regs->orig_ax and regs->ax
+ * are fully functional.
+ *
+ * For phase 2's benefit, our return value is:
+ * 0: resume the syscall
+ * 1: go to phase 2; no seccomp phase 2 needed
+ * anything else: go to phase 2; pass return value to seccomp
*/
-long syscall_trace_enter(struct pt_regs *regs)
+unsigned long syscall_trace_enter_phase1(struct pt_regs *regs, u32 arch)
+{
+ unsigned long ret = 0;
+ u32 work;
+
+ BUG_ON(regs != task_pt_regs(current));
+
+ work = ACCESS_ONCE(current_thread_info()->flags) &
+ _TIF_WORK_SYSCALL_ENTRY;
+
+ /*
+ * If TIF_NOHZ is set, we are required to call user_exit() before
+ * doing anything that could touch RCU.
+ */
+ if (work & _TIF_NOHZ) {
+ user_exit();
+ work &= ~TIF_NOHZ;
+ }
+
+#ifdef CONFIG_SECCOMP
+ /*
+ * Do seccomp first -- it should minimize exposure of other
+ * code, and keeping seccomp fast is probably more valuable
+ * than the rest of this.
+ */
+ if (work & _TIF_SECCOMP) {
+ struct seccomp_data sd;
+
+ sd.arch = arch;
+ sd.nr = regs->orig_ax;
+ sd.instruction_pointer = regs->ip;
+#ifdef CONFIG_X86_64
+ if (arch == AUDIT_ARCH_X86_64) {
+ sd.args[0] = regs->di;
+ sd.args[1] = regs->si;
+ sd.args[2] = regs->dx;
+ sd.args[3] = regs->r10;
+ sd.args[4] = regs->r8;
+ sd.args[5] = regs->r9;
+ } else
+#endif
+ {
+ sd.args[0] = regs->bx;
+ sd.args[1] = regs->cx;
+ sd.args[2] = regs->dx;
+ sd.args[3] = regs->si;
+ sd.args[4] = regs->di;
+ sd.args[5] = regs->bp;
+ }
+
+ BUILD_BUG_ON(SECCOMP_PHASE1_OK != 0);
+ BUILD_BUG_ON(SECCOMP_PHASE1_SKIP != 1);
+
+ ret = seccomp_phase1(&sd);
+ if (ret == SECCOMP_PHASE1_SKIP) {
+ regs->orig_ax = -1;
+ ret = 0;
+ } else if (ret != SECCOMP_PHASE1_OK) {
+ return ret; /* Go directly to phase 2 */
+ }
+
+ work &= ~_TIF_SECCOMP;
+ }
+#endif
+
+ /* Do our best to finish without phase 2. */
+ if (work == 0)
+ return ret; /* seccomp and/or nohz only (ret == 0 here) */
+
+#ifdef CONFIG_AUDITSYSCALL
+ if (work == _TIF_SYSCALL_AUDIT) {
+ /*
+ * If there is no more work to be done except auditing,
+ * then audit in phase 1. Phase 2 always audits, so, if
+ * we audit here, then we can't go on to phase 2.
+ */
+ do_audit_syscall_entry(regs, arch);
+ return 0;
+ }
+#endif
+
+ return 1; /* Something is enabled that we can't handle in phase 1 */
+}
+
+/* Returns the syscall nr to run (which should match regs->orig_ax). */
+long syscall_trace_enter_phase2(struct pt_regs *regs, u32 arch,
+ unsigned long phase1_result)
{
long ret = 0;
+ u32 work = ACCESS_ONCE(current_thread_info()->flags) &
+ _TIF_WORK_SYSCALL_ENTRY;
- user_exit();
+ BUG_ON(regs != task_pt_regs(current));
/*
* If we stepped into a sysenter/syscall insn, it trapped in
@@ -1467,17 +1569,21 @@ long syscall_trace_enter(struct pt_regs *regs)
* do_debug() and we need to set it again to restore the user
* state. If we entered on the slow path, TF was already set.
*/
- if (test_thread_flag(TIF_SINGLESTEP))
+ if (work & _TIF_SINGLESTEP)
regs->flags |= X86_EFLAGS_TF;
- /* do the secure computing check first */
- if (secure_computing(regs->orig_ax)) {
+#ifdef CONFIG_SECCOMP
+ /*
+ * Call seccomp_phase2 before running the other hooks so that
+ * they can see any changes made by a seccomp tracer.
+ */
+ if (phase1_result > 1 && seccomp_phase2(phase1_result)) {
/* seccomp failures shouldn't expose any additional code. */
- ret = -1L;
- goto out;
+ return -1;
}
+#endif
- if (unlikely(test_thread_flag(TIF_SYSCALL_EMU)))
+ if (unlikely(work & _TIF_SYSCALL_EMU))
ret = -1L;
if ((ret || test_thread_flag(TIF_SYSCALL_TRACE)) &&
@@ -1487,23 +1593,22 @@ long syscall_trace_enter(struct pt_regs *regs)
if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
trace_sys_enter(regs, regs->orig_ax);
- if (IS_IA32)
- audit_syscall_entry(AUDIT_ARCH_I386,
- regs->orig_ax,
- regs->bx, regs->cx,
- regs->dx, regs->si);
-#ifdef CONFIG_X86_64
- else
- audit_syscall_entry(AUDIT_ARCH_X86_64,
- regs->orig_ax,
- regs->di, regs->si,
- regs->dx, regs->r10);
-#endif
+ do_audit_syscall_entry(regs, arch);
-out:
return ret ?: regs->orig_ax;
}
+long syscall_trace_enter(struct pt_regs *regs)
+{
+ u32 arch = is_ia32_task() ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64;
+ unsigned long phase1_result = syscall_trace_enter_phase1(regs, arch);
+
+ if (phase1_result == 0)
+ return regs->orig_ax;
+ else
+ return syscall_trace_enter_phase2(regs, arch, phase1_result);
+}
+
void syscall_trace_leave(struct pt_regs *regs)
{
bool step;
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index ff898bbf579d..176a0f99d4da 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -498,6 +498,24 @@ void force_hpet_resume(void)
}
/*
+ * According to the datasheet e6xx systems have the HPET hardwired to
+ * 0xfed00000
+ */
+static void e6xx_force_enable_hpet(struct pci_dev *dev)
+{
+ if (hpet_address || force_hpet_address)
+ return;
+
+ force_hpet_address = 0xFED00000;
+ force_hpet_resume_type = NONE_FORCE_HPET_RESUME;
+ dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at "
+ "0x%lx\n", force_hpet_address);
+ return;
+}
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E6XX_CU,
+ e6xx_force_enable_hpet);
+
+/*
* HPET MSI on some boards (ATI SB700/SB800) has side effect on
* floppy DMA. Disable HPET MSI on such platforms.
* See erratum #27 (Misinterpreted MSI Requests May Result in
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 41ead8d3bc0b..ab08aa2276fb 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -879,6 +879,15 @@ void __init setup_arch(char **cmdline_p)
KERNEL_PGD_PTRS);
load_cr3(swapper_pg_dir);
+ /*
+ * Note: Quark X1000 CPUs advertise PGE incorrectly and require
+ * a cr3 based tlb flush, so the following __flush_tlb_all()
+ * will not flush anything because the cpu quirk which clears
+ * X86_FEATURE_PGE has not been invoked yet. Though due to the
+ * load_cr3() above the TLB has been flushed already. The
+ * quirk is invoked before subsequent calls to __flush_tlb_all()
+ * so proper operation is guaranteed.
+ */
__flush_tlb_all();
#else
printk(KERN_INFO "Command line: %s\n", boot_command_line);
@@ -1119,7 +1128,6 @@ void __init setup_arch(char **cmdline_p)
setup_real_mode();
memblock_set_current_limit(get_max_mapped());
- dma_contiguous_reserve(max_pfn_mapped << PAGE_SHIFT);
/*
* NOTE: On x86-32, only from this point on, fixmaps are ready for use.
@@ -1150,6 +1158,7 @@ void __init setup_arch(char **cmdline_p)
early_acpi_boot_init();
initmem_init();
+ dma_contiguous_reserve(max_pfn_mapped << PAGE_SHIFT);
/*
* Reserve memory for crash kernel after SRAT is parsed so that it
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 2851d63c1202..ed37a768d0fc 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -675,6 +675,11 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs)
* handler too.
*/
regs->flags &= ~(X86_EFLAGS_DF|X86_EFLAGS_RF|X86_EFLAGS_TF);
+ /*
+ * Ensure the signal handler starts with the new fpu state.
+ */
+ if (used_math())
+ drop_init_fpu(current);
}
signal_setup_done(failed, ksig, test_thread_flag(TIF_SINGLESTEP));
}
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 5492798930ef..668d8f2a8781 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -111,7 +111,6 @@ atomic_t init_deasserted;
static void smp_callin(void)
{
int cpuid, phys_id;
- unsigned long timeout;
/*
* If waken up by an INIT in an 82489DX configuration
@@ -130,37 +129,6 @@ static void smp_callin(void)
* (This works even if the APIC is not enabled.)
*/
phys_id = read_apic_id();
- if (cpumask_test_cpu(cpuid, cpu_callin_mask)) {
- panic("%s: phys CPU#%d, CPU#%d already present??\n", __func__,
- phys_id, cpuid);
- }
- pr_debug("CPU#%d (phys ID: %d) waiting for CALLOUT\n", cpuid, phys_id);
-
- /*
- * STARTUP IPIs are fragile beasts as they might sometimes
- * trigger some glue motherboard logic. Complete APIC bus
- * silence for 1 second, this overestimates the time the
- * boot CPU is spending to send the up to 2 STARTUP IPIs
- * by a factor of two. This should be enough.
- */
-
- /*
- * Waiting 2s total for startup (udelay is not yet working)
- */
- timeout = jiffies + 2*HZ;
- while (time_before(jiffies, timeout)) {
- /*
- * Has the boot CPU finished it's STARTUP sequence?
- */
- if (cpumask_test_cpu(cpuid, cpu_callout_mask))
- break;
- cpu_relax();
- }
-
- if (!time_before(jiffies, timeout)) {
- panic("%s: CPU%d started up but did not get a callout!\n",
- __func__, cpuid);
- }
/*
* the boot CPU has finished the init stage and is spinning
@@ -168,10 +136,6 @@ static void smp_callin(void)
* CPU, first the APIC. (this is probably redundant on most
* boards)
*/
-
- pr_debug("CALLIN, before setup_local_APIC()\n");
- if (apic->smp_callin_clear_local_apic)
- apic->smp_callin_clear_local_apic();
setup_local_APIC();
end_local_APIC_setup();
@@ -300,11 +264,19 @@ void smp_store_cpu_info(int id)
}
static bool
+topology_same_node(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
+{
+ int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
+
+ return (cpu_to_node(cpu1) == cpu_to_node(cpu2));
+}
+
+static bool
topology_sane(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o, const char *name)
{
int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
- return !WARN_ONCE(cpu_to_node(cpu1) != cpu_to_node(cpu2),
+ return !WARN_ONCE(!topology_same_node(c, o),
"sched: CPU #%d's %s-sibling CPU #%d is not on the same node! "
"[node: %d != %d]. Ignoring dependency.\n",
cpu1, name, cpu2, cpu_to_node(cpu1), cpu_to_node(cpu2));
@@ -345,17 +317,44 @@ static bool match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
return false;
}
-static bool match_mc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
+/*
+ * Unlike the other levels, we do not enforce keeping a
+ * multicore group inside a NUMA node. If this happens, we will
+ * discard the MC level of the topology later.
+ */
+static bool match_die(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
{
- if (c->phys_proc_id == o->phys_proc_id) {
- if (cpu_has(c, X86_FEATURE_AMD_DCM))
- return true;
-
- return topology_sane(c, o, "mc");
- }
+ if (c->phys_proc_id == o->phys_proc_id)
+ return true;
return false;
}
+static struct sched_domain_topology_level numa_inside_package_topology[] = {
+#ifdef CONFIG_SCHED_SMT
+ { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
+#endif
+#ifdef CONFIG_SCHED_MC
+ { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
+#endif
+ { NULL, },
+};
+/*
+ * set_sched_topology() sets the topology internal to a CPU. The
+ * NUMA topologies are layered on top of it to build the full
+ * system topology.
+ *
+ * If NUMA nodes are observed to occur within a CPU package, this
+ * function should be called. It forces the sched domain code to
+ * only use the SMT level for the CPU portion of the topology.
+ * This essentially falls back to relying on NUMA information
+ * from the SRAT table to describe the entire system topology
+ * (except for hyperthreads).
+ */
+static void primarily_use_numa_for_topology(void)
+{
+ set_sched_topology(numa_inside_package_topology);
+}
+
void set_cpu_sibling_map(int cpu)
{
bool has_smt = smp_num_siblings > 1;
@@ -392,7 +391,7 @@ void set_cpu_sibling_map(int cpu)
for_each_cpu(i, cpu_sibling_setup_mask) {
o = &cpu_data(i);
- if ((i == cpu) || (has_mp && match_mc(c, o))) {
+ if ((i == cpu) || (has_mp && match_die(c, o))) {
link_mask(core, cpu, i);
/*
@@ -414,6 +413,8 @@ void set_cpu_sibling_map(int cpu)
} else if (i != cpu && !c->booted_cores)
c->booted_cores = cpu_data(i).booted_cores;
}
+ if (match_die(c, o) && !topology_same_node(c, o))
+ primarily_use_numa_for_topology();
}
}
@@ -757,8 +758,8 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
unsigned long start_ip = real_mode_header->trampoline_start;
unsigned long boot_error = 0;
- int timeout;
int cpu0_nmi_registered = 0;
+ unsigned long timeout;
/* Just in case we booted with a single CPU. */
alternatives_enable_smp();
@@ -806,6 +807,15 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
}
/*
+ * AP might wait on cpu_callout_mask in cpu_init() with
+ * cpu_initialized_mask set if previous attempt to online
+ * it timed-out. Clear cpu_initialized_mask so that after
+ * INIT/SIPI it could start with a clean state.
+ */
+ cpumask_clear_cpu(cpu, cpu_initialized_mask);
+ smp_mb();
+
+ /*
* Wake up a CPU in difference cases:
* - Use the method in the APIC driver if it's defined
* Otherwise,
@@ -819,53 +829,38 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
if (!boot_error) {
/*
- * allow APs to start initializing.
+ * Wait 10s total for a response from AP
*/
- pr_debug("Before Callout %d\n", cpu);
- cpumask_set_cpu(cpu, cpu_callout_mask);
- pr_debug("After Callout %d\n", cpu);
+ boot_error = -1;
+ timeout = jiffies + 10*HZ;
+ while (time_before(jiffies, timeout)) {
+ if (cpumask_test_cpu(cpu, cpu_initialized_mask)) {
+ /*
+ * Tell AP to proceed with initialization
+ */
+ cpumask_set_cpu(cpu, cpu_callout_mask);
+ boot_error = 0;
+ break;
+ }
+ udelay(100);
+ schedule();
+ }
+ }
+ if (!boot_error) {
/*
- * Wait 5s total for a response
+ * Wait till AP completes initial initialization
*/
- for (timeout = 0; timeout < 50000; timeout++) {
- if (cpumask_test_cpu(cpu, cpu_callin_mask))
- break; /* It has booted */
- udelay(100);
+ while (!cpumask_test_cpu(cpu, cpu_callin_mask)) {
/*
* Allow other tasks to run while we wait for the
* AP to come online. This also gives a chance
* for the MTRR work(triggered by the AP coming online)
* to be completed in the stop machine context.
*/
+ udelay(100);
schedule();
}
-
- if (cpumask_test_cpu(cpu, cpu_callin_mask)) {
- print_cpu_msr(&cpu_data(cpu));
- pr_debug("CPU%d: has booted.\n", cpu);
- } else {
- boot_error = 1;
- if (*trampoline_status == 0xA5A5A5A5)
- /* trampoline started but...? */
- pr_err("CPU%d: Stuck ??\n", cpu);
- else
- /* trampoline code not run */
- pr_err("CPU%d: Not responding\n", cpu);
- if (apic->inquire_remote_apic)
- apic->inquire_remote_apic(apicid);
- }
- }
-
- if (boot_error) {
- /* Try to put things back the way they were before ... */
- numa_remove_cpu(cpu); /* was set by numa_add_cpu */
-
- /* was set by do_boot_cpu() */
- cpumask_clear_cpu(cpu, cpu_callout_mask);
-
- /* was set by cpu_init() */
- cpumask_clear_cpu(cpu, cpu_initialized_mask);
}
/* mark "stuck" area as not stuck */
@@ -1143,10 +1138,6 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
enable_IO_APIC();
bsp_end_local_APIC_setup();
-
- if (apic->setup_portio_remap)
- apic->setup_portio_remap();
-
smpboot_setup_io_apic();
/*
* Set up local APIC timer on boot CPU.
@@ -1292,6 +1283,9 @@ static void remove_siblinginfo(int cpu)
for_each_cpu(sibling, cpu_sibling_mask(cpu))
cpumask_clear_cpu(cpu, cpu_sibling_mask(sibling));
+ for_each_cpu(sibling, cpu_llc_shared_mask(cpu))
+ cpumask_clear_cpu(cpu, cpu_llc_shared_mask(sibling));
+ cpumask_clear(cpu_llc_shared_mask(cpu));
cpumask_clear(cpu_sibling_mask(cpu));
cpumask_clear(cpu_core_mask(cpu));
c->phys_proc_id = 0;
@@ -1309,10 +1303,14 @@ static void __ref remove_cpu_from_maps(int cpu)
numa_remove_cpu(cpu);
}
+static DEFINE_PER_CPU(struct completion, die_complete);
+
void cpu_disable_common(void)
{
int cpu = smp_processor_id();
+ init_completion(&per_cpu(die_complete, smp_processor_id()));
+
remove_siblinginfo(cpu);
/* It's now safe to remove this processor from the online map */
@@ -1331,26 +1329,29 @@ int native_cpu_disable(void)
return ret;
clear_local_APIC();
-
cpu_disable_common();
+
return 0;
}
+void cpu_die_common(unsigned int cpu)
+{
+ wait_for_completion_timeout(&per_cpu(die_complete, cpu), HZ);
+}
+
void native_cpu_die(unsigned int cpu)
{
/* We don't do anything here: idle task is faking death itself. */
- unsigned int i;
- for (i = 0; i < 10; i++) {
- /* They ack this in play_dead by setting CPU_DEAD */
- if (per_cpu(cpu_state, cpu) == CPU_DEAD) {
- if (system_state == SYSTEM_RUNNING)
- pr_info("CPU %u is now offline\n", cpu);
- return;
- }
- msleep(100);
+ cpu_die_common(cpu);
+
+ /* They ack this in play_dead() by setting CPU_DEAD */
+ if (per_cpu(cpu_state, cpu) == CPU_DEAD) {
+ if (system_state == SYSTEM_RUNNING)
+ pr_info("CPU %u is now offline\n", cpu);
+ } else {
+ pr_err("CPU %u didn't die...\n", cpu);
}
- pr_err("CPU %u didn't die...\n", cpu);
}
void play_dead_common(void)
@@ -1362,6 +1363,7 @@ void play_dead_common(void)
mb();
/* Ack it */
__this_cpu_write(cpu_state, CPU_DEAD);
+ complete(&per_cpu(die_complete, smp_processor_id()));
/*
* With physical CPU hotplug, we should halt the cpu
diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c
index bf7ef5ce29df..0fa29609b2c4 100644
--- a/arch/x86/kernel/time.c
+++ b/arch/x86/kernel/time.c
@@ -68,6 +68,8 @@ static struct irqaction irq0 = {
void __init setup_default_timer_irq(void)
{
+ if (!nr_legacy_irqs())
+ return;
setup_irq(0, &irq0);
}
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index b6025f9e36c6..b7e50bba3bbb 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -1166,14 +1166,17 @@ void __init tsc_init(void)
x86_init.timers.tsc_pre_init();
- if (!cpu_has_tsc)
+ if (!cpu_has_tsc) {
+ setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER);
return;
+ }
tsc_khz = x86_platform.calibrate_tsc();
cpu_khz = tsc_khz;
if (!tsc_khz) {
mark_tsc_unstable("could not calculate TSC khz");
+ setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER);
return;
}
diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c
index b99b9ad8540c..ee22c1d93ae5 100644
--- a/arch/x86/kernel/vsmp_64.c
+++ b/arch/x86/kernel/vsmp_64.c
@@ -152,7 +152,7 @@ static void __init detect_vsmp_box(void)
is_vsmp = 1;
}
-int is_vsmp_box(void)
+static int is_vsmp_box(void)
{
if (is_vsmp != -1)
return is_vsmp;
@@ -166,7 +166,7 @@ int is_vsmp_box(void)
static void __init detect_vsmp_box(void)
{
}
-int is_vsmp_box(void)
+static int is_vsmp_box(void)
{
return 0;
}
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index e1e1e80fc6a6..957779f4eb40 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -216,7 +216,7 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
*/
regs->orig_ax = syscall_nr;
regs->ax = -ENOSYS;
- tmp = secure_computing(syscall_nr);
+ tmp = secure_computing();
if ((!tmp && regs->orig_ax != syscall_nr) || regs->ip != address) {
warn_bad_vsyscall(KERN_DEBUG, regs,
"seccomp tried to change syscall nr or ip");
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index a4b451c6addf..4c540c4719d8 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -8,6 +8,7 @@
#include <linux/bootmem.h>
#include <linux/compat.h>
+#include <linux/cpu.h>
#include <asm/i387.h>
#include <asm/fpu-internal.h>
#include <asm/sigframe.h>
@@ -24,7 +25,9 @@ u64 pcntxt_mask;
struct xsave_struct *init_xstate_buf;
static struct _fpx_sw_bytes fx_sw_reserved, fx_sw_reserved_ia32;
-static unsigned int *xstate_offsets, *xstate_sizes, xstate_features;
+static unsigned int *xstate_offsets, *xstate_sizes;
+static unsigned int xstate_comp_offsets[sizeof(pcntxt_mask)*8];
+static unsigned int xstate_features;
/*
* If a processor implementation discern that a processor state component is
@@ -268,8 +271,6 @@ int save_xstate_sig(void __user *buf, void __user *buf_fx, int size)
if (use_fxsr() && save_xstate_epilog(buf_fx, ia32_fxstate))
return -1;
- drop_init_fpu(tsk); /* trigger finit */
-
return 0;
}
@@ -283,7 +284,7 @@ sanitize_restored_xstate(struct task_struct *tsk,
if (use_xsave()) {
/* These bits must be zero. */
- xsave_hdr->reserved1[0] = xsave_hdr->reserved1[1] = 0;
+ memset(xsave_hdr->reserved, 0, 48);
/*
* Init the state that is not present in the memory
@@ -399,8 +400,11 @@ int __restore_xstate_sig(void __user *buf, void __user *buf_fx, int size)
set_used_math();
}
- if (use_eager_fpu())
+ if (use_eager_fpu()) {
+ preempt_disable();
math_state_restore();
+ preempt_enable();
+ }
return err;
} else {
@@ -479,6 +483,52 @@ static void __init setup_xstate_features(void)
}
/*
+ * This function sets up offsets and sizes of all extended states in
+ * xsave area. This supports both standard format and compacted format
+ * of the xsave aread.
+ *
+ * Input: void
+ * Output: void
+ */
+void setup_xstate_comp(void)
+{
+ unsigned int xstate_comp_sizes[sizeof(pcntxt_mask)*8];
+ int i;
+
+ /*
+ * The FP xstates and SSE xstates are legacy states. They are always
+ * in the fixed offsets in the xsave area in either compacted form
+ * or standard form.
+ */
+ xstate_comp_offsets[0] = 0;
+ xstate_comp_offsets[1] = offsetof(struct i387_fxsave_struct, xmm_space);
+
+ if (!cpu_has_xsaves) {
+ for (i = 2; i < xstate_features; i++) {
+ if (test_bit(i, (unsigned long *)&pcntxt_mask)) {
+ xstate_comp_offsets[i] = xstate_offsets[i];
+ xstate_comp_sizes[i] = xstate_sizes[i];
+ }
+ }
+ return;
+ }
+
+ xstate_comp_offsets[2] = FXSAVE_SIZE + XSAVE_HDR_SIZE;
+
+ for (i = 2; i < xstate_features; i++) {
+ if (test_bit(i, (unsigned long *)&pcntxt_mask))
+ xstate_comp_sizes[i] = xstate_sizes[i];
+ else
+ xstate_comp_sizes[i] = 0;
+
+ if (i > 2)
+ xstate_comp_offsets[i] = xstate_comp_offsets[i-1]
+ + xstate_comp_sizes[i-1];
+
+ }
+}
+
+/*
* setup the xstate image representing the init state
*/
static void __init setup_init_fpu_buf(void)
@@ -496,15 +546,21 @@ static void __init setup_init_fpu_buf(void)
setup_xstate_features();
+ if (cpu_has_xsaves) {
+ init_xstate_buf->xsave_hdr.xcomp_bv =
+ (u64)1 << 63 | pcntxt_mask;
+ init_xstate_buf->xsave_hdr.xstate_bv = pcntxt_mask;
+ }
+
/*
* Init all the features state with header_bv being 0x0
*/
- xrstor_state(init_xstate_buf, -1);
+ xrstor_state_booting(init_xstate_buf, -1);
/*
* Dump the init state again. This is to identify the init state
* of any feature which is not represented by all zero's.
*/
- xsave_state(init_xstate_buf, -1);
+ xsave_state_booting(init_xstate_buf, -1);
}
static enum { AUTO, ENABLE, DISABLE } eagerfpu = AUTO;
@@ -520,6 +576,30 @@ static int __init eager_fpu_setup(char *s)
}
__setup("eagerfpu=", eager_fpu_setup);
+
+/*
+ * Calculate total size of enabled xstates in XCR0/pcntxt_mask.
+ */
+static void __init init_xstate_size(void)
+{
+ unsigned int eax, ebx, ecx, edx;
+ int i;
+
+ if (!cpu_has_xsaves) {
+ cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
+ xstate_size = ebx;
+ return;
+ }
+
+ xstate_size = FXSAVE_SIZE + XSAVE_HDR_SIZE;
+ for (i = 2; i < 64; i++) {
+ if (test_bit(i, (unsigned long *)&pcntxt_mask)) {
+ cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx);
+ xstate_size += eax;
+ }
+ }
+}
+
/*
* Enable and initialize the xsave feature.
*/
@@ -551,8 +631,7 @@ static void __init xstate_enable_boot_cpu(void)
/*
* Recompute the context size for enabled features
*/
- cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
- xstate_size = ebx;
+ init_xstate_size();
update_regset_xstate_info(xstate_size, pcntxt_mask);
prepare_fx_sw_frame();
@@ -572,8 +651,9 @@ static void __init xstate_enable_boot_cpu(void)
}
}
- pr_info("enabled xstate_bv 0x%llx, cntxt size 0x%x\n",
- pcntxt_mask, xstate_size);
+ pr_info("enabled xstate_bv 0x%llx, cntxt size 0x%x using %s\n",
+ pcntxt_mask, xstate_size,
+ cpu_has_xsaves ? "compacted form" : "standard form");
}
/*
@@ -635,3 +715,26 @@ void eager_fpu_init(void)
else
fxrstor_checking(&init_xstate_buf->i387);
}
+
+/*
+ * Given the xsave area and a state inside, this function returns the
+ * address of the state.
+ *
+ * This is the API that is called to get xstate address in either
+ * standard format or compacted format of xsave area.
+ *
+ * Inputs:
+ * xsave: base address of the xsave area;
+ * xstate: state which is defined in xsave.h (e.g. XSTATE_FP, XSTATE_SSE,
+ * etc.)
+ * Output:
+ * address of the state in the xsave area.
+ */
+void *get_xsave_addr(struct xsave_struct *xsave, int xstate)
+{
+ int feature = fls64(xstate) - 1;
+ if (!test_bit(feature, (unsigned long *)&pcntxt_mask))
+ return NULL;
+
+ return (void *)xsave + xstate_comp_offsets[feature];
+}
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 287e4c85fff9..f9d16ff56c6b 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -27,6 +27,7 @@ config KVM
select MMU_NOTIFIER
select ANON_INODES
select HAVE_KVM_IRQCHIP
+ select HAVE_KVM_IRQFD
select HAVE_KVM_IRQ_ROUTING
select HAVE_KVM_EVENTFD
select KVM_APIC_ARCHITECTURE
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 38a0afe83c6b..976e3a57f9ea 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -53,14 +53,14 @@ u64 kvm_supported_xcr0(void)
return xcr0;
}
-void kvm_update_cpuid(struct kvm_vcpu *vcpu)
+int kvm_update_cpuid(struct kvm_vcpu *vcpu)
{
struct kvm_cpuid_entry2 *best;
struct kvm_lapic *apic = vcpu->arch.apic;
best = kvm_find_cpuid_entry(vcpu, 1, 0);
if (!best)
- return;
+ return 0;
/* Update OSXSAVE bit */
if (cpu_has_xsave && best->function == 0x1) {
@@ -88,7 +88,17 @@ void kvm_update_cpuid(struct kvm_vcpu *vcpu)
xstate_required_size(vcpu->arch.xcr0);
}
+ /*
+ * The existing code assumes virtual address is 48-bit in the canonical
+ * address checks; exit if it is ever changed.
+ */
+ best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0);
+ if (best && ((best->eax & 0xff00) >> 8) != 48 &&
+ ((best->eax & 0xff00) >> 8) != 0)
+ return -EINVAL;
+
kvm_pmu_cpuid_update(vcpu);
+ return 0;
}
static int is_efer_nx(void)
@@ -112,8 +122,8 @@ static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu)
break;
}
}
- if (entry && (entry->edx & (1 << 20)) && !is_efer_nx()) {
- entry->edx &= ~(1 << 20);
+ if (entry && (entry->edx & bit(X86_FEATURE_NX)) && !is_efer_nx()) {
+ entry->edx &= ~bit(X86_FEATURE_NX);
printk(KERN_INFO "kvm: guest NX capability removed\n");
}
}
@@ -151,10 +161,9 @@ int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
}
vcpu->arch.cpuid_nent = cpuid->nent;
cpuid_fix_nx_cap(vcpu);
- r = 0;
kvm_apic_set_version(vcpu);
kvm_x86_ops->cpuid_update(vcpu);
- kvm_update_cpuid(vcpu);
+ r = kvm_update_cpuid(vcpu);
out_free:
vfree(cpuid_entries);
@@ -178,9 +187,7 @@ int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
vcpu->arch.cpuid_nent = cpuid->nent;
kvm_apic_set_version(vcpu);
kvm_x86_ops->cpuid_update(vcpu);
- kvm_update_cpuid(vcpu);
- return 0;
-
+ r = kvm_update_cpuid(vcpu);
out:
return r;
}
@@ -767,6 +774,12 @@ void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx)
if (!best)
best = check_cpuid_limit(vcpu, function, index);
+ /*
+ * Perfmon not yet supported for L2 guest.
+ */
+ if (is_guest_mode(vcpu) && function == 0xa)
+ best = NULL;
+
if (best) {
*eax = best->eax;
*ebx = best->ebx;
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index a5380590ab0e..4452eedfaedd 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -3,7 +3,7 @@
#include "x86.h"
-void kvm_update_cpuid(struct kvm_vcpu *vcpu);
+int kvm_update_cpuid(struct kvm_vcpu *vcpu);
struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
u32 function, u32 index);
int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid,
@@ -88,6 +88,14 @@ static inline bool guest_cpuid_has_x2apic(struct kvm_vcpu *vcpu)
return best && (best->ecx & bit(X86_FEATURE_X2APIC));
}
+static inline bool guest_cpuid_is_amd(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best;
+
+ best = kvm_find_cpuid_entry(vcpu, 0, 0);
+ return best && best->ebx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx;
+}
+
static inline bool guest_cpuid_has_gbpages(struct kvm_vcpu *vcpu)
{
struct kvm_cpuid_entry2 *best;
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 56657b0bb3bb..9f8a2faf5040 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -504,11 +504,6 @@ static void rsp_increment(struct x86_emulate_ctxt *ctxt, int inc)
masked_increment(reg_rmw(ctxt, VCPU_REGS_RSP), stack_mask(ctxt), inc);
}
-static inline void jmp_rel(struct x86_emulate_ctxt *ctxt, int rel)
-{
- register_address_increment(ctxt, &ctxt->_eip, rel);
-}
-
static u32 desc_limit_scaled(struct desc_struct *desc)
{
u32 limit = get_desc_limit(desc);
@@ -527,6 +522,7 @@ static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, int seg)
static int emulate_exception(struct x86_emulate_ctxt *ctxt, int vec,
u32 error, bool valid)
{
+ WARN_ON(vec > 0x1f);
ctxt->exception.vector = vec;
ctxt->exception.error_code = error;
ctxt->exception.error_code_valid = valid;
@@ -568,6 +564,40 @@ static int emulate_nm(struct x86_emulate_ctxt *ctxt)
return emulate_exception(ctxt, NM_VECTOR, 0, false);
}
+static inline int assign_eip_far(struct x86_emulate_ctxt *ctxt, ulong dst,
+ int cs_l)
+{
+ switch (ctxt->op_bytes) {
+ case 2:
+ ctxt->_eip = (u16)dst;
+ break;
+ case 4:
+ ctxt->_eip = (u32)dst;
+ break;
+#ifdef CONFIG_X86_64
+ case 8:
+ if ((cs_l && is_noncanonical_address(dst)) ||
+ (!cs_l && (dst >> 32) != 0))
+ return emulate_gp(ctxt, 0);
+ ctxt->_eip = dst;
+ break;
+#endif
+ default:
+ WARN(1, "unsupported eip assignment size\n");
+ }
+ return X86EMUL_CONTINUE;
+}
+
+static inline int assign_eip_near(struct x86_emulate_ctxt *ctxt, ulong dst)
+{
+ return assign_eip_far(ctxt, dst, ctxt->mode == X86EMUL_MODE_PROT64);
+}
+
+static inline int jmp_rel(struct x86_emulate_ctxt *ctxt, int rel)
+{
+ return assign_eip_near(ctxt, ctxt->_eip + rel);
+}
+
static u16 get_segment_selector(struct x86_emulate_ctxt *ctxt, unsigned seg)
{
u16 selector;
@@ -613,7 +643,8 @@ static bool insn_aligned(struct x86_emulate_ctxt *ctxt, unsigned size)
static int __linearize(struct x86_emulate_ctxt *ctxt,
struct segmented_address addr,
- unsigned size, bool write, bool fetch,
+ unsigned *max_size, unsigned size,
+ bool write, bool fetch,
ulong *linear)
{
struct desc_struct desc;
@@ -624,10 +655,15 @@ static int __linearize(struct x86_emulate_ctxt *ctxt,
unsigned cpl;
la = seg_base(ctxt, addr.seg) + addr.ea;
+ *max_size = 0;
switch (ctxt->mode) {
case X86EMUL_MODE_PROT64:
if (((signed long)la << 16) >> 16 != la)
return emulate_gp(ctxt, 0);
+
+ *max_size = min_t(u64, ~0u, (1ull << 48) - la);
+ if (size > *max_size)
+ goto bad;
break;
default:
usable = ctxt->ops->get_segment(ctxt, &sel, &desc, NULL,
@@ -645,20 +681,25 @@ static int __linearize(struct x86_emulate_ctxt *ctxt,
if ((ctxt->mode == X86EMUL_MODE_REAL) && !fetch &&
(ctxt->d & NoBigReal)) {
/* la is between zero and 0xffff */
- if (la > 0xffff || (u32)(la + size - 1) > 0xffff)
+ if (la > 0xffff)
goto bad;
+ *max_size = 0x10000 - la;
} else if ((desc.type & 8) || !(desc.type & 4)) {
/* expand-up segment */
- if (addr.ea > lim || (u32)(addr.ea + size - 1) > lim)
+ if (addr.ea > lim)
goto bad;
+ *max_size = min_t(u64, ~0u, (u64)lim + 1 - addr.ea);
} else {
/* expand-down segment */
- if (addr.ea <= lim || (u32)(addr.ea + size - 1) <= lim)
+ if (addr.ea <= lim)
goto bad;
lim = desc.d ? 0xffffffff : 0xffff;
- if (addr.ea > lim || (u32)(addr.ea + size - 1) > lim)
+ if (addr.ea > lim)
goto bad;
+ *max_size = min_t(u64, ~0u, (u64)lim + 1 - addr.ea);
}
+ if (size > *max_size)
+ goto bad;
cpl = ctxt->ops->cpl(ctxt);
if (!(desc.type & 8)) {
/* data segment */
@@ -683,9 +724,9 @@ static int __linearize(struct x86_emulate_ctxt *ctxt,
return X86EMUL_CONTINUE;
bad:
if (addr.seg == VCPU_SREG_SS)
- return emulate_ss(ctxt, sel);
+ return emulate_ss(ctxt, 0);
else
- return emulate_gp(ctxt, sel);
+ return emulate_gp(ctxt, 0);
}
static int linearize(struct x86_emulate_ctxt *ctxt,
@@ -693,7 +734,8 @@ static int linearize(struct x86_emulate_ctxt *ctxt,
unsigned size, bool write,
ulong *linear)
{
- return __linearize(ctxt, addr, size, write, false, linear);
+ unsigned max_size;
+ return __linearize(ctxt, addr, &max_size, size, write, false, linear);
}
@@ -718,17 +760,27 @@ static int segmented_read_std(struct x86_emulate_ctxt *ctxt,
static int __do_insn_fetch_bytes(struct x86_emulate_ctxt *ctxt, int op_size)
{
int rc;
- unsigned size;
+ unsigned size, max_size;
unsigned long linear;
int cur_size = ctxt->fetch.end - ctxt->fetch.data;
struct segmented_address addr = { .seg = VCPU_SREG_CS,
.ea = ctxt->eip + cur_size };
- size = 15UL ^ cur_size;
- rc = __linearize(ctxt, addr, size, false, true, &linear);
+ /*
+ * We do not know exactly how many bytes will be needed, and
+ * __linearize is expensive, so fetch as much as possible. We
+ * just have to avoid going beyond the 15 byte limit, the end
+ * of the segment, or the end of the page.
+ *
+ * __linearize is called with size 0 so that it does not do any
+ * boundary check itself. Instead, we use max_size to check
+ * against op_size.
+ */
+ rc = __linearize(ctxt, addr, &max_size, 0, false, true, &linear);
if (unlikely(rc != X86EMUL_CONTINUE))
return rc;
+ size = min_t(unsigned, 15UL ^ cur_size, max_size);
size = min_t(unsigned, size, PAGE_SIZE - offset_in_page(linear));
/*
@@ -738,7 +790,8 @@ static int __do_insn_fetch_bytes(struct x86_emulate_ctxt *ctxt, int op_size)
* still, we must have hit the 15-byte boundary.
*/
if (unlikely(size < op_size))
- return X86EMUL_UNHANDLEABLE;
+ return emulate_gp(ctxt, 0);
+
rc = ctxt->ops->fetch(ctxt, linear, ctxt->fetch.end,
size, &ctxt->exception);
if (unlikely(rc != X86EMUL_CONTINUE))
@@ -750,8 +803,10 @@ static int __do_insn_fetch_bytes(struct x86_emulate_ctxt *ctxt, int op_size)
static __always_inline int do_insn_fetch_bytes(struct x86_emulate_ctxt *ctxt,
unsigned size)
{
- if (unlikely(ctxt->fetch.end - ctxt->fetch.ptr < size))
- return __do_insn_fetch_bytes(ctxt, size);
+ unsigned done_size = ctxt->fetch.end - ctxt->fetch.ptr;
+
+ if (unlikely(done_size < size))
+ return __do_insn_fetch_bytes(ctxt, size - done_size);
else
return X86EMUL_CONTINUE;
}
@@ -1415,7 +1470,9 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
/* Does not support long mode */
static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
- u16 selector, int seg, u8 cpl, bool in_task_switch)
+ u16 selector, int seg, u8 cpl,
+ bool in_task_switch,
+ struct desc_struct *desc)
{
struct desc_struct seg_desc, old_desc;
u8 dpl, rpl;
@@ -1468,7 +1525,7 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
return ret;
err_code = selector & 0xfffc;
- err_vec = GP_VECTOR;
+ err_vec = in_task_switch ? TS_VECTOR : GP_VECTOR;
/* can't load system descriptor into segment selector */
if (seg <= VCPU_SREG_GS && !seg_desc.s)
@@ -1491,9 +1548,6 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
goto exception;
break;
case VCPU_SREG_CS:
- if (in_task_switch && rpl != dpl)
- goto exception;
-
if (!(seg_desc.type & 8))
goto exception;
@@ -1506,6 +1560,15 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
if (rpl > cpl || dpl != cpl)
goto exception;
}
+ /* in long-mode d/b must be clear if l is set */
+ if (seg_desc.d && seg_desc.l) {
+ u64 efer = 0;
+
+ ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
+ if (efer & EFER_LMA)
+ goto exception;
+ }
+
/* CS(RPL) <- CPL */
selector = (selector & 0xfffc) | cpl;
break;
@@ -1550,17 +1613,18 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
}
load:
ctxt->ops->set_segment(ctxt, selector, &seg_desc, base3, seg);
+ if (desc)
+ *desc = seg_desc;
return X86EMUL_CONTINUE;
exception:
- emulate_exception(ctxt, err_vec, err_code, true);
- return X86EMUL_PROPAGATE_FAULT;
+ return emulate_exception(ctxt, err_vec, err_code, true);
}
static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
u16 selector, int seg)
{
u8 cpl = ctxt->ops->cpl(ctxt);
- return __load_segment_descriptor(ctxt, selector, seg, cpl, false);
+ return __load_segment_descriptor(ctxt, selector, seg, cpl, false, NULL);
}
static void write_register_operand(struct operand *op)
@@ -1954,17 +2018,31 @@ static int em_iret(struct x86_emulate_ctxt *ctxt)
static int em_jmp_far(struct x86_emulate_ctxt *ctxt)
{
int rc;
- unsigned short sel;
+ unsigned short sel, old_sel;
+ struct desc_struct old_desc, new_desc;
+ const struct x86_emulate_ops *ops = ctxt->ops;
+ u8 cpl = ctxt->ops->cpl(ctxt);
+
+ /* Assignment of RIP may only fail in 64-bit mode */
+ if (ctxt->mode == X86EMUL_MODE_PROT64)
+ ops->get_segment(ctxt, &old_sel, &old_desc, NULL,
+ VCPU_SREG_CS);
memcpy(&sel, ctxt->src.valptr + ctxt->op_bytes, 2);
- rc = load_segment_descriptor(ctxt, sel, VCPU_SREG_CS);
+ rc = __load_segment_descriptor(ctxt, sel, VCPU_SREG_CS, cpl, false,
+ &new_desc);
if (rc != X86EMUL_CONTINUE)
return rc;
- ctxt->_eip = 0;
- memcpy(&ctxt->_eip, ctxt->src.valptr, ctxt->op_bytes);
- return X86EMUL_CONTINUE;
+ rc = assign_eip_far(ctxt, ctxt->src.val, new_desc.l);
+ if (rc != X86EMUL_CONTINUE) {
+ WARN_ON(ctxt->mode != X86EMUL_MODE_PROT64);
+ /* assigning eip failed; restore the old cs */
+ ops->set_segment(ctxt, old_sel, &old_desc, 0, VCPU_SREG_CS);
+ return rc;
+ }
+ return rc;
}
static int em_grp45(struct x86_emulate_ctxt *ctxt)
@@ -1975,13 +2053,15 @@ static int em_grp45(struct x86_emulate_ctxt *ctxt)
case 2: /* call near abs */ {
long int old_eip;
old_eip = ctxt->_eip;
- ctxt->_eip = ctxt->src.val;
+ rc = assign_eip_near(ctxt, ctxt->src.val);
+ if (rc != X86EMUL_CONTINUE)
+ break;
ctxt->src.val = old_eip;
rc = em_push(ctxt);
break;
}
case 4: /* jmp abs */
- ctxt->_eip = ctxt->src.val;
+ rc = assign_eip_near(ctxt, ctxt->src.val);
break;
case 5: /* jmp far */
rc = em_jmp_far(ctxt);
@@ -2016,30 +2096,47 @@ static int em_cmpxchg8b(struct x86_emulate_ctxt *ctxt)
static int em_ret(struct x86_emulate_ctxt *ctxt)
{
- ctxt->dst.type = OP_REG;
- ctxt->dst.addr.reg = &ctxt->_eip;
- ctxt->dst.bytes = ctxt->op_bytes;
- return em_pop(ctxt);
+ int rc;
+ unsigned long eip;
+
+ rc = emulate_pop(ctxt, &eip, ctxt->op_bytes);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ return assign_eip_near(ctxt, eip);
}
static int em_ret_far(struct x86_emulate_ctxt *ctxt)
{
int rc;
- unsigned long cs;
+ unsigned long eip, cs;
+ u16 old_cs;
int cpl = ctxt->ops->cpl(ctxt);
+ struct desc_struct old_desc, new_desc;
+ const struct x86_emulate_ops *ops = ctxt->ops;
+
+ if (ctxt->mode == X86EMUL_MODE_PROT64)
+ ops->get_segment(ctxt, &old_cs, &old_desc, NULL,
+ VCPU_SREG_CS);
- rc = emulate_pop(ctxt, &ctxt->_eip, ctxt->op_bytes);
+ rc = emulate_pop(ctxt, &eip, ctxt->op_bytes);
if (rc != X86EMUL_CONTINUE)
return rc;
- if (ctxt->op_bytes == 4)
- ctxt->_eip = (u32)ctxt->_eip;
rc = emulate_pop(ctxt, &cs, ctxt->op_bytes);
if (rc != X86EMUL_CONTINUE)
return rc;
/* Outer-privilege level return is not implemented */
if (ctxt->mode >= X86EMUL_MODE_PROT16 && (cs & 3) > cpl)
return X86EMUL_UNHANDLEABLE;
- rc = load_segment_descriptor(ctxt, (u16)cs, VCPU_SREG_CS);
+ rc = __load_segment_descriptor(ctxt, (u16)cs, VCPU_SREG_CS, 0, false,
+ &new_desc);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ rc = assign_eip_far(ctxt, eip, new_desc.l);
+ if (rc != X86EMUL_CONTINUE) {
+ WARN_ON(ctxt->mode != X86EMUL_MODE_PROT64);
+ ops->set_segment(ctxt, old_cs, &old_desc, 0, VCPU_SREG_CS);
+ }
return rc;
}
@@ -2300,7 +2397,7 @@ static int em_sysexit(struct x86_emulate_ctxt *ctxt)
{
const struct x86_emulate_ops *ops = ctxt->ops;
struct desc_struct cs, ss;
- u64 msr_data;
+ u64 msr_data, rcx, rdx;
int usermode;
u16 cs_sel = 0, ss_sel = 0;
@@ -2316,6 +2413,9 @@ static int em_sysexit(struct x86_emulate_ctxt *ctxt)
else
usermode = X86EMUL_MODE_PROT32;
+ rcx = reg_read(ctxt, VCPU_REGS_RCX);
+ rdx = reg_read(ctxt, VCPU_REGS_RDX);
+
cs.dpl = 3;
ss.dpl = 3;
ops->get_msr(ctxt, MSR_IA32_SYSENTER_CS, &msr_data);
@@ -2333,6 +2433,9 @@ static int em_sysexit(struct x86_emulate_ctxt *ctxt)
ss_sel = cs_sel + 8;
cs.d = 0;
cs.l = 1;
+ if (is_noncanonical_address(rcx) ||
+ is_noncanonical_address(rdx))
+ return emulate_gp(ctxt, 0);
break;
}
cs_sel |= SELECTOR_RPL_MASK;
@@ -2341,8 +2444,8 @@ static int em_sysexit(struct x86_emulate_ctxt *ctxt)
ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS);
ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS);
- ctxt->_eip = reg_read(ctxt, VCPU_REGS_RDX);
- *reg_write(ctxt, VCPU_REGS_RSP) = reg_read(ctxt, VCPU_REGS_RCX);
+ ctxt->_eip = rdx;
+ *reg_write(ctxt, VCPU_REGS_RSP) = rcx;
return X86EMUL_CONTINUE;
}
@@ -2460,19 +2563,24 @@ static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt,
* Now load segment descriptors. If fault happens at this stage
* it is handled in a context of new task
*/
- ret = __load_segment_descriptor(ctxt, tss->ldt, VCPU_SREG_LDTR, cpl, true);
+ ret = __load_segment_descriptor(ctxt, tss->ldt, VCPU_SREG_LDTR, cpl,
+ true, NULL);
if (ret != X86EMUL_CONTINUE)
return ret;
- ret = __load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES, cpl, true);
+ ret = __load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES, cpl,
+ true, NULL);
if (ret != X86EMUL_CONTINUE)
return ret;
- ret = __load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS, cpl, true);
+ ret = __load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS, cpl,
+ true, NULL);
if (ret != X86EMUL_CONTINUE)
return ret;
- ret = __load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS, cpl, true);
+ ret = __load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS, cpl,
+ true, NULL);
if (ret != X86EMUL_CONTINUE)
return ret;
- ret = __load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS, cpl, true);
+ ret = __load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS, cpl,
+ true, NULL);
if (ret != X86EMUL_CONTINUE)
return ret;
@@ -2597,25 +2705,32 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
* Now load segment descriptors. If fault happenes at this stage
* it is handled in a context of new task
*/
- ret = __load_segment_descriptor(ctxt, tss->ldt_selector, VCPU_SREG_LDTR, cpl, true);
+ ret = __load_segment_descriptor(ctxt, tss->ldt_selector, VCPU_SREG_LDTR,
+ cpl, true, NULL);
if (ret != X86EMUL_CONTINUE)
return ret;
- ret = __load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES, cpl, true);
+ ret = __load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES, cpl,
+ true, NULL);
if (ret != X86EMUL_CONTINUE)
return ret;
- ret = __load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS, cpl, true);
+ ret = __load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS, cpl,
+ true, NULL);
if (ret != X86EMUL_CONTINUE)
return ret;
- ret = __load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS, cpl, true);
+ ret = __load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS, cpl,
+ true, NULL);
if (ret != X86EMUL_CONTINUE)
return ret;
- ret = __load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS, cpl, true);
+ ret = __load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS, cpl,
+ true, NULL);
if (ret != X86EMUL_CONTINUE)
return ret;
- ret = __load_segment_descriptor(ctxt, tss->fs, VCPU_SREG_FS, cpl, true);
+ ret = __load_segment_descriptor(ctxt, tss->fs, VCPU_SREG_FS, cpl,
+ true, NULL);
if (ret != X86EMUL_CONTINUE)
return ret;
- ret = __load_segment_descriptor(ctxt, tss->gs, VCPU_SREG_GS, cpl, true);
+ ret = __load_segment_descriptor(ctxt, tss->gs, VCPU_SREG_GS, cpl,
+ true, NULL);
if (ret != X86EMUL_CONTINUE)
return ret;
@@ -2726,8 +2841,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
if (!next_tss_desc.p ||
((desc_limit < 0x67 && (next_tss_desc.type & 8)) ||
desc_limit < 0x2b)) {
- emulate_ts(ctxt, tss_selector & 0xfffc);
- return X86EMUL_PROPAGATE_FAULT;
+ return emulate_ts(ctxt, tss_selector & 0xfffc);
}
if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) {
@@ -2883,10 +2997,13 @@ static int em_aad(struct x86_emulate_ctxt *ctxt)
static int em_call(struct x86_emulate_ctxt *ctxt)
{
+ int rc;
long rel = ctxt->src.val;
ctxt->src.val = (unsigned long)ctxt->_eip;
- jmp_rel(ctxt, rel);
+ rc = jmp_rel(ctxt, rel);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
return em_push(ctxt);
}
@@ -2895,34 +3012,50 @@ static int em_call_far(struct x86_emulate_ctxt *ctxt)
u16 sel, old_cs;
ulong old_eip;
int rc;
+ struct desc_struct old_desc, new_desc;
+ const struct x86_emulate_ops *ops = ctxt->ops;
+ int cpl = ctxt->ops->cpl(ctxt);
- old_cs = get_segment_selector(ctxt, VCPU_SREG_CS);
old_eip = ctxt->_eip;
+ ops->get_segment(ctxt, &old_cs, &old_desc, NULL, VCPU_SREG_CS);
memcpy(&sel, ctxt->src.valptr + ctxt->op_bytes, 2);
- if (load_segment_descriptor(ctxt, sel, VCPU_SREG_CS))
+ rc = __load_segment_descriptor(ctxt, sel, VCPU_SREG_CS, cpl, false,
+ &new_desc);
+ if (rc != X86EMUL_CONTINUE)
return X86EMUL_CONTINUE;
- ctxt->_eip = 0;
- memcpy(&ctxt->_eip, ctxt->src.valptr, ctxt->op_bytes);
+ rc = assign_eip_far(ctxt, ctxt->src.val, new_desc.l);
+ if (rc != X86EMUL_CONTINUE)
+ goto fail;
ctxt->src.val = old_cs;
rc = em_push(ctxt);
if (rc != X86EMUL_CONTINUE)
- return rc;
+ goto fail;
ctxt->src.val = old_eip;
- return em_push(ctxt);
+ rc = em_push(ctxt);
+ /* If we failed, we tainted the memory, but the very least we should
+ restore cs */
+ if (rc != X86EMUL_CONTINUE)
+ goto fail;
+ return rc;
+fail:
+ ops->set_segment(ctxt, old_cs, &old_desc, 0, VCPU_SREG_CS);
+ return rc;
+
}
static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt)
{
int rc;
+ unsigned long eip;
- ctxt->dst.type = OP_REG;
- ctxt->dst.addr.reg = &ctxt->_eip;
- ctxt->dst.bytes = ctxt->op_bytes;
- rc = emulate_pop(ctxt, &ctxt->dst.val, ctxt->op_bytes);
+ rc = emulate_pop(ctxt, &eip, ctxt->op_bytes);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ rc = assign_eip_near(ctxt, eip);
if (rc != X86EMUL_CONTINUE)
return rc;
rsp_increment(ctxt, ctxt->src.val);
@@ -3019,7 +3152,7 @@ static int em_movbe(struct x86_emulate_ctxt *ctxt)
ctxt->dst.val = swab64(ctxt->src.val);
break;
default:
- return X86EMUL_PROPAGATE_FAULT;
+ BUG();
}
return X86EMUL_CONTINUE;
}
@@ -3143,12 +3276,8 @@ static int em_clts(struct x86_emulate_ctxt *ctxt)
static int em_vmcall(struct x86_emulate_ctxt *ctxt)
{
- int rc;
+ int rc = ctxt->ops->fix_hypercall(ctxt);
- if (ctxt->modrm_mod != 3 || ctxt->modrm_rm != 1)
- return X86EMUL_UNHANDLEABLE;
-
- rc = ctxt->ops->fix_hypercall(ctxt);
if (rc != X86EMUL_CONTINUE)
return rc;
@@ -3253,20 +3382,24 @@ static int em_lmsw(struct x86_emulate_ctxt *ctxt)
static int em_loop(struct x86_emulate_ctxt *ctxt)
{
+ int rc = X86EMUL_CONTINUE;
+
register_address_increment(ctxt, reg_rmw(ctxt, VCPU_REGS_RCX), -1);
if ((address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) != 0) &&
(ctxt->b == 0xe2 || test_cc(ctxt->b ^ 0x5, ctxt->eflags)))
- jmp_rel(ctxt, ctxt->src.val);
+ rc = jmp_rel(ctxt, ctxt->src.val);
- return X86EMUL_CONTINUE;
+ return rc;
}
static int em_jcxz(struct x86_emulate_ctxt *ctxt)
{
+ int rc = X86EMUL_CONTINUE;
+
if (address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) == 0)
- jmp_rel(ctxt, ctxt->src.val);
+ rc = jmp_rel(ctxt, ctxt->src.val);
- return X86EMUL_CONTINUE;
+ return rc;
}
static int em_in(struct x86_emulate_ctxt *ctxt)
@@ -3354,6 +3487,12 @@ static int em_bswap(struct x86_emulate_ctxt *ctxt)
return X86EMUL_CONTINUE;
}
+static int em_clflush(struct x86_emulate_ctxt *ctxt)
+{
+ /* emulating clflush regardless of cpuid */
+ return X86EMUL_CONTINUE;
+}
+
static bool valid_cr(int nr)
{
switch (nr) {
@@ -3566,6 +3705,12 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt)
F2bv(((_f) | DstReg | SrcMem | ModRM) & ~Lock, _e), \
F2bv(((_f) & ~Lock) | DstAcc | SrcImm, _e)
+static const struct opcode group7_rm0[] = {
+ N,
+ I(SrcNone | Priv | EmulateOnUD, em_vmcall),
+ N, N, N, N, N, N,
+};
+
static const struct opcode group7_rm1[] = {
DI(SrcNone | Priv, monitor),
DI(SrcNone | Priv, mwait),
@@ -3659,7 +3804,7 @@ static const struct group_dual group7 = { {
II(SrcMem16 | Mov | Priv, em_lmsw, lmsw),
II(SrcMem | ByteOp | Priv | NoAccess, em_invlpg, invlpg),
}, {
- I(SrcNone | Priv | EmulateOnUD, em_vmcall),
+ EXT(0, group7_rm0),
EXT(0, group7_rm1),
N, EXT(0, group7_rm3),
II(SrcNone | DstMem | Mov, em_smsw, smsw), N,
@@ -3686,18 +3831,32 @@ static const struct opcode group11[] = {
X7(D(Undefined)),
};
+static const struct gprefix pfx_0f_ae_7 = {
+ I(SrcMem | ByteOp, em_clflush), N, N, N,
+};
+
+static const struct group_dual group15 = { {
+ N, N, N, N, N, N, N, GP(0, &pfx_0f_ae_7),
+}, {
+ N, N, N, N, N, N, N, N,
+} };
+
static const struct gprefix pfx_0f_6f_0f_7f = {
I(Mmx, em_mov), I(Sse | Aligned, em_mov), N, I(Sse | Unaligned, em_mov),
};
-static const struct gprefix pfx_vmovntpx = {
- I(0, em_mov), N, N, N,
+static const struct gprefix pfx_0f_2b = {
+ I(0, em_mov), I(0, em_mov), N, N,
};
static const struct gprefix pfx_0f_28_0f_29 = {
I(Aligned, em_mov), I(Aligned, em_mov), N, N,
};
+static const struct gprefix pfx_0f_e7 = {
+ N, I(Sse, em_mov), N, N,
+};
+
static const struct escape escape_d9 = { {
N, N, N, N, N, N, N, I(DstMem, em_fnstcw),
}, {
@@ -3890,10 +4049,11 @@ static const struct opcode twobyte_table[256] = {
N, I(ImplicitOps | EmulateOnUD, em_syscall),
II(ImplicitOps | Priv, em_clts, clts), N,
DI(ImplicitOps | Priv, invd), DI(ImplicitOps | Priv, wbinvd), N, N,
- N, D(ImplicitOps | ModRM), N, N,
+ N, D(ImplicitOps | ModRM | SrcMem | NoAccess), N, N,
/* 0x10 - 0x1F */
N, N, N, N, N, N, N, N,
- D(ImplicitOps | ModRM), N, N, N, N, N, N, D(ImplicitOps | ModRM),
+ D(ImplicitOps | ModRM | SrcMem | NoAccess),
+ N, N, N, N, N, N, D(ImplicitOps | ModRM | SrcMem | NoAccess),
/* 0x20 - 0x2F */
DIP(ModRM | DstMem | Priv | Op3264 | NoMod, cr_read, check_cr_read),
DIP(ModRM | DstMem | Priv | Op3264 | NoMod, dr_read, check_dr_read),
@@ -3904,7 +4064,7 @@ static const struct opcode twobyte_table[256] = {
N, N, N, N,
GP(ModRM | DstReg | SrcMem | Mov | Sse, &pfx_0f_28_0f_29),
GP(ModRM | DstMem | SrcReg | Mov | Sse, &pfx_0f_28_0f_29),
- N, GP(ModRM | DstMem | SrcReg | Sse | Mov | Aligned, &pfx_vmovntpx),
+ N, GP(ModRM | DstMem | SrcReg | Mov | Sse, &pfx_0f_2b),
N, N, N, N,
/* 0x30 - 0x3F */
II(ImplicitOps | Priv, em_wrmsr, wrmsr),
@@ -3945,7 +4105,7 @@ static const struct opcode twobyte_table[256] = {
F(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_bts),
F(DstMem | SrcReg | Src2ImmByte | ModRM, em_shrd),
F(DstMem | SrcReg | Src2CL | ModRM, em_shrd),
- D(ModRM), F(DstReg | SrcMem | ModRM, em_imul),
+ GD(0, &group15), F(DstReg | SrcMem | ModRM, em_imul),
/* 0xB0 - 0xB7 */
I2bv(DstMem | SrcReg | ModRM | Lock | PageTable, em_cmpxchg),
I(DstReg | SrcMemFAddr | ModRM | Src2SS, em_lseg),
@@ -3968,7 +4128,8 @@ static const struct opcode twobyte_table[256] = {
/* 0xD0 - 0xDF */
N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
/* 0xE0 - 0xEF */
- N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
+ N, N, N, N, N, N, N, GP(SrcReg | DstMem | ModRM | Mov, &pfx_0f_e7),
+ N, N, N, N, N, N, N, N,
/* 0xF0 - 0xFF */
N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N
};
@@ -4126,6 +4287,7 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op,
fetch_register_operand(op);
break;
case OpCL:
+ op->type = OP_IMM;
op->bytes = 1;
op->val = reg_read(ctxt, VCPU_REGS_RCX) & 0xff;
break;
@@ -4133,6 +4295,7 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op,
rc = decode_imm(ctxt, op, 1, true);
break;
case OpOne:
+ op->type = OP_IMM;
op->bytes = 1;
op->val = 1;
break;
@@ -4191,21 +4354,27 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op,
ctxt->memop.bytes = ctxt->op_bytes + 2;
goto mem_common;
case OpES:
+ op->type = OP_IMM;
op->val = VCPU_SREG_ES;
break;
case OpCS:
+ op->type = OP_IMM;
op->val = VCPU_SREG_CS;
break;
case OpSS:
+ op->type = OP_IMM;
op->val = VCPU_SREG_SS;
break;
case OpDS:
+ op->type = OP_IMM;
op->val = VCPU_SREG_DS;
break;
case OpFS:
+ op->type = OP_IMM;
op->val = VCPU_SREG_FS;
break;
case OpGS:
+ op->type = OP_IMM;
op->val = VCPU_SREG_GS;
break;
case OpImplicit:
@@ -4394,8 +4563,11 @@ done_prefixes:
ctxt->execute = opcode.u.execute;
+ if (unlikely(ctxt->ud) && likely(!(ctxt->d & EmulateOnUD)))
+ return EMULATION_FAILED;
+
if (unlikely(ctxt->d &
- (NotImpl|EmulateOnUD|Stack|Op3264|Sse|Mmx|Intercept|CheckPerm))) {
+ (NotImpl|Stack|Op3264|Sse|Mmx|Intercept|CheckPerm))) {
/*
* These are copied unconditionally here, and checked unconditionally
* in x86_emulate_insn.
@@ -4406,9 +4578,6 @@ done_prefixes:
if (ctxt->d & NotImpl)
return EMULATION_FAILED;
- if (!(ctxt->d & EmulateOnUD) && ctxt->ud)
- return EMULATION_FAILED;
-
if (mode == X86EMUL_MODE_PROT64 && (ctxt->d & Stack))
ctxt->op_bytes = 8;
@@ -4461,10 +4630,10 @@ done_prefixes:
/* Decode and fetch the destination operand: register or memory. */
rc = decode_operand(ctxt, &ctxt->dst, (ctxt->d >> DstShift) & OpMask);
-done:
if (ctxt->rip_relative)
ctxt->memopp->addr.mem.ea += ctxt->_eip;
+done:
return (rc != X86EMUL_CONTINUE) ? EMULATION_FAILED : EMULATION_OK;
}
@@ -4714,7 +4883,7 @@ special_insn:
break;
case 0x70 ... 0x7f: /* jcc (short) */
if (test_cc(ctxt->b, ctxt->eflags))
- jmp_rel(ctxt, ctxt->src.val);
+ rc = jmp_rel(ctxt, ctxt->src.val);
break;
case 0x8d: /* lea r16/r32, m */
ctxt->dst.val = ctxt->src.addr.mem.ea;
@@ -4744,7 +4913,7 @@ special_insn:
break;
case 0xe9: /* jmp rel */
case 0xeb: /* jmp rel short */
- jmp_rel(ctxt, ctxt->src.val);
+ rc = jmp_rel(ctxt, ctxt->src.val);
ctxt->dst.type = OP_NONE; /* Disable writeback. */
break;
case 0xf4: /* hlt */
@@ -4832,8 +5001,10 @@ writeback:
ctxt->eip = ctxt->_eip;
done:
- if (rc == X86EMUL_PROPAGATE_FAULT)
+ if (rc == X86EMUL_PROPAGATE_FAULT) {
+ WARN_ON(ctxt->exception.vector > 0x1f);
ctxt->have_exception = true;
+ }
if (rc == X86EMUL_INTERCEPTED)
return EMULATION_INTERCEPTED;
@@ -4867,13 +5038,11 @@ twobyte_insn:
break;
case 0x80 ... 0x8f: /* jnz rel, etc*/
if (test_cc(ctxt->b, ctxt->eflags))
- jmp_rel(ctxt, ctxt->src.val);
+ rc = jmp_rel(ctxt, ctxt->src.val);
break;
case 0x90 ... 0x9f: /* setcc r/m8 */
ctxt->dst.val = test_cc(ctxt->b, ctxt->eflags);
break;
- case 0xae: /* clflush */
- break;
case 0xb6 ... 0xb7: /* movzx */
ctxt->dst.bytes = ctxt->op_bytes;
ctxt->dst.val = (ctxt->src.bytes == 1) ? (u8) ctxt->src.val
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 518d86471b76..298781d4cfb4 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -262,8 +262,10 @@ void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
return;
timer = &pit->pit_state.timer;
+ mutex_lock(&pit->pit_state.lock);
if (hrtimer_cancel(timer))
hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
+ mutex_unlock(&pit->pit_state.lock);
}
static void destroy_pit_timer(struct kvm_pit *pit)
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index bd0da433e6d7..a1ec6a50a05a 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -108,7 +108,7 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v)
vector = kvm_cpu_get_extint(v);
- if (kvm_apic_vid_enabled(v->kvm) || vector != -1)
+ if (vector != -1)
return vector; /* PIC */
return kvm_get_apic_interrupt(v); /* APIC */
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 3855103f71fd..b8345dd41b25 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -112,17 +112,6 @@ static inline int __apic_test_and_clear_vector(int vec, void *bitmap)
struct static_key_deferred apic_hw_disabled __read_mostly;
struct static_key_deferred apic_sw_disabled __read_mostly;
-static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val)
-{
- if ((kvm_apic_get_reg(apic, APIC_SPIV) ^ val) & APIC_SPIV_APIC_ENABLED) {
- if (val & APIC_SPIV_APIC_ENABLED)
- static_key_slow_dec_deferred(&apic_sw_disabled);
- else
- static_key_slow_inc(&apic_sw_disabled.key);
- }
- apic_set_reg(apic, APIC_SPIV, val);
-}
-
static inline int apic_enabled(struct kvm_lapic *apic)
{
return kvm_apic_sw_enabled(apic) && kvm_apic_hw_enabled(apic);
@@ -210,6 +199,20 @@ out:
kvm_vcpu_request_scan_ioapic(kvm);
}
+static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val)
+{
+ u32 prev = kvm_apic_get_reg(apic, APIC_SPIV);
+
+ apic_set_reg(apic, APIC_SPIV, val);
+ if ((prev ^ val) & APIC_SPIV_APIC_ENABLED) {
+ if (val & APIC_SPIV_APIC_ENABLED) {
+ static_key_slow_dec_deferred(&apic_sw_disabled);
+ recalculate_apic_map(apic->vcpu->kvm);
+ } else
+ static_key_slow_inc(&apic_sw_disabled.key);
+ }
+}
+
static inline void kvm_apic_set_id(struct kvm_lapic *apic, u8 id)
{
apic_set_reg(apic, APIC_ID, id << 24);
@@ -352,25 +355,46 @@ static inline int apic_find_highest_irr(struct kvm_lapic *apic)
static inline void apic_clear_irr(int vec, struct kvm_lapic *apic)
{
- apic->irr_pending = false;
+ struct kvm_vcpu *vcpu;
+
+ vcpu = apic->vcpu;
+
apic_clear_vector(vec, apic->regs + APIC_IRR);
- if (apic_search_irr(apic) != -1)
- apic->irr_pending = true;
+ if (unlikely(kvm_apic_vid_enabled(vcpu->kvm)))
+ /* try to update RVI */
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ else {
+ vec = apic_search_irr(apic);
+ apic->irr_pending = (vec != -1);
+ }
}
static inline void apic_set_isr(int vec, struct kvm_lapic *apic)
{
- /* Note that we never get here with APIC virtualization enabled. */
+ struct kvm_vcpu *vcpu;
+
+ if (__apic_test_and_set_vector(vec, apic->regs + APIC_ISR))
+ return;
+
+ vcpu = apic->vcpu;
- if (!__apic_test_and_set_vector(vec, apic->regs + APIC_ISR))
- ++apic->isr_count;
- BUG_ON(apic->isr_count > MAX_APIC_VECTOR);
/*
- * ISR (in service register) bit is set when injecting an interrupt.
- * The highest vector is injected. Thus the latest bit set matches
- * the highest bit in ISR.
+ * With APIC virtualization enabled, all caching is disabled
+ * because the processor can modify ISR under the hood. Instead
+ * just set SVI.
*/
- apic->highest_isr_cache = vec;
+ if (unlikely(kvm_apic_vid_enabled(vcpu->kvm)))
+ kvm_x86_ops->hwapic_isr_update(vcpu->kvm, vec);
+ else {
+ ++apic->isr_count;
+ BUG_ON(apic->isr_count > MAX_APIC_VECTOR);
+ /*
+ * ISR (in service register) bit is set when injecting an interrupt.
+ * The highest vector is injected. Thus the latest bit set matches
+ * the highest bit in ISR.
+ */
+ apic->highest_isr_cache = vec;
+ }
}
static inline int apic_find_highest_isr(struct kvm_lapic *apic)
@@ -685,6 +709,8 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
int result = 0;
struct kvm_vcpu *vcpu = apic->vcpu;
+ trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode,
+ trig_mode, vector);
switch (delivery_mode) {
case APIC_DM_LOWEST:
vcpu->arch.apic_arb_prio++;
@@ -706,8 +732,6 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
kvm_make_request(KVM_REQ_EVENT, vcpu);
kvm_vcpu_kick(vcpu);
}
- trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode,
- trig_mode, vector, false);
break;
case APIC_DM_REMRD:
@@ -1331,6 +1355,9 @@ void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data)
return;
hrtimer_cancel(&apic->lapic_timer.timer);
+ /* Inject here so clearing tscdeadline won't override new value */
+ if (apic_has_pending_timer(vcpu))
+ kvm_inject_apic_timer_irqs(vcpu);
apic->lapic_timer.tscdeadline = data;
start_apic_timer(apic);
}
@@ -1618,6 +1645,8 @@ void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu)
if (atomic_read(&apic->lapic_timer.pending) > 0) {
kvm_apic_local_deliver(apic, APIC_LVTT);
+ if (apic_lvtt_tscdeadline(apic))
+ apic->lapic_timer.tscdeadline = 0;
atomic_set(&apic->lapic_timer.pending, 0);
}
}
@@ -1627,11 +1656,16 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu)
int vector = kvm_apic_has_interrupt(vcpu);
struct kvm_lapic *apic = vcpu->arch.apic;
- /* Note that we never get here with APIC virtualization enabled. */
-
if (vector == -1)
return -1;
+ /*
+ * We get here even with APIC virtualization enabled, if doing
+ * nested virtualization and L1 runs with the "acknowledge interrupt
+ * on exit" mode. Then we cannot inject the interrupt via RVI,
+ * because the process would deliver it through the IDT.
+ */
+
apic_set_isr(vector, apic);
apic_update_ppr(apic);
apic_clear_irr(vector, apic);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 931467881da7..ac1c4de3a484 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -199,16 +199,20 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask)
EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
/*
- * spte bits of bit 3 ~ bit 11 are used as low 9 bits of generation number,
- * the bits of bits 52 ~ bit 61 are used as high 10 bits of generation
- * number.
+ * the low bit of the generation number is always presumed to be zero.
+ * This disables mmio caching during memslot updates. The concept is
+ * similar to a seqcount but instead of retrying the access we just punt
+ * and ignore the cache.
+ *
+ * spte bits 3-11 are used as bits 1-9 of the generation number,
+ * the bits 52-61 are used as bits 10-19 of the generation number.
*/
-#define MMIO_SPTE_GEN_LOW_SHIFT 3
+#define MMIO_SPTE_GEN_LOW_SHIFT 2
#define MMIO_SPTE_GEN_HIGH_SHIFT 52
-#define MMIO_GEN_SHIFT 19
-#define MMIO_GEN_LOW_SHIFT 9
-#define MMIO_GEN_LOW_MASK ((1 << MMIO_GEN_LOW_SHIFT) - 1)
+#define MMIO_GEN_SHIFT 20
+#define MMIO_GEN_LOW_SHIFT 10
+#define MMIO_GEN_LOW_MASK ((1 << MMIO_GEN_LOW_SHIFT) - 2)
#define MMIO_GEN_MASK ((1 << MMIO_GEN_SHIFT) - 1)
#define MMIO_MAX_GEN ((1 << MMIO_GEN_SHIFT) - 1)
@@ -236,12 +240,7 @@ static unsigned int get_mmio_spte_generation(u64 spte)
static unsigned int kvm_current_mmio_generation(struct kvm *kvm)
{
- /*
- * Init kvm generation close to MMIO_MAX_GEN to easily test the
- * code of handling generation number wrap-around.
- */
- return (kvm_memslots(kvm)->generation +
- MMIO_MAX_GEN - 150) & MMIO_GEN_MASK;
+ return kvm_memslots(kvm)->generation & MMIO_GEN_MASK;
}
static void mark_mmio_spte(struct kvm *kvm, u64 *sptep, u64 gfn,
@@ -296,11 +295,6 @@ static bool check_mmio_spte(struct kvm *kvm, u64 spte)
return likely(kvm_gen == spte_gen);
}
-static inline u64 rsvd_bits(int s, int e)
-{
- return ((1ULL << (e - s + 1)) - 1) << s;
-}
-
void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
u64 dirty_mask, u64 nx_mask, u64 x_mask)
{
@@ -1180,7 +1174,7 @@ static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
* Write-protect on the specified @sptep, @pt_protect indicates whether
* spte write-protection is caused by protecting shadow page table.
*
- * Note: write protection is difference between drity logging and spte
+ * Note: write protection is difference between dirty logging and spte
* protection:
* - for dirty logging, the spte can be set to writable at anytime if
* its dirty bitmap is properly set.
@@ -1268,7 +1262,8 @@ static bool rmap_write_protect(struct kvm *kvm, u64 gfn)
}
static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
- struct kvm_memory_slot *slot, unsigned long data)
+ struct kvm_memory_slot *slot, gfn_t gfn, int level,
+ unsigned long data)
{
u64 *sptep;
struct rmap_iterator iter;
@@ -1276,7 +1271,8 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
while ((sptep = rmap_get_first(*rmapp, &iter))) {
BUG_ON(!(*sptep & PT_PRESENT_MASK));
- rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", sptep, *sptep);
+ rmap_printk("kvm_rmap_unmap_hva: spte %p %llx gfn %llx (%d)\n",
+ sptep, *sptep, gfn, level);
drop_spte(kvm, sptep);
need_tlb_flush = 1;
@@ -1286,7 +1282,8 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
}
static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
- struct kvm_memory_slot *slot, unsigned long data)
+ struct kvm_memory_slot *slot, gfn_t gfn, int level,
+ unsigned long data)
{
u64 *sptep;
struct rmap_iterator iter;
@@ -1300,7 +1297,8 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
for (sptep = rmap_get_first(*rmapp, &iter); sptep;) {
BUG_ON(!is_shadow_present_pte(*sptep));
- rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", sptep, *sptep);
+ rmap_printk("kvm_set_pte_rmapp: spte %p %llx gfn %llx (%d)\n",
+ sptep, *sptep, gfn, level);
need_flush = 1;
@@ -1334,6 +1332,8 @@ static int kvm_handle_hva_range(struct kvm *kvm,
int (*handler)(struct kvm *kvm,
unsigned long *rmapp,
struct kvm_memory_slot *slot,
+ gfn_t gfn,
+ int level,
unsigned long data))
{
int j;
@@ -1363,6 +1363,7 @@ static int kvm_handle_hva_range(struct kvm *kvm,
j < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++j) {
unsigned long idx, idx_end;
unsigned long *rmapp;
+ gfn_t gfn = gfn_start;
/*
* {idx(page_j) | page_j intersects with
@@ -1373,8 +1374,10 @@ static int kvm_handle_hva_range(struct kvm *kvm,
rmapp = __gfn_to_rmap(gfn_start, j, memslot);
- for (; idx <= idx_end; ++idx)
- ret |= handler(kvm, rmapp++, memslot, data);
+ for (; idx <= idx_end;
+ ++idx, gfn += (1UL << KVM_HPAGE_GFN_SHIFT(j)))
+ ret |= handler(kvm, rmapp++, memslot,
+ gfn, j, data);
}
}
@@ -1385,6 +1388,7 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
unsigned long data,
int (*handler)(struct kvm *kvm, unsigned long *rmapp,
struct kvm_memory_slot *slot,
+ gfn_t gfn, int level,
unsigned long data))
{
return kvm_handle_hva_range(kvm, hva, hva + 1, data, handler);
@@ -1406,24 +1410,14 @@ void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
}
static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
- struct kvm_memory_slot *slot, unsigned long data)
+ struct kvm_memory_slot *slot, gfn_t gfn, int level,
+ unsigned long data)
{
u64 *sptep;
struct rmap_iterator uninitialized_var(iter);
int young = 0;
- /*
- * In case of absence of EPT Access and Dirty Bits supports,
- * emulate the accessed bit for EPT, by checking if this page has
- * an EPT mapping, and clearing it if it does. On the next access,
- * a new EPT mapping will be established.
- * This has some overhead, but not as much as the cost of swapping
- * out actively used pages or breaking up actively used hugepages.
- */
- if (!shadow_accessed_mask) {
- young = kvm_unmap_rmapp(kvm, rmapp, slot, data);
- goto out;
- }
+ BUG_ON(!shadow_accessed_mask);
for (sptep = rmap_get_first(*rmapp, &iter); sptep;
sptep = rmap_get_next(&iter)) {
@@ -1435,14 +1429,13 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
(unsigned long *)sptep);
}
}
-out:
- /* @data has hva passed to kvm_age_hva(). */
- trace_kvm_age_page(data, slot, young);
+ trace_kvm_age_page(gfn, level, slot, young);
return young;
}
static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
- struct kvm_memory_slot *slot, unsigned long data)
+ struct kvm_memory_slot *slot, gfn_t gfn,
+ int level, unsigned long data)
{
u64 *sptep;
struct rmap_iterator iter;
@@ -1480,13 +1473,33 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
- kvm_unmap_rmapp(vcpu->kvm, rmapp, NULL, 0);
+ kvm_unmap_rmapp(vcpu->kvm, rmapp, NULL, gfn, sp->role.level, 0);
kvm_flush_remote_tlbs(vcpu->kvm);
}
-int kvm_age_hva(struct kvm *kvm, unsigned long hva)
+int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
{
- return kvm_handle_hva(kvm, hva, hva, kvm_age_rmapp);
+ /*
+ * In case of absence of EPT Access and Dirty Bits supports,
+ * emulate the accessed bit for EPT, by checking if this page has
+ * an EPT mapping, and clearing it if it does. On the next access,
+ * a new EPT mapping will be established.
+ * This has some overhead, but not as much as the cost of swapping
+ * out actively used pages or breaking up actively used hugepages.
+ */
+ if (!shadow_accessed_mask) {
+ /*
+ * We are holding the kvm->mmu_lock, and we are blowing up
+ * shadow PTEs. MMU notifier consumers need to be kept at bay.
+ * This is correct as long as we don't decouple the mmu_lock
+ * protected regions (like invalidate_range_start|end does).
+ */
+ kvm->mmu_notifier_seq++;
+ return kvm_handle_hva_range(kvm, start, end, 0,
+ kvm_unmap_rmapp);
+ }
+
+ return kvm_handle_hva_range(kvm, start, end, 0, kvm_age_rmapp);
}
int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
@@ -1749,7 +1762,7 @@ static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
return 1;
}
- kvm_mmu_flush_tlb(vcpu);
+ kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
return 0;
}
@@ -1802,7 +1815,7 @@ static void kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn)
kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
if (flush)
- kvm_mmu_flush_tlb(vcpu);
+ kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
}
struct mmu_page_path {
@@ -2536,7 +2549,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
true, host_writable)) {
if (write_fault)
*emulate = 1;
- kvm_mmu_flush_tlb(vcpu);
+ kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
}
if (unlikely(is_mmio_spte(*sptep) && emulate))
@@ -3163,7 +3176,7 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
return;
- vcpu_clear_mmio_info(vcpu, ~0ul);
+ vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
hpa_t root = vcpu->arch.mmu.root_hpa;
@@ -3206,7 +3219,7 @@ static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr,
{
if (exception)
exception->error_code = 0;
- return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access);
+ return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access, exception);
}
static bool quickly_check_mmio_pf(struct kvm_vcpu *vcpu, u64 addr, bool direct)
@@ -3450,13 +3463,6 @@ static void nonpaging_init_context(struct kvm_vcpu *vcpu,
context->nx = false;
}
-void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
-{
- ++vcpu->stat.tlb_flush;
- kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
-}
-EXPORT_SYMBOL_GPL(kvm_mmu_flush_tlb);
-
void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu)
{
mmu_free_roots(vcpu);
@@ -3518,6 +3524,7 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
int maxphyaddr = cpuid_maxphyaddr(vcpu);
u64 exb_bit_rsvd = 0;
u64 gbpages_bit_rsvd = 0;
+ u64 nonleaf_bit8_rsvd = 0;
context->bad_mt_xwr = 0;
@@ -3525,6 +3532,14 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
exb_bit_rsvd = rsvd_bits(63, 63);
if (!guest_cpuid_has_gbpages(vcpu))
gbpages_bit_rsvd = rsvd_bits(7, 7);
+
+ /*
+ * Non-leaf PML4Es and PDPEs reserve bit 8 (which would be the G bit for
+ * leaf entries) on AMD CPUs only.
+ */
+ if (guest_cpuid_is_amd(vcpu))
+ nonleaf_bit8_rsvd = rsvd_bits(8, 8);
+
switch (context->root_level) {
case PT32_ROOT_LEVEL:
/* no rsvd bits for 2 level 4K page table entries */
@@ -3559,9 +3574,9 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
break;
case PT64_ROOT_LEVEL:
context->rsvd_bits_mask[0][3] = exb_bit_rsvd |
- rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 7);
+ nonleaf_bit8_rsvd | rsvd_bits(7, 7) | rsvd_bits(maxphyaddr, 51);
context->rsvd_bits_mask[0][2] = exb_bit_rsvd |
- gbpages_bit_rsvd | rsvd_bits(maxphyaddr, 51);
+ nonleaf_bit8_rsvd | gbpages_bit_rsvd | rsvd_bits(maxphyaddr, 51);
context->rsvd_bits_mask[0][1] = exb_bit_rsvd |
rsvd_bits(maxphyaddr, 51);
context->rsvd_bits_mask[0][0] = exb_bit_rsvd |
@@ -3962,7 +3977,7 @@ static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, bool zap_page,
if (remote_flush)
kvm_flush_remote_tlbs(vcpu->kvm);
else if (local_flush)
- kvm_mmu_flush_tlb(vcpu);
+ kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
}
static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa,
@@ -4223,7 +4238,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
{
vcpu->arch.mmu.invlpg(vcpu, gva);
- kvm_mmu_flush_tlb(vcpu);
+ kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
++vcpu->stat.invlpg;
}
EXPORT_SYMBOL_GPL(kvm_mmu_invlpg);
@@ -4433,7 +4448,7 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm)
* The very rare case: if the generation-number is round,
* zap all shadow pages.
*/
- if (unlikely(kvm_current_mmio_generation(kvm) >= MMIO_MAX_GEN)) {
+ if (unlikely(kvm_current_mmio_generation(kvm) == 0)) {
printk_ratelimited(KERN_INFO "kvm: zapping shadow pages for mmio generation wraparound\n");
kvm_mmu_invalidate_zap_all_pages(kvm);
}
@@ -4534,7 +4549,7 @@ int kvm_mmu_module_init(void)
if (!mmu_page_header_cache)
goto nomem;
- if (percpu_counter_init(&kvm_total_used_mmu_pages, 0))
+ if (percpu_counter_init(&kvm_total_used_mmu_pages, 0, GFP_KERNEL))
goto nomem;
register_shrinker(&mmu_shrinker);
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index b982112d2ca5..bde8ee725754 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -56,6 +56,11 @@
#define PFERR_RSVD_MASK (1U << PFERR_RSVD_BIT)
#define PFERR_FETCH_MASK (1U << PFERR_FETCH_BIT)
+static inline u64 rsvd_bits(int s, int e)
+{
+ return ((1ULL << (e - s + 1)) - 1) << s;
+}
+
int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]);
void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask);
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
index 1185fe7a7f47..9ade5cfb5a4c 100644
--- a/arch/x86/kvm/mmu_audit.c
+++ b/arch/x86/kvm/mmu_audit.c
@@ -273,7 +273,7 @@ static int mmu_audit_set(const char *val, const struct kernel_param *kp)
int ret;
unsigned long enable;
- ret = strict_strtoul(val, 10, &enable);
+ ret = kstrtoul(val, 10, &enable);
if (ret < 0)
return -EINVAL;
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 410776528265..fd49c867b25a 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -298,8 +298,7 @@ retry_walk:
}
#endif
walker->max_level = walker->level;
- ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) ||
- (mmu->get_cr3(vcpu) & CR3_NONPAE_RESERVED_BITS) == 0);
+ ASSERT(!(is_long_mode(vcpu) && !is_pae(vcpu)));
accessed_dirty = PT_GUEST_ACCESSED_MASK;
pt_access = pte_access = ACC_ALL;
@@ -321,9 +320,22 @@ retry_walk:
walker->pte_gpa[walker->level - 1] = pte_gpa;
real_gfn = mmu->translate_gpa(vcpu, gfn_to_gpa(table_gfn),
- PFERR_USER_MASK|PFERR_WRITE_MASK);
+ PFERR_USER_MASK|PFERR_WRITE_MASK,
+ &walker->fault);
+
+ /*
+ * FIXME: This can happen if emulation (for of an INS/OUTS
+ * instruction) triggers a nested page fault. The exit
+ * qualification / exit info field will incorrectly have
+ * "guest page access" as the nested page fault's cause,
+ * instead of "guest page structure access". To fix this,
+ * the x86_exception struct should be augmented with enough
+ * information to fix the exit_qualification or exit_info_1
+ * fields.
+ */
if (unlikely(real_gfn == UNMAPPED_GVA))
- goto error;
+ return 0;
+
real_gfn = gpa_to_gfn(real_gfn);
host_addr = gfn_to_hva_prot(vcpu->kvm, real_gfn,
@@ -364,7 +376,7 @@ retry_walk:
if (PTTYPE == 32 && walker->level == PT_DIRECTORY_LEVEL && is_cpuid_PSE36())
gfn += pse36_gfn_delta(pte);
- real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(gfn), access);
+ real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(gfn), access, &walker->fault);
if (real_gpa == UNMAPPED_GVA)
return 0;
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index 3dd6accb64ec..8e6b7d869d2f 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -15,6 +15,7 @@
#include <linux/types.h>
#include <linux/kvm_host.h>
#include <linux/perf_event.h>
+#include <asm/perf_event.h>
#include "x86.h"
#include "cpuid.h"
#include "lapic.h"
@@ -463,7 +464,8 @@ void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu)
{
struct kvm_pmu *pmu = &vcpu->arch.pmu;
struct kvm_cpuid_entry2 *entry;
- unsigned bitmap_len;
+ union cpuid10_eax eax;
+ union cpuid10_edx edx;
pmu->nr_arch_gp_counters = 0;
pmu->nr_arch_fixed_counters = 0;
@@ -475,25 +477,27 @@ void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu)
entry = kvm_find_cpuid_entry(vcpu, 0xa, 0);
if (!entry)
return;
+ eax.full = entry->eax;
+ edx.full = entry->edx;
- pmu->version = entry->eax & 0xff;
+ pmu->version = eax.split.version_id;
if (!pmu->version)
return;
- pmu->nr_arch_gp_counters = min((int)(entry->eax >> 8) & 0xff,
- INTEL_PMC_MAX_GENERIC);
- pmu->counter_bitmask[KVM_PMC_GP] =
- ((u64)1 << ((entry->eax >> 16) & 0xff)) - 1;
- bitmap_len = (entry->eax >> 24) & 0xff;
- pmu->available_event_types = ~entry->ebx & ((1ull << bitmap_len) - 1);
+ pmu->nr_arch_gp_counters = min_t(int, eax.split.num_counters,
+ INTEL_PMC_MAX_GENERIC);
+ pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << eax.split.bit_width) - 1;
+ pmu->available_event_types = ~entry->ebx &
+ ((1ull << eax.split.mask_length) - 1);
if (pmu->version == 1) {
pmu->nr_arch_fixed_counters = 0;
} else {
- pmu->nr_arch_fixed_counters = min((int)(entry->edx & 0x1f),
+ pmu->nr_arch_fixed_counters =
+ min_t(int, edx.split.num_counters_fixed,
INTEL_PMC_MAX_FIXED);
pmu->counter_bitmask[KVM_PMC_FIXED] =
- ((u64)1 << ((entry->edx >> 5) & 0xff)) - 1;
+ ((u64)1 << edx.split.bit_width_fixed) - 1;
}
pmu->global_ctrl = ((1 << pmu->nr_arch_gp_counters) - 1) |
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index ddf742768ecf..7527cefc5a43 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -622,7 +622,7 @@ static int has_svm(void)
return 1;
}
-static void svm_hardware_disable(void *garbage)
+static void svm_hardware_disable(void)
{
/* Make sure we clean up behind us */
if (static_cpu_has(X86_FEATURE_TSCRATEMSR))
@@ -633,7 +633,7 @@ static void svm_hardware_disable(void *garbage)
amd_pmu_disable_virt();
}
-static int svm_hardware_enable(void *garbage)
+static int svm_hardware_enable(void)
{
struct svm_cpu_data *sd;
@@ -670,7 +670,7 @@ static int svm_hardware_enable(void *garbage)
if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) {
wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT);
- __get_cpu_var(current_tsc_ratio) = TSC_RATIO_DEFAULT;
+ __this_cpu_write(current_tsc_ratio, TSC_RATIO_DEFAULT);
}
@@ -1257,7 +1257,8 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
svm->asid_generation = 0;
init_vmcb(svm);
- svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
+ svm->vcpu.arch.apic_base = APIC_DEFAULT_PHYS_BASE |
+ MSR_IA32_APICBASE_ENABLE;
if (kvm_vcpu_is_bsp(&svm->vcpu))
svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
@@ -1312,8 +1313,8 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
if (static_cpu_has(X86_FEATURE_TSCRATEMSR) &&
- svm->tsc_ratio != __get_cpu_var(current_tsc_ratio)) {
- __get_cpu_var(current_tsc_ratio) = svm->tsc_ratio;
+ svm->tsc_ratio != __this_cpu_read(current_tsc_ratio)) {
+ __this_cpu_write(current_tsc_ratio, svm->tsc_ratio);
wrmsrl(MSR_AMD64_TSC_RATIO, svm->tsc_ratio);
}
}
@@ -1974,10 +1975,26 @@ static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu,
{
struct vcpu_svm *svm = to_svm(vcpu);
- svm->vmcb->control.exit_code = SVM_EXIT_NPF;
- svm->vmcb->control.exit_code_hi = 0;
- svm->vmcb->control.exit_info_1 = fault->error_code;
- svm->vmcb->control.exit_info_2 = fault->address;
+ if (svm->vmcb->control.exit_code != SVM_EXIT_NPF) {
+ /*
+ * TODO: track the cause of the nested page fault, and
+ * correctly fill in the high bits of exit_info_1.
+ */
+ svm->vmcb->control.exit_code = SVM_EXIT_NPF;
+ svm->vmcb->control.exit_code_hi = 0;
+ svm->vmcb->control.exit_info_1 = (1ULL << 32);
+ svm->vmcb->control.exit_info_2 = fault->address;
+ }
+
+ svm->vmcb->control.exit_info_1 &= ~0xffffffffULL;
+ svm->vmcb->control.exit_info_1 |= fault->error_code;
+
+ /*
+ * The present bit is always zero for page structure faults on real
+ * hardware.
+ */
+ if (svm->vmcb->control.exit_info_1 & (2ULL << 32))
+ svm->vmcb->control.exit_info_1 &= ~1;
nested_svm_vmexit(svm);
}
@@ -3031,7 +3048,7 @@ static int cr8_write_interception(struct vcpu_svm *svm)
return 0;
}
-u64 svm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
+static u64 svm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
{
struct vmcb *vmcb = get_host_vmcb(to_svm(vcpu));
return vmcb->control.tsc_offset +
@@ -3234,7 +3251,7 @@ static int wrmsr_interception(struct vcpu_svm *svm)
msr.host_initiated = false;
svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
- if (svm_set_msr(&svm->vcpu, &msr)) {
+ if (kvm_set_msr(&svm->vcpu, &msr)) {
trace_kvm_msr_write_ex(ecx, data);
kvm_inject_gp(&svm->vcpu, 0);
} else {
@@ -3534,9 +3551,9 @@ static int handle_exit(struct kvm_vcpu *vcpu)
if (exit_code >= ARRAY_SIZE(svm_exit_handlers)
|| !svm_exit_handlers[exit_code]) {
- kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
- kvm_run->hw.hardware_exit_reason = exit_code;
- return 0;
+ WARN_ONCE(1, "vmx: unexpected exit reason 0x%x\n", exit_code);
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
}
return svm_exit_handlers[exit_code](svm);
@@ -4305,6 +4322,10 @@ static void svm_handle_external_intr(struct kvm_vcpu *vcpu)
local_irq_enable();
}
+static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu)
+{
+}
+
static struct kvm_x86_ops svm_x86_ops = {
.cpu_has_kvm_support = has_svm,
.disabled_by_bios = is_disabled,
@@ -4349,7 +4370,6 @@ static struct kvm_x86_ops svm_x86_ops = {
.cache_reg = svm_cache_reg,
.get_rflags = svm_get_rflags,
.set_rflags = svm_set_rflags,
- .fpu_activate = svm_fpu_activate,
.fpu_deactivate = svm_fpu_deactivate,
.tlb_flush = svm_flush_tlb,
@@ -4406,6 +4426,8 @@ static struct kvm_x86_ops svm_x86_ops = {
.check_intercept = svm_check_intercept,
.handle_external_intr = svm_handle_external_intr,
+
+ .sched_in = svm_sched_in,
};
static int __init svm_init(void)
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index e850a7d332be..6b06ab8748dd 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -415,15 +415,14 @@ TRACE_EVENT(kvm_apic_ipi,
);
TRACE_EVENT(kvm_apic_accept_irq,
- TP_PROTO(__u32 apicid, __u16 dm, __u8 tm, __u8 vec, bool coalesced),
- TP_ARGS(apicid, dm, tm, vec, coalesced),
+ TP_PROTO(__u32 apicid, __u16 dm, __u8 tm, __u8 vec),
+ TP_ARGS(apicid, dm, tm, vec),
TP_STRUCT__entry(
__field( __u32, apicid )
__field( __u16, dm )
__field( __u8, tm )
__field( __u8, vec )
- __field( bool, coalesced )
),
TP_fast_assign(
@@ -431,14 +430,12 @@ TRACE_EVENT(kvm_apic_accept_irq,
__entry->dm = dm;
__entry->tm = tm;
__entry->vec = vec;
- __entry->coalesced = coalesced;
),
- TP_printk("apicid %x vec %u (%s|%s)%s",
+ TP_printk("apicid %x vec %u (%s|%s)",
__entry->apicid, __entry->vec,
__print_symbolic((__entry->dm >> 8 & 0x7), kvm_deliver_mode),
- __entry->tm ? "level" : "edge",
- __entry->coalesced ? " (coalesced)" : "")
+ __entry->tm ? "level" : "edge")
);
TRACE_EVENT(kvm_eoi,
@@ -850,6 +847,36 @@ TRACE_EVENT(kvm_track_tsc,
#endif /* CONFIG_X86_64 */
+TRACE_EVENT(kvm_ple_window,
+ TP_PROTO(bool grow, unsigned int vcpu_id, int new, int old),
+ TP_ARGS(grow, vcpu_id, new, old),
+
+ TP_STRUCT__entry(
+ __field( bool, grow )
+ __field( unsigned int, vcpu_id )
+ __field( int, new )
+ __field( int, old )
+ ),
+
+ TP_fast_assign(
+ __entry->grow = grow;
+ __entry->vcpu_id = vcpu_id;
+ __entry->new = new;
+ __entry->old = old;
+ ),
+
+ TP_printk("vcpu %u: ple_window %d (%s %d)",
+ __entry->vcpu_id,
+ __entry->new,
+ __entry->grow ? "grow" : "shrink",
+ __entry->old)
+);
+
+#define trace_kvm_ple_window_grow(vcpu_id, new, old) \
+ trace_kvm_ple_window(true, vcpu_id, new, old)
+#define trace_kvm_ple_window_shrink(vcpu_id, new, old) \
+ trace_kvm_ple_window(false, vcpu_id, new, old)
+
#endif /* _TRACE_KVM_H */
#undef TRACE_INCLUDE_PATH
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index e618f34bde2d..3e556c68351b 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -125,14 +125,32 @@ module_param(nested, bool, S_IRUGO);
* Time is measured based on a counter that runs at the same rate as the TSC,
* refer SDM volume 3b section 21.6.13 & 22.1.3.
*/
-#define KVM_VMX_DEFAULT_PLE_GAP 128
-#define KVM_VMX_DEFAULT_PLE_WINDOW 4096
+#define KVM_VMX_DEFAULT_PLE_GAP 128
+#define KVM_VMX_DEFAULT_PLE_WINDOW 4096
+#define KVM_VMX_DEFAULT_PLE_WINDOW_GROW 2
+#define KVM_VMX_DEFAULT_PLE_WINDOW_SHRINK 0
+#define KVM_VMX_DEFAULT_PLE_WINDOW_MAX \
+ INT_MAX / KVM_VMX_DEFAULT_PLE_WINDOW_GROW
+
static int ple_gap = KVM_VMX_DEFAULT_PLE_GAP;
module_param(ple_gap, int, S_IRUGO);
static int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
module_param(ple_window, int, S_IRUGO);
+/* Default doubles per-vcpu window every exit. */
+static int ple_window_grow = KVM_VMX_DEFAULT_PLE_WINDOW_GROW;
+module_param(ple_window_grow, int, S_IRUGO);
+
+/* Default resets per-vcpu window every exit to ple_window. */
+static int ple_window_shrink = KVM_VMX_DEFAULT_PLE_WINDOW_SHRINK;
+module_param(ple_window_shrink, int, S_IRUGO);
+
+/* Default is to compute the maximum so we can never overflow. */
+static int ple_window_actual_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX;
+static int ple_window_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX;
+module_param(ple_window_max, int, S_IRUGO);
+
extern const ulong vmx_return;
#define NR_AUTOLOAD_MSRS 8
@@ -379,6 +397,7 @@ struct nested_vmx {
* we must keep them pinned while L2 runs.
*/
struct page *apic_access_page;
+ struct page *virtual_apic_page;
u64 msr_ia32_feature_control;
struct hrtimer preemption_timer;
@@ -453,6 +472,7 @@ struct vcpu_vmx {
int gs_ldt_reload_needed;
int fs_reload_needed;
u64 msr_host_bndcfgs;
+ unsigned long vmcs_host_cr4; /* May not match real cr4 */
} host_state;
struct {
int vm86_active;
@@ -484,6 +504,10 @@ struct vcpu_vmx {
/* Support for a guest hypervisor (nested VMX) */
struct nested_vmx nested;
+
+ /* Dynamic PLE window. */
+ int ple_window;
+ bool ple_window_dirty;
};
enum segment_cache_field {
@@ -533,6 +557,7 @@ static int max_shadow_read_only_fields =
ARRAY_SIZE(shadow_read_only_fields);
static unsigned long shadow_read_write_fields[] = {
+ TPR_THRESHOLD,
GUEST_RIP,
GUEST_RSP,
GUEST_CR0,
@@ -743,6 +768,7 @@ static u32 vmx_segment_access_rights(struct kvm_segment *var);
static void vmx_sync_pir_to_irr_dummy(struct kvm_vcpu *vcpu);
static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx);
static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx);
+static int alloc_identity_pagetable(struct kvm *kvm);
static DEFINE_PER_CPU(struct vmcs *, vmxarea);
static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
@@ -1601,7 +1627,7 @@ static void reload_tss(void)
/*
* VT restores TR but not its size. Useless.
*/
- struct desc_ptr *gdt = &__get_cpu_var(host_gdt);
+ struct desc_ptr *gdt = this_cpu_ptr(&host_gdt);
struct desc_struct *descs;
descs = (void *)gdt->address;
@@ -1647,7 +1673,7 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
static unsigned long segment_base(u16 selector)
{
- struct desc_ptr *gdt = &__get_cpu_var(host_gdt);
+ struct desc_ptr *gdt = this_cpu_ptr(&host_gdt);
struct desc_struct *d;
unsigned long table_base;
unsigned long v;
@@ -1777,7 +1803,7 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx)
*/
if (!user_has_fpu() && !vmx->vcpu.guest_fpu_loaded)
stts();
- load_gdt(&__get_cpu_var(host_gdt));
+ load_gdt(this_cpu_ptr(&host_gdt));
}
static void vmx_load_host_state(struct vcpu_vmx *vmx)
@@ -1807,7 +1833,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
}
if (vmx->loaded_vmcs->cpu != cpu) {
- struct desc_ptr *gdt = &__get_cpu_var(host_gdt);
+ struct desc_ptr *gdt = this_cpu_ptr(&host_gdt);
unsigned long sysenter_esp;
kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
@@ -2135,7 +2161,7 @@ static u64 guest_read_tsc(void)
* Like guest_read_tsc, but always returns L1's notion of the timestamp
* counter, even if a nested guest (L2) is currently running.
*/
-u64 vmx_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
+static u64 vmx_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
{
u64 tsc_offset;
@@ -2330,7 +2356,7 @@ static __init void nested_vmx_setup_ctls_msrs(void)
CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING |
CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_EXITING |
CPU_BASED_RDPMC_EXITING | CPU_BASED_RDTSC_EXITING |
- CPU_BASED_PAUSE_EXITING |
+ CPU_BASED_PAUSE_EXITING | CPU_BASED_TPR_SHADOW |
CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
/*
* We can allow some features even when not supported by the
@@ -2601,6 +2627,8 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
break;
case MSR_IA32_CR_PAT:
if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
+ if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data))
+ return 1;
vmcs_write64(GUEST_IA32_PAT, data);
vcpu->arch.pat = data;
break;
@@ -2631,12 +2659,15 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
default:
msr = find_msr_entry(vmx, msr_index);
if (msr) {
+ u64 old_msr_data = msr->data;
msr->data = data;
if (msr - vmx->guest_msrs < vmx->save_nmsrs) {
preempt_disable();
- kvm_set_shared_msr(msr->index, msr->data,
- msr->mask);
+ ret = kvm_set_shared_msr(msr->index, msr->data,
+ msr->mask);
preempt_enable();
+ if (ret)
+ msr->data = old_msr_data;
}
break;
}
@@ -2704,7 +2735,7 @@ static void kvm_cpu_vmxon(u64 addr)
: "memory", "cc");
}
-static int hardware_enable(void *garbage)
+static int hardware_enable(void)
{
int cpu = raw_smp_processor_id();
u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
@@ -2744,7 +2775,7 @@ static int hardware_enable(void *garbage)
ept_sync_global();
}
- native_store_gdt(&__get_cpu_var(host_gdt));
+ native_store_gdt(this_cpu_ptr(&host_gdt));
return 0;
}
@@ -2768,7 +2799,7 @@ static void kvm_cpu_vmxoff(void)
asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc");
}
-static void hardware_disable(void *garbage)
+static void hardware_disable(void)
{
if (vmm_exclusive) {
vmclear_local_loaded_vmcss();
@@ -3107,9 +3138,17 @@ static __init int hardware_setup(void)
if (!cpu_has_vmx_unrestricted_guest())
enable_unrestricted_guest = 0;
- if (!cpu_has_vmx_flexpriority())
+ if (!cpu_has_vmx_flexpriority()) {
flexpriority_enabled = 0;
+ /*
+ * set_apic_access_page_addr() is used to reload apic access
+ * page upon invalidation. No need to do anything if the
+ * processor does not have the APIC_ACCESS_ADDR VMCS field.
+ */
+ kvm_x86_ops->set_apic_access_page_addr = NULL;
+ }
+
if (!cpu_has_vmx_tpr_shadow())
kvm_x86_ops->update_cr8_intercept = NULL;
@@ -3905,7 +3944,7 @@ static int init_rmode_tss(struct kvm *kvm)
{
gfn_t fn;
u16 data = 0;
- int r, idx, ret = 0;
+ int idx, r;
idx = srcu_read_lock(&kvm->srcu);
fn = kvm->arch.tss_addr >> PAGE_SHIFT;
@@ -3927,32 +3966,32 @@ static int init_rmode_tss(struct kvm *kvm)
r = kvm_write_guest_page(kvm, fn, &data,
RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1,
sizeof(u8));
- if (r < 0)
- goto out;
-
- ret = 1;
out:
srcu_read_unlock(&kvm->srcu, idx);
- return ret;
+ return r;
}
static int init_rmode_identity_map(struct kvm *kvm)
{
- int i, idx, r, ret;
+ int i, idx, r = 0;
pfn_t identity_map_pfn;
u32 tmp;
if (!enable_ept)
- return 1;
- if (unlikely(!kvm->arch.ept_identity_pagetable)) {
- printk(KERN_ERR "EPT: identity-mapping pagetable "
- "haven't been allocated!\n");
return 0;
- }
+
+ /* Protect kvm->arch.ept_identity_pagetable_done. */
+ mutex_lock(&kvm->slots_lock);
+
if (likely(kvm->arch.ept_identity_pagetable_done))
- return 1;
- ret = 0;
+ goto out2;
+
identity_map_pfn = kvm->arch.ept_identity_map_addr >> PAGE_SHIFT;
+
+ r = alloc_identity_pagetable(kvm);
+ if (r < 0)
+ goto out2;
+
idx = srcu_read_lock(&kvm->srcu);
r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE);
if (r < 0)
@@ -3967,10 +4006,13 @@ static int init_rmode_identity_map(struct kvm *kvm)
goto out;
}
kvm->arch.ept_identity_pagetable_done = true;
- ret = 1;
+
out:
srcu_read_unlock(&kvm->srcu, idx);
- return ret;
+
+out2:
+ mutex_unlock(&kvm->slots_lock);
+ return r;
}
static void seg_setup(int seg)
@@ -3995,23 +4037,28 @@ static int alloc_apic_access_page(struct kvm *kvm)
int r = 0;
mutex_lock(&kvm->slots_lock);
- if (kvm->arch.apic_access_page)
+ if (kvm->arch.apic_access_page_done)
goto out;
kvm_userspace_mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT;
kvm_userspace_mem.flags = 0;
- kvm_userspace_mem.guest_phys_addr = 0xfee00000ULL;
+ kvm_userspace_mem.guest_phys_addr = APIC_DEFAULT_PHYS_BASE;
kvm_userspace_mem.memory_size = PAGE_SIZE;
r = __kvm_set_memory_region(kvm, &kvm_userspace_mem);
if (r)
goto out;
- page = gfn_to_page(kvm, 0xfee00);
+ page = gfn_to_page(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
if (is_error_page(page)) {
r = -EFAULT;
goto out;
}
- kvm->arch.apic_access_page = page;
+ /*
+ * Do not pin the page in memory, so that memory hot-unplug
+ * is able to migrate it.
+ */
+ put_page(page);
+ kvm->arch.apic_access_page_done = true;
out:
mutex_unlock(&kvm->slots_lock);
return r;
@@ -4019,31 +4066,20 @@ out:
static int alloc_identity_pagetable(struct kvm *kvm)
{
- struct page *page;
+ /* Called with kvm->slots_lock held. */
+
struct kvm_userspace_memory_region kvm_userspace_mem;
int r = 0;
- mutex_lock(&kvm->slots_lock);
- if (kvm->arch.ept_identity_pagetable)
- goto out;
+ BUG_ON(kvm->arch.ept_identity_pagetable_done);
+
kvm_userspace_mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT;
kvm_userspace_mem.flags = 0;
kvm_userspace_mem.guest_phys_addr =
kvm->arch.ept_identity_map_addr;
kvm_userspace_mem.memory_size = PAGE_SIZE;
r = __kvm_set_memory_region(kvm, &kvm_userspace_mem);
- if (r)
- goto out;
-
- page = gfn_to_page(kvm, kvm->arch.ept_identity_map_addr >> PAGE_SHIFT);
- if (is_error_page(page)) {
- r = -EFAULT;
- goto out;
- }
- kvm->arch.ept_identity_pagetable = page;
-out:
- mutex_unlock(&kvm->slots_lock);
return r;
}
@@ -4235,11 +4271,16 @@ static void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
u32 low32, high32;
unsigned long tmpl;
struct desc_ptr dt;
+ unsigned long cr4;
vmcs_writel(HOST_CR0, read_cr0() & ~X86_CR0_TS); /* 22.2.3 */
- vmcs_writel(HOST_CR4, read_cr4()); /* 22.2.3, 22.2.5 */
vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */
+ /* Save the most likely value for this task's CR4 in the VMCS. */
+ cr4 = read_cr4();
+ vmcs_writel(HOST_CR4, cr4); /* 22.2.3, 22.2.5 */
+ vmx->host_state.vmcs_host_cr4 = cr4;
+
vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */
#ifdef CONFIG_X86_64
/*
@@ -4402,7 +4443,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
if (ple_gap) {
vmcs_write32(PLE_GAP, ple_gap);
- vmcs_write32(PLE_WINDOW, ple_window);
+ vmx->ple_window = ple_window;
+ vmx->ple_window_dirty = true;
}
vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
@@ -4477,7 +4519,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu)
vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
kvm_set_cr8(&vmx->vcpu, 0);
- apic_base_msr.data = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
+ apic_base_msr.data = APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE;
if (kvm_vcpu_is_bsp(&vmx->vcpu))
apic_base_msr.data |= MSR_IA32_APICBASE_BSP;
apic_base_msr.host_initiated = true;
@@ -4537,9 +4579,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu)
vmcs_write32(TPR_THRESHOLD, 0);
}
- if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
- vmcs_write64(APIC_ACCESS_ADDR,
- page_to_phys(vmx->vcpu.kvm->arch.apic_access_page));
+ kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
if (vmx_vm_has_apicv(vcpu->kvm))
memset(&vmx->pi_desc, 0, sizeof(struct pi_desc));
@@ -4729,10 +4769,7 @@ static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
if (ret)
return ret;
kvm->arch.tss_addr = addr;
- if (!init_rmode_tss(kvm))
- return -ENOMEM;
-
- return 0;
+ return init_rmode_tss(kvm);
}
static bool rmode_exception(struct kvm_vcpu *vcpu, int vec)
@@ -5257,7 +5294,7 @@ static int handle_wrmsr(struct kvm_vcpu *vcpu)
msr.data = data;
msr.index = ecx;
msr.host_initiated = false;
- if (vmx_set_msr(vcpu, &msr) != 0) {
+ if (kvm_set_msr(vcpu, &msr) != 0) {
trace_kvm_msr_write_ex(ecx, data);
kvm_inject_gp(vcpu, 0);
return 1;
@@ -5521,17 +5558,18 @@ static u64 ept_rsvd_mask(u64 spte, int level)
for (i = 51; i > boot_cpu_data.x86_phys_bits; i--)
mask |= (1ULL << i);
- if (level > 2)
+ if (level == 4)
/* bits 7:3 reserved */
mask |= 0xf8;
- else if (level == 2) {
- if (spte & (1ULL << 7))
- /* 2MB ref, bits 20:12 reserved */
- mask |= 0x1ff000;
- else
- /* bits 6:3 reserved */
- mask |= 0x78;
- }
+ else if (spte & (1ULL << 7))
+ /*
+ * 1GB/2MB page, bits 29:12 or 20:12 reserved respectively,
+ * level == 1 if the hypervisor is using the ignored bit 7.
+ */
+ mask |= (PAGE_SIZE << ((level - 1) * 9)) - PAGE_SIZE;
+ else if (level > 1)
+ /* bits 6:3 reserved */
+ mask |= 0x78;
return mask;
}
@@ -5561,7 +5599,8 @@ static void ept_misconfig_inspect_spte(struct kvm_vcpu *vcpu, u64 spte,
WARN_ON(1);
}
- if (level == 1 || (level == 2 && (spte & (1ULL << 7)))) {
+ /* bits 5:3 are _not_ reserved for large page or leaf page */
+ if ((rsvd_bits & 0x38) == 0) {
u64 ept_mem_type = (spte & 0x38) >> 3;
if (ept_mem_type == 2 || ept_mem_type == 3 ||
@@ -5676,12 +5715,85 @@ out:
return ret;
}
+static int __grow_ple_window(int val)
+{
+ if (ple_window_grow < 1)
+ return ple_window;
+
+ val = min(val, ple_window_actual_max);
+
+ if (ple_window_grow < ple_window)
+ val *= ple_window_grow;
+ else
+ val += ple_window_grow;
+
+ return val;
+}
+
+static int __shrink_ple_window(int val, int modifier, int minimum)
+{
+ if (modifier < 1)
+ return ple_window;
+
+ if (modifier < ple_window)
+ val /= modifier;
+ else
+ val -= modifier;
+
+ return max(val, minimum);
+}
+
+static void grow_ple_window(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ int old = vmx->ple_window;
+
+ vmx->ple_window = __grow_ple_window(old);
+
+ if (vmx->ple_window != old)
+ vmx->ple_window_dirty = true;
+
+ trace_kvm_ple_window_grow(vcpu->vcpu_id, vmx->ple_window, old);
+}
+
+static void shrink_ple_window(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ int old = vmx->ple_window;
+
+ vmx->ple_window = __shrink_ple_window(old,
+ ple_window_shrink, ple_window);
+
+ if (vmx->ple_window != old)
+ vmx->ple_window_dirty = true;
+
+ trace_kvm_ple_window_shrink(vcpu->vcpu_id, vmx->ple_window, old);
+}
+
+/*
+ * ple_window_actual_max is computed to be one grow_ple_window() below
+ * ple_window_max. (See __grow_ple_window for the reason.)
+ * This prevents overflows, because ple_window_max is int.
+ * ple_window_max effectively rounded down to a multiple of ple_window_grow in
+ * this process.
+ * ple_window_max is also prevented from setting vmx->ple_window < ple_window.
+ */
+static void update_ple_window_actual_max(void)
+{
+ ple_window_actual_max =
+ __shrink_ple_window(max(ple_window_max, ple_window),
+ ple_window_grow, INT_MIN);
+}
+
/*
* Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE
* exiting, so only get here on cpu with PAUSE-Loop-Exiting.
*/
static int handle_pause(struct kvm_vcpu *vcpu)
{
+ if (ple_gap)
+ grow_ple_window(vcpu);
+
skip_emulated_instruction(vcpu);
kvm_vcpu_on_spin(vcpu);
@@ -6146,7 +6258,11 @@ static void free_nested(struct vcpu_vmx *vmx)
/* Unpin physical memory we referred to in current vmcs02 */
if (vmx->nested.apic_access_page) {
nested_release_page(vmx->nested.apic_access_page);
- vmx->nested.apic_access_page = 0;
+ vmx->nested.apic_access_page = NULL;
+ }
+ if (vmx->nested.virtual_apic_page) {
+ nested_release_page(vmx->nested.virtual_apic_page);
+ vmx->nested.virtual_apic_page = NULL;
}
nested_free_all_saved_vmcss(vmx);
@@ -6310,6 +6426,8 @@ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx)
const unsigned long *fields = shadow_read_write_fields;
const int num_fields = max_shadow_read_write_fields;
+ preempt_disable();
+
vmcs_load(shadow_vmcs);
for (i = 0; i < num_fields; i++) {
@@ -6333,6 +6451,8 @@ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx)
vmcs_clear(shadow_vmcs);
vmcs_load(vmx->loaded_vmcs->vmcs);
+
+ preempt_enable();
}
static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx)
@@ -6617,7 +6737,7 @@ static int handle_invept(struct kvm_vcpu *vcpu)
switch (type) {
case VMX_EPT_EXTENT_GLOBAL:
kvm_mmu_sync_roots(vcpu);
- kvm_mmu_flush_tlb(vcpu);
+ kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
nested_vmx_succeed(vcpu);
break;
default:
@@ -6630,6 +6750,12 @@ static int handle_invept(struct kvm_vcpu *vcpu)
return 1;
}
+static int handle_invvpid(struct kvm_vcpu *vcpu)
+{
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+}
+
/*
* The exit handlers return 1 if the exit was handled fully and guest execution
* may resume. Otherwise they set the kvm_run parameter to indicate what needs
@@ -6675,6 +6801,7 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
[EXIT_REASON_MWAIT_INSTRUCTION] = handle_mwait,
[EXIT_REASON_MONITOR_INSTRUCTION] = handle_monitor,
[EXIT_REASON_INVEPT] = handle_invept,
+ [EXIT_REASON_INVVPID] = handle_invvpid,
};
static const int kvm_vmx_max_exit_handlers =
@@ -6892,6 +7019,8 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
case EXIT_REASON_TASK_SWITCH:
return 1;
case EXIT_REASON_CPUID:
+ if (kvm_register_read(vcpu, VCPU_REGS_RAX) == 0xa)
+ return 0;
return 1;
case EXIT_REASON_HLT:
return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING);
@@ -6908,7 +7037,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
case EXIT_REASON_VMPTRST: case EXIT_REASON_VMREAD:
case EXIT_REASON_VMRESUME: case EXIT_REASON_VMWRITE:
case EXIT_REASON_VMOFF: case EXIT_REASON_VMON:
- case EXIT_REASON_INVEPT:
+ case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID:
/*
* VMX instructions trap unconditionally. This allows L1 to
* emulate them for its L2 guest, i.e., allows 3-level nesting!
@@ -6936,7 +7065,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
case EXIT_REASON_MCE_DURING_VMENTRY:
return 0;
case EXIT_REASON_TPR_BELOW_THRESHOLD:
- return 1;
+ return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW);
case EXIT_REASON_APIC_ACCESS:
return nested_cpu_has2(vmcs12,
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
@@ -7049,14 +7178,20 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
&& kvm_vmx_exit_handlers[exit_reason])
return kvm_vmx_exit_handlers[exit_reason](vcpu);
else {
- vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
- vcpu->run->hw.hardware_exit_reason = exit_reason;
+ WARN_ONCE(1, "vmx: unexpected exit reason 0x%x\n", exit_reason);
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
}
- return 0;
}
static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
{
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+
+ if (is_guest_mode(vcpu) &&
+ nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
+ return;
+
if (irr == -1 || tpr < irr) {
vmcs_write32(TPR_THRESHOLD, 0);
return;
@@ -7094,6 +7229,29 @@ static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
vmx_set_msr_bitmap(vcpu);
}
+static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ /*
+ * Currently we do not handle the nested case where L2 has an
+ * APIC access page of its own; that page is still pinned.
+ * Hence, we skip the case where the VCPU is in guest mode _and_
+ * L1 prepared an APIC access page for L2.
+ *
+ * For the case where L1 and L2 share the same APIC access page
+ * (flexpriority=Y but SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES clear
+ * in the vmcs12), this function will only update either the vmcs01
+ * or the vmcs02. If the former, the vmcs02 will be updated by
+ * prepare_vmcs02. If the latter, the vmcs01 will be updated in
+ * the next L2->L1 exit.
+ */
+ if (!is_guest_mode(vcpu) ||
+ !nested_cpu_has2(vmx->nested.current_vmcs12,
+ SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
+ vmcs_write64(APIC_ACCESS_ADDR, hpa);
+}
+
static void vmx_hwapic_isr_update(struct kvm *kvm, int isr)
{
u16 status;
@@ -7376,7 +7534,7 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- unsigned long debugctlmsr;
+ unsigned long debugctlmsr, cr4;
/* Record the guest's net vcpu time for enforced NMI injections. */
if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked))
@@ -7387,6 +7545,11 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
if (vmx->emulation_required)
return;
+ if (vmx->ple_window_dirty) {
+ vmx->ple_window_dirty = false;
+ vmcs_write32(PLE_WINDOW, vmx->ple_window);
+ }
+
if (vmx->nested.sync_shadow_vmcs) {
copy_vmcs12_to_shadow(vmx);
vmx->nested.sync_shadow_vmcs = false;
@@ -7397,6 +7560,12 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty))
vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
+ cr4 = read_cr4();
+ if (unlikely(cr4 != vmx->host_state.vmcs_host_cr4)) {
+ vmcs_writel(HOST_CR4, cr4);
+ vmx->host_state.vmcs_host_cr4 = cr4;
+ }
+
/* When single-stepping over STI and MOV SS, we must clear the
* corresponding interruptibility bits in the guest state. Otherwise
* vmentry fails as it then expects bit 14 (BS) in pending debug
@@ -7642,10 +7811,8 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
if (!kvm->arch.ept_identity_map_addr)
kvm->arch.ept_identity_map_addr =
VMX_EPT_IDENTITY_PAGETABLE_ADDR;
- err = -ENOMEM;
- if (alloc_identity_pagetable(kvm) != 0)
- goto free_vmcs;
- if (!init_rmode_identity_map(kvm))
+ err = init_rmode_identity_map(kvm);
+ if (err)
goto free_vmcs;
}
@@ -7824,6 +7991,55 @@ static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
kvm_inject_page_fault(vcpu, fault);
}
+static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
+ /* TODO: Also verify bits beyond physical address width are 0 */
+ if (!PAGE_ALIGNED(vmcs12->apic_access_addr))
+ return false;
+
+ /*
+ * Translate L1 physical address to host physical
+ * address for vmcs02. Keep the page pinned, so this
+ * physical address remains valid. We keep a reference
+ * to it so we can release it later.
+ */
+ if (vmx->nested.apic_access_page) /* shouldn't happen */
+ nested_release_page(vmx->nested.apic_access_page);
+ vmx->nested.apic_access_page =
+ nested_get_page(vcpu, vmcs12->apic_access_addr);
+ }
+
+ if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
+ /* TODO: Also verify bits beyond physical address width are 0 */
+ if (!PAGE_ALIGNED(vmcs12->virtual_apic_page_addr))
+ return false;
+
+ if (vmx->nested.virtual_apic_page) /* shouldn't happen */
+ nested_release_page(vmx->nested.virtual_apic_page);
+ vmx->nested.virtual_apic_page =
+ nested_get_page(vcpu, vmcs12->virtual_apic_page_addr);
+
+ /*
+ * Failing the vm entry is _not_ what the processor does
+ * but it's basically the only possibility we have.
+ * We could still enter the guest if CR8 load exits are
+ * enabled, CR8 store exits are enabled, and virtualize APIC
+ * access is disabled; in this case the processor would never
+ * use the TPR shadow and we could simply clear the bit from
+ * the execution control. But such a configuration is useless,
+ * so let's keep the code simple.
+ */
+ if (!vmx->nested.virtual_apic_page)
+ return false;
+ }
+
+ return true;
+}
+
static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu)
{
u64 preemption_timeout = get_vmcs12(vcpu)->vmx_preemption_timer_value;
@@ -7849,7 +8065,7 @@ static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu)
/*
* prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
* L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it
- * with L0's requirements for its guest (a.k.a. vmsc01), so we can run the L2
+ * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2
* guest in a way that will both be appropriate to L1's requests, and our
* needs. In addition to modifying the active vmcs (which is vmcs02), this
* function also has additional necessary side-effects, like setting various
@@ -7970,16 +8186,6 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
if (exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) {
/*
- * Translate L1 physical address to host physical
- * address for vmcs02. Keep the page pinned, so this
- * physical address remains valid. We keep a reference
- * to it so we can release it later.
- */
- if (vmx->nested.apic_access_page) /* shouldn't happen */
- nested_release_page(vmx->nested.apic_access_page);
- vmx->nested.apic_access_page =
- nested_get_page(vcpu, vmcs12->apic_access_addr);
- /*
* If translation failed, no matter: This feature asks
* to exit when accessing the given address, and if it
* can never be accessed, this feature won't do
@@ -7994,8 +8200,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
} else if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) {
exec_control |=
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
- vmcs_write64(APIC_ACCESS_ADDR,
- page_to_phys(vcpu->kvm->arch.apic_access_page));
+ kvm_vcpu_reload_apic_access_page(vcpu);
}
vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
@@ -8024,6 +8229,13 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
exec_control &= ~CPU_BASED_TPR_SHADOW;
exec_control |= vmcs12->cpu_based_vm_exec_control;
+
+ if (exec_control & CPU_BASED_TPR_SHADOW) {
+ vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
+ page_to_phys(vmx->nested.virtual_apic_page));
+ vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold);
+ }
+
/*
* Merging of IO and MSR bitmaps not currently supported.
* Rather, exit every time.
@@ -8185,8 +8397,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
return 1;
}
- if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) &&
- !PAGE_ALIGNED(vmcs12->apic_access_addr)) {
+ if (!nested_get_vmcs12_pages(vcpu, vmcs12)) {
/*TODO: Also verify bits beyond physical address width are 0*/
nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
return 1;
@@ -8754,6 +8965,8 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info,
exit_qualification);
+ vmx_load_vmcs01(vcpu);
+
if ((exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT)
&& nested_exit_intr_ack_set(vcpu)) {
int irq = kvm_cpu_get_interrupt(vcpu);
@@ -8769,8 +8982,6 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
vmcs12->vm_exit_intr_error_code,
KVM_ISA_VMX);
- vmx_load_vmcs01(vcpu);
-
vm_entry_controls_init(vmx, vmcs_read32(VM_ENTRY_CONTROLS));
vm_exit_controls_init(vmx, vmcs_read32(VM_EXIT_CONTROLS));
vmx_segment_cache_clear(vmx);
@@ -8790,10 +9001,20 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
/* Unpin physical memory we referred to in vmcs02 */
if (vmx->nested.apic_access_page) {
nested_release_page(vmx->nested.apic_access_page);
- vmx->nested.apic_access_page = 0;
+ vmx->nested.apic_access_page = NULL;
+ }
+ if (vmx->nested.virtual_apic_page) {
+ nested_release_page(vmx->nested.virtual_apic_page);
+ vmx->nested.virtual_apic_page = NULL;
}
/*
+ * We are now running in L2, mmu_notifier will force to reload the
+ * page's hpa for L2 vmcs. Need to reload it for L1 before entering L1.
+ */
+ kvm_vcpu_reload_apic_access_page(vcpu);
+
+ /*
* Exiting from L2 to L1, we're now back to L1 which thinks it just
* finished a VMLAUNCH or VMRESUME instruction, so we need to set the
* success or failure flag accordingly.
@@ -8846,6 +9067,12 @@ static int vmx_check_intercept(struct kvm_vcpu *vcpu,
return X86EMUL_CONTINUE;
}
+static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu)
+{
+ if (ple_gap)
+ shrink_ple_window(vcpu);
+}
+
static struct kvm_x86_ops vmx_x86_ops = {
.cpu_has_kvm_support = cpu_has_kvm_support,
.disabled_by_bios = vmx_disabled_by_bios,
@@ -8890,7 +9117,6 @@ static struct kvm_x86_ops vmx_x86_ops = {
.cache_reg = vmx_cache_reg,
.get_rflags = vmx_get_rflags,
.set_rflags = vmx_set_rflags,
- .fpu_activate = vmx_fpu_activate,
.fpu_deactivate = vmx_fpu_deactivate,
.tlb_flush = vmx_flush_tlb,
@@ -8913,6 +9139,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
.enable_irq_window = enable_irq_window,
.update_cr8_intercept = update_cr8_intercept,
.set_virtual_x2apic_mode = vmx_set_virtual_x2apic_mode,
+ .set_apic_access_page_addr = vmx_set_apic_access_page_addr,
.vm_has_apicv = vmx_vm_has_apicv,
.load_eoi_exitmap = vmx_load_eoi_exitmap,
.hwapic_irr_update = vmx_hwapic_irr_update,
@@ -8951,6 +9178,8 @@ static struct kvm_x86_ops vmx_x86_ops = {
.mpx_supported = vmx_mpx_supported,
.check_nested_events = vmx_check_nested_events,
+
+ .sched_in = vmx_sched_in,
};
static int __init vmx_init(void)
@@ -9065,6 +9294,8 @@ static int __init vmx_init(void)
} else
kvm_disable_tdp();
+ update_ple_window_actual_max();
+
return 0;
out7:
@@ -9098,7 +9329,7 @@ static void __exit vmx_exit(void)
free_page((unsigned long)vmx_vmread_bitmap);
#ifdef CONFIG_KEXEC
- rcu_assign_pointer(crash_vmclear_loaded_vmcss, NULL);
+ RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL);
synchronize_rcu();
#endif
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ca3d760dd581..0033df32a745 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -229,24 +229,29 @@ static void kvm_shared_msr_cpu_online(void)
shared_msr_update(i, shared_msrs_global.msrs[i]);
}
-void kvm_set_shared_msr(unsigned slot, u64 value, u64 mask)
+int kvm_set_shared_msr(unsigned slot, u64 value, u64 mask)
{
unsigned int cpu = smp_processor_id();
struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
+ int err;
if (((value ^ smsr->values[slot].curr) & mask) == 0)
- return;
+ return 0;
smsr->values[slot].curr = value;
- wrmsrl(shared_msrs_global.msrs[slot], value);
+ err = wrmsrl_safe(shared_msrs_global.msrs[slot], value);
+ if (err)
+ return 1;
+
if (!smsr->registered) {
smsr->urn.on_user_return = kvm_on_user_return;
user_return_notifier_register(&smsr->urn);
smsr->registered = true;
}
+ return 0;
}
EXPORT_SYMBOL_GPL(kvm_set_shared_msr);
-static void drop_user_return_notifiers(void *ignore)
+static void drop_user_return_notifiers(void)
{
unsigned int cpu = smp_processor_id();
struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
@@ -408,12 +413,14 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
}
EXPORT_SYMBOL_GPL(kvm_inject_page_fault);
-void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
+static bool kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
{
if (mmu_is_nested(vcpu) && !fault->nested_page_fault)
vcpu->arch.nested_mmu.inject_page_fault(vcpu, fault);
else
vcpu->arch.mmu.inject_page_fault(vcpu, fault);
+
+ return fault->nested_page_fault;
}
void kvm_inject_nmi(struct kvm_vcpu *vcpu)
@@ -457,11 +464,12 @@ int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
gfn_t ngfn, void *data, int offset, int len,
u32 access)
{
+ struct x86_exception exception;
gfn_t real_gfn;
gpa_t ngpa;
ngpa = gfn_to_gpa(ngfn);
- real_gfn = mmu->translate_gpa(vcpu, ngpa, access);
+ real_gfn = mmu->translate_gpa(vcpu, ngpa, access, &exception);
if (real_gfn == UNMAPPED_GVA)
return -EFAULT;
@@ -726,7 +734,7 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
{
if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) {
kvm_mmu_sync_roots(vcpu);
- kvm_mmu_flush_tlb(vcpu);
+ kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
return 0;
}
@@ -984,7 +992,6 @@ void kvm_enable_efer_bits(u64 mask)
}
EXPORT_SYMBOL_GPL(kvm_enable_efer_bits);
-
/*
* Writes msr value into into the appropriate "register".
* Returns 0 on success, non-0 otherwise.
@@ -992,8 +999,34 @@ EXPORT_SYMBOL_GPL(kvm_enable_efer_bits);
*/
int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
{
+ switch (msr->index) {
+ case MSR_FS_BASE:
+ case MSR_GS_BASE:
+ case MSR_KERNEL_GS_BASE:
+ case MSR_CSTAR:
+ case MSR_LSTAR:
+ if (is_noncanonical_address(msr->data))
+ return 1;
+ break;
+ case MSR_IA32_SYSENTER_EIP:
+ case MSR_IA32_SYSENTER_ESP:
+ /*
+ * IA32_SYSENTER_ESP and IA32_SYSENTER_EIP cause #GP if
+ * non-canonical address is written on Intel but not on
+ * AMD (which ignores the top 32-bits, because it does
+ * not implement 64-bit SYSENTER).
+ *
+ * 64-bit code should hence be able to write a non-canonical
+ * value on AMD. Making the address canonical ensures that
+ * vmentry does not fail on Intel after writing a non-canonical
+ * value, and that something deterministic happens if the guest
+ * invokes 64-bit SYSENTER.
+ */
+ msr->data = get_canonical(msr->data);
+ }
return kvm_x86_ops->set_msr(vcpu, msr);
}
+EXPORT_SYMBOL_GPL(kvm_set_msr);
/*
* Adapt set_msr() to msr_io()'s calling convention
@@ -1518,7 +1551,7 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
pvclock_update_vm_gtod_copy(kvm);
kvm_for_each_vcpu(i, vcpu, kvm)
- set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests);
+ kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
/* guest entries allowed */
kvm_for_each_vcpu(i, vcpu, kvm)
@@ -1556,7 +1589,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
/* Keep irq disabled to prevent changes to the clock */
local_irq_save(flags);
- this_tsc_khz = __get_cpu_var(cpu_tsc_khz);
+ this_tsc_khz = __this_cpu_read(cpu_tsc_khz);
if (unlikely(this_tsc_khz == 0)) {
local_irq_restore(flags);
kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
@@ -1661,7 +1694,7 @@ static void kvmclock_update_fn(struct work_struct *work)
struct kvm_vcpu *vcpu;
kvm_for_each_vcpu(i, vcpu, kvm) {
- set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests);
+ kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
kvm_vcpu_kick(vcpu);
}
}
@@ -1670,7 +1703,7 @@ static void kvm_gen_kvmclock_update(struct kvm_vcpu *v)
{
struct kvm *kvm = v->kvm;
- set_bit(KVM_REQ_CLOCK_UPDATE, &v->requests);
+ kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
schedule_delayed_work(&kvm->arch.kvmclock_update_work,
KVMCLOCK_UPDATE_DELAY);
}
@@ -1723,9 +1756,10 @@ static bool valid_mtrr_type(unsigned t)
return t < 8 && (1 << t) & 0x73; /* 0, 1, 4, 5, 6 */
}
-static bool mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data)
{
int i;
+ u64 mask;
if (!msr_mtrr_valid(msr))
return false;
@@ -1747,14 +1781,31 @@ static bool mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data)
}
/* variable MTRRs */
- return valid_mtrr_type(data & 0xff);
+ WARN_ON(!(msr >= 0x200 && msr < 0x200 + 2 * KVM_NR_VAR_MTRR));
+
+ mask = (~0ULL) << cpuid_maxphyaddr(vcpu);
+ if ((msr & 1) == 0) {
+ /* MTRR base */
+ if (!valid_mtrr_type(data & 0xff))
+ return false;
+ mask |= 0xf00;
+ } else
+ /* MTRR mask */
+ mask |= 0x7ff;
+ if (data & mask) {
+ kvm_inject_gp(vcpu, 0);
+ return false;
+ }
+
+ return true;
}
+EXPORT_SYMBOL_GPL(kvm_mtrr_valid);
static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
{
u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges;
- if (!mtrr_valid(vcpu, msr, data))
+ if (!kvm_mtrr_valid(vcpu, msr, data))
return 1;
if (msr == MSR_MTRRdefType) {
@@ -1805,7 +1856,7 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data)
break;
default:
if (msr >= MSR_IA32_MC0_CTL &&
- msr < MSR_IA32_MC0_CTL + 4 * bank_num) {
+ msr < MSR_IA32_MCx_CTL(bank_num)) {
u32 offset = msr - MSR_IA32_MC0_CTL;
/* only 0 or all 1s can be written to IA32_MCi_CTL
* some Linux kernels though clear bit 10 in bank 4 to
@@ -2164,7 +2215,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_IA32_MCG_CTL:
case MSR_IA32_MCG_STATUS:
- case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
+ case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
return set_msr_mce(vcpu, msr, data);
/* Performance counters are not protected by a CPUID bit,
@@ -2330,7 +2381,7 @@ static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
break;
default:
if (msr >= MSR_IA32_MC0_CTL &&
- msr < MSR_IA32_MC0_CTL + 4 * bank_num) {
+ msr < MSR_IA32_MCx_CTL(bank_num)) {
u32 offset = msr - MSR_IA32_MC0_CTL;
data = vcpu->arch.mce_banks[offset];
break;
@@ -2419,7 +2470,13 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
case MSR_K7_HWCR:
case MSR_VM_HSAVE_PA:
case MSR_K7_EVNTSEL0:
+ case MSR_K7_EVNTSEL1:
+ case MSR_K7_EVNTSEL2:
+ case MSR_K7_EVNTSEL3:
case MSR_K7_PERFCTR0:
+ case MSR_K7_PERFCTR1:
+ case MSR_K7_PERFCTR2:
+ case MSR_K7_PERFCTR3:
case MSR_K8_INT_PENDING_MSG:
case MSR_AMD64_NB_CFG:
case MSR_FAM10H_MMIO_CONF_BASE:
@@ -2505,7 +2562,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
case MSR_IA32_MCG_CAP:
case MSR_IA32_MCG_CTL:
case MSR_IA32_MCG_STATUS:
- case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
+ case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
return get_msr_mce(vcpu, msr, pdata);
case MSR_K7_CLK_CTL:
/*
@@ -2636,7 +2693,7 @@ out:
return r;
}
-int kvm_dev_ioctl_check_extension(long ext)
+int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
{
int r;
@@ -2823,7 +2880,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
if (unlikely(vcpu->arch.tsc_offset_adjustment)) {
adjust_tsc_offset_host(vcpu, vcpu->arch.tsc_offset_adjustment);
vcpu->arch.tsc_offset_adjustment = 0;
- set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests);
+ kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
}
if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) {
@@ -4040,16 +4097,16 @@ void kvm_get_segment(struct kvm_vcpu *vcpu,
kvm_x86_ops->get_segment(vcpu, var, seg);
}
-gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access)
+gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access,
+ struct x86_exception *exception)
{
gpa_t t_gpa;
- struct x86_exception exception;
BUG_ON(!mmu_is_nested(vcpu));
/* NPT walks are always user-walks */
access |= PFERR_USER_MASK;
- t_gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gpa, access, &exception);
+ t_gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gpa, access, exception);
return t_gpa;
}
@@ -4906,16 +4963,18 @@ static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
}
}
-static void inject_emulated_exception(struct kvm_vcpu *vcpu)
+static bool inject_emulated_exception(struct kvm_vcpu *vcpu)
{
struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
if (ctxt->exception.vector == PF_VECTOR)
- kvm_propagate_fault(vcpu, &ctxt->exception);
- else if (ctxt->exception.error_code_valid)
+ return kvm_propagate_fault(vcpu, &ctxt->exception);
+
+ if (ctxt->exception.error_code_valid)
kvm_queue_exception_e(vcpu, ctxt->exception.vector,
ctxt->exception.error_code);
else
kvm_queue_exception(vcpu, ctxt->exception.vector);
+ return false;
}
static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
@@ -4972,7 +5031,7 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu)
++vcpu->stat.insn_emulation_fail;
trace_kvm_emulate_insn_failed(vcpu);
- if (!is_guest_mode(vcpu)) {
+ if (!is_guest_mode(vcpu) && kvm_x86_ops->get_cpl(vcpu) == 0) {
vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
vcpu->run->internal.ndata = 0;
@@ -5224,6 +5283,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
ctxt->interruptibility = 0;
ctxt->have_exception = false;
+ ctxt->exception.vector = -1;
ctxt->perm_ok = false;
ctxt->ud = emulation_type & EMULTYPE_TRAP_UD;
@@ -5276,8 +5336,9 @@ restart:
}
if (ctxt->have_exception) {
- inject_emulated_exception(vcpu);
r = EMULATE_DONE;
+ if (inject_emulated_exception(vcpu))
+ return r;
} else if (vcpu->arch.pio.count) {
if (!vcpu->arch.pio.in) {
/* FIXME: return into emulator if single-stepping. */
@@ -5545,7 +5606,7 @@ static void kvm_set_mmio_spte_mask(void)
* entry to generate page fault with PFER.RSV = 1.
*/
/* Mask the reserved physical address bits. */
- mask = ((1ull << (51 - maxphyaddr + 1)) - 1) << maxphyaddr;
+ mask = rsvd_bits(maxphyaddr, 51);
/* Bit 62 is always reserved for 32bit host. */
mask |= 0x3ull << 62;
@@ -5576,7 +5637,7 @@ static void pvclock_gtod_update_fn(struct work_struct *work)
spin_lock(&kvm_lock);
list_for_each_entry(kvm, &vm_list, vm_list)
kvm_for_each_vcpu(i, vcpu, kvm)
- set_bit(KVM_REQ_MASTERCLOCK_UPDATE, &vcpu->requests);
+ kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
atomic_set(&kvm_guest_has_master_clock, 0);
spin_unlock(&kvm_lock);
}
@@ -5989,6 +6050,44 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
kvm_apic_update_tmr(vcpu, tmr);
}
+static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu)
+{
+ ++vcpu->stat.tlb_flush;
+ kvm_x86_ops->tlb_flush(vcpu);
+}
+
+void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
+{
+ struct page *page = NULL;
+
+ if (!irqchip_in_kernel(vcpu->kvm))
+ return;
+
+ if (!kvm_x86_ops->set_apic_access_page_addr)
+ return;
+
+ page = gfn_to_page(vcpu->kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
+ kvm_x86_ops->set_apic_access_page_addr(vcpu, page_to_phys(page));
+
+ /*
+ * Do not pin apic access page in memory, the MMU notifier
+ * will call us again if it is migrated or swapped out.
+ */
+ put_page(page);
+}
+EXPORT_SYMBOL_GPL(kvm_vcpu_reload_apic_access_page);
+
+void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm,
+ unsigned long address)
+{
+ /*
+ * The physical address of apic access page is stored in the VMCS.
+ * Update it when it becomes invalid.
+ */
+ if (address == gfn_to_hva(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT))
+ kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD);
+}
+
/*
* Returns 1 to let __vcpu_run() continue the guest execution loop without
* exiting to the userspace. Otherwise, the value will be returned to the
@@ -6018,7 +6117,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu))
kvm_mmu_sync_roots(vcpu);
if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu))
- kvm_x86_ops->tlb_flush(vcpu);
+ kvm_vcpu_flush_tlb(vcpu);
if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) {
vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS;
r = 0;
@@ -6049,6 +6148,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
kvm_deliver_pmi(vcpu);
if (kvm_check_request(KVM_REQ_SCAN_IOAPIC, vcpu))
vcpu_scan_ioapic(vcpu);
+ if (kvm_check_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu))
+ kvm_vcpu_reload_apic_access_page(vcpu);
}
if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
@@ -6934,7 +7035,7 @@ void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, unsigned int vector)
kvm_rip_write(vcpu, 0);
}
-int kvm_arch_hardware_enable(void *garbage)
+int kvm_arch_hardware_enable(void)
{
struct kvm *kvm;
struct kvm_vcpu *vcpu;
@@ -6945,7 +7046,7 @@ int kvm_arch_hardware_enable(void *garbage)
bool stable, backwards_tsc = false;
kvm_shared_msr_cpu_online();
- ret = kvm_x86_ops->hardware_enable(garbage);
+ ret = kvm_x86_ops->hardware_enable();
if (ret != 0)
return ret;
@@ -6954,7 +7055,7 @@ int kvm_arch_hardware_enable(void *garbage)
list_for_each_entry(kvm, &vm_list, vm_list) {
kvm_for_each_vcpu(i, vcpu, kvm) {
if (!stable && vcpu->cpu == smp_processor_id())
- set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests);
+ kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
if (stable && vcpu->arch.last_host_tsc > local_tsc) {
backwards_tsc = true;
if (vcpu->arch.last_host_tsc > max_tsc)
@@ -7008,8 +7109,7 @@ int kvm_arch_hardware_enable(void *garbage)
kvm_for_each_vcpu(i, vcpu, kvm) {
vcpu->arch.tsc_offset_adjustment += delta_cyc;
vcpu->arch.last_host_tsc = local_tsc;
- set_bit(KVM_REQ_MASTERCLOCK_UPDATE,
- &vcpu->requests);
+ kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
}
/*
@@ -7026,10 +7126,10 @@ int kvm_arch_hardware_enable(void *garbage)
return 0;
}
-void kvm_arch_hardware_disable(void *garbage)
+void kvm_arch_hardware_disable(void)
{
- kvm_x86_ops->hardware_disable(garbage);
- drop_user_return_notifiers(garbage);
+ kvm_x86_ops->hardware_disable();
+ drop_user_return_notifiers();
}
int kvm_arch_hardware_setup(void)
@@ -7146,6 +7246,11 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
static_key_slow_dec(&kvm_no_apic_vcpu);
}
+void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu)
+{
+ kvm_x86_ops->sched_in(vcpu, cpu);
+}
+
int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
{
if (type)
@@ -7237,10 +7342,6 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
kfree(kvm->arch.vpic);
kfree(kvm->arch.vioapic);
kvm_free_vcpus(kvm);
- if (kvm->arch.apic_access_page)
- put_page(kvm->arch.apic_access_page);
- if (kvm->arch.ept_identity_pagetable)
- put_page(kvm->arch.ept_identity_pagetable);
kfree(rcu_dereference_check(kvm->arch.apic_map, 1));
}
@@ -7643,3 +7744,4 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_tsc_offset);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ple_window);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 306a1b77581f..7cb9c45a5fe0 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -88,15 +88,23 @@ static inline void vcpu_cache_mmio_info(struct kvm_vcpu *vcpu,
vcpu->arch.mmio_gva = gva & PAGE_MASK;
vcpu->arch.access = access;
vcpu->arch.mmio_gfn = gfn;
+ vcpu->arch.mmio_gen = kvm_memslots(vcpu->kvm)->generation;
+}
+
+static inline bool vcpu_match_mmio_gen(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.mmio_gen == kvm_memslots(vcpu->kvm)->generation;
}
/*
- * Clear the mmio cache info for the given gva,
- * specially, if gva is ~0ul, we clear all mmio cache info.
+ * Clear the mmio cache info for the given gva. If gva is MMIO_GVA_ANY, we
+ * clear all mmio cache info.
*/
+#define MMIO_GVA_ANY (~(gva_t)0)
+
static inline void vcpu_clear_mmio_info(struct kvm_vcpu *vcpu, gva_t gva)
{
- if (gva != (~0ul) && vcpu->arch.mmio_gva != (gva & PAGE_MASK))
+ if (gva != MMIO_GVA_ANY && vcpu->arch.mmio_gva != (gva & PAGE_MASK))
return;
vcpu->arch.mmio_gva = 0;
@@ -104,7 +112,8 @@ static inline void vcpu_clear_mmio_info(struct kvm_vcpu *vcpu, gva_t gva)
static inline bool vcpu_match_mmio_gva(struct kvm_vcpu *vcpu, unsigned long gva)
{
- if (vcpu->arch.mmio_gva && vcpu->arch.mmio_gva == (gva & PAGE_MASK))
+ if (vcpu_match_mmio_gen(vcpu) && vcpu->arch.mmio_gva &&
+ vcpu->arch.mmio_gva == (gva & PAGE_MASK))
return true;
return false;
@@ -112,7 +121,8 @@ static inline bool vcpu_match_mmio_gva(struct kvm_vcpu *vcpu, unsigned long gva)
static inline bool vcpu_match_mmio_gpa(struct kvm_vcpu *vcpu, gpa_t gpa)
{
- if (vcpu->arch.mmio_gfn && vcpu->arch.mmio_gfn == gpa >> PAGE_SHIFT)
+ if (vcpu_match_mmio_gen(vcpu) && vcpu->arch.mmio_gfn &&
+ vcpu->arch.mmio_gfn == gpa >> PAGE_SHIFT)
return true;
return false;
@@ -149,6 +159,8 @@ int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt,
gva_t addr, void *val, unsigned int bytes,
struct x86_exception *exception);
+bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data);
+
#define KVM_SUPPORTED_XCR0 (XSTATE_FP | XSTATE_SSE | XSTATE_YMM \
| XSTATE_BNDREGS | XSTATE_BNDCSR)
extern u64 host_xcr0;
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index 4d4f96a27638..db92793b7e23 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -20,7 +20,6 @@ lib-y := delay.o misc.o cmdline.o
lib-y += thunk_$(BITS).o
lib-y += usercopy_$(BITS).o usercopy.o getuser.o putuser.o
lib-y += memcpy_$(BITS).o
-lib-$(CONFIG_SMP) += rwlock.o
lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o
@@ -39,7 +38,7 @@ endif
else
obj-y += iomap_copy_64.o
lib-y += csum-partial_64.o csum-copy_64.o csum-wrappers_64.o
- lib-y += thunk_64.o clear_page_64.o copy_page_64.o
+ lib-y += clear_page_64.o copy_page_64.o
lib-y += memmove_64.o memset_64.o
lib-y += copy_user_64.o copy_user_nocache_64.o
lib-y += cmpxchg16b_emu.o
diff --git a/arch/x86/lib/cmpxchg16b_emu.S b/arch/x86/lib/cmpxchg16b_emu.S
index 1e572c507d06..40a172541ee2 100644
--- a/arch/x86/lib/cmpxchg16b_emu.S
+++ b/arch/x86/lib/cmpxchg16b_emu.S
@@ -6,15 +6,8 @@
*
*/
#include <linux/linkage.h>
-#include <asm/alternative-asm.h>
-#include <asm/frame.h>
#include <asm/dwarf2.h>
-
-#ifdef CONFIG_SMP
-#define SEG_PREFIX %gs:
-#else
-#define SEG_PREFIX
-#endif
+#include <asm/percpu.h>
.text
@@ -39,24 +32,25 @@ CFI_STARTPROC
# *atomic* on a single cpu (as provided by the this_cpu_xx class of
# macros).
#
-this_cpu_cmpxchg16b_emu:
- pushf
+ pushfq_cfi
cli
- cmpq SEG_PREFIX(%rsi), %rax
- jne not_same
- cmpq SEG_PREFIX 8(%rsi), %rdx
- jne not_same
+ cmpq PER_CPU_VAR((%rsi)), %rax
+ jne .Lnot_same
+ cmpq PER_CPU_VAR(8(%rsi)), %rdx
+ jne .Lnot_same
- movq %rbx, SEG_PREFIX(%rsi)
- movq %rcx, SEG_PREFIX 8(%rsi)
+ movq %rbx, PER_CPU_VAR((%rsi))
+ movq %rcx, PER_CPU_VAR(8(%rsi))
- popf
+ CFI_REMEMBER_STATE
+ popfq_cfi
mov $1, %al
ret
- not_same:
- popf
+ CFI_RESTORE_STATE
+.Lnot_same:
+ popfq_cfi
xor %al,%al
ret
diff --git a/arch/x86/lib/cmpxchg8b_emu.S b/arch/x86/lib/cmpxchg8b_emu.S
index 828cb710dec2..b4807fce5177 100644
--- a/arch/x86/lib/cmpxchg8b_emu.S
+++ b/arch/x86/lib/cmpxchg8b_emu.S
@@ -7,11 +7,8 @@
*/
#include <linux/linkage.h>
-#include <asm/alternative-asm.h>
-#include <asm/frame.h>
#include <asm/dwarf2.h>
-
.text
/*
@@ -30,27 +27,28 @@ CFI_STARTPROC
# set the whole ZF thing (caller will just compare
# eax:edx with the expected value)
#
-cmpxchg8b_emu:
- pushfl
+ pushfl_cfi
cli
cmpl (%esi), %eax
- jne not_same
+ jne .Lnot_same
cmpl 4(%esi), %edx
- jne half_same
+ jne .Lhalf_same
movl %ebx, (%esi)
movl %ecx, 4(%esi)
- popfl
+ CFI_REMEMBER_STATE
+ popfl_cfi
ret
- not_same:
+ CFI_RESTORE_STATE
+.Lnot_same:
movl (%esi), %eax
- half_same:
+.Lhalf_same:
movl 4(%esi), %edx
- popfl
+ popfl_cfi
ret
CFI_ENDPROC
diff --git a/arch/x86/lib/csum-wrappers_64.c b/arch/x86/lib/csum-wrappers_64.c
index 7609e0e421ec..1318f75d56e4 100644
--- a/arch/x86/lib/csum-wrappers_64.c
+++ b/arch/x86/lib/csum-wrappers_64.c
@@ -41,9 +41,8 @@ csum_partial_copy_from_user(const void __user *src, void *dst,
while (((unsigned long)src & 6) && len >= 2) {
__u16 val16;
- *errp = __get_user(val16, (const __u16 __user *)src);
- if (*errp)
- return isum;
+ if (__get_user(val16, (const __u16 __user *)src))
+ goto out_err;
*(__u16 *)dst = val16;
isum = (__force __wsum)add32_with_carry(
diff --git a/arch/x86/lib/rwlock.S b/arch/x86/lib/rwlock.S
deleted file mode 100644
index 1cad22139c88..000000000000
--- a/arch/x86/lib/rwlock.S
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Slow paths of read/write spinlocks. */
-
-#include <linux/linkage.h>
-#include <asm/alternative-asm.h>
-#include <asm/frame.h>
-#include <asm/rwlock.h>
-
-#ifdef CONFIG_X86_32
-# define __lock_ptr eax
-#else
-# define __lock_ptr rdi
-#endif
-
-ENTRY(__write_lock_failed)
- CFI_STARTPROC
- FRAME
-0: LOCK_PREFIX
- WRITE_LOCK_ADD($RW_LOCK_BIAS) (%__lock_ptr)
-1: rep; nop
- cmpl $WRITE_LOCK_CMP, (%__lock_ptr)
- jne 1b
- LOCK_PREFIX
- WRITE_LOCK_SUB($RW_LOCK_BIAS) (%__lock_ptr)
- jnz 0b
- ENDFRAME
- ret
- CFI_ENDPROC
-END(__write_lock_failed)
-
-ENTRY(__read_lock_failed)
- CFI_STARTPROC
- FRAME
-0: LOCK_PREFIX
- READ_LOCK_SIZE(inc) (%__lock_ptr)
-1: rep; nop
- READ_LOCK_SIZE(cmp) $1, (%__lock_ptr)
- js 1b
- LOCK_PREFIX
- READ_LOCK_SIZE(dec) (%__lock_ptr)
- js 0b
- ENDFRAME
- ret
- CFI_ENDPROC
-END(__read_lock_failed)
diff --git a/arch/x86/lib/thunk_32.S b/arch/x86/lib/thunk_32.S
index 28f85c916712..e28cdaf5ac2c 100644
--- a/arch/x86/lib/thunk_32.S
+++ b/arch/x86/lib/thunk_32.S
@@ -6,25 +6,46 @@
*/
#include <linux/linkage.h>
#include <asm/asm.h>
+ #include <asm/dwarf2.h>
-#ifdef CONFIG_TRACE_IRQFLAGS
/* put return address in eax (arg1) */
- .macro thunk_ra name,func
+ .macro THUNK name, func, put_ret_addr_in_eax=0
.globl \name
\name:
- pushl %eax
- pushl %ecx
- pushl %edx
+ CFI_STARTPROC
+ pushl_cfi %eax
+ CFI_REL_OFFSET eax, 0
+ pushl_cfi %ecx
+ CFI_REL_OFFSET ecx, 0
+ pushl_cfi %edx
+ CFI_REL_OFFSET edx, 0
+
+ .if \put_ret_addr_in_eax
/* Place EIP in the arg1 */
movl 3*4(%esp), %eax
+ .endif
+
call \func
- popl %edx
- popl %ecx
- popl %eax
+ popl_cfi %edx
+ CFI_RESTORE edx
+ popl_cfi %ecx
+ CFI_RESTORE ecx
+ popl_cfi %eax
+ CFI_RESTORE eax
ret
+ CFI_ENDPROC
_ASM_NOKPROBE(\name)
.endm
- thunk_ra trace_hardirqs_on_thunk,trace_hardirqs_on_caller
- thunk_ra trace_hardirqs_off_thunk,trace_hardirqs_off_caller
+#ifdef CONFIG_TRACE_IRQFLAGS
+ THUNK trace_hardirqs_on_thunk,trace_hardirqs_on_caller,1
+ THUNK trace_hardirqs_off_thunk,trace_hardirqs_off_caller,1
+#endif
+
+#ifdef CONFIG_PREEMPT
+ THUNK ___preempt_schedule, preempt_schedule
+#ifdef CONFIG_CONTEXT_TRACKING
+ THUNK ___preempt_schedule_context, preempt_schedule_context
#endif
+#endif
+
diff --git a/arch/x86/lib/thunk_64.S b/arch/x86/lib/thunk_64.S
index 92d9feaff42b..b30b5ebd614a 100644
--- a/arch/x86/lib/thunk_64.S
+++ b/arch/x86/lib/thunk_64.S
@@ -38,6 +38,13 @@
THUNK lockdep_sys_exit_thunk,lockdep_sys_exit
#endif
+#ifdef CONFIG_PREEMPT
+ THUNK ___preempt_schedule, preempt_schedule
+#ifdef CONFIG_CONTEXT_TRACKING
+ THUNK ___preempt_schedule_context, preempt_schedule_context
+#endif
+#endif
+
/* SAVE_ARGS below is used only for the .cfi directives it contains. */
CFI_STARTPROC
SAVE_ARGS
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index 167ffcac16ed..95a427e57887 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -48,7 +48,9 @@ enum address_markers_idx {
LOW_KERNEL_NR,
VMALLOC_START_NR,
VMEMMAP_START_NR,
+# ifdef CONFIG_X86_ESPFIX64
ESPFIX_START_NR,
+# endif
HIGH_KERNEL_NR,
MODULES_VADDR_NR,
MODULES_END_NR,
@@ -71,7 +73,9 @@ static struct addr_marker address_markers[] = {
{ PAGE_OFFSET, "Low Kernel Mapping" },
{ VMALLOC_START, "vmalloc() Area" },
{ VMEMMAP_START, "Vmemmap" },
+# ifdef CONFIG_X86_ESPFIX64
{ ESPFIX_BASE_ADDR, "ESPfix Area", 16 },
+# endif
{ __START_KERNEL_map, "High Kernel Mapping" },
{ MODULES_VADDR, "Modules" },
{ MODULES_END, "End Modules" },
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 1dbade870f90..d973e61e450d 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -3,7 +3,6 @@
* Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs.
* Copyright (C) 2008-2009, Red Hat Inc., Ingo Molnar
*/
-#include <linux/magic.h> /* STACK_END_MAGIC */
#include <linux/sched.h> /* test_thread_flag(), ... */
#include <linux/kdebug.h> /* oops_begin/end, ... */
#include <linux/module.h> /* search_exception_table */
@@ -350,7 +349,7 @@ out:
void vmalloc_sync_all(void)
{
- sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END);
+ sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END, 0);
}
/*
@@ -649,7 +648,6 @@ no_context(struct pt_regs *regs, unsigned long error_code,
unsigned long address, int signal, int si_code)
{
struct task_struct *tsk = current;
- unsigned long *stackend;
unsigned long flags;
int sig;
@@ -709,8 +707,7 @@ no_context(struct pt_regs *regs, unsigned long error_code,
show_fault_oops(regs, error_code, address);
- stackend = end_of_stack(tsk);
- if (tsk != &init_task && *stackend != STACK_END_MAGIC)
+ if (task_stack_end_corrupted(tsk))
printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
tsk->thread.cr2 = address;
@@ -933,8 +930,17 @@ static int spurious_fault_check(unsigned long error_code, pte_t *pte)
* cross-processor TLB flush, even if no stale TLB entries exist
* on other processors.
*
+ * Spurious faults may only occur if the TLB contains an entry with
+ * fewer permission than the page table entry. Non-present (P = 0)
+ * and reserved bit (R = 1) faults are never spurious.
+ *
* There are no security implications to leaving a stale TLB when
* increasing the permissions on a page.
+ *
+ * Returns non-zero if a spurious fault was handled, zero otherwise.
+ *
+ * See Intel Developer's Manual Vol 3 Section 4.10.4.3, bullet 3
+ * (Optional Invalidation).
*/
static noinline int
spurious_fault(unsigned long error_code, unsigned long address)
@@ -945,8 +951,17 @@ spurious_fault(unsigned long error_code, unsigned long address)
pte_t *pte;
int ret;
- /* Reserved-bit violation or user access to kernel space? */
- if (error_code & (PF_USER | PF_RSVD))
+ /*
+ * Only writes to RO or instruction fetches from NX may cause
+ * spurious faults.
+ *
+ * These could be from user or supervisor accesses but the TLB
+ * is only lazily flushed after a kernel mapping protection
+ * change, so user accesses are not expected to cause spurious
+ * faults.
+ */
+ if (error_code != (PF_WRITE | PF_PROT)
+ && error_code != (PF_INSTR | PF_PROT))
return 0;
pgd = init_mm.pgd + pgd_index(address);
@@ -1218,7 +1233,8 @@ good_area:
/*
* If for any reason at all we couldn't handle the fault,
* make sure we exit gracefully rather than endlessly redo
- * the fault:
+ * the fault. Since we never set FAULT_FLAG_RETRY_NOWAIT, if
+ * we get VM_FAULT_RETRY back, the mmap_sem has been unlocked.
*/
fault = handle_mm_fault(mm, vma, address, flags);
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index e39504878aec..c8140e12816a 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -537,7 +537,7 @@ static void __init pagetable_init(void)
permanent_kmaps_init(pgd_base);
}
-pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL | _PAGE_IOMAP);
+pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL);
EXPORT_SYMBOL_GPL(__supported_pte_mask);
/* user-defined highmem size */
@@ -825,7 +825,8 @@ void __init mem_init(void)
int arch_add_memory(int nid, u64 start, u64 size)
{
struct pglist_data *pgdata = NODE_DATA(nid);
- struct zone *zone = pgdata->node_zones + ZONE_HIGHMEM;
+ struct zone *zone = pgdata->node_zones +
+ zone_for_memory(nid, start, size, ZONE_HIGHMEM);
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index df1a9927ad29..4cb8763868fc 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -151,7 +151,7 @@ early_param("gbpages", parse_direct_gbpages_on);
* around without checking the pgd every time.
*/
-pteval_t __supported_pte_mask __read_mostly = ~_PAGE_IOMAP;
+pteval_t __supported_pte_mask __read_mostly = ~0;
EXPORT_SYMBOL_GPL(__supported_pte_mask);
int force_personality32;
@@ -178,7 +178,7 @@ __setup("noexec32=", nonx32_setup);
* When memory was added/removed make sure all the processes MM have
* suitable PGD entries in the local PGD level page.
*/
-void sync_global_pgds(unsigned long start, unsigned long end)
+void sync_global_pgds(unsigned long start, unsigned long end, int removed)
{
unsigned long address;
@@ -186,7 +186,12 @@ void sync_global_pgds(unsigned long start, unsigned long end)
const pgd_t *pgd_ref = pgd_offset_k(address);
struct page *page;
- if (pgd_none(*pgd_ref))
+ /*
+ * When it is called after memory hot remove, pgd_none()
+ * returns true. In this case (removed == 1), we must clear
+ * the PGD entries in the local PGD level page.
+ */
+ if (pgd_none(*pgd_ref) && !removed)
continue;
spin_lock(&pgd_lock);
@@ -199,12 +204,18 @@ void sync_global_pgds(unsigned long start, unsigned long end)
pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
spin_lock(pgt_lock);
- if (pgd_none(*pgd))
- set_pgd(pgd, *pgd_ref);
- else
+ if (!pgd_none(*pgd_ref) && !pgd_none(*pgd))
BUG_ON(pgd_page_vaddr(*pgd)
!= pgd_page_vaddr(*pgd_ref));
+ if (removed) {
+ if (pgd_none(*pgd_ref) && !pgd_none(*pgd))
+ pgd_clear(pgd);
+ } else {
+ if (pgd_none(*pgd))
+ set_pgd(pgd, *pgd_ref);
+ }
+
spin_unlock(pgt_lock);
}
spin_unlock(&pgd_lock);
@@ -633,7 +644,7 @@ kernel_physical_mapping_init(unsigned long start,
}
if (pgd_changed)
- sync_global_pgds(addr, end - 1);
+ sync_global_pgds(addr, end - 1, 0);
__flush_tlb_all();
@@ -691,7 +702,8 @@ static void update_end_of_memory_vars(u64 start, u64 size)
int arch_add_memory(int nid, u64 start, u64 size)
{
struct pglist_data *pgdat = NODE_DATA(nid);
- struct zone *zone = pgdat->node_zones + ZONE_NORMAL;
+ struct zone *zone = pgdat->node_zones +
+ zone_for_memory(nid, start, size, ZONE_NORMAL);
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
int ret;
@@ -975,25 +987,26 @@ static void __meminit
remove_pagetable(unsigned long start, unsigned long end, bool direct)
{
unsigned long next;
+ unsigned long addr;
pgd_t *pgd;
pud_t *pud;
bool pgd_changed = false;
- for (; start < end; start = next) {
- next = pgd_addr_end(start, end);
+ for (addr = start; addr < end; addr = next) {
+ next = pgd_addr_end(addr, end);
- pgd = pgd_offset_k(start);
+ pgd = pgd_offset_k(addr);
if (!pgd_present(*pgd))
continue;
pud = (pud_t *)pgd_page_vaddr(*pgd);
- remove_pud_table(pud, start, next, direct);
+ remove_pud_table(pud, addr, next, direct);
if (free_pud_table(pud, pgd))
pgd_changed = true;
}
if (pgd_changed)
- sync_global_pgds(start, end - 1);
+ sync_global_pgds(start, end - 1, 1);
flush_tlb_all();
}
@@ -1340,7 +1353,7 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node)
else
err = vmemmap_populate_basepages(start, end, node);
if (!err)
- sync_global_pgds(start, end - 1);
+ sync_global_pgds(start, end - 1, 0);
return err;
}
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index baff1da354e0..af78e50ca6ce 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -86,6 +86,7 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
pgprot_t prot;
int retval;
void __iomem *ret_addr;
+ int ram_region;
/* Don't allow wraparound or zero size */
last_addr = phys_addr + size - 1;
@@ -108,12 +109,23 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
/*
* Don't allow anybody to remap normal RAM that we're using..
*/
- pfn = phys_addr >> PAGE_SHIFT;
- last_pfn = last_addr >> PAGE_SHIFT;
- if (walk_system_ram_range(pfn, last_pfn - pfn + 1, NULL,
- __ioremap_check_ram) == 1)
+ /* First check if whole region can be identified as RAM or not */
+ ram_region = region_is_ram(phys_addr, size);
+ if (ram_region > 0) {
+ WARN_ONCE(1, "ioremap on RAM at 0x%lx - 0x%lx\n",
+ (unsigned long int)phys_addr,
+ (unsigned long int)last_addr);
return NULL;
+ }
+ /* If could not be identified(-1), check page by page */
+ if (ram_region < 0) {
+ pfn = phys_addr >> PAGE_SHIFT;
+ last_pfn = last_addr >> PAGE_SHIFT;
+ if (walk_system_ram_range(pfn, last_pfn - pfn + 1, NULL,
+ __ioremap_check_ram) == 1)
+ return NULL;
+ }
/*
* Mappings have to be page-aligned
*/
diff --git a/arch/x86/mm/kmemcheck/kmemcheck.c b/arch/x86/mm/kmemcheck/kmemcheck.c
index dd89a13f1051..b4f2e7e9e907 100644
--- a/arch/x86/mm/kmemcheck/kmemcheck.c
+++ b/arch/x86/mm/kmemcheck/kmemcheck.c
@@ -140,7 +140,7 @@ static DEFINE_PER_CPU(struct kmemcheck_context, kmemcheck_context);
bool kmemcheck_active(struct pt_regs *regs)
{
- struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context);
+ struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context);
return data->balance > 0;
}
@@ -148,7 +148,7 @@ bool kmemcheck_active(struct pt_regs *regs)
/* Save an address that needs to be shown/hidden */
static void kmemcheck_save_addr(unsigned long addr)
{
- struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context);
+ struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context);
BUG_ON(data->n_addrs >= ARRAY_SIZE(data->addr));
data->addr[data->n_addrs++] = addr;
@@ -156,7 +156,7 @@ static void kmemcheck_save_addr(unsigned long addr)
static unsigned int kmemcheck_show_all(void)
{
- struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context);
+ struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context);
unsigned int i;
unsigned int n;
@@ -169,7 +169,7 @@ static unsigned int kmemcheck_show_all(void)
static unsigned int kmemcheck_hide_all(void)
{
- struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context);
+ struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context);
unsigned int i;
unsigned int n;
@@ -185,7 +185,7 @@ static unsigned int kmemcheck_hide_all(void)
*/
void kmemcheck_show(struct pt_regs *regs)
{
- struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context);
+ struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context);
BUG_ON(!irqs_disabled());
@@ -226,7 +226,7 @@ void kmemcheck_show(struct pt_regs *regs)
*/
void kmemcheck_hide(struct pt_regs *regs)
{
- struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context);
+ struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context);
int n;
BUG_ON(!irqs_disabled());
@@ -528,7 +528,7 @@ static void kmemcheck_access(struct pt_regs *regs,
const uint8_t *insn_primary;
unsigned int size;
- struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context);
+ struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context);
/* Recursive fault -- ouch. */
if (data->busy) {
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index 25e7e1372bb2..919b91205cd4 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -31,7 +31,7 @@
#include <linux/sched.h>
#include <asm/elf.h>
-struct __read_mostly va_alignment va_align = {
+struct va_alignment __read_mostly va_align = {
.flags = -1,
};
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index a32b706c401a..1a883705a12a 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -185,8 +185,8 @@ int __init numa_add_memblk(int nid, u64 start, u64 end)
return numa_add_memblk_to(nid, start, end, &numa_meminfo);
}
-/* Initialize NODE_DATA for a node on the local memory */
-static void __init setup_node_data(int nid, u64 start, u64 end)
+/* Allocate NODE_DATA for a node on the local memory */
+static void __init alloc_node_data(int nid)
{
const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
u64 nd_pa;
@@ -194,18 +194,6 @@ static void __init setup_node_data(int nid, u64 start, u64 end)
int tnid;
/*
- * Don't confuse VM with a node that doesn't have the
- * minimum amount of memory:
- */
- if (end && (end - start) < NODE_MIN_SIZE)
- return;
-
- start = roundup(start, ZONE_ALIGN);
-
- printk(KERN_INFO "Initmem setup node %d [mem %#010Lx-%#010Lx]\n",
- nid, start, end - 1);
-
- /*
* Allocate node data. Try node-local memory and then any node.
* Never allocate in DMA zone.
*/
@@ -222,7 +210,7 @@ static void __init setup_node_data(int nid, u64 start, u64 end)
nd = __va(nd_pa);
/* report and initialize */
- printk(KERN_INFO " NODE_DATA [mem %#010Lx-%#010Lx]\n",
+ printk(KERN_INFO "NODE_DATA(%d) allocated [mem %#010Lx-%#010Lx]\n", nid,
nd_pa, nd_pa + nd_size - 1);
tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT);
if (tnid != nid)
@@ -230,9 +218,6 @@ static void __init setup_node_data(int nid, u64 start, u64 end)
node_data[nid] = nd;
memset(NODE_DATA(nid), 0, sizeof(pg_data_t));
- NODE_DATA(nid)->node_id = nid;
- NODE_DATA(nid)->node_start_pfn = start >> PAGE_SHIFT;
- NODE_DATA(nid)->node_spanned_pages = (end - start) >> PAGE_SHIFT;
node_set_online(nid);
}
@@ -478,6 +463,42 @@ static bool __init numa_meminfo_cover_memory(const struct numa_meminfo *mi)
return true;
}
+static void __init numa_clear_kernel_node_hotplug(void)
+{
+ int i, nid;
+ nodemask_t numa_kernel_nodes = NODE_MASK_NONE;
+ unsigned long start, end;
+ struct memblock_region *r;
+
+ /*
+ * At this time, all memory regions reserved by memblock are
+ * used by the kernel. Set the nid in memblock.reserved will
+ * mark out all the nodes the kernel resides in.
+ */
+ for (i = 0; i < numa_meminfo.nr_blks; i++) {
+ struct numa_memblk *mb = &numa_meminfo.blk[i];
+
+ memblock_set_node(mb->start, mb->end - mb->start,
+ &memblock.reserved, mb->nid);
+ }
+
+ /* Mark all kernel nodes. */
+ for_each_memblock(reserved, r)
+ node_set(r->nid, numa_kernel_nodes);
+
+ /* Clear MEMBLOCK_HOTPLUG flag for memory in kernel nodes. */
+ for (i = 0; i < numa_meminfo.nr_blks; i++) {
+ nid = numa_meminfo.blk[i].nid;
+ if (!node_isset(nid, numa_kernel_nodes))
+ continue;
+
+ start = numa_meminfo.blk[i].start;
+ end = numa_meminfo.blk[i].end;
+
+ memblock_clear_hotplug(start, end - start);
+ }
+}
+
static int __init numa_register_memblks(struct numa_meminfo *mi)
{
unsigned long uninitialized_var(pfn_align);
@@ -496,6 +517,15 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
}
/*
+ * At very early time, the kernel have to use some memory such as
+ * loading the kernel image. We cannot prevent this anyway. So any
+ * node the kernel resides in should be un-hotpluggable.
+ *
+ * And when we come here, alloc node data won't fail.
+ */
+ numa_clear_kernel_node_hotplug();
+
+ /*
* If sections array is gonna be used for pfn -> nid mapping, check
* whether its granularity is fine enough.
*/
@@ -523,8 +553,17 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
end = max(mi->blk[i].end, end);
}
- if (start < end)
- setup_node_data(nid, start, end);
+ if (start >= end)
+ continue;
+
+ /*
+ * Don't confuse VM with a node that doesn't have the
+ * minimum amount of memory:
+ */
+ if (end && (end - start) < NODE_MIN_SIZE)
+ continue;
+
+ alloc_node_data(nid);
}
/* Dump memblock with node info and return. */
@@ -554,41 +593,6 @@ static void __init numa_init_array(void)
}
}
-static void __init numa_clear_kernel_node_hotplug(void)
-{
- int i, nid;
- nodemask_t numa_kernel_nodes = NODE_MASK_NONE;
- unsigned long start, end;
- struct memblock_region *r;
-
- /*
- * At this time, all memory regions reserved by memblock are
- * used by the kernel. Set the nid in memblock.reserved will
- * mark out all the nodes the kernel resides in.
- */
- for (i = 0; i < numa_meminfo.nr_blks; i++) {
- struct numa_memblk *mb = &numa_meminfo.blk[i];
- memblock_set_node(mb->start, mb->end - mb->start,
- &memblock.reserved, mb->nid);
- }
-
- /* Mark all kernel nodes. */
- for_each_memblock(reserved, r)
- node_set(r->nid, numa_kernel_nodes);
-
- /* Clear MEMBLOCK_HOTPLUG flag for memory in kernel nodes. */
- for (i = 0; i < numa_meminfo.nr_blks; i++) {
- nid = numa_meminfo.blk[i].nid;
- if (!node_isset(nid, numa_kernel_nodes))
- continue;
-
- start = numa_meminfo.blk[i].start;
- end = numa_meminfo.blk[i].end;
-
- memblock_clear_hotplug(start, end - start);
- }
-}
-
static int __init numa_init(int (*init_func)(void))
{
int i;
@@ -643,15 +647,6 @@ static int __init numa_init(int (*init_func)(void))
}
numa_init_array();
- /*
- * At very early time, the kernel have to use some memory such as
- * loading the kernel image. We cannot prevent this anyway. So any
- * node the kernel resides in should be un-hotpluggable.
- *
- * And when we come here, numa_init() won't fail.
- */
- numa_clear_kernel_node_hotplug();
-
return 0;
}
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index ae242a7c11c7..36de293caf25 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -409,7 +409,7 @@ phys_addr_t slow_virt_to_phys(void *__virt_addr)
psize = page_level_size(level);
pmask = page_level_mask(level);
offset = virt_addr & ~pmask;
- phys_addr = pte_pfn(*pte) << PAGE_SHIFT;
+ phys_addr = (phys_addr_t)pte_pfn(*pte) << PAGE_SHIFT;
return (phys_addr | offset);
}
EXPORT_SYMBOL_GPL(slow_virt_to_phys);
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
index 4dd8cf652579..75cc0978d45d 100644
--- a/arch/x86/mm/pgtable_32.c
+++ b/arch/x86/mm/pgtable_32.c
@@ -59,41 +59,6 @@ void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
__flush_tlb_one(vaddr);
}
-/*
- * Associate a large virtual page frame with a given physical page frame
- * and protection flags for that frame. pfn is for the base of the page,
- * vaddr is what the page gets mapped to - both must be properly aligned.
- * The pmd must already be instantiated. Assumes PAE mode.
- */
-void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
-{
- pgd_t *pgd;
- pud_t *pud;
- pmd_t *pmd;
-
- if (vaddr & (PMD_SIZE-1)) { /* vaddr is misaligned */
- printk(KERN_WARNING "set_pmd_pfn: vaddr misaligned\n");
- return; /* BUG(); */
- }
- if (pfn & (PTRS_PER_PTE-1)) { /* pfn is misaligned */
- printk(KERN_WARNING "set_pmd_pfn: pfn misaligned\n");
- return; /* BUG(); */
- }
- pgd = swapper_pg_dir + pgd_index(vaddr);
- if (pgd_none(*pgd)) {
- printk(KERN_WARNING "set_pmd_pfn: pgd_none\n");
- return; /* BUG(); */
- }
- pud = pud_offset(pgd, vaddr);
- pmd = pmd_offset(pud, vaddr);
- set_pmd(pmd, pfn_pmd(pfn, flags));
- /*
- * It's enough to flush this one mapping.
- * (PGE mappings get flushed as well)
- */
- __flush_tlb_one(vaddr);
-}
-
unsigned long __FIXADDR_TOP = 0xfffff000;
EXPORT_SYMBOL(__FIXADDR_TOP);
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 1fe33987de02..ee61c36d64f8 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -49,7 +49,13 @@ void leave_mm(int cpu)
if (cpumask_test_cpu(cpu, mm_cpumask(active_mm))) {
cpumask_clear_cpu(cpu, mm_cpumask(active_mm));
load_cr3(swapper_pg_dir);
- trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
+ /*
+ * This gets called in the idle path where RCU
+ * functions differently. Tracing normally
+ * uses RCU, so we have to call the tracepoint
+ * specially here.
+ */
+ trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
}
}
EXPORT_SYMBOL_GPL(leave_mm);
@@ -174,7 +180,7 @@ void flush_tlb_current_task(void)
*
* This is in units of pages.
*/
-unsigned long tlb_single_page_flush_ceiling = 33;
+static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
unsigned long end, unsigned long vmflag)
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 5c8cb8043c5a..3f627345d51c 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -8,12 +8,10 @@
* as published by the Free Software Foundation; version 2
* of the License.
*/
-#include <linux/moduleloader.h>
-#include <asm/cacheflush.h>
#include <linux/netdevice.h>
#include <linux/filter.h>
#include <linux/if_vlan.h>
-#include <linux/random.h>
+#include <asm/cacheflush.h>
int bpf_jit_enable __read_mostly;
@@ -109,39 +107,6 @@ static inline void bpf_flush_icache(void *start, void *end)
#define CHOOSE_LOAD_FUNC(K, func) \
((int)K < 0 ? ((int)K >= SKF_LL_OFF ? func##_negative_offset : func) : func##_positive_offset)
-struct bpf_binary_header {
- unsigned int pages;
- /* Note : for security reasons, bpf code will follow a randomly
- * sized amount of int3 instructions
- */
- u8 image[];
-};
-
-static struct bpf_binary_header *bpf_alloc_binary(unsigned int proglen,
- u8 **image_ptr)
-{
- unsigned int sz, hole;
- struct bpf_binary_header *header;
-
- /* Most of BPF filters are really small,
- * but if some of them fill a page, allow at least
- * 128 extra bytes to insert a random section of int3
- */
- sz = round_up(proglen + sizeof(*header) + 128, PAGE_SIZE);
- header = module_alloc(sz);
- if (!header)
- return NULL;
-
- memset(header, 0xcc, sz); /* fill whole space with int3 instructions */
-
- header->pages = sz / PAGE_SIZE;
- hole = min(sz - (proglen + sizeof(*header)), PAGE_SIZE - sizeof(*header));
-
- /* insert a random number of int3 instructions before BPF code */
- *image_ptr = &header->image[prandom_u32() % hole];
- return header;
-}
-
/* pick a register outside of BPF range for JIT internal work */
#define AUX_REG (MAX_BPF_REG + 1)
@@ -206,17 +171,28 @@ static inline u8 add_2reg(u8 byte, u32 dst_reg, u32 src_reg)
return byte + reg2hex[dst_reg] + (reg2hex[src_reg] << 3);
}
+static void jit_fill_hole(void *area, unsigned int size)
+{
+ /* fill whole space with int3 instructions */
+ memset(area, 0xcc, size);
+}
+
struct jit_context {
unsigned int cleanup_addr; /* epilogue code offset */
bool seen_ld_abs;
};
+/* maximum number of bytes emitted while JITing one eBPF insn */
+#define BPF_MAX_INSN_SIZE 128
+#define BPF_INSN_SAFETY 64
+
static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
int oldproglen, struct jit_context *ctx)
{
struct bpf_insn *insn = bpf_prog->insnsi;
int insn_cnt = bpf_prog->len;
- u8 temp[64];
+ bool seen_ld_abs = ctx->seen_ld_abs | (oldproglen == 0);
+ u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY];
int i;
int proglen = 0;
u8 *prog = temp;
@@ -254,7 +230,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
EMIT2(0x31, 0xc0); /* xor eax, eax */
EMIT3(0x4D, 0x31, 0xED); /* xor r13, r13 */
- if (ctx->seen_ld_abs) {
+ if (seen_ld_abs) {
/* r9d : skb->len - skb->data_len (headlen)
* r10 : skb->data
*/
@@ -393,6 +369,23 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
EMIT1_off32(add_1reg(0xB8, dst_reg), imm32);
break;
+ case BPF_LD | BPF_IMM | BPF_DW:
+ if (insn[1].code != 0 || insn[1].src_reg != 0 ||
+ insn[1].dst_reg != 0 || insn[1].off != 0) {
+ /* verifier must catch invalid insns */
+ pr_err("invalid BPF_LD_IMM64 insn\n");
+ return -EINVAL;
+ }
+
+ /* movabsq %rax, imm64 */
+ EMIT2(add_1mod(0x48, dst_reg), add_1reg(0xB8, dst_reg));
+ EMIT(insn[0].imm, 4);
+ EMIT(insn[1].imm, 4);
+
+ insn++;
+ i++;
+ break;
+
/* dst %= src, dst /= src, dst %= imm32, dst /= imm32 */
case BPF_ALU | BPF_MOD | BPF_X:
case BPF_ALU | BPF_DIV | BPF_X:
@@ -515,6 +508,48 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
EMIT3(0xC1, add_1reg(b3, dst_reg), imm32);
break;
+ case BPF_ALU | BPF_LSH | BPF_X:
+ case BPF_ALU | BPF_RSH | BPF_X:
+ case BPF_ALU | BPF_ARSH | BPF_X:
+ case BPF_ALU64 | BPF_LSH | BPF_X:
+ case BPF_ALU64 | BPF_RSH | BPF_X:
+ case BPF_ALU64 | BPF_ARSH | BPF_X:
+
+ /* check for bad case when dst_reg == rcx */
+ if (dst_reg == BPF_REG_4) {
+ /* mov r11, dst_reg */
+ EMIT_mov(AUX_REG, dst_reg);
+ dst_reg = AUX_REG;
+ }
+
+ if (src_reg != BPF_REG_4) { /* common case */
+ EMIT1(0x51); /* push rcx */
+
+ /* mov rcx, src_reg */
+ EMIT_mov(BPF_REG_4, src_reg);
+ }
+
+ /* shl %rax, %cl | shr %rax, %cl | sar %rax, %cl */
+ if (BPF_CLASS(insn->code) == BPF_ALU64)
+ EMIT1(add_1mod(0x48, dst_reg));
+ else if (is_ereg(dst_reg))
+ EMIT1(add_1mod(0x40, dst_reg));
+
+ switch (BPF_OP(insn->code)) {
+ case BPF_LSH: b3 = 0xE0; break;
+ case BPF_RSH: b3 = 0xE8; break;
+ case BPF_ARSH: b3 = 0xF8; break;
+ }
+ EMIT2(0xD3, add_1reg(b3, dst_reg));
+
+ if (src_reg != BPF_REG_4)
+ EMIT1(0x59); /* pop rcx */
+
+ if (insn->dst_reg == BPF_REG_4)
+ /* mov dst_reg, r11 */
+ EMIT_mov(insn->dst_reg, AUX_REG);
+ break;
+
case BPF_ALU | BPF_END | BPF_FROM_BE:
switch (imm32) {
case 16:
@@ -655,7 +690,7 @@ xadd: if (is_imm8(insn->off))
case BPF_JMP | BPF_CALL:
func = (u8 *) __bpf_call_base + imm32;
jmp_offset = func - (image + addrs[i]);
- if (ctx->seen_ld_abs) {
+ if (seen_ld_abs) {
EMIT2(0x41, 0x52); /* push %r10 */
EMIT2(0x41, 0x51); /* push %r9 */
/* need to adjust jmp offset, since
@@ -669,7 +704,7 @@ xadd: if (is_imm8(insn->off))
return -EINVAL;
}
EMIT1_off32(0xE8, jmp_offset);
- if (ctx->seen_ld_abs) {
+ if (seen_ld_abs) {
EMIT2(0x41, 0x59); /* pop %r9 */
EMIT2(0x41, 0x5A); /* pop %r10 */
}
@@ -774,7 +809,8 @@ emit_jmp:
goto common_load;
case BPF_LD | BPF_ABS | BPF_W:
func = CHOOSE_LOAD_FUNC(imm32, sk_load_word);
-common_load: ctx->seen_ld_abs = true;
+common_load:
+ ctx->seen_ld_abs = seen_ld_abs = true;
jmp_offset = func - (image + addrs[i]);
if (!func || !is_simm32(jmp_offset)) {
pr_err("unsupported bpf func %d addr %p image %p\n",
@@ -848,6 +884,11 @@ common_load: ctx->seen_ld_abs = true;
}
ilen = prog - temp;
+ if (ilen > BPF_MAX_INSN_SIZE) {
+ pr_err("bpf_jit_compile fatal insn size error\n");
+ return -EFAULT;
+ }
+
if (image) {
if (unlikely(proglen + ilen > oldproglen)) {
pr_err("bpf_jit_compile fatal error\n");
@@ -900,17 +941,20 @@ void bpf_int_jit_compile(struct bpf_prog *prog)
if (proglen <= 0) {
image = NULL;
if (header)
- module_free(NULL, header);
+ bpf_jit_binary_free(header);
goto out;
}
if (image) {
- if (proglen != oldproglen)
+ if (proglen != oldproglen) {
pr_err("bpf_jit: proglen=%d != oldproglen=%d\n",
proglen, oldproglen);
+ goto out;
+ }
break;
}
if (proglen == oldproglen) {
- header = bpf_alloc_binary(proglen, &image);
+ header = bpf_jit_binary_alloc(proglen, &image,
+ 1, jit_fill_hole);
if (!header)
goto out;
}
@@ -924,29 +968,23 @@ void bpf_int_jit_compile(struct bpf_prog *prog)
bpf_flush_icache(header, image + proglen);
set_memory_ro((unsigned long)header, header->pages);
prog->bpf_func = (void *)image;
- prog->jited = 1;
+ prog->jited = true;
}
out:
kfree(addrs);
}
-static void bpf_jit_free_deferred(struct work_struct *work)
+void bpf_jit_free(struct bpf_prog *fp)
{
- struct bpf_prog *fp = container_of(work, struct bpf_prog, work);
unsigned long addr = (unsigned long)fp->bpf_func & PAGE_MASK;
struct bpf_binary_header *header = (void *)addr;
+ if (!fp->jited)
+ goto free_filter;
+
set_memory_rw(addr, header->pages);
- module_free(NULL, header);
- kfree(fp);
-}
+ bpf_jit_binary_free(header);
-void bpf_jit_free(struct bpf_prog *fp)
-{
- if (fp->jited) {
- INIT_WORK(&fp->work, bpf_jit_free_deferred);
- schedule_work(&fp->work);
- } else {
- kfree(fp);
- }
+free_filter:
+ bpf_prog_unlock_free(fp);
}
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index 379e8bd0deea..1d2e6392f5fa 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -64,11 +64,11 @@ u64 op_x86_get_ctrl(struct op_x86_model_spec const *model,
static int profile_exceptions_notify(unsigned int val, struct pt_regs *regs)
{
if (ctr_running)
- model->check_ctrs(regs, &__get_cpu_var(cpu_msrs));
+ model->check_ctrs(regs, this_cpu_ptr(&cpu_msrs));
else if (!nmi_enabled)
return NMI_DONE;
else
- model->stop(&__get_cpu_var(cpu_msrs));
+ model->stop(this_cpu_ptr(&cpu_msrs));
return NMI_HANDLED;
}
@@ -91,7 +91,7 @@ static void nmi_cpu_save_registers(struct op_msrs *msrs)
static void nmi_cpu_start(void *dummy)
{
- struct op_msrs const *msrs = &__get_cpu_var(cpu_msrs);
+ struct op_msrs const *msrs = this_cpu_ptr(&cpu_msrs);
if (!msrs->controls)
WARN_ON_ONCE(1);
else
@@ -111,7 +111,7 @@ static int nmi_start(void)
static void nmi_cpu_stop(void *dummy)
{
- struct op_msrs const *msrs = &__get_cpu_var(cpu_msrs);
+ struct op_msrs const *msrs = this_cpu_ptr(&cpu_msrs);
if (!msrs->controls)
WARN_ON_ONCE(1);
else
diff --git a/arch/x86/oprofile/op_model_p4.c b/arch/x86/oprofile/op_model_p4.c
index 98ab13058f89..ad1d91f475ab 100644
--- a/arch/x86/oprofile/op_model_p4.c
+++ b/arch/x86/oprofile/op_model_p4.c
@@ -372,7 +372,7 @@ static unsigned int get_stagger(void)
{
#ifdef CONFIG_SMP
int cpu = smp_processor_id();
- return cpu != cpumask_first(__get_cpu_var(cpu_sibling_map));
+ return cpu != cpumask_first(this_cpu_cpumask_var_ptr(cpu_sibling_map));
#endif
return 0;
}
diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index 5075371ab593..cfd1b132b8e3 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -448,7 +448,7 @@ static void probe_pci_root_info(struct pci_root_info *info,
return;
size = sizeof(*info->res) * info->res_num;
- info->res = kzalloc(size, GFP_KERNEL);
+ info->res = kzalloc_node(size, GFP_KERNEL, info->sd.node);
if (!info->res) {
info->res_num = 0;
return;
@@ -456,7 +456,7 @@ static void probe_pci_root_info(struct pci_root_info *info,
size = sizeof(*info->res_offset) * info->res_num;
info->res_num = 0;
- info->res_offset = kzalloc(size, GFP_KERNEL);
+ info->res_offset = kzalloc_node(size, GFP_KERNEL, info->sd.node);
if (!info->res_offset) {
kfree(info->res);
info->res = NULL;
@@ -499,7 +499,7 @@ struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root)
if (node != NUMA_NO_NODE && !node_online(node))
node = NUMA_NO_NODE;
- info = kzalloc(sizeof(*info), GFP_KERNEL);
+ info = kzalloc_node(sizeof(*info), GFP_KERNEL, node);
if (!info) {
printk(KERN_WARNING "pci_bus %04x:%02x: "
"ignored (out of memory)\n", domain, busnum);
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index 059a76c29739..7b20bccf3648 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -81,14 +81,14 @@ struct pci_ops pci_root_ops = {
*/
DEFINE_RAW_SPINLOCK(pci_config_lock);
-static int can_skip_ioresource_align(const struct dmi_system_id *d)
+static int __init can_skip_ioresource_align(const struct dmi_system_id *d)
{
pci_probe |= PCI_CAN_SKIP_ISA_ALIGN;
printk(KERN_INFO "PCI: %s detected, can skip ISA alignment\n", d->ident);
return 0;
}
-static const struct dmi_system_id can_skip_pciprobe_dmi_table[] = {
+static const struct dmi_system_id can_skip_pciprobe_dmi_table[] __initconst = {
/*
* Systems where PCI IO resource ISA alignment can be skipped
* when the ISA enable bit in the bridge control is not set
@@ -186,7 +186,7 @@ void pcibios_remove_bus(struct pci_bus *bus)
* on the kernel command line (which was parsed earlier).
*/
-static int set_bf_sort(const struct dmi_system_id *d)
+static int __init set_bf_sort(const struct dmi_system_id *d)
{
if (pci_bf_sort == pci_bf_sort_default) {
pci_bf_sort = pci_dmi_bf;
@@ -195,8 +195,8 @@ static int set_bf_sort(const struct dmi_system_id *d)
return 0;
}
-static void read_dmi_type_b1(const struct dmi_header *dm,
- void *private_data)
+static void __init read_dmi_type_b1(const struct dmi_header *dm,
+ void *private_data)
{
u8 *d = (u8 *)dm + 4;
@@ -217,7 +217,7 @@ static void read_dmi_type_b1(const struct dmi_header *dm,
}
}
-static int find_sort_method(const struct dmi_system_id *d)
+static int __init find_sort_method(const struct dmi_system_id *d)
{
dmi_walk(read_dmi_type_b1, NULL);
@@ -232,7 +232,7 @@ static int find_sort_method(const struct dmi_system_id *d)
* Enable renumbering of PCI bus# ranges to reach all PCI busses (Cardbus)
*/
#ifdef __i386__
-static int assign_all_busses(const struct dmi_system_id *d)
+static int __init assign_all_busses(const struct dmi_system_id *d)
{
pci_probe |= PCI_ASSIGN_ALL_BUSSES;
printk(KERN_INFO "%s detected: enabling PCI bus# renumbering"
@@ -241,7 +241,7 @@ static int assign_all_busses(const struct dmi_system_id *d)
}
#endif
-static int set_scan_all(const struct dmi_system_id *d)
+static int __init set_scan_all(const struct dmi_system_id *d)
{
printk(KERN_INFO "PCI: %s detected, enabling pci=pcie_scan_all\n",
d->ident);
@@ -249,7 +249,7 @@ static int set_scan_all(const struct dmi_system_id *d)
return 0;
}
-static const struct dmi_system_id pciprobe_dmi_table[] = {
+static const struct dmi_system_id pciprobe_dmi_table[] __initconst = {
#ifdef __i386__
/*
* Laptops which need pci=assign-busses to see Cardbus cards
@@ -512,7 +512,7 @@ int __init pcibios_init(void)
return 0;
}
-char * __init pcibios_setup(char *str)
+char *__init pcibios_setup(char *str)
{
if (!strcmp(str, "off")) {
pci_probe = 0;
diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c
index c61ea57d1ba1..9a2b7101ae8a 100644
--- a/arch/x86/pci/fixup.c
+++ b/arch/x86/pci/fixup.c
@@ -326,27 +326,6 @@ static void pci_fixup_video(struct pci_dev *pdev)
struct pci_bus *bus;
u16 config;
- if (!vga_default_device()) {
- resource_size_t start, end;
- int i;
-
- /* Does firmware framebuffer belong to us? */
- for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) {
- if (!(pci_resource_flags(pdev, i) & IORESOURCE_MEM))
- continue;
-
- start = pci_resource_start(pdev, i);
- end = pci_resource_end(pdev, i);
-
- if (!start || !end)
- continue;
-
- if (screen_info.lfb_base >= start &&
- (screen_info.lfb_base + screen_info.lfb_size) < end)
- vga_set_default_device(pdev);
- }
- }
-
/* Is VGA routed to us? */
bus = pdev->bus;
while (bus) {
@@ -371,8 +350,7 @@ static void pci_fixup_video(struct pci_dev *pdev)
pci_read_config_word(pdev, PCI_COMMAND, &config);
if (config & (PCI_COMMAND_IO | PCI_COMMAND_MEMORY)) {
pdev->resource[PCI_ROM_RESOURCE].flags |= IORESOURCE_ROM_SHADOW;
- dev_printk(KERN_DEBUG, &pdev->dev, "Boot video device\n");
- vga_set_default_device(pdev);
+ dev_printk(KERN_DEBUG, &pdev->dev, "Video device with shadowed ROM\n");
}
}
}
diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
index 2ae525e0d8ba..37c1435889ce 100644
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -442,8 +442,6 @@ int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
*/
prot |= _PAGE_CACHE_UC_MINUS;
- prot |= _PAGE_IOMAP; /* creating a mapping for IO */
-
vma->vm_page_prot = __pgprot(prot);
if (io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
diff --git a/arch/x86/pci/intel_mid_pci.c b/arch/x86/pci/intel_mid_pci.c
index 84b9d672843d..b9958c364075 100644
--- a/arch/x86/pci/intel_mid_pci.c
+++ b/arch/x86/pci/intel_mid_pci.c
@@ -208,27 +208,31 @@ static int pci_write(struct pci_bus *bus, unsigned int devfn, int where,
static int intel_mid_pci_irq_enable(struct pci_dev *dev)
{
- u8 pin;
- struct io_apic_irq_attr irq_attr;
+ int polarity;
- pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
+ if (intel_mid_identify_cpu() == INTEL_MID_CPU_CHIP_TANGIER)
+ polarity = 0; /* active high */
+ else
+ polarity = 1; /* active low */
/*
* MRST only have IOAPIC, the PCI irq lines are 1:1 mapped to
* IOAPIC RTE entries, so we just enable RTE for the device.
*/
- irq_attr.ioapic = mp_find_ioapic(dev->irq);
- irq_attr.ioapic_pin = dev->irq;
- irq_attr.trigger = 1; /* level */
- if (intel_mid_identify_cpu() == INTEL_MID_CPU_CHIP_TANGIER)
- irq_attr.polarity = 0; /* active high */
- else
- irq_attr.polarity = 1; /* active low */
- io_apic_set_pci_routing(&dev->dev, dev->irq, &irq_attr);
+ if (mp_set_gsi_attr(dev->irq, 1, polarity, dev_to_node(&dev->dev)))
+ return -EBUSY;
+ if (mp_map_gsi_to_irq(dev->irq, IOAPIC_MAP_ALLOC) < 0)
+ return -EBUSY;
return 0;
}
+static void intel_mid_pci_irq_disable(struct pci_dev *dev)
+{
+ if (!mp_should_keep_irq(&dev->dev) && dev->irq > 0)
+ mp_unmap_irq(dev->irq);
+}
+
struct pci_ops intel_mid_pci_ops = {
.read = pci_read,
.write = pci_write,
@@ -245,6 +249,7 @@ int __init intel_mid_pci_init(void)
pr_info("Intel MID platform detected, using MID PCI ops\n");
pci_mmcfg_late_init();
pcibios_enable_irq = intel_mid_pci_irq_enable;
+ pcibios_disable_irq = intel_mid_pci_irq_disable;
pci_root_ops = intel_mid_pci_ops;
pci_soc_mode = 1;
/* Continue with standard init */
diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c
index 84112f55dd7a..eb500c2592ad 100644
--- a/arch/x86/pci/irq.c
+++ b/arch/x86/pci/irq.c
@@ -26,6 +26,7 @@ static int acer_tm360_irqrouting;
static struct irq_routing_table *pirq_table;
static int pirq_enable_irq(struct pci_dev *dev);
+static void pirq_disable_irq(struct pci_dev *dev);
/*
* Never use: 0, 1, 2 (timer, keyboard, and cascade)
@@ -53,7 +54,7 @@ struct irq_router_handler {
};
int (*pcibios_enable_irq)(struct pci_dev *dev) = pirq_enable_irq;
-void (*pcibios_disable_irq)(struct pci_dev *dev) = NULL;
+void (*pcibios_disable_irq)(struct pci_dev *dev) = pirq_disable_irq;
/*
* Check passed address for the PCI IRQ Routing Table signature
@@ -1186,7 +1187,7 @@ void pcibios_penalize_isa_irq(int irq, int active)
static int pirq_enable_irq(struct pci_dev *dev)
{
- u8 pin;
+ u8 pin = 0;
pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
if (pin && !pcibios_lookup_irq(dev, 1)) {
@@ -1227,8 +1228,6 @@ static int pirq_enable_irq(struct pci_dev *dev)
}
dev = temp_dev;
if (irq >= 0) {
- io_apic_set_pci_routing(&dev->dev, irq,
- &irq_attr);
dev->irq = irq;
dev_info(&dev->dev, "PCI->APIC IRQ transform: "
"INT %c -> IRQ %d\n", 'A' + pin - 1, irq);
@@ -1254,3 +1253,12 @@ static int pirq_enable_irq(struct pci_dev *dev)
}
return 0;
}
+
+static void pirq_disable_irq(struct pci_dev *dev)
+{
+ if (io_apic_assign_pci_irqs && !mp_should_keep_irq(&dev->dev) &&
+ dev->irq) {
+ mp_unmap_irq(dev->irq);
+ dev->irq = 0;
+ }
+}
diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c
index 248642f4bab7..326198a4434e 100644
--- a/arch/x86/pci/mmconfig-shared.c
+++ b/arch/x86/pci/mmconfig-shared.c
@@ -31,7 +31,7 @@ static DEFINE_MUTEX(pci_mmcfg_lock);
LIST_HEAD(pci_mmcfg_list);
-static __init void pci_mmconfig_remove(struct pci_mmcfg_region *cfg)
+static void __init pci_mmconfig_remove(struct pci_mmcfg_region *cfg)
{
if (cfg->res.parent)
release_resource(&cfg->res);
@@ -39,7 +39,7 @@ static __init void pci_mmconfig_remove(struct pci_mmcfg_region *cfg)
kfree(cfg);
}
-static __init void free_all_mmcfg(void)
+static void __init free_all_mmcfg(void)
{
struct pci_mmcfg_region *cfg, *tmp;
@@ -93,7 +93,7 @@ static struct pci_mmcfg_region *pci_mmconfig_alloc(int segment, int start,
return new;
}
-static __init struct pci_mmcfg_region *pci_mmconfig_add(int segment, int start,
+static struct pci_mmcfg_region *__init pci_mmconfig_add(int segment, int start,
int end, u64 addr)
{
struct pci_mmcfg_region *new;
@@ -125,7 +125,7 @@ struct pci_mmcfg_region *pci_mmconfig_lookup(int segment, int bus)
return NULL;
}
-static const char __init *pci_mmcfg_e7520(void)
+static const char *__init pci_mmcfg_e7520(void)
{
u32 win;
raw_pci_ops->read(0, 0, PCI_DEVFN(0, 0), 0xce, 2, &win);
@@ -140,7 +140,7 @@ static const char __init *pci_mmcfg_e7520(void)
return "Intel Corporation E7520 Memory Controller Hub";
}
-static const char __init *pci_mmcfg_intel_945(void)
+static const char *__init pci_mmcfg_intel_945(void)
{
u32 pciexbar, mask = 0, len = 0;
@@ -184,7 +184,7 @@ static const char __init *pci_mmcfg_intel_945(void)
return "Intel Corporation 945G/GZ/P/PL Express Memory Controller Hub";
}
-static const char __init *pci_mmcfg_amd_fam10h(void)
+static const char *__init pci_mmcfg_amd_fam10h(void)
{
u32 low, high, address;
u64 base, msr;
@@ -235,21 +235,25 @@ static const char __init *pci_mmcfg_amd_fam10h(void)
}
static bool __initdata mcp55_checked;
-static const char __init *pci_mmcfg_nvidia_mcp55(void)
+static const char *__init pci_mmcfg_nvidia_mcp55(void)
{
int bus;
int mcp55_mmconf_found = 0;
- static const u32 extcfg_regnum = 0x90;
- static const u32 extcfg_regsize = 4;
- static const u32 extcfg_enable_mask = 1<<31;
- static const u32 extcfg_start_mask = 0xff<<16;
- static const int extcfg_start_shift = 16;
- static const u32 extcfg_size_mask = 0x3<<28;
- static const int extcfg_size_shift = 28;
- static const int extcfg_sizebus[] = {0x100, 0x80, 0x40, 0x20};
- static const u32 extcfg_base_mask[] = {0x7ff8, 0x7ffc, 0x7ffe, 0x7fff};
- static const int extcfg_base_lshift = 25;
+ static const u32 extcfg_regnum __initconst = 0x90;
+ static const u32 extcfg_regsize __initconst = 4;
+ static const u32 extcfg_enable_mask __initconst = 1 << 31;
+ static const u32 extcfg_start_mask __initconst = 0xff << 16;
+ static const int extcfg_start_shift __initconst = 16;
+ static const u32 extcfg_size_mask __initconst = 0x3 << 28;
+ static const int extcfg_size_shift __initconst = 28;
+ static const int extcfg_sizebus[] __initconst = {
+ 0x100, 0x80, 0x40, 0x20
+ };
+ static const u32 extcfg_base_mask[] __initconst = {
+ 0x7ff8, 0x7ffc, 0x7ffe, 0x7fff
+ };
+ static const int extcfg_base_lshift __initconst = 25;
/*
* do check if amd fam10h already took over
@@ -302,7 +306,7 @@ struct pci_mmcfg_hostbridge_probe {
const char *(*probe)(void);
};
-static struct pci_mmcfg_hostbridge_probe pci_mmcfg_probes[] __initdata = {
+static const struct pci_mmcfg_hostbridge_probe pci_mmcfg_probes[] __initconst = {
{ 0, PCI_DEVFN(0, 0), PCI_VENDOR_ID_INTEL,
PCI_DEVICE_ID_INTEL_E7520_MCH, pci_mmcfg_e7520 },
{ 0, PCI_DEVFN(0, 0), PCI_VENDOR_ID_INTEL,
diff --git a/arch/x86/pci/pcbios.c b/arch/x86/pci/pcbios.c
index c77b24a8b2da..9b83b9051ae7 100644
--- a/arch/x86/pci/pcbios.c
+++ b/arch/x86/pci/pcbios.c
@@ -79,13 +79,13 @@ union bios32 {
static struct {
unsigned long address;
unsigned short segment;
-} bios32_indirect = { 0, __KERNEL_CS };
+} bios32_indirect __initdata = { 0, __KERNEL_CS };
/*
* Returns the entry point for the given service, NULL on error
*/
-static unsigned long bios32_service(unsigned long service)
+static unsigned long __init bios32_service(unsigned long service)
{
unsigned char return_code; /* %al */
unsigned long address; /* %ebx */
@@ -124,7 +124,7 @@ static struct {
static int pci_bios_present;
-static int check_pcibios(void)
+static int __init check_pcibios(void)
{
u32 signature, eax, ebx, ecx;
u8 status, major_ver, minor_ver, hw_mech;
@@ -312,7 +312,7 @@ static const struct pci_raw_ops pci_bios_access = {
* Try to find PCI BIOS.
*/
-static const struct pci_raw_ops *pci_find_bios(void)
+static const struct pci_raw_ops *__init pci_find_bios(void)
{
union bios32 *check;
unsigned char sum;
diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c
index 905956f16465..093f5f4272d3 100644
--- a/arch/x86/pci/xen.c
+++ b/arch/x86/pci/xen.c
@@ -23,6 +23,7 @@
#include <xen/features.h>
#include <xen/events.h>
#include <asm/xen/pci.h>
+#include <asm/i8259.h>
static int xen_pcifront_enable_irq(struct pci_dev *dev)
{
@@ -40,7 +41,7 @@ static int xen_pcifront_enable_irq(struct pci_dev *dev)
/* In PV DomU the Xen PCI backend puts the PIRQ in the interrupt line.*/
pirq = gsi;
- if (gsi < NR_IRQS_LEGACY)
+ if (gsi < nr_legacy_irqs())
share = 0;
rc = xen_bind_pirq_gsi_to_irq(gsi, pirq, share, "pcifront");
@@ -511,7 +512,7 @@ int __init pci_xen_initial_domain(void)
xen_setup_acpi_sci();
__acpi_register_gsi = acpi_register_gsi_xen;
/* Pre-allocate legacy irqs */
- for (irq = 0; irq < NR_IRQS_LEGACY; irq++) {
+ for (irq = 0; irq < nr_legacy_irqs(); irq++) {
int trigger, polarity;
if (acpi_get_override_irq(irq, &trigger, &polarity) == -1)
@@ -522,7 +523,7 @@ int __init pci_xen_initial_domain(void)
true /* Map GSI to PIRQ */);
}
if (0 == nr_ioapics) {
- for (irq = 0; irq < NR_IRQS_LEGACY; irq++)
+ for (irq = 0; irq < nr_legacy_irqs(); irq++)
xen_bind_pirq_gsi_to_irq(irq, irq, 0, "xt-pic");
}
return 0;
diff --git a/arch/x86/platform/ce4100/ce4100.c b/arch/x86/platform/ce4100/ce4100.c
index 8244f5ec2f4c..701fd5843c87 100644
--- a/arch/x86/platform/ce4100/ce4100.c
+++ b/arch/x86/platform/ce4100/ce4100.c
@@ -135,14 +135,10 @@ static void __init sdv_arch_setup(void)
sdv_serial_fixup();
}
-#ifdef CONFIG_X86_IO_APIC
static void sdv_pci_init(void)
{
x86_of_pci_init();
- /* We can't set this earlier, because we need to calibrate the timer */
- legacy_pic = &null_legacy_pic;
}
-#endif
/*
* CE4100 specific x86_init function overrides and early setup
@@ -155,7 +151,9 @@ void __init x86_ce4100_early_setup(void)
x86_init.resources.probe_roms = x86_init_noop;
x86_init.mpparse.get_smp_config = x86_init_uint_noop;
x86_init.mpparse.find_smp_config = x86_init_noop;
+ x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc_nocheck;
x86_init.pci.init = ce4100_pci_init;
+ x86_init.pci.init_irq = sdv_pci_init;
/*
* By default, the reboot method is ACPI which is supported by the
@@ -166,10 +164,5 @@ void __init x86_ce4100_early_setup(void)
*/
reboot_type = BOOT_KBD;
-#ifdef CONFIG_X86_IO_APIC
- x86_init.pci.init_irq = sdv_pci_init;
- x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc_nocheck;
-#endif
-
pm_power_off = ce4100_power_off;
}
diff --git a/arch/x86/platform/efi/efi-bgrt.c b/arch/x86/platform/efi/efi-bgrt.c
index f15103dff4b4..d143d216d52b 100644
--- a/arch/x86/platform/efi/efi-bgrt.c
+++ b/arch/x86/platform/efi/efi-bgrt.c
@@ -40,20 +40,40 @@ void __init efi_bgrt_init(void)
if (ACPI_FAILURE(status))
return;
- if (bgrt_tab->header.length < sizeof(*bgrt_tab))
+ if (bgrt_tab->header.length < sizeof(*bgrt_tab)) {
+ pr_err("Ignoring BGRT: invalid length %u (expected %zu)\n",
+ bgrt_tab->header.length, sizeof(*bgrt_tab));
return;
- if (bgrt_tab->version != 1 || bgrt_tab->status != 1)
+ }
+ if (bgrt_tab->version != 1) {
+ pr_err("Ignoring BGRT: invalid version %u (expected 1)\n",
+ bgrt_tab->version);
+ return;
+ }
+ if (bgrt_tab->status != 1) {
+ pr_err("Ignoring BGRT: invalid status %u (expected 1)\n",
+ bgrt_tab->status);
+ return;
+ }
+ if (bgrt_tab->image_type != 0) {
+ pr_err("Ignoring BGRT: invalid image type %u (expected 0)\n",
+ bgrt_tab->image_type);
return;
- if (bgrt_tab->image_type != 0 || !bgrt_tab->image_address)
+ }
+ if (!bgrt_tab->image_address) {
+ pr_err("Ignoring BGRT: null image address\n");
return;
+ }
image = efi_lookup_mapped_addr(bgrt_tab->image_address);
if (!image) {
image = early_memremap(bgrt_tab->image_address,
sizeof(bmp_header));
ioremapped = true;
- if (!image)
+ if (!image) {
+ pr_err("Ignoring BGRT: failed to map image header memory\n");
return;
+ }
}
memcpy_fromio(&bmp_header, image, sizeof(bmp_header));
@@ -61,14 +81,18 @@ void __init efi_bgrt_init(void)
early_iounmap(image, sizeof(bmp_header));
bgrt_image_size = bmp_header.size;
- bgrt_image = kmalloc(bgrt_image_size, GFP_KERNEL);
- if (!bgrt_image)
+ bgrt_image = kmalloc(bgrt_image_size, GFP_KERNEL | __GFP_NOWARN);
+ if (!bgrt_image) {
+ pr_err("Ignoring BGRT: failed to allocate memory for image (wanted %zu bytes)\n",
+ bgrt_image_size);
return;
+ }
if (ioremapped) {
image = early_memremap(bgrt_tab->image_address,
bmp_header.size);
if (!image) {
+ pr_err("Ignoring BGRT: failed to map image memory\n");
kfree(bgrt_image);
bgrt_image = NULL;
return;
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index 850da94fef30..dbc8627a5cdf 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -70,17 +70,7 @@ static efi_config_table_type_t arch_tables[] __initdata = {
u64 efi_setup; /* efi setup_data physical address */
-static bool disable_runtime __initdata = false;
-static int __init setup_noefi(char *arg)
-{
- disable_runtime = true;
- return 0;
-}
-early_param("noefi", setup_noefi);
-
-int add_efi_memmap;
-EXPORT_SYMBOL(add_efi_memmap);
-
+static int add_efi_memmap __initdata;
static int __init setup_add_efi_memmap(char *arg)
{
add_efi_memmap = 1;
@@ -96,7 +86,7 @@ static efi_status_t __init phys_efi_set_virtual_address_map(
{
efi_status_t status;
- efi_call_phys_prelog();
+ efi_call_phys_prolog();
status = efi_call_phys(efi_phys.set_virtual_address_map,
memory_map_size, descriptor_size,
descriptor_version, virtual_map);
@@ -210,9 +200,12 @@ static void __init print_efi_memmap(void)
for (p = memmap.map, i = 0;
p < memmap.map_end;
p += memmap.desc_size, i++) {
+ char buf[64];
+
md = p;
- pr_info("mem%02u: type=%u, attr=0x%llx, range=[0x%016llx-0x%016llx) (%lluMB)\n",
- i, md->type, md->attribute, md->phys_addr,
+ pr_info("mem%02u: %s range=[0x%016llx-0x%016llx) (%lluMB)\n",
+ i, efi_md_typeattr_format(buf, sizeof(buf), md),
+ md->phys_addr,
md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT),
(md->num_pages >> (20 - EFI_PAGE_SHIFT)));
}
@@ -344,9 +337,9 @@ static int __init efi_runtime_init32(void)
}
/*
- * We will only need *early* access to the following two
- * EFI runtime services before set_virtual_address_map
- * is invoked.
+ * We will only need *early* access to the SetVirtualAddressMap
+ * EFI runtime service. All other runtime services will be called
+ * via the virtual mapping.
*/
efi_phys.set_virtual_address_map =
(efi_set_virtual_address_map_t *)
@@ -368,9 +361,9 @@ static int __init efi_runtime_init64(void)
}
/*
- * We will only need *early* access to the following two
- * EFI runtime services before set_virtual_address_map
- * is invoked.
+ * We will only need *early* access to the SetVirtualAddressMap
+ * EFI runtime service. All other runtime services will be called
+ * via the virtual mapping.
*/
efi_phys.set_virtual_address_map =
(efi_set_virtual_address_map_t *)
@@ -492,7 +485,7 @@ void __init efi_init(void)
if (!efi_runtime_supported())
pr_info("No EFI runtime due to 32/64-bit mismatch with kernel\n");
else {
- if (disable_runtime || efi_runtime_init())
+ if (efi_runtime_disabled() || efi_runtime_init())
return;
}
if (efi_memmap_init())
@@ -537,7 +530,7 @@ void __init runtime_code_page_mkexec(void)
}
}
-void efi_memory_uc(u64 addr, unsigned long size)
+void __init efi_memory_uc(u64 addr, unsigned long size)
{
unsigned long page_shift = 1UL << EFI_PAGE_SHIFT;
u64 npages;
@@ -732,6 +725,7 @@ static void __init kexec_enter_virtual_mode(void)
*/
if (!efi_is_native()) {
efi_unmap_memmap();
+ clear_bit(EFI_RUNTIME_SERVICES, &efi.flags);
return;
}
@@ -805,6 +799,7 @@ static void __init __efi_enter_virtual_mode(void)
new_memmap = efi_map_regions(&count, &pg_shift);
if (!new_memmap) {
pr_err("Error reallocating memory, EFI runtime non-functional!\n");
+ clear_bit(EFI_RUNTIME_SERVICES, &efi.flags);
return;
}
@@ -812,8 +807,10 @@ static void __init __efi_enter_virtual_mode(void)
BUG_ON(!efi.systab);
- if (efi_setup_page_tables(__pa(new_memmap), 1 << pg_shift))
+ if (efi_setup_page_tables(__pa(new_memmap), 1 << pg_shift)) {
+ clear_bit(EFI_RUNTIME_SERVICES, &efi.flags);
return;
+ }
efi_sync_low_kernel_mappings();
efi_dump_pagetable();
@@ -938,14 +935,11 @@ u64 efi_mem_attributes(unsigned long phys_addr)
return 0;
}
-static int __init parse_efi_cmdline(char *str)
+static int __init arch_parse_efi_cmdline(char *str)
{
- if (*str == '=')
- str++;
-
- if (!strncmp(str, "old_map", 7))
+ if (parse_option_str(str, "old_map"))
set_bit(EFI_OLD_MEMMAP, &efi.flags);
return 0;
}
-early_param("efi", parse_efi_cmdline);
+early_param("efi", arch_parse_efi_cmdline);
diff --git a/arch/x86/platform/efi/efi_32.c b/arch/x86/platform/efi/efi_32.c
index 9ee3491e31fb..40e7cda52936 100644
--- a/arch/x86/platform/efi/efi_32.c
+++ b/arch/x86/platform/efi/efi_32.c
@@ -33,7 +33,7 @@
/*
* To make EFI call EFI runtime service in physical addressing mode we need
- * prelog/epilog before/after the invocation to disable interrupt, to
+ * prolog/epilog before/after the invocation to disable interrupt, to
* claim EFI runtime service handler exclusively and to duplicate a memory in
* low memory space say 0 - 3G.
*/
@@ -41,11 +41,13 @@ static unsigned long efi_rt_eflags;
void efi_sync_low_kernel_mappings(void) {}
void __init efi_dump_pagetable(void) {}
-int efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages)
+int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages)
{
return 0;
}
-void efi_cleanup_page_tables(unsigned long pa_memmap, unsigned num_pages) {}
+void __init efi_cleanup_page_tables(unsigned long pa_memmap, unsigned num_pages)
+{
+}
void __init efi_map_region(efi_memory_desc_t *md)
{
@@ -55,7 +57,7 @@ void __init efi_map_region(efi_memory_desc_t *md)
void __init efi_map_region_fixed(efi_memory_desc_t *md) {}
void __init parse_efi_setup(u64 phys_addr, u32 data_len) {}
-void efi_call_phys_prelog(void)
+void __init efi_call_phys_prolog(void)
{
struct desc_ptr gdt_descr;
@@ -69,7 +71,7 @@ void efi_call_phys_prelog(void)
load_gdt(&gdt_descr);
}
-void efi_call_phys_epilog(void)
+void __init efi_call_phys_epilog(void)
{
struct desc_ptr gdt_descr;
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index 290d397e1dd9..35aecb6042fb 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -79,7 +79,7 @@ static void __init early_code_mapping_set_exec(int executable)
}
}
-void __init efi_call_phys_prelog(void)
+void __init efi_call_phys_prolog(void)
{
unsigned long vaddress;
int pgd;
@@ -139,7 +139,7 @@ void efi_sync_low_kernel_mappings(void)
sizeof(pgd_t) * num_pgds);
}
-int efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages)
+int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages)
{
unsigned long text;
struct page *page;
@@ -192,7 +192,7 @@ int efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages)
return 0;
}
-void efi_cleanup_page_tables(unsigned long pa_memmap, unsigned num_pages)
+void __init efi_cleanup_page_tables(unsigned long pa_memmap, unsigned num_pages)
{
pgd_t *pgd = (pgd_t *)__va(real_mode_header->trampoline_pgd);
diff --git a/arch/x86/platform/efi/efi_stub_32.S b/arch/x86/platform/efi/efi_stub_32.S
index fbe66e626c09..040192b50d02 100644
--- a/arch/x86/platform/efi/efi_stub_32.S
+++ b/arch/x86/platform/efi/efi_stub_32.S
@@ -27,13 +27,13 @@ ENTRY(efi_call_phys)
* set to 0x0010, DS and SS have been set to 0x0018. In EFI, I found
* the values of these registers are the same. And, the corresponding
* GDT entries are identical. So I will do nothing about segment reg
- * and GDT, but change GDT base register in prelog and epilog.
+ * and GDT, but change GDT base register in prolog and epilog.
*/
/*
* 1. Now I am running with EIP = <physical address> + PAGE_OFFSET.
* But to make it smoothly switch from virtual mode to flat mode.
- * The mapping of lower virtual memory has been created in prelog and
+ * The mapping of lower virtual memory has been created in prolog and
* epilog.
*/
movl $1f, %edx
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_wdt.c b/arch/x86/platform/intel-mid/device_libs/platform_wdt.c
index 973cf3bfa9fd..0b283d4d0ad7 100644
--- a/arch/x86/platform/intel-mid/device_libs/platform_wdt.c
+++ b/arch/x86/platform/intel-mid/device_libs/platform_wdt.c
@@ -26,28 +26,18 @@ static struct platform_device wdt_dev = {
static int tangier_probe(struct platform_device *pdev)
{
- int ioapic;
- int irq;
+ int gsi;
struct intel_mid_wdt_pdata *pdata = pdev->dev.platform_data;
- struct io_apic_irq_attr irq_attr = { 0 };
if (!pdata)
return -EINVAL;
- irq = pdata->irq;
- ioapic = mp_find_ioapic(irq);
- if (ioapic >= 0) {
- int ret;
- irq_attr.ioapic = ioapic;
- irq_attr.ioapic_pin = irq;
- irq_attr.trigger = 1;
- /* irq_attr.polarity = 0; -> Active high */
- ret = io_apic_set_pci_routing(NULL, irq, &irq_attr);
- if (ret)
- return ret;
- } else {
+ /* IOAPIC builds identity mapping between GSI and IRQ on MID */
+ gsi = pdata->irq;
+ if (mp_set_gsi_attr(gsi, 1, 0, cpu_to_node(0)) ||
+ mp_map_gsi_to_irq(gsi, IOAPIC_MAP_ALLOC) <= 0) {
dev_warn(&pdev->dev, "cannot find interrupt %d in ioapic\n",
- irq);
+ gsi);
return -EINVAL;
}
diff --git a/arch/x86/platform/intel-mid/intel_mid_weak_decls.h b/arch/x86/platform/intel-mid/intel_mid_weak_decls.h
index 46aa25c8ce06..3c1c3866d82b 100644
--- a/arch/x86/platform/intel-mid/intel_mid_weak_decls.h
+++ b/arch/x86/platform/intel-mid/intel_mid_weak_decls.h
@@ -10,10 +10,9 @@
*/
-/* __attribute__((weak)) makes these declarations overridable */
/* For every CPU addition a new get_<cpuname>_ops interface needs
* to be added.
*/
-extern void *get_penwell_ops(void) __attribute__((weak));
-extern void *get_cloverview_ops(void) __attribute__((weak));
-extern void *get_tangier_ops(void) __attribute__((weak));
+extern void *get_penwell_ops(void);
+extern void *get_cloverview_ops(void);
+extern void *get_tangier_ops(void);
diff --git a/arch/x86/platform/intel-mid/sfi.c b/arch/x86/platform/intel-mid/sfi.c
index 994c40bd7cb7..c14ad34776c4 100644
--- a/arch/x86/platform/intel-mid/sfi.c
+++ b/arch/x86/platform/intel-mid/sfi.c
@@ -106,6 +106,7 @@ int __init sfi_parse_mtmr(struct sfi_table_header *table)
mp_irq.dstapic = MP_APIC_ALL;
mp_irq.dstirq = pentry->irq;
mp_save_irq(&mp_irq);
+ mp_map_gsi_to_irq(pentry->irq, IOAPIC_MAP_ALLOC);
}
return 0;
@@ -176,6 +177,7 @@ int __init sfi_parse_mrtc(struct sfi_table_header *table)
mp_irq.dstapic = MP_APIC_ALL;
mp_irq.dstirq = pentry->irq;
mp_save_irq(&mp_irq);
+ mp_map_gsi_to_irq(pentry->irq, IOAPIC_MAP_ALLOC);
}
return 0;
}
@@ -432,9 +434,8 @@ static int __init sfi_parse_devs(struct sfi_table_header *table)
struct sfi_table_simple *sb;
struct sfi_device_table_entry *pentry;
struct devs_id *dev = NULL;
- int num, i;
- int ioapic;
- struct io_apic_irq_attr irq_attr;
+ int num, i, ret;
+ int polarity;
sb = (struct sfi_table_simple *)table;
num = SFI_GET_NUM_ENTRIES(sb, struct sfi_device_table_entry);
@@ -448,35 +449,30 @@ static int __init sfi_parse_devs(struct sfi_table_header *table)
* devices, but they have separate RTE entry in IOAPIC
* so we have to enable them one by one here
*/
- ioapic = mp_find_ioapic(irq);
- if (ioapic >= 0) {
- irq_attr.ioapic = ioapic;
- irq_attr.ioapic_pin = irq;
- irq_attr.trigger = 1;
- if (intel_mid_identify_cpu() ==
- INTEL_MID_CPU_CHIP_TANGIER) {
- if (!strncmp(pentry->name,
- "r69001-ts-i2c", 13))
- /* active low */
- irq_attr.polarity = 1;
- else if (!strncmp(pentry->name,
- "synaptics_3202", 14))
- /* active low */
- irq_attr.polarity = 1;
- else if (irq == 41)
- /* fast_int_1 */
- irq_attr.polarity = 1;
- else
- /* active high */
- irq_attr.polarity = 0;
- } else {
- /* PNW and CLV go with active low */
- irq_attr.polarity = 1;
- }
- io_apic_set_pci_routing(NULL, irq, &irq_attr);
+ if (intel_mid_identify_cpu() ==
+ INTEL_MID_CPU_CHIP_TANGIER) {
+ if (!strncmp(pentry->name, "r69001-ts-i2c", 13))
+ /* active low */
+ polarity = 1;
+ else if (!strncmp(pentry->name,
+ "synaptics_3202", 14))
+ /* active low */
+ polarity = 1;
+ else if (irq == 41)
+ /* fast_int_1 */
+ polarity = 1;
+ else
+ /* active high */
+ polarity = 0;
+ } else {
+ /* PNW and CLV go with active low */
+ polarity = 1;
}
- } else {
- irq = 0; /* No irq */
+
+ ret = mp_set_gsi_attr(irq, 1, polarity, NUMA_NO_NODE);
+ if (ret == 0)
+ ret = mp_map_gsi_to_irq(irq, IOAPIC_MAP_ALLOC);
+ WARN_ON(ret < 0);
}
dev = get_device_id(pentry->type, pentry->name);
diff --git a/arch/x86/platform/sfi/sfi.c b/arch/x86/platform/sfi/sfi.c
index bcd1a703e3e6..2a8a74f3bd76 100644
--- a/arch/x86/platform/sfi/sfi.c
+++ b/arch/x86/platform/sfi/sfi.c
@@ -25,6 +25,7 @@
#include <linux/init.h>
#include <linux/sfi.h>
#include <linux/io.h>
+#include <linux/irqdomain.h>
#include <asm/io_apic.h>
#include <asm/mpspec.h>
@@ -70,19 +71,26 @@ static int __init sfi_parse_cpus(struct sfi_table_header *table)
#endif /* CONFIG_X86_LOCAL_APIC */
#ifdef CONFIG_X86_IO_APIC
+static struct irq_domain_ops sfi_ioapic_irqdomain_ops = {
+ .map = mp_irqdomain_map,
+};
static int __init sfi_parse_ioapic(struct sfi_table_header *table)
{
struct sfi_table_simple *sb;
struct sfi_apic_table_entry *pentry;
int i, num;
+ struct ioapic_domain_cfg cfg = {
+ .type = IOAPIC_DOMAIN_STRICT,
+ .ops = &sfi_ioapic_irqdomain_ops,
+ };
sb = (struct sfi_table_simple *)table;
num = SFI_GET_NUM_ENTRIES(sb, struct sfi_apic_table_entry);
pentry = (struct sfi_apic_table_entry *)sb->pentry;
for (i = 0; i < num; i++) {
- mp_register_ioapic(i, pentry->phys_addr, gsi_top);
+ mp_register_ioapic(i, pentry->phys_addr, gsi_top, &cfg);
pentry++;
}
diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c
index ed161c6e278b..3968d67d366b 100644
--- a/arch/x86/platform/uv/tlb_uv.c
+++ b/arch/x86/platform/uv/tlb_uv.c
@@ -1479,7 +1479,7 @@ static ssize_t ptc_proc_write(struct file *file, const char __user *user,
return count;
}
- if (strict_strtol(optstr, 10, &input_arg) < 0) {
+ if (kstrtol(optstr, 10, &input_arg) < 0) {
printk(KERN_DEBUG "%s is invalid\n", optstr);
return -EINVAL;
}
diff --git a/arch/x86/platform/uv/uv_nmi.c b/arch/x86/platform/uv/uv_nmi.c
index c89c93320c12..c6b146e67116 100644
--- a/arch/x86/platform/uv/uv_nmi.c
+++ b/arch/x86/platform/uv/uv_nmi.c
@@ -63,8 +63,8 @@
static struct uv_hub_nmi_s **uv_hub_nmi_list;
-DEFINE_PER_CPU(struct uv_cpu_nmi_s, __uv_cpu_nmi);
-EXPORT_PER_CPU_SYMBOL_GPL(__uv_cpu_nmi);
+DEFINE_PER_CPU(struct uv_cpu_nmi_s, uv_cpu_nmi);
+EXPORT_PER_CPU_SYMBOL_GPL(uv_cpu_nmi);
static unsigned long nmi_mmr;
static unsigned long nmi_mmr_clear;
@@ -215,7 +215,7 @@ static int uv_check_nmi(struct uv_hub_nmi_s *hub_nmi)
int nmi = 0;
local64_inc(&uv_nmi_count);
- uv_cpu_nmi.queries++;
+ this_cpu_inc(uv_cpu_nmi.queries);
do {
nmi = atomic_read(&hub_nmi->in_nmi);
@@ -293,7 +293,7 @@ static void uv_nmi_nr_cpus_ping(void)
int cpu;
for_each_cpu(cpu, uv_nmi_cpu_mask)
- atomic_set(&uv_cpu_nmi_per(cpu).pinging, 1);
+ uv_cpu_nmi_per(cpu).pinging = 1;
apic->send_IPI_mask(uv_nmi_cpu_mask, APIC_DM_NMI);
}
@@ -304,8 +304,8 @@ static void uv_nmi_cleanup_mask(void)
int cpu;
for_each_cpu(cpu, uv_nmi_cpu_mask) {
- atomic_set(&uv_cpu_nmi_per(cpu).pinging, 0);
- atomic_set(&uv_cpu_nmi_per(cpu).state, UV_NMI_STATE_OUT);
+ uv_cpu_nmi_per(cpu).pinging = 0;
+ uv_cpu_nmi_per(cpu).state = UV_NMI_STATE_OUT;
cpumask_clear_cpu(cpu, uv_nmi_cpu_mask);
}
}
@@ -328,7 +328,7 @@ static int uv_nmi_wait_cpus(int first)
int loop_delay = uv_nmi_loop_delay;
for_each_cpu(j, uv_nmi_cpu_mask) {
- if (atomic_read(&uv_cpu_nmi_per(j).state)) {
+ if (uv_cpu_nmi_per(j).state) {
cpumask_clear_cpu(j, uv_nmi_cpu_mask);
if (++k >= n)
break;
@@ -359,7 +359,7 @@ static int uv_nmi_wait_cpus(int first)
static void uv_nmi_wait(int master)
{
/* indicate this cpu is in */
- atomic_set(&uv_cpu_nmi.state, UV_NMI_STATE_IN);
+ this_cpu_write(uv_cpu_nmi.state, UV_NMI_STATE_IN);
/* if not the first cpu in (the master), then we are a slave cpu */
if (!master)
@@ -419,7 +419,7 @@ static void uv_nmi_dump_state_cpu(int cpu, struct pt_regs *regs)
"UV:%sNMI process trace for CPU %d\n", dots, cpu);
show_regs(regs);
}
- atomic_set(&uv_cpu_nmi.state, UV_NMI_STATE_DUMP_DONE);
+ this_cpu_write(uv_cpu_nmi.state, UV_NMI_STATE_DUMP_DONE);
}
/* Trigger a slave cpu to dump it's state */
@@ -427,20 +427,20 @@ static void uv_nmi_trigger_dump(int cpu)
{
int retry = uv_nmi_trigger_delay;
- if (atomic_read(&uv_cpu_nmi_per(cpu).state) != UV_NMI_STATE_IN)
+ if (uv_cpu_nmi_per(cpu).state != UV_NMI_STATE_IN)
return;
- atomic_set(&uv_cpu_nmi_per(cpu).state, UV_NMI_STATE_DUMP);
+ uv_cpu_nmi_per(cpu).state = UV_NMI_STATE_DUMP;
do {
cpu_relax();
udelay(10);
- if (atomic_read(&uv_cpu_nmi_per(cpu).state)
+ if (uv_cpu_nmi_per(cpu).state
!= UV_NMI_STATE_DUMP)
return;
} while (--retry > 0);
pr_crit("UV: CPU %d stuck in process dump function\n", cpu);
- atomic_set(&uv_cpu_nmi_per(cpu).state, UV_NMI_STATE_DUMP_DONE);
+ uv_cpu_nmi_per(cpu).state = UV_NMI_STATE_DUMP_DONE;
}
/* Wait until all cpus ready to exit */
@@ -488,7 +488,7 @@ static void uv_nmi_dump_state(int cpu, struct pt_regs *regs, int master)
} else {
while (!atomic_read(&uv_nmi_slave_continue))
cpu_relax();
- while (atomic_read(&uv_cpu_nmi.state) != UV_NMI_STATE_DUMP)
+ while (this_cpu_read(uv_cpu_nmi.state) != UV_NMI_STATE_DUMP)
cpu_relax();
uv_nmi_dump_state_cpu(cpu, regs);
}
@@ -615,7 +615,7 @@ int uv_handle_nmi(unsigned int reason, struct pt_regs *regs)
local_irq_save(flags);
/* If not a UV System NMI, ignore */
- if (!atomic_read(&uv_cpu_nmi.pinging) && !uv_check_nmi(hub_nmi)) {
+ if (!this_cpu_read(uv_cpu_nmi.pinging) && !uv_check_nmi(hub_nmi)) {
local_irq_restore(flags);
return NMI_DONE;
}
@@ -639,7 +639,7 @@ int uv_handle_nmi(unsigned int reason, struct pt_regs *regs)
uv_call_kgdb_kdb(cpu, regs, master);
/* Clear per_cpu "in nmi" flag */
- atomic_set(&uv_cpu_nmi.state, UV_NMI_STATE_OUT);
+ this_cpu_write(uv_cpu_nmi.state, UV_NMI_STATE_OUT);
/* Clear MMR NMI flag on each hub */
uv_clear_nmi(cpu);
@@ -666,16 +666,16 @@ static int uv_handle_nmi_ping(unsigned int reason, struct pt_regs *regs)
{
int ret;
- uv_cpu_nmi.queries++;
- if (!atomic_read(&uv_cpu_nmi.pinging)) {
+ this_cpu_inc(uv_cpu_nmi.queries);
+ if (!this_cpu_read(uv_cpu_nmi.pinging)) {
local64_inc(&uv_nmi_ping_misses);
return NMI_DONE;
}
- uv_cpu_nmi.pings++;
+ this_cpu_inc(uv_cpu_nmi.pings);
local64_inc(&uv_nmi_ping_count);
ret = uv_handle_nmi(reason, regs);
- atomic_set(&uv_cpu_nmi.pinging, 0);
+ this_cpu_write(uv_cpu_nmi.pinging, 0);
return ret;
}
diff --git a/arch/x86/platform/uv/uv_time.c b/arch/x86/platform/uv/uv_time.c
index 5c86786bbfd2..a244237f3cfa 100644
--- a/arch/x86/platform/uv/uv_time.c
+++ b/arch/x86/platform/uv/uv_time.c
@@ -365,7 +365,7 @@ __setup("uvrtcevt", uv_enable_evt_rtc);
static __init void uv_rtc_register_clockevents(struct work_struct *dummy)
{
- struct clock_event_device *ced = &__get_cpu_var(cpu_ced);
+ struct clock_event_device *ced = this_cpu_ptr(&cpu_ced);
*ced = clock_event_device_uv;
ced->cpumask = cpumask_of(smp_processor_id());
diff --git a/arch/x86/power/hibernate_32.c b/arch/x86/power/hibernate_32.c
index 7d28c885d238..291226b952a9 100644
--- a/arch/x86/power/hibernate_32.c
+++ b/arch/x86/power/hibernate_32.c
@@ -13,13 +13,11 @@
#include <asm/page.h>
#include <asm/pgtable.h>
#include <asm/mmzone.h>
+#include <asm/sections.h>
/* Defined in hibernate_asm_32.S */
extern int restore_image(void);
-/* References to section boundaries */
-extern const void __nosave_begin, __nosave_end;
-
/* Pointer to the temporary resume page tables */
pgd_t *resume_pg_dir;
diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c
index 35e2bb6c0f37..009947d419a6 100644
--- a/arch/x86/power/hibernate_64.c
+++ b/arch/x86/power/hibernate_64.c
@@ -17,11 +17,9 @@
#include <asm/page.h>
#include <asm/pgtable.h>
#include <asm/mtrr.h>
+#include <asm/sections.h>
#include <asm/suspend.h>
-/* References to section boundaries */
-extern __visible const void __nosave_begin, __nosave_end;
-
/* Defined in hibernate_asm_64.S */
extern asmlinkage __visible int restore_image(void);
diff --git a/arch/x86/purgatory/Makefile b/arch/x86/purgatory/Makefile
new file mode 100644
index 000000000000..f52e033557c9
--- /dev/null
+++ b/arch/x86/purgatory/Makefile
@@ -0,0 +1,29 @@
+purgatory-y := purgatory.o stack.o setup-x86_$(BITS).o sha256.o entry64.o string.o
+
+targets += $(purgatory-y)
+PURGATORY_OBJS = $(addprefix $(obj)/,$(purgatory-y))
+
+LDFLAGS_purgatory.ro := -e purgatory_start -r --no-undefined -nostdlib -z nodefaultlib
+targets += purgatory.ro
+
+# Default KBUILD_CFLAGS can have -pg option set when FTRACE is enabled. That
+# in turn leaves some undefined symbols like __fentry__ in purgatory and not
+# sure how to relocate those. Like kexec-tools, use custom flags.
+
+KBUILD_CFLAGS := -fno-strict-aliasing -Wall -Wstrict-prototypes -fno-zero-initialized-in-bss -fno-builtin -ffreestanding -c -MD -Os -mcmodel=large
+KBUILD_CFLAGS += -m$(BITS)
+
+$(obj)/purgatory.ro: $(PURGATORY_OBJS) FORCE
+ $(call if_changed,ld)
+
+targets += kexec-purgatory.c
+
+CMD_BIN2C = $(objtree)/scripts/basic/bin2c
+quiet_cmd_bin2c = BIN2C $@
+ cmd_bin2c = $(CMD_BIN2C) kexec_purgatory < $< > $@
+
+$(obj)/kexec-purgatory.c: $(obj)/purgatory.ro FORCE
+ $(call if_changed,bin2c)
+
+
+obj-$(CONFIG_KEXEC_FILE) += kexec-purgatory.o
diff --git a/arch/x86/purgatory/entry64.S b/arch/x86/purgatory/entry64.S
new file mode 100644
index 000000000000..d1a4291d3568
--- /dev/null
+++ b/arch/x86/purgatory/entry64.S
@@ -0,0 +1,101 @@
+/*
+ * Copyright (C) 2003,2004 Eric Biederman (ebiederm@xmission.com)
+ * Copyright (C) 2014 Red Hat Inc.
+
+ * Author(s): Vivek Goyal <vgoyal@redhat.com>
+ *
+ * This code has been taken from kexec-tools.
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2. See the file COPYING for more details.
+ */
+
+ .text
+ .balign 16
+ .code64
+ .globl entry64, entry64_regs
+
+
+entry64:
+ /* Setup a gdt that should be preserved */
+ lgdt gdt(%rip)
+
+ /* load the data segments */
+ movl $0x18, %eax /* data segment */
+ movl %eax, %ds
+ movl %eax, %es
+ movl %eax, %ss
+ movl %eax, %fs
+ movl %eax, %gs
+
+ /* Setup new stack */
+ leaq stack_init(%rip), %rsp
+ pushq $0x10 /* CS */
+ leaq new_cs_exit(%rip), %rax
+ pushq %rax
+ lretq
+new_cs_exit:
+
+ /* Load the registers */
+ movq rax(%rip), %rax
+ movq rbx(%rip), %rbx
+ movq rcx(%rip), %rcx
+ movq rdx(%rip), %rdx
+ movq rsi(%rip), %rsi
+ movq rdi(%rip), %rdi
+ movq rsp(%rip), %rsp
+ movq rbp(%rip), %rbp
+ movq r8(%rip), %r8
+ movq r9(%rip), %r9
+ movq r10(%rip), %r10
+ movq r11(%rip), %r11
+ movq r12(%rip), %r12
+ movq r13(%rip), %r13
+ movq r14(%rip), %r14
+ movq r15(%rip), %r15
+
+ /* Jump to the new code... */
+ jmpq *rip(%rip)
+
+ .section ".rodata"
+ .balign 4
+entry64_regs:
+rax: .quad 0x0
+rcx: .quad 0x0
+rdx: .quad 0x0
+rbx: .quad 0x0
+rsp: .quad 0x0
+rbp: .quad 0x0
+rsi: .quad 0x0
+rdi: .quad 0x0
+r8: .quad 0x0
+r9: .quad 0x0
+r10: .quad 0x0
+r11: .quad 0x0
+r12: .quad 0x0
+r13: .quad 0x0
+r14: .quad 0x0
+r15: .quad 0x0
+rip: .quad 0x0
+ .size entry64_regs, . - entry64_regs
+
+ /* GDT */
+ .section ".rodata"
+ .balign 16
+gdt:
+ /* 0x00 unusable segment
+ * 0x08 unused
+ * so use them as gdt ptr
+ */
+ .word gdt_end - gdt - 1
+ .quad gdt
+ .word 0, 0, 0
+
+ /* 0x10 4GB flat code segment */
+ .word 0xFFFF, 0x0000, 0x9A00, 0x00AF
+
+ /* 0x18 4GB flat data segment */
+ .word 0xFFFF, 0x0000, 0x9200, 0x00CF
+gdt_end:
+stack: .quad 0, 0
+stack_init:
diff --git a/arch/x86/purgatory/purgatory.c b/arch/x86/purgatory/purgatory.c
new file mode 100644
index 000000000000..25e068ba3382
--- /dev/null
+++ b/arch/x86/purgatory/purgatory.c
@@ -0,0 +1,72 @@
+/*
+ * purgatory: Runs between two kernels
+ *
+ * Copyright (C) 2014 Red Hat Inc.
+ *
+ * Author:
+ * Vivek Goyal <vgoyal@redhat.com>
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2. See the file COPYING for more details.
+ */
+
+#include "sha256.h"
+#include "../boot/string.h"
+
+struct sha_region {
+ unsigned long start;
+ unsigned long len;
+};
+
+unsigned long backup_dest = 0;
+unsigned long backup_src = 0;
+unsigned long backup_sz = 0;
+
+u8 sha256_digest[SHA256_DIGEST_SIZE] = { 0 };
+
+struct sha_region sha_regions[16] = {};
+
+/*
+ * On x86, second kernel requries first 640K of memory to boot. Copy
+ * first 640K to a backup region in reserved memory range so that second
+ * kernel can use first 640K.
+ */
+static int copy_backup_region(void)
+{
+ if (backup_dest)
+ memcpy((void *)backup_dest, (void *)backup_src, backup_sz);
+
+ return 0;
+}
+
+int verify_sha256_digest(void)
+{
+ struct sha_region *ptr, *end;
+ u8 digest[SHA256_DIGEST_SIZE];
+ struct sha256_state sctx;
+
+ sha256_init(&sctx);
+ end = &sha_regions[sizeof(sha_regions)/sizeof(sha_regions[0])];
+ for (ptr = sha_regions; ptr < end; ptr++)
+ sha256_update(&sctx, (uint8_t *)(ptr->start), ptr->len);
+
+ sha256_final(&sctx, digest);
+
+ if (memcmp(digest, sha256_digest, sizeof(digest)))
+ return 1;
+
+ return 0;
+}
+
+void purgatory(void)
+{
+ int ret;
+
+ ret = verify_sha256_digest();
+ if (ret) {
+ /* loop forever */
+ for (;;)
+ ;
+ }
+ copy_backup_region();
+}
diff --git a/arch/x86/purgatory/setup-x86_64.S b/arch/x86/purgatory/setup-x86_64.S
new file mode 100644
index 000000000000..fe3c91ba1bd0
--- /dev/null
+++ b/arch/x86/purgatory/setup-x86_64.S
@@ -0,0 +1,58 @@
+/*
+ * purgatory: setup code
+ *
+ * Copyright (C) 2003,2004 Eric Biederman (ebiederm@xmission.com)
+ * Copyright (C) 2014 Red Hat Inc.
+ *
+ * This code has been taken from kexec-tools.
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2. See the file COPYING for more details.
+ */
+
+ .text
+ .globl purgatory_start
+ .balign 16
+purgatory_start:
+ .code64
+
+ /* Load a gdt so I know what the segment registers are */
+ lgdt gdt(%rip)
+
+ /* load the data segments */
+ movl $0x18, %eax /* data segment */
+ movl %eax, %ds
+ movl %eax, %es
+ movl %eax, %ss
+ movl %eax, %fs
+ movl %eax, %gs
+
+ /* Setup a stack */
+ leaq lstack_end(%rip), %rsp
+
+ /* Call the C code */
+ call purgatory
+ jmp entry64
+
+ .section ".rodata"
+ .balign 16
+gdt: /* 0x00 unusable segment
+ * 0x08 unused
+ * so use them as the gdt ptr
+ */
+ .word gdt_end - gdt - 1
+ .quad gdt
+ .word 0, 0, 0
+
+ /* 0x10 4GB flat code segment */
+ .word 0xFFFF, 0x0000, 0x9A00, 0x00AF
+
+ /* 0x18 4GB flat data segment */
+ .word 0xFFFF, 0x0000, 0x9200, 0x00CF
+gdt_end:
+
+ .bss
+ .balign 4096
+lstack:
+ .skip 4096
+lstack_end:
diff --git a/arch/x86/purgatory/sha256.c b/arch/x86/purgatory/sha256.c
new file mode 100644
index 000000000000..548ca675a14a
--- /dev/null
+++ b/arch/x86/purgatory/sha256.c
@@ -0,0 +1,283 @@
+/*
+ * SHA-256, as specified in
+ * http://csrc.nist.gov/groups/STM/cavp/documents/shs/sha256-384-512.pdf
+ *
+ * SHA-256 code by Jean-Luc Cooke <jlcooke@certainkey.com>.
+ *
+ * Copyright (c) Jean-Luc Cooke <jlcooke@certainkey.com>
+ * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
+ * Copyright (c) 2002 James Morris <jmorris@intercode.com.au>
+ * Copyright (c) 2014 Red Hat Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ */
+
+#include <linux/bitops.h>
+#include <asm/byteorder.h>
+#include "sha256.h"
+#include "../boot/string.h"
+
+static inline u32 Ch(u32 x, u32 y, u32 z)
+{
+ return z ^ (x & (y ^ z));
+}
+
+static inline u32 Maj(u32 x, u32 y, u32 z)
+{
+ return (x & y) | (z & (x | y));
+}
+
+#define e0(x) (ror32(x, 2) ^ ror32(x, 13) ^ ror32(x, 22))
+#define e1(x) (ror32(x, 6) ^ ror32(x, 11) ^ ror32(x, 25))
+#define s0(x) (ror32(x, 7) ^ ror32(x, 18) ^ (x >> 3))
+#define s1(x) (ror32(x, 17) ^ ror32(x, 19) ^ (x >> 10))
+
+static inline void LOAD_OP(int I, u32 *W, const u8 *input)
+{
+ W[I] = __be32_to_cpu(((__be32 *)(input))[I]);
+}
+
+static inline void BLEND_OP(int I, u32 *W)
+{
+ W[I] = s1(W[I-2]) + W[I-7] + s0(W[I-15]) + W[I-16];
+}
+
+static void sha256_transform(u32 *state, const u8 *input)
+{
+ u32 a, b, c, d, e, f, g, h, t1, t2;
+ u32 W[64];
+ int i;
+
+ /* load the input */
+ for (i = 0; i < 16; i++)
+ LOAD_OP(i, W, input);
+
+ /* now blend */
+ for (i = 16; i < 64; i++)
+ BLEND_OP(i, W);
+
+ /* load the state into our registers */
+ a = state[0]; b = state[1]; c = state[2]; d = state[3];
+ e = state[4]; f = state[5]; g = state[6]; h = state[7];
+
+ /* now iterate */
+ t1 = h + e1(e) + Ch(e, f, g) + 0x428a2f98 + W[0];
+ t2 = e0(a) + Maj(a, b, c); d += t1; h = t1 + t2;
+ t1 = g + e1(d) + Ch(d, e, f) + 0x71374491 + W[1];
+ t2 = e0(h) + Maj(h, a, b); c += t1; g = t1 + t2;
+ t1 = f + e1(c) + Ch(c, d, e) + 0xb5c0fbcf + W[2];
+ t2 = e0(g) + Maj(g, h, a); b += t1; f = t1 + t2;
+ t1 = e + e1(b) + Ch(b, c, d) + 0xe9b5dba5 + W[3];
+ t2 = e0(f) + Maj(f, g, h); a += t1; e = t1 + t2;
+ t1 = d + e1(a) + Ch(a, b, c) + 0x3956c25b + W[4];
+ t2 = e0(e) + Maj(e, f, g); h += t1; d = t1 + t2;
+ t1 = c + e1(h) + Ch(h, a, b) + 0x59f111f1 + W[5];
+ t2 = e0(d) + Maj(d, e, f); g += t1; c = t1 + t2;
+ t1 = b + e1(g) + Ch(g, h, a) + 0x923f82a4 + W[6];
+ t2 = e0(c) + Maj(c, d, e); f += t1; b = t1 + t2;
+ t1 = a + e1(f) + Ch(f, g, h) + 0xab1c5ed5 + W[7];
+ t2 = e0(b) + Maj(b, c, d); e += t1; a = t1 + t2;
+
+ t1 = h + e1(e) + Ch(e, f, g) + 0xd807aa98 + W[8];
+ t2 = e0(a) + Maj(a, b, c); d += t1; h = t1 + t2;
+ t1 = g + e1(d) + Ch(d, e, f) + 0x12835b01 + W[9];
+ t2 = e0(h) + Maj(h, a, b); c += t1; g = t1 + t2;
+ t1 = f + e1(c) + Ch(c, d, e) + 0x243185be + W[10];
+ t2 = e0(g) + Maj(g, h, a); b += t1; f = t1 + t2;
+ t1 = e + e1(b) + Ch(b, c, d) + 0x550c7dc3 + W[11];
+ t2 = e0(f) + Maj(f, g, h); a += t1; e = t1 + t2;
+ t1 = d + e1(a) + Ch(a, b, c) + 0x72be5d74 + W[12];
+ t2 = e0(e) + Maj(e, f, g); h += t1; d = t1 + t2;
+ t1 = c + e1(h) + Ch(h, a, b) + 0x80deb1fe + W[13];
+ t2 = e0(d) + Maj(d, e, f); g += t1; c = t1 + t2;
+ t1 = b + e1(g) + Ch(g, h, a) + 0x9bdc06a7 + W[14];
+ t2 = e0(c) + Maj(c, d, e); f += t1; b = t1 + t2;
+ t1 = a + e1(f) + Ch(f, g, h) + 0xc19bf174 + W[15];
+ t2 = e0(b) + Maj(b, c, d); e += t1; a = t1+t2;
+
+ t1 = h + e1(e) + Ch(e, f, g) + 0xe49b69c1 + W[16];
+ t2 = e0(a) + Maj(a, b, c); d += t1; h = t1+t2;
+ t1 = g + e1(d) + Ch(d, e, f) + 0xefbe4786 + W[17];
+ t2 = e0(h) + Maj(h, a, b); c += t1; g = t1+t2;
+ t1 = f + e1(c) + Ch(c, d, e) + 0x0fc19dc6 + W[18];
+ t2 = e0(g) + Maj(g, h, a); b += t1; f = t1+t2;
+ t1 = e + e1(b) + Ch(b, c, d) + 0x240ca1cc + W[19];
+ t2 = e0(f) + Maj(f, g, h); a += t1; e = t1+t2;
+ t1 = d + e1(a) + Ch(a, b, c) + 0x2de92c6f + W[20];
+ t2 = e0(e) + Maj(e, f, g); h += t1; d = t1+t2;
+ t1 = c + e1(h) + Ch(h, a, b) + 0x4a7484aa + W[21];
+ t2 = e0(d) + Maj(d, e, f); g += t1; c = t1+t2;
+ t1 = b + e1(g) + Ch(g, h, a) + 0x5cb0a9dc + W[22];
+ t2 = e0(c) + Maj(c, d, e); f += t1; b = t1+t2;
+ t1 = a + e1(f) + Ch(f, g, h) + 0x76f988da + W[23];
+ t2 = e0(b) + Maj(b, c, d); e += t1; a = t1+t2;
+
+ t1 = h + e1(e) + Ch(e, f, g) + 0x983e5152 + W[24];
+ t2 = e0(a) + Maj(a, b, c); d += t1; h = t1+t2;
+ t1 = g + e1(d) + Ch(d, e, f) + 0xa831c66d + W[25];
+ t2 = e0(h) + Maj(h, a, b); c += t1; g = t1+t2;
+ t1 = f + e1(c) + Ch(c, d, e) + 0xb00327c8 + W[26];
+ t2 = e0(g) + Maj(g, h, a); b += t1; f = t1+t2;
+ t1 = e + e1(b) + Ch(b, c, d) + 0xbf597fc7 + W[27];
+ t2 = e0(f) + Maj(f, g, h); a += t1; e = t1+t2;
+ t1 = d + e1(a) + Ch(a, b, c) + 0xc6e00bf3 + W[28];
+ t2 = e0(e) + Maj(e, f, g); h += t1; d = t1+t2;
+ t1 = c + e1(h) + Ch(h, a, b) + 0xd5a79147 + W[29];
+ t2 = e0(d) + Maj(d, e, f); g += t1; c = t1+t2;
+ t1 = b + e1(g) + Ch(g, h, a) + 0x06ca6351 + W[30];
+ t2 = e0(c) + Maj(c, d, e); f += t1; b = t1+t2;
+ t1 = a + e1(f) + Ch(f, g, h) + 0x14292967 + W[31];
+ t2 = e0(b) + Maj(b, c, d); e += t1; a = t1+t2;
+
+ t1 = h + e1(e) + Ch(e, f, g) + 0x27b70a85 + W[32];
+ t2 = e0(a) + Maj(a, b, c); d += t1; h = t1+t2;
+ t1 = g + e1(d) + Ch(d, e, f) + 0x2e1b2138 + W[33];
+ t2 = e0(h) + Maj(h, a, b); c += t1; g = t1+t2;
+ t1 = f + e1(c) + Ch(c, d, e) + 0x4d2c6dfc + W[34];
+ t2 = e0(g) + Maj(g, h, a); b += t1; f = t1+t2;
+ t1 = e + e1(b) + Ch(b, c, d) + 0x53380d13 + W[35];
+ t2 = e0(f) + Maj(f, g, h); a += t1; e = t1+t2;
+ t1 = d + e1(a) + Ch(a, b, c) + 0x650a7354 + W[36];
+ t2 = e0(e) + Maj(e, f, g); h += t1; d = t1+t2;
+ t1 = c + e1(h) + Ch(h, a, b) + 0x766a0abb + W[37];
+ t2 = e0(d) + Maj(d, e, f); g += t1; c = t1+t2;
+ t1 = b + e1(g) + Ch(g, h, a) + 0x81c2c92e + W[38];
+ t2 = e0(c) + Maj(c, d, e); f += t1; b = t1+t2;
+ t1 = a + e1(f) + Ch(f, g, h) + 0x92722c85 + W[39];
+ t2 = e0(b) + Maj(b, c, d); e += t1; a = t1+t2;
+
+ t1 = h + e1(e) + Ch(e, f, g) + 0xa2bfe8a1 + W[40];
+ t2 = e0(a) + Maj(a, b, c); d += t1; h = t1+t2;
+ t1 = g + e1(d) + Ch(d, e, f) + 0xa81a664b + W[41];
+ t2 = e0(h) + Maj(h, a, b); c += t1; g = t1+t2;
+ t1 = f + e1(c) + Ch(c, d, e) + 0xc24b8b70 + W[42];
+ t2 = e0(g) + Maj(g, h, a); b += t1; f = t1+t2;
+ t1 = e + e1(b) + Ch(b, c, d) + 0xc76c51a3 + W[43];
+ t2 = e0(f) + Maj(f, g, h); a += t1; e = t1+t2;
+ t1 = d + e1(a) + Ch(a, b, c) + 0xd192e819 + W[44];
+ t2 = e0(e) + Maj(e, f, g); h += t1; d = t1+t2;
+ t1 = c + e1(h) + Ch(h, a, b) + 0xd6990624 + W[45];
+ t2 = e0(d) + Maj(d, e, f); g += t1; c = t1+t2;
+ t1 = b + e1(g) + Ch(g, h, a) + 0xf40e3585 + W[46];
+ t2 = e0(c) + Maj(c, d, e); f += t1; b = t1+t2;
+ t1 = a + e1(f) + Ch(f, g, h) + 0x106aa070 + W[47];
+ t2 = e0(b) + Maj(b, c, d); e += t1; a = t1+t2;
+
+ t1 = h + e1(e) + Ch(e, f, g) + 0x19a4c116 + W[48];
+ t2 = e0(a) + Maj(a, b, c); d += t1; h = t1+t2;
+ t1 = g + e1(d) + Ch(d, e, f) + 0x1e376c08 + W[49];
+ t2 = e0(h) + Maj(h, a, b); c += t1; g = t1+t2;
+ t1 = f + e1(c) + Ch(c, d, e) + 0x2748774c + W[50];
+ t2 = e0(g) + Maj(g, h, a); b += t1; f = t1+t2;
+ t1 = e + e1(b) + Ch(b, c, d) + 0x34b0bcb5 + W[51];
+ t2 = e0(f) + Maj(f, g, h); a += t1; e = t1+t2;
+ t1 = d + e1(a) + Ch(a, b, c) + 0x391c0cb3 + W[52];
+ t2 = e0(e) + Maj(e, f, g); h += t1; d = t1+t2;
+ t1 = c + e1(h) + Ch(h, a, b) + 0x4ed8aa4a + W[53];
+ t2 = e0(d) + Maj(d, e, f); g += t1; c = t1+t2;
+ t1 = b + e1(g) + Ch(g, h, a) + 0x5b9cca4f + W[54];
+ t2 = e0(c) + Maj(c, d, e); f += t1; b = t1+t2;
+ t1 = a + e1(f) + Ch(f, g, h) + 0x682e6ff3 + W[55];
+ t2 = e0(b) + Maj(b, c, d); e += t1; a = t1+t2;
+
+ t1 = h + e1(e) + Ch(e, f, g) + 0x748f82ee + W[56];
+ t2 = e0(a) + Maj(a, b, c); d += t1; h = t1+t2;
+ t1 = g + e1(d) + Ch(d, e, f) + 0x78a5636f + W[57];
+ t2 = e0(h) + Maj(h, a, b); c += t1; g = t1+t2;
+ t1 = f + e1(c) + Ch(c, d, e) + 0x84c87814 + W[58];
+ t2 = e0(g) + Maj(g, h, a); b += t1; f = t1+t2;
+ t1 = e + e1(b) + Ch(b, c, d) + 0x8cc70208 + W[59];
+ t2 = e0(f) + Maj(f, g, h); a += t1; e = t1+t2;
+ t1 = d + e1(a) + Ch(a, b, c) + 0x90befffa + W[60];
+ t2 = e0(e) + Maj(e, f, g); h += t1; d = t1+t2;
+ t1 = c + e1(h) + Ch(h, a, b) + 0xa4506ceb + W[61];
+ t2 = e0(d) + Maj(d, e, f); g += t1; c = t1+t2;
+ t1 = b + e1(g) + Ch(g, h, a) + 0xbef9a3f7 + W[62];
+ t2 = e0(c) + Maj(c, d, e); f += t1; b = t1+t2;
+ t1 = a + e1(f) + Ch(f, g, h) + 0xc67178f2 + W[63];
+ t2 = e0(b) + Maj(b, c, d); e += t1; a = t1+t2;
+
+ state[0] += a; state[1] += b; state[2] += c; state[3] += d;
+ state[4] += e; state[5] += f; state[6] += g; state[7] += h;
+
+ /* clear any sensitive info... */
+ a = b = c = d = e = f = g = h = t1 = t2 = 0;
+ memset(W, 0, 64 * sizeof(u32));
+}
+
+int sha256_init(struct sha256_state *sctx)
+{
+ sctx->state[0] = SHA256_H0;
+ sctx->state[1] = SHA256_H1;
+ sctx->state[2] = SHA256_H2;
+ sctx->state[3] = SHA256_H3;
+ sctx->state[4] = SHA256_H4;
+ sctx->state[5] = SHA256_H5;
+ sctx->state[6] = SHA256_H6;
+ sctx->state[7] = SHA256_H7;
+ sctx->count = 0;
+
+ return 0;
+}
+
+int sha256_update(struct sha256_state *sctx, const u8 *data, unsigned int len)
+{
+ unsigned int partial, done;
+ const u8 *src;
+
+ partial = sctx->count & 0x3f;
+ sctx->count += len;
+ done = 0;
+ src = data;
+
+ if ((partial + len) > 63) {
+ if (partial) {
+ done = -partial;
+ memcpy(sctx->buf + partial, data, done + 64);
+ src = sctx->buf;
+ }
+
+ do {
+ sha256_transform(sctx->state, src);
+ done += 64;
+ src = data + done;
+ } while (done + 63 < len);
+
+ partial = 0;
+ }
+ memcpy(sctx->buf + partial, src, len - done);
+
+ return 0;
+}
+
+int sha256_final(struct sha256_state *sctx, u8 *out)
+{
+ __be32 *dst = (__be32 *)out;
+ __be64 bits;
+ unsigned int index, pad_len;
+ int i;
+ static const u8 padding[64] = { 0x80, };
+
+ /* Save number of bits */
+ bits = cpu_to_be64(sctx->count << 3);
+
+ /* Pad out to 56 mod 64. */
+ index = sctx->count & 0x3f;
+ pad_len = (index < 56) ? (56 - index) : ((64+56) - index);
+ sha256_update(sctx, padding, pad_len);
+
+ /* Append length (before padding) */
+ sha256_update(sctx, (const u8 *)&bits, sizeof(bits));
+
+ /* Store state in digest */
+ for (i = 0; i < 8; i++)
+ dst[i] = cpu_to_be32(sctx->state[i]);
+
+ /* Zeroize sensitive information. */
+ memset(sctx, 0, sizeof(*sctx));
+
+ return 0;
+}
diff --git a/arch/x86/purgatory/sha256.h b/arch/x86/purgatory/sha256.h
new file mode 100644
index 000000000000..bd15a4127735
--- /dev/null
+++ b/arch/x86/purgatory/sha256.h
@@ -0,0 +1,22 @@
+/*
+ * Copyright (C) 2014 Red Hat Inc.
+ *
+ * Author: Vivek Goyal <vgoyal@redhat.com>
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2. See the file COPYING for more details.
+ */
+
+#ifndef SHA256_H
+#define SHA256_H
+
+
+#include <linux/types.h>
+#include <crypto/sha.h>
+
+extern int sha256_init(struct sha256_state *sctx);
+extern int sha256_update(struct sha256_state *sctx, const u8 *input,
+ unsigned int length);
+extern int sha256_final(struct sha256_state *sctx, u8 *hash);
+
+#endif /* SHA256_H */
diff --git a/arch/x86/purgatory/stack.S b/arch/x86/purgatory/stack.S
new file mode 100644
index 000000000000..3cefba1fefc8
--- /dev/null
+++ b/arch/x86/purgatory/stack.S
@@ -0,0 +1,19 @@
+/*
+ * purgatory: stack
+ *
+ * Copyright (C) 2014 Red Hat Inc.
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2. See the file COPYING for more details.
+ */
+
+ /* A stack for the loaded kernel.
+ * Seperate and in the data section so it can be prepopulated.
+ */
+ .data
+ .balign 4096
+ .globl stack, stack_end
+
+stack:
+ .skip 4096
+stack_end:
diff --git a/arch/x86/purgatory/string.c b/arch/x86/purgatory/string.c
new file mode 100644
index 000000000000..d886b1fa36f0
--- /dev/null
+++ b/arch/x86/purgatory/string.c
@@ -0,0 +1,13 @@
+/*
+ * Simple string functions.
+ *
+ * Copyright (C) 2014 Red Hat Inc.
+ *
+ * Author:
+ * Vivek Goyal <vgoyal@redhat.com>
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2. See the file COPYING for more details.
+ */
+
+#include "../boot/string.c"
diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl
index d1b4a119d4a5..9fe1b5d002f0 100644
--- a/arch/x86/syscalls/syscall_32.tbl
+++ b/arch/x86/syscalls/syscall_32.tbl
@@ -362,3 +362,5 @@
353 i386 renameat2 sys_renameat2
354 i386 seccomp sys_seccomp
355 i386 getrandom sys_getrandom
+356 i386 memfd_create sys_memfd_create
+357 i386 bpf sys_bpf
diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl
index 252c804bb1aa..281150b539a2 100644
--- a/arch/x86/syscalls/syscall_64.tbl
+++ b/arch/x86/syscalls/syscall_64.tbl
@@ -325,6 +325,9 @@
316 common renameat2 sys_renameat2
317 common seccomp sys_seccomp
318 common getrandom sys_getrandom
+319 common memfd_create sys_memfd_create
+320 common kexec_file_load sys_kexec_file_load
+321 common bpf sys_bpf
#
# x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/arch/x86/tools/calc_run_size.pl b/arch/x86/tools/calc_run_size.pl
new file mode 100644
index 000000000000..0b0b124d3ece
--- /dev/null
+++ b/arch/x86/tools/calc_run_size.pl
@@ -0,0 +1,30 @@
+#!/usr/bin/perl
+#
+# Calculate the amount of space needed to run the kernel, including room for
+# the .bss and .brk sections.
+#
+# Usage:
+# objdump -h a.out | perl calc_run_size.pl
+use strict;
+
+my $mem_size = 0;
+my $file_offset = 0;
+
+my $sections=" *[0-9]+ \.(?:bss|brk) +";
+while (<>) {
+ if (/^$sections([0-9a-f]+) +(?:[0-9a-f]+ +){2}([0-9a-f]+)/) {
+ my $size = hex($1);
+ my $offset = hex($2);
+ $mem_size += $size;
+ if ($file_offset == 0) {
+ $file_offset = $offset;
+ } elsif ($file_offset != $offset) {
+ die ".bss and .brk lack common file offset\n";
+ }
+ }
+}
+
+if ($file_offset == 0) {
+ die "Never found .bss or .brk file offset\n";
+}
+printf("%d\n", $mem_size + $file_offset);
diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c
index bbb1d2259ecf..a5efb21d5228 100644
--- a/arch/x86/tools/relocs.c
+++ b/arch/x86/tools/relocs.c
@@ -695,7 +695,7 @@ static void walk_relocs(int (*process)(struct section *sec, Elf_Rel *rel,
*
*/
static int per_cpu_shndx = -1;
-Elf_Addr per_cpu_load_addr;
+static Elf_Addr per_cpu_load_addr;
static void percpu_init(void)
{
diff --git a/arch/x86/um/asm/elf.h b/arch/x86/um/asm/elf.h
index 0feee2fd5077..25a1022dd793 100644
--- a/arch/x86/um/asm/elf.h
+++ b/arch/x86/um/asm/elf.h
@@ -216,6 +216,5 @@ extern long elf_aux_hwcap;
#define ELF_HWCAP (elf_aux_hwcap)
#define SET_PERSONALITY(ex) do ; while(0)
-#define __HAVE_ARCH_GATE_AREA 1
#endif
diff --git a/arch/x86/um/asm/ptrace.h b/arch/x86/um/asm/ptrace.h
index 54f8102ccde5..e59eef20647b 100644
--- a/arch/x86/um/asm/ptrace.h
+++ b/arch/x86/um/asm/ptrace.h
@@ -47,8 +47,6 @@ struct user_desc;
#ifdef CONFIG_X86_32
-#define HOST_AUDIT_ARCH AUDIT_ARCH_I386
-
extern int ptrace_get_thread_area(struct task_struct *child, int idx,
struct user_desc __user *user_desc);
@@ -57,8 +55,6 @@ extern int ptrace_set_thread_area(struct task_struct *child, int idx,
#else
-#define HOST_AUDIT_ARCH AUDIT_ARCH_X86_64
-
#define PT_REGS_R8(r) UPT_R8(&(r)->regs)
#define PT_REGS_R9(r) UPT_R9(&(r)->regs)
#define PT_REGS_R10(r) UPT_R10(&(r)->regs)
diff --git a/arch/x86/um/asm/syscall.h b/arch/x86/um/asm/syscall.h
new file mode 100644
index 000000000000..9fe77b7b5a0e
--- /dev/null
+++ b/arch/x86/um/asm/syscall.h
@@ -0,0 +1,15 @@
+#ifndef __UM_ASM_SYSCALL_H
+#define __UM_ASM_SYSCALL_H
+
+#include <uapi/linux/audit.h>
+
+static inline int syscall_get_arch(void)
+{
+#ifdef CONFIG_X86_32
+ return AUDIT_ARCH_I386;
+#else
+ return AUDIT_ARCH_X86_64;
+#endif
+}
+
+#endif /* __UM_ASM_SYSCALL_H */
diff --git a/arch/x86/um/checksum_32.S b/arch/x86/um/checksum_32.S
index 8d0c420465cc..fa4b8b9841ff 100644
--- a/arch/x86/um/checksum_32.S
+++ b/arch/x86/um/checksum_32.S
@@ -214,242 +214,3 @@ csum_partial:
ret
#endif
-
-/*
-unsigned int csum_partial_copy_generic (const char *src, char *dst,
- int len, int sum, int *src_err_ptr, int *dst_err_ptr)
- */
-
-/*
- * Copy from ds while checksumming, otherwise like csum_partial
- *
- * The macros SRC and DST specify the type of access for the instruction.
- * thus we can call a custom exception handler for all access types.
- *
- * FIXME: could someone double-check whether I haven't mixed up some SRC and
- * DST definitions? It's damn hard to trigger all cases. I hope I got
- * them all but there's no guarantee.
- */
-
-#define SRC(y...) \
- 9999: y; \
- _ASM_EXTABLE(9999b, 6001f)
-
-#define DST(y...) \
- 9999: y; \
- _ASM_EXTABLE(9999b, 6002f)
-
-.align 4
-
-#ifndef CONFIG_X86_USE_PPRO_CHECKSUM
-
-#define ARGBASE 16
-#define FP 12
-
-csum_partial_copy_generic_i386:
- subl $4,%esp
- pushl %edi
- pushl %esi
- pushl %ebx
- movl ARGBASE+16(%esp),%eax # sum
- movl ARGBASE+12(%esp),%ecx # len
- movl ARGBASE+4(%esp),%esi # src
- movl ARGBASE+8(%esp),%edi # dst
-
- testl $2, %edi # Check alignment.
- jz 2f # Jump if alignment is ok.
- subl $2, %ecx # Alignment uses up two bytes.
- jae 1f # Jump if we had at least two bytes.
- addl $2, %ecx # ecx was < 2. Deal with it.
- jmp 4f
-SRC(1: movw (%esi), %bx )
- addl $2, %esi
-DST( movw %bx, (%edi) )
- addl $2, %edi
- addw %bx, %ax
- adcl $0, %eax
-2:
- movl %ecx, FP(%esp)
- shrl $5, %ecx
- jz 2f
- testl %esi, %esi
-SRC(1: movl (%esi), %ebx )
-SRC( movl 4(%esi), %edx )
- adcl %ebx, %eax
-DST( movl %ebx, (%edi) )
- adcl %edx, %eax
-DST( movl %edx, 4(%edi) )
-
-SRC( movl 8(%esi), %ebx )
-SRC( movl 12(%esi), %edx )
- adcl %ebx, %eax
-DST( movl %ebx, 8(%edi) )
- adcl %edx, %eax
-DST( movl %edx, 12(%edi) )
-
-SRC( movl 16(%esi), %ebx )
-SRC( movl 20(%esi), %edx )
- adcl %ebx, %eax
-DST( movl %ebx, 16(%edi) )
- adcl %edx, %eax
-DST( movl %edx, 20(%edi) )
-
-SRC( movl 24(%esi), %ebx )
-SRC( movl 28(%esi), %edx )
- adcl %ebx, %eax
-DST( movl %ebx, 24(%edi) )
- adcl %edx, %eax
-DST( movl %edx, 28(%edi) )
-
- lea 32(%esi), %esi
- lea 32(%edi), %edi
- dec %ecx
- jne 1b
- adcl $0, %eax
-2: movl FP(%esp), %edx
- movl %edx, %ecx
- andl $0x1c, %edx
- je 4f
- shrl $2, %edx # This clears CF
-SRC(3: movl (%esi), %ebx )
- adcl %ebx, %eax
-DST( movl %ebx, (%edi) )
- lea 4(%esi), %esi
- lea 4(%edi), %edi
- dec %edx
- jne 3b
- adcl $0, %eax
-4: andl $3, %ecx
- jz 7f
- cmpl $2, %ecx
- jb 5f
-SRC( movw (%esi), %cx )
- leal 2(%esi), %esi
-DST( movw %cx, (%edi) )
- leal 2(%edi), %edi
- je 6f
- shll $16,%ecx
-SRC(5: movb (%esi), %cl )
-DST( movb %cl, (%edi) )
-6: addl %ecx, %eax
- adcl $0, %eax
-7:
-5000:
-
-# Exception handler:
-.section .fixup, "ax"
-
-6001:
- movl ARGBASE+20(%esp), %ebx # src_err_ptr
- movl $-EFAULT, (%ebx)
-
- # zero the complete destination - computing the rest
- # is too much work
- movl ARGBASE+8(%esp), %edi # dst
- movl ARGBASE+12(%esp), %ecx # len
- xorl %eax,%eax
- rep ; stosb
-
- jmp 5000b
-
-6002:
- movl ARGBASE+24(%esp), %ebx # dst_err_ptr
- movl $-EFAULT,(%ebx)
- jmp 5000b
-
-.previous
-
- popl %ebx
- popl %esi
- popl %edi
- popl %ecx # equivalent to addl $4,%esp
- ret
-
-#else
-
-/* Version for PentiumII/PPro */
-
-#define ROUND1(x) \
- SRC(movl x(%esi), %ebx ) ; \
- addl %ebx, %eax ; \
- DST(movl %ebx, x(%edi) ) ;
-
-#define ROUND(x) \
- SRC(movl x(%esi), %ebx ) ; \
- adcl %ebx, %eax ; \
- DST(movl %ebx, x(%edi) ) ;
-
-#define ARGBASE 12
-
-csum_partial_copy_generic_i386:
- pushl %ebx
- pushl %edi
- pushl %esi
- movl ARGBASE+4(%esp),%esi #src
- movl ARGBASE+8(%esp),%edi #dst
- movl ARGBASE+12(%esp),%ecx #len
- movl ARGBASE+16(%esp),%eax #sum
-# movl %ecx, %edx
- movl %ecx, %ebx
- movl %esi, %edx
- shrl $6, %ecx
- andl $0x3c, %ebx
- negl %ebx
- subl %ebx, %esi
- subl %ebx, %edi
- lea -1(%esi),%edx
- andl $-32,%edx
- lea 3f(%ebx,%ebx), %ebx
- testl %esi, %esi
- jmp *%ebx
-1: addl $64,%esi
- addl $64,%edi
- SRC(movb -32(%edx),%bl) ; SRC(movb (%edx),%bl)
- ROUND1(-64) ROUND(-60) ROUND(-56) ROUND(-52)
- ROUND (-48) ROUND(-44) ROUND(-40) ROUND(-36)
- ROUND (-32) ROUND(-28) ROUND(-24) ROUND(-20)
- ROUND (-16) ROUND(-12) ROUND(-8) ROUND(-4)
-3: adcl $0,%eax
- addl $64, %edx
- dec %ecx
- jge 1b
-4: movl ARGBASE+12(%esp),%edx #len
- andl $3, %edx
- jz 7f
- cmpl $2, %edx
- jb 5f
-SRC( movw (%esi), %dx )
- leal 2(%esi), %esi
-DST( movw %dx, (%edi) )
- leal 2(%edi), %edi
- je 6f
- shll $16,%edx
-5:
-SRC( movb (%esi), %dl )
-DST( movb %dl, (%edi) )
-6: addl %edx, %eax
- adcl $0, %eax
-7:
-.section .fixup, "ax"
-6001: movl ARGBASE+20(%esp), %ebx # src_err_ptr
- movl $-EFAULT, (%ebx)
- # zero the complete destination (computing the rest is too much work)
- movl ARGBASE+8(%esp),%edi # dst
- movl ARGBASE+12(%esp),%ecx # len
- xorl %eax,%eax
- rep; stosb
- jmp 7b
-6002: movl ARGBASE+24(%esp), %ebx # dst_err_ptr
- movl $-EFAULT, (%ebx)
- jmp 7b
-.previous
-
- popl %esi
- popl %edi
- popl %ebx
- ret
-
-#undef ROUND
-#undef ROUND1
-
-#endif
diff --git a/arch/x86/um/mem_64.c b/arch/x86/um/mem_64.c
index c6492e75797b..f8fecaddcc0d 100644
--- a/arch/x86/um/mem_64.c
+++ b/arch/x86/um/mem_64.c
@@ -9,18 +9,3 @@ const char *arch_vma_name(struct vm_area_struct *vma)
return NULL;
}
-
-struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
-{
- return NULL;
-}
-
-int in_gate_area(struct mm_struct *mm, unsigned long addr)
-{
- return 0;
-}
-
-int in_gate_area_no_mm(unsigned long addr)
-{
- return 0;
-}
diff --git a/arch/x86/um/signal.c b/arch/x86/um/signal.c
index 5e04a1c899fa..79d824551c1a 100644
--- a/arch/x86/um/signal.c
+++ b/arch/x86/um/signal.c
@@ -370,13 +370,12 @@ struct rt_sigframe
char retcode[8];
};
-int setup_signal_stack_sc(unsigned long stack_top, int sig,
- struct k_sigaction *ka, struct pt_regs *regs,
- sigset_t *mask)
+int setup_signal_stack_sc(unsigned long stack_top, struct ksignal *ksig,
+ struct pt_regs *regs, sigset_t *mask)
{
struct sigframe __user *frame;
void __user *restorer;
- int err = 0;
+ int err = 0, sig = ksig->sig;
/* This is the same calculation as i386 - ((sp + 4) & 15) == 0 */
stack_top = ((stack_top + 4) & -16UL) - 4;
@@ -385,8 +384,8 @@ int setup_signal_stack_sc(unsigned long stack_top, int sig,
return 1;
restorer = frame->retcode;
- if (ka->sa.sa_flags & SA_RESTORER)
- restorer = ka->sa.sa_restorer;
+ if (ksig->ka.sa.sa_flags & SA_RESTORER)
+ restorer = ksig->ka.sa.sa_restorer;
err |= __put_user(restorer, &frame->pretcode);
err |= __put_user(sig, &frame->sig);
@@ -410,20 +409,19 @@ int setup_signal_stack_sc(unsigned long stack_top, int sig,
return err;
PT_REGS_SP(regs) = (unsigned long) frame;
- PT_REGS_IP(regs) = (unsigned long) ka->sa.sa_handler;
+ PT_REGS_IP(regs) = (unsigned long) ksig->ka.sa.sa_handler;
PT_REGS_AX(regs) = (unsigned long) sig;
PT_REGS_DX(regs) = (unsigned long) 0;
PT_REGS_CX(regs) = (unsigned long) 0;
return 0;
}
-int setup_signal_stack_si(unsigned long stack_top, int sig,
- struct k_sigaction *ka, struct pt_regs *regs,
- siginfo_t *info, sigset_t *mask)
+int setup_signal_stack_si(unsigned long stack_top, struct ksignal *ksig,
+ struct pt_regs *regs, sigset_t *mask)
{
struct rt_sigframe __user *frame;
void __user *restorer;
- int err = 0;
+ int err = 0, sig = ksig->sig;
stack_top &= -8UL;
frame = (struct rt_sigframe __user *) stack_top - 1;
@@ -431,14 +429,14 @@ int setup_signal_stack_si(unsigned long stack_top, int sig,
return 1;
restorer = frame->retcode;
- if (ka->sa.sa_flags & SA_RESTORER)
- restorer = ka->sa.sa_restorer;
+ if (ksig->ka.sa.sa_flags & SA_RESTORER)
+ restorer = ksig->ka.sa.sa_restorer;
err |= __put_user(restorer, &frame->pretcode);
err |= __put_user(sig, &frame->sig);
err |= __put_user(&frame->info, &frame->pinfo);
err |= __put_user(&frame->uc, &frame->puc);
- err |= copy_siginfo_to_user(&frame->info, info);
+ err |= copy_siginfo_to_user(&frame->info, &ksig->info);
err |= copy_ucontext_to_user(&frame->uc, &frame->fpstate, mask,
PT_REGS_SP(regs));
@@ -457,7 +455,7 @@ int setup_signal_stack_si(unsigned long stack_top, int sig,
return err;
PT_REGS_SP(regs) = (unsigned long) frame;
- PT_REGS_IP(regs) = (unsigned long) ka->sa.sa_handler;
+ PT_REGS_IP(regs) = (unsigned long) ksig->ka.sa.sa_handler;
PT_REGS_AX(regs) = (unsigned long) sig;
PT_REGS_DX(regs) = (unsigned long) &frame->info;
PT_REGS_CX(regs) = (unsigned long) &frame->uc;
@@ -502,12 +500,11 @@ struct rt_sigframe
struct _fpstate fpstate;
};
-int setup_signal_stack_si(unsigned long stack_top, int sig,
- struct k_sigaction *ka, struct pt_regs * regs,
- siginfo_t *info, sigset_t *set)
+int setup_signal_stack_si(unsigned long stack_top, struct ksignal *ksig,
+ struct pt_regs *regs, sigset_t *set)
{
struct rt_sigframe __user *frame;
- int err = 0;
+ int err = 0, sig = ksig->sig;
frame = (struct rt_sigframe __user *)
round_down(stack_top - sizeof(struct rt_sigframe), 16);
@@ -517,8 +514,8 @@ int setup_signal_stack_si(unsigned long stack_top, int sig,
if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
goto out;
- if (ka->sa.sa_flags & SA_SIGINFO) {
- err |= copy_siginfo_to_user(&frame->info, info);
+ if (ksig->ka.sa.sa_flags & SA_SIGINFO) {
+ err |= copy_siginfo_to_user(&frame->info, &ksig->info);
if (err)
goto out;
}
@@ -543,8 +540,8 @@ int setup_signal_stack_si(unsigned long stack_top, int sig,
* already in userspace.
*/
/* x86-64 should always use SA_RESTORER. */
- if (ka->sa.sa_flags & SA_RESTORER)
- err |= __put_user(ka->sa.sa_restorer, &frame->pretcode);
+ if (ksig->ka.sa.sa_flags & SA_RESTORER)
+ err |= __put_user(ksig->ka.sa.sa_restorer, &frame->pretcode);
else
/* could use a vstub here */
return err;
@@ -570,7 +567,7 @@ int setup_signal_stack_si(unsigned long stack_top, int sig,
*/
PT_REGS_SI(regs) = (unsigned long) &frame->info;
PT_REGS_DX(regs) = (unsigned long) &frame->uc;
- PT_REGS_IP(regs) = (unsigned long) ka->sa.sa_handler;
+ PT_REGS_IP(regs) = (unsigned long) ksig->ka.sa.sa_handler;
out:
return err;
}
diff --git a/arch/x86/vdso/vdso2c.h b/arch/x86/vdso/vdso2c.h
index fd57829b30d8..0224987556ce 100644
--- a/arch/x86/vdso/vdso2c.h
+++ b/arch/x86/vdso/vdso2c.h
@@ -109,16 +109,18 @@ static void BITSFUNC(go)(void *raw_addr, size_t raw_len,
/* Validate mapping addresses. */
for (i = 0; i < sizeof(special_pages) / sizeof(special_pages[0]); i++) {
- if (!syms[i])
+ INT_BITS symval = syms[special_pages[i]];
+
+ if (!symval)
continue; /* The mapping isn't used; ignore it. */
- if (syms[i] % 4096)
+ if (symval % 4096)
fail("%s must be a multiple of 4096\n",
required_syms[i].name);
- if (syms[sym_vvar_start] > syms[i] + 4096)
- fail("%s underruns begin_vvar\n",
+ if (symval + 4096 < syms[sym_vvar_start])
+ fail("%s underruns vvar_start\n",
required_syms[i].name);
- if (syms[i] + 4096 > 0)
+ if (symval + 4096 > 0)
fail("%s is on the wrong side of the vdso text\n",
required_syms[i].name);
}
diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c
index e4f7781ee162..e904c270573b 100644
--- a/arch/x86/vdso/vdso32-setup.c
+++ b/arch/x86/vdso/vdso32-setup.c
@@ -115,23 +115,6 @@ static __init int ia32_binfmt_init(void)
return 0;
}
__initcall(ia32_binfmt_init);
-#endif
-
-#else /* CONFIG_X86_32 */
-
-struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
-{
- return NULL;
-}
-
-int in_gate_area(struct mm_struct *mm, unsigned long addr)
-{
- return 0;
-}
-
-int in_gate_area_no_mm(unsigned long addr)
-{
- return 0;
-}
+#endif /* CONFIG_SYSCTL */
#endif /* CONFIG_X86_64 */
diff --git a/arch/x86/xen/efi.c b/arch/x86/xen/efi.c
index a02e09e18f57..be14cc3e48d5 100644
--- a/arch/x86/xen/efi.c
+++ b/arch/x86/xen/efi.c
@@ -15,12 +15,14 @@
* with this program. If not, see <http://www.gnu.org/licenses/>.
*/
+#include <linux/bitops.h>
#include <linux/efi.h>
#include <linux/init.h>
#include <linux/string.h>
#include <xen/xen-ops.h>
+#include <asm/page.h>
#include <asm/setup.h>
void __init xen_efi_init(void)
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 94813515fdd6..fac5e4f9607c 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -821,7 +821,7 @@ static void xen_convert_trap_info(const struct desc_ptr *desc,
void xen_copy_trap_info(struct trap_info *traps)
{
- const struct desc_ptr *desc = &__get_cpu_var(idt_desc);
+ const struct desc_ptr *desc = this_cpu_ptr(&idt_desc);
xen_convert_trap_info(desc, traps);
}
@@ -838,7 +838,7 @@ static void xen_load_idt(const struct desc_ptr *desc)
spin_lock(&lock);
- __get_cpu_var(idt_desc) = *desc;
+ memcpy(this_cpu_ptr(&idt_desc), desc, sizeof(idt_desc));
xen_convert_trap_info(desc, traps);
@@ -1463,6 +1463,7 @@ static void __ref xen_setup_gdt(int cpu)
pv_cpu_ops.load_gdt = xen_load_gdt;
}
+#ifdef CONFIG_XEN_PVH
/*
* A PV guest starts with default flags that are not set for PVH, set them
* here asap.
@@ -1508,17 +1509,21 @@ static void __init xen_pvh_early_guest_init(void)
return;
xen_have_vector_callback = 1;
+
+ xen_pvh_early_cpu_init(0, false);
xen_pvh_set_cr_flags(0);
#ifdef CONFIG_X86_32
BUG(); /* PVH: Implement proper support. */
#endif
}
+#endif /* CONFIG_XEN_PVH */
/* First C function to be called on Xen boot */
asmlinkage __visible void __init xen_start_kernel(void)
{
struct physdev_set_iopl set_iopl;
+ unsigned long initrd_start = 0;
int rc;
if (!xen_start_info)
@@ -1527,7 +1532,9 @@ asmlinkage __visible void __init xen_start_kernel(void)
xen_domain_type = XEN_PV_DOMAIN;
xen_setup_features();
+#ifdef CONFIG_XEN_PVH
xen_pvh_early_guest_init();
+#endif
xen_setup_machphys_mapping();
/* Install Xen paravirt ops */
@@ -1559,8 +1566,6 @@ asmlinkage __visible void __init xen_start_kernel(void)
#endif
__supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD);
- __supported_pte_mask |= _PAGE_IOMAP;
-
/*
* Prevent page tables from being allocated in highmem, even
* if CONFIG_HIGHPTE is enabled.
@@ -1631,9 +1636,6 @@ asmlinkage __visible void __init xen_start_kernel(void)
xen_raw_console_write("mapping kernel into physical memory\n");
xen_setup_kernel_pagetable((pgd_t *)xen_start_info->pt_base, xen_start_info->nr_pages);
- /* Allocate and initialize top and mid mfn levels for p2m structure */
- xen_build_mfn_list_list();
-
/* keep using Xen gdt for now; no urgent need to change it */
#ifdef CONFIG_X86_32
@@ -1667,10 +1669,16 @@ asmlinkage __visible void __init xen_start_kernel(void)
new_cpu_data.x86_capability[0] = cpuid_edx(1);
#endif
+ if (xen_start_info->mod_start) {
+ if (xen_start_info->flags & SIF_MOD_START_PFN)
+ initrd_start = PFN_PHYS(xen_start_info->mod_start);
+ else
+ initrd_start = __pa(xen_start_info->mod_start);
+ }
+
/* Poke various useful things into boot_params */
boot_params.hdr.type_of_loader = (9 << 4) | 0;
- boot_params.hdr.ramdisk_image = xen_start_info->mod_start
- ? __pa(xen_start_info->mod_start) : 0;
+ boot_params.hdr.ramdisk_image = initrd_start;
boot_params.hdr.ramdisk_size = xen_start_info->mod_len;
boot_params.hdr.cmd_line_ptr = __pa(xen_start_info->cmd_line);
@@ -1828,8 +1836,19 @@ static void __init xen_hvm_guest_init(void)
xen_hvm_init_mmu_ops();
}
+static bool xen_nopv = false;
+static __init int xen_parse_nopv(char *arg)
+{
+ xen_nopv = true;
+ return 0;
+}
+early_param("xen_nopv", xen_parse_nopv);
+
static uint32_t __init xen_hvm_platform(void)
{
+ if (xen_nopv)
+ return 0;
+
if (xen_pv_domain())
return 0;
@@ -1838,6 +1857,8 @@ static uint32_t __init xen_hvm_platform(void)
bool xen_hvm_need_lapic(void)
{
+ if (xen_nopv)
+ return false;
if (xen_pv_domain())
return false;
if (!xen_hvm_domain())
diff --git a/arch/x86/xen/grant-table.c b/arch/x86/xen/grant-table.c
index ebfa9b2c871d..1580e7a5a4cf 100644
--- a/arch/x86/xen/grant-table.c
+++ b/arch/x86/xen/grant-table.c
@@ -49,7 +49,7 @@
static struct gnttab_vm_area {
struct vm_struct *area;
pte_t **ptes;
-} gnttab_shared_vm_area, gnttab_status_vm_area;
+} gnttab_shared_vm_area;
int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes,
unsigned long max_nr_gframes,
@@ -73,43 +73,16 @@ int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes,
return 0;
}
-int arch_gnttab_map_status(uint64_t *frames, unsigned long nr_gframes,
- unsigned long max_nr_gframes,
- grant_status_t **__shared)
-{
- grant_status_t *shared = *__shared;
- unsigned long addr;
- unsigned long i;
-
- if (shared == NULL)
- *__shared = shared = gnttab_status_vm_area.area->addr;
-
- addr = (unsigned long)shared;
-
- for (i = 0; i < nr_gframes; i++) {
- set_pte_at(&init_mm, addr, gnttab_status_vm_area.ptes[i],
- mfn_pte(frames[i], PAGE_KERNEL));
- addr += PAGE_SIZE;
- }
-
- return 0;
-}
-
void arch_gnttab_unmap(void *shared, unsigned long nr_gframes)
{
- pte_t **ptes;
unsigned long addr;
unsigned long i;
- if (shared == gnttab_status_vm_area.area->addr)
- ptes = gnttab_status_vm_area.ptes;
- else
- ptes = gnttab_shared_vm_area.ptes;
-
addr = (unsigned long)shared;
for (i = 0; i < nr_gframes; i++) {
- set_pte_at(&init_mm, addr, ptes[i], __pte(0));
+ set_pte_at(&init_mm, addr, gnttab_shared_vm_area.ptes[i],
+ __pte(0));
addr += PAGE_SIZE;
}
}
@@ -129,35 +102,12 @@ static int arch_gnttab_valloc(struct gnttab_vm_area *area, unsigned nr_frames)
return 0;
}
-static void arch_gnttab_vfree(struct gnttab_vm_area *area)
+int arch_gnttab_init(unsigned long nr_shared)
{
- free_vm_area(area->area);
- kfree(area->ptes);
-}
-
-int arch_gnttab_init(unsigned long nr_shared, unsigned long nr_status)
-{
- int ret;
-
if (!xen_pv_domain())
return 0;
- ret = arch_gnttab_valloc(&gnttab_shared_vm_area, nr_shared);
- if (ret < 0)
- return ret;
-
- /*
- * Always allocate the space for the status frames in case
- * we're migrated to a host with V2 support.
- */
- ret = arch_gnttab_valloc(&gnttab_status_vm_area, nr_status);
- if (ret < 0)
- goto err;
-
- return 0;
- err:
- arch_gnttab_vfree(&gnttab_shared_vm_area);
- return -ENOMEM;
+ return arch_gnttab_valloc(&gnttab_shared_vm_area, nr_shared);
}
#ifdef CONFIG_XEN_PVH
@@ -168,6 +118,7 @@ static int __init xlated_setup_gnttab_pages(void)
{
struct page **pages;
xen_pfn_t *pfns;
+ void *vaddr;
int rc;
unsigned int i;
unsigned long nr_grant_frames = gnttab_max_grant_frames();
@@ -193,21 +144,20 @@ static int __init xlated_setup_gnttab_pages(void)
for (i = 0; i < nr_grant_frames; i++)
pfns[i] = page_to_pfn(pages[i]);
- rc = arch_gnttab_map_shared(pfns, nr_grant_frames, nr_grant_frames,
- &xen_auto_xlat_grant_frames.vaddr);
-
- if (rc) {
+ vaddr = vmap(pages, nr_grant_frames, 0, PAGE_KERNEL);
+ if (!vaddr) {
pr_warn("%s Couldn't map %ld pfns rc:%d\n", __func__,
nr_grant_frames, rc);
free_xenballooned_pages(nr_grant_frames, pages);
kfree(pages);
kfree(pfns);
- return rc;
+ return -ENOMEM;
}
kfree(pages);
xen_auto_xlat_grant_frames.pfn = pfns;
xen_auto_xlat_grant_frames.count = nr_grant_frames;
+ xen_auto_xlat_grant_frames.vaddr = vaddr;
return 0;
}
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index e8a1201c3293..a8a1a3d08d4d 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -399,38 +399,14 @@ static pteval_t pte_pfn_to_mfn(pteval_t val)
if (unlikely(mfn == INVALID_P2M_ENTRY)) {
mfn = 0;
flags = 0;
- } else {
- /*
- * Paramount to do this test _after_ the
- * INVALID_P2M_ENTRY as INVALID_P2M_ENTRY &
- * IDENTITY_FRAME_BIT resolves to true.
- */
- mfn &= ~FOREIGN_FRAME_BIT;
- if (mfn & IDENTITY_FRAME_BIT) {
- mfn &= ~IDENTITY_FRAME_BIT;
- flags |= _PAGE_IOMAP;
- }
- }
+ } else
+ mfn &= ~(FOREIGN_FRAME_BIT | IDENTITY_FRAME_BIT);
val = ((pteval_t)mfn << PAGE_SHIFT) | flags;
}
return val;
}
-static pteval_t iomap_pte(pteval_t val)
-{
- if (val & _PAGE_PRESENT) {
- unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
- pteval_t flags = val & PTE_FLAGS_MASK;
-
- /* We assume the pte frame number is a MFN, so
- just use it as-is. */
- val = ((pteval_t)pfn << PAGE_SHIFT) | flags;
- }
-
- return val;
-}
-
__visible pteval_t xen_pte_val(pte_t pte)
{
pteval_t pteval = pte.pte;
@@ -441,9 +417,6 @@ __visible pteval_t xen_pte_val(pte_t pte)
pteval = (pteval & ~_PAGE_PAT) | _PAGE_PWT;
}
#endif
- if (xen_initial_domain() && (pteval & _PAGE_IOMAP))
- return pteval;
-
return pte_mfn_to_pfn(pteval);
}
PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val);
@@ -481,7 +454,6 @@ void xen_set_pat(u64 pat)
__visible pte_t xen_make_pte(pteval_t pte)
{
- phys_addr_t addr = (pte & PTE_PFN_MASK);
#if 0
/* If Linux is trying to set a WC pte, then map to the Xen WC.
* If _PAGE_PAT is set, then it probably means it is really
@@ -496,19 +468,7 @@ __visible pte_t xen_make_pte(pteval_t pte)
pte = (pte & ~(_PAGE_PCD | _PAGE_PWT)) | _PAGE_PAT;
}
#endif
- /*
- * Unprivileged domains are allowed to do IOMAPpings for
- * PCI passthrough, but not map ISA space. The ISA
- * mappings are just dummy local mappings to keep other
- * parts of the kernel happy.
- */
- if (unlikely(pte & _PAGE_IOMAP) &&
- (xen_initial_domain() || addr >= ISA_END_ADDRESS)) {
- pte = iomap_pte(pte);
- } else {
- pte &= ~_PAGE_IOMAP;
- pte = pte_pfn_to_mfn(pte);
- }
+ pte = pte_pfn_to_mfn(pte);
return native_make_pte(pte);
}
@@ -1257,10 +1217,13 @@ static void __init xen_pagetable_p2m_copy(void)
static void __init xen_pagetable_init(void)
{
paging_init();
- xen_setup_shared_info();
#ifdef CONFIG_X86_64
xen_pagetable_p2m_copy();
#endif
+ /* Allocate and initialize top and mid mfn levels for p2m structure */
+ xen_build_mfn_list_list();
+
+ xen_setup_shared_info();
xen_post_allocator_init();
}
static void xen_write_cr2(unsigned long cr2)
@@ -1866,12 +1829,11 @@ static void __init check_pt_base(unsigned long *pt_base, unsigned long *pt_end,
*
* We can construct this by grafting the Xen provided pagetable into
* head_64.S's preconstructed pagetables. We copy the Xen L2's into
- * level2_ident_pgt, level2_kernel_pgt and level2_fixmap_pgt. This
- * means that only the kernel has a physical mapping to start with -
- * but that's enough to get __va working. We need to fill in the rest
- * of the physical mapping once some sort of allocator has been set
- * up.
- * NOTE: for PVH, the page tables are native.
+ * level2_ident_pgt, and level2_kernel_pgt. This means that only the
+ * kernel has a physical mapping to start with - but that's enough to
+ * get __va working. We need to fill in the rest of the physical
+ * mapping once some sort of allocator has been set up. NOTE: for
+ * PVH, the page tables are native.
*/
void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
{
@@ -1902,8 +1864,11 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
/* L3_i[0] -> level2_ident_pgt */
convert_pfn_mfn(level3_ident_pgt);
/* L3_k[510] -> level2_kernel_pgt
- * L3_i[511] -> level2_fixmap_pgt */
+ * L3_k[511] -> level2_fixmap_pgt */
convert_pfn_mfn(level3_kernel_pgt);
+
+ /* L3_k[511][506] -> level1_fixmap_pgt */
+ convert_pfn_mfn(level2_fixmap_pgt);
}
/* We get [511][511] and have Xen's version of level2_kernel_pgt */
l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd);
@@ -1913,21 +1878,15 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
addr[1] = (unsigned long)l3;
addr[2] = (unsigned long)l2;
/* Graft it onto L4[272][0]. Note that we creating an aliasing problem:
- * Both L4[272][0] and L4[511][511] have entries that point to the same
+ * Both L4[272][0] and L4[511][510] have entries that point to the same
* L2 (PMD) tables. Meaning that if you modify it in __va space
* it will be also modified in the __ka space! (But if you just
* modify the PMD table to point to other PTE's or none, then you
* are OK - which is what cleanup_highmap does) */
copy_page(level2_ident_pgt, l2);
- /* Graft it onto L4[511][511] */
+ /* Graft it onto L4[511][510] */
copy_page(level2_kernel_pgt, l2);
- /* Get [511][510] and graft that in level2_fixmap_pgt */
- l3 = m2v(pgd[pgd_index(__START_KERNEL_map + PMD_SIZE)].pgd);
- l2 = m2v(l3[pud_index(__START_KERNEL_map + PMD_SIZE)].pud);
- copy_page(level2_fixmap_pgt, l2);
- /* Note that we don't do anything with level1_fixmap_pgt which
- * we don't need. */
if (!xen_feature(XENFEAT_auto_translated_physmap)) {
/* Make pagetable pieces RO */
set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
@@ -1937,6 +1896,7 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
set_page_prot(level2_ident_pgt, PAGE_KERNEL_RO);
set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
+ set_page_prot(level1_fixmap_pgt, PAGE_KERNEL_RO);
/* Pin down new L4 */
pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
@@ -2094,7 +2054,7 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
default:
/* By default, set_fixmap is used for hardware mappings */
- pte = mfn_pte(phys, __pgprot(pgprot_val(prot) | _PAGE_IOMAP));
+ pte = mfn_pte(phys, prot);
break;
}
diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c
index 0d82003e76ad..ea54a08d8301 100644
--- a/arch/x86/xen/multicalls.c
+++ b/arch/x86/xen/multicalls.c
@@ -54,7 +54,7 @@ DEFINE_PER_CPU(unsigned long, xen_mc_irq_flags);
void xen_mc_flush(void)
{
- struct mc_buffer *b = &__get_cpu_var(mc_buffer);
+ struct mc_buffer *b = this_cpu_ptr(&mc_buffer);
struct multicall_entry *mc;
int ret = 0;
unsigned long flags;
@@ -131,7 +131,7 @@ void xen_mc_flush(void)
struct multicall_space __xen_mc_entry(size_t args)
{
- struct mc_buffer *b = &__get_cpu_var(mc_buffer);
+ struct mc_buffer *b = this_cpu_ptr(&mc_buffer);
struct multicall_space ret;
unsigned argidx = roundup(b->argidx, sizeof(u64));
@@ -162,7 +162,7 @@ struct multicall_space __xen_mc_entry(size_t args)
struct multicall_space xen_mc_extend_args(unsigned long op, size_t size)
{
- struct mc_buffer *b = &__get_cpu_var(mc_buffer);
+ struct mc_buffer *b = this_cpu_ptr(&mc_buffer);
struct multicall_space ret = { NULL, NULL };
BUG_ON(preemptible());
@@ -192,7 +192,7 @@ out:
void xen_mc_callback(void (*fn)(void *), void *data)
{
- struct mc_buffer *b = &__get_cpu_var(mc_buffer);
+ struct mc_buffer *b = this_cpu_ptr(&mc_buffer);
struct callback *cb;
if (b->cbidx == MC_BATCH) {
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index 9bb3d82ffec8..b456b048eca9 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -163,6 +163,7 @@
#include <linux/hash.h>
#include <linux/sched.h>
#include <linux/seq_file.h>
+#include <linux/bootmem.h>
#include <asm/cache.h>
#include <asm/setup.h>
@@ -173,6 +174,7 @@
#include <xen/balloon.h>
#include <xen/grant_table.h>
+#include "p2m.h"
#include "multicalls.h"
#include "xen-ops.h"
@@ -180,38 +182,27 @@ static void __init m2p_override_init(void);
unsigned long xen_max_p2m_pfn __read_mostly;
-#define P2M_PER_PAGE (PAGE_SIZE / sizeof(unsigned long))
-#define P2M_MID_PER_PAGE (PAGE_SIZE / sizeof(unsigned long *))
-#define P2M_TOP_PER_PAGE (PAGE_SIZE / sizeof(unsigned long **))
-
-#define MAX_P2M_PFN (P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE)
+static unsigned long *p2m_mid_missing_mfn;
+static unsigned long *p2m_top_mfn;
+static unsigned long **p2m_top_mfn_p;
/* Placeholders for holes in the address space */
static RESERVE_BRK_ARRAY(unsigned long, p2m_missing, P2M_PER_PAGE);
static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_missing, P2M_MID_PER_PAGE);
-static RESERVE_BRK_ARRAY(unsigned long, p2m_mid_missing_mfn, P2M_MID_PER_PAGE);
static RESERVE_BRK_ARRAY(unsigned long **, p2m_top, P2M_TOP_PER_PAGE);
-static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, P2M_TOP_PER_PAGE);
-static RESERVE_BRK_ARRAY(unsigned long *, p2m_top_mfn_p, P2M_TOP_PER_PAGE);
static RESERVE_BRK_ARRAY(unsigned long, p2m_identity, P2M_PER_PAGE);
static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_identity, P2M_MID_PER_PAGE);
-static RESERVE_BRK_ARRAY(unsigned long, p2m_mid_identity_mfn, P2M_MID_PER_PAGE);
RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
-RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
-/* We might hit two boundary violations at the start and end, at max each
- * boundary violation will require three middle nodes. */
-RESERVE_BRK(p2m_mid_extra, PAGE_SIZE * 2 * 3);
-
-/* When we populate back during bootup, the amount of pages can vary. The
- * max we have is seen is 395979, but that does not mean it can't be more.
- * Some machines can have 3GB I/O holes even. With early_can_reuse_p2m_middle
- * it can re-use Xen provided mfn_list array, so we only need to allocate at
- * most three P2M top nodes. */
-RESERVE_BRK(p2m_populated, PAGE_SIZE * 3);
+/* For each I/O range remapped we may lose up to two leaf pages for the boundary
+ * violations and three mid pages to cover up to 3GB. With
+ * early_can_reuse_p2m_middle() most of the leaf pages will be reused by the
+ * remapped region.
+ */
+RESERVE_BRK(p2m_identity_remap, PAGE_SIZE * 2 * 3 * MAX_REMAP_RANGES);
static inline unsigned p2m_top_index(unsigned long pfn)
{
@@ -281,11 +272,11 @@ static void p2m_init(unsigned long *p2m)
* Build the parallel p2m_top_mfn and p2m_mid_mfn structures
*
* This is called both at boot time, and after resuming from suspend:
- * - At boot time we're called very early, and must use extend_brk()
+ * - At boot time we're called rather early, and must use alloc_bootmem*()
* to allocate memory.
*
* - After resume we're called from within stop_machine, but the mfn
- * tree should alreay be completely allocated.
+ * tree should already be completely allocated.
*/
void __ref xen_build_mfn_list_list(void)
{
@@ -296,20 +287,17 @@ void __ref xen_build_mfn_list_list(void)
/* Pre-initialize p2m_top_mfn to be completely missing */
if (p2m_top_mfn == NULL) {
- p2m_mid_missing_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
+ p2m_mid_missing_mfn = alloc_bootmem_align(PAGE_SIZE, PAGE_SIZE);
p2m_mid_mfn_init(p2m_mid_missing_mfn, p2m_missing);
- p2m_mid_identity_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
- p2m_mid_mfn_init(p2m_mid_identity_mfn, p2m_identity);
- p2m_top_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
+ p2m_top_mfn_p = alloc_bootmem_align(PAGE_SIZE, PAGE_SIZE);
p2m_top_mfn_p_init(p2m_top_mfn_p);
- p2m_top_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
+ p2m_top_mfn = alloc_bootmem_align(PAGE_SIZE, PAGE_SIZE);
p2m_top_mfn_init(p2m_top_mfn);
} else {
/* Reinitialise, mfn's all change after migration */
p2m_mid_mfn_init(p2m_mid_missing_mfn, p2m_missing);
- p2m_mid_mfn_init(p2m_mid_identity_mfn, p2m_identity);
}
for (pfn = 0; pfn < xen_max_p2m_pfn; pfn += P2M_PER_PAGE) {
@@ -337,10 +325,9 @@ void __ref xen_build_mfn_list_list(void)
/*
* XXX boot-time only! We should never find
* missing parts of the mfn tree after
- * runtime. extend_brk() will BUG if we call
- * it too late.
+ * runtime.
*/
- mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
+ mid_mfn_p = alloc_bootmem_align(PAGE_SIZE, PAGE_SIZE);
p2m_mid_mfn_init(mid_mfn_p, p2m_missing);
p2m_top_mfn_p[topidx] = mid_mfn_p;
@@ -424,7 +411,6 @@ void __init xen_build_dynamic_phys_to_machine(void)
m2p_override_init();
}
#ifdef CONFIG_X86_64
-#include <linux/bootmem.h>
unsigned long __init xen_revector_p2m_tree(void)
{
unsigned long va_start;
@@ -486,7 +472,6 @@ unsigned long __init xen_revector_p2m_tree(void)
copy_page(new, mid_p);
p2m_top[topidx][mididx] = &mfn_list[pfn_free];
- p2m_top_mfn_p[topidx][mididx] = virt_to_mfn(&mfn_list[pfn_free]);
pfn_free += P2M_PER_PAGE;
@@ -547,12 +532,13 @@ static bool alloc_p2m(unsigned long pfn)
unsigned topidx, mididx;
unsigned long ***top_p, **mid;
unsigned long *top_mfn_p, *mid_mfn;
+ unsigned long *p2m_orig;
topidx = p2m_top_index(pfn);
mididx = p2m_mid_index(pfn);
top_p = &p2m_top[topidx];
- mid = *top_p;
+ mid = ACCESS_ONCE(*top_p);
if (mid == p2m_mid_missing) {
/* Mid level is missing, allocate a new one */
@@ -567,7 +553,7 @@ static bool alloc_p2m(unsigned long pfn)
}
top_mfn_p = &p2m_top_mfn[topidx];
- mid_mfn = p2m_top_mfn_p[topidx];
+ mid_mfn = ACCESS_ONCE(p2m_top_mfn_p[topidx]);
BUG_ON(virt_to_mfn(mid_mfn) != *top_mfn_p);
@@ -575,6 +561,7 @@ static bool alloc_p2m(unsigned long pfn)
/* Separately check the mid mfn level */
unsigned long missing_mfn;
unsigned long mid_mfn_mfn;
+ unsigned long old_mfn;
mid_mfn = alloc_p2m_page();
if (!mid_mfn)
@@ -584,17 +571,19 @@ static bool alloc_p2m(unsigned long pfn)
missing_mfn = virt_to_mfn(p2m_mid_missing_mfn);
mid_mfn_mfn = virt_to_mfn(mid_mfn);
- if (cmpxchg(top_mfn_p, missing_mfn, mid_mfn_mfn) != missing_mfn)
+ old_mfn = cmpxchg(top_mfn_p, missing_mfn, mid_mfn_mfn);
+ if (old_mfn != missing_mfn) {
free_p2m_page(mid_mfn);
- else
+ mid_mfn = mfn_to_virt(old_mfn);
+ } else {
p2m_top_mfn_p[topidx] = mid_mfn;
+ }
}
- if (p2m_top[topidx][mididx] == p2m_identity ||
- p2m_top[topidx][mididx] == p2m_missing) {
+ p2m_orig = ACCESS_ONCE(p2m_top[topidx][mididx]);
+ if (p2m_orig == p2m_identity || p2m_orig == p2m_missing) {
/* p2m leaf page is missing */
unsigned long *p2m;
- unsigned long *p2m_orig = p2m_top[topidx][mididx];
p2m = alloc_p2m_page();
if (!p2m)
@@ -615,7 +604,6 @@ static bool __init early_alloc_p2m(unsigned long pfn, bool check_boundary)
{
unsigned topidx, mididx, idx;
unsigned long *p2m;
- unsigned long *mid_mfn_p;
topidx = p2m_top_index(pfn);
mididx = p2m_mid_index(pfn);
@@ -642,43 +630,21 @@ static bool __init early_alloc_p2m(unsigned long pfn, bool check_boundary)
p2m_top[topidx][mididx] = p2m;
- /* For save/restore we need to MFN of the P2M saved */
-
- mid_mfn_p = p2m_top_mfn_p[topidx];
- WARN(mid_mfn_p[mididx] != virt_to_mfn(p2m_missing),
- "P2M_TOP_P[%d][%d] != MFN of p2m_missing!\n",
- topidx, mididx);
- mid_mfn_p[mididx] = virt_to_mfn(p2m);
-
return true;
}
static bool __init early_alloc_p2m_middle(unsigned long pfn)
{
unsigned topidx = p2m_top_index(pfn);
- unsigned long *mid_mfn_p;
unsigned long **mid;
mid = p2m_top[topidx];
- mid_mfn_p = p2m_top_mfn_p[topidx];
if (mid == p2m_mid_missing) {
mid = extend_brk(PAGE_SIZE, PAGE_SIZE);
p2m_mid_init(mid, p2m_missing);
p2m_top[topidx] = mid;
-
- BUG_ON(mid_mfn_p != p2m_mid_missing_mfn);
- }
- /* And the save/restore P2M tables.. */
- if (mid_mfn_p == p2m_mid_missing_mfn) {
- mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
- p2m_mid_mfn_init(mid_mfn_p, p2m_missing);
-
- p2m_top_mfn_p[topidx] = mid_mfn_p;
- p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p);
- /* Note: we don't set mid_mfn_p[midix] here,
- * look in early_alloc_p2m() */
}
return true;
}
@@ -689,14 +655,13 @@ static bool __init early_alloc_p2m_middle(unsigned long pfn)
* replace the P2M leaf with a p2m_missing or p2m_identity.
* Stick the old page in the new P2M tree location.
*/
-bool __init early_can_reuse_p2m_middle(unsigned long set_pfn, unsigned long set_mfn)
+static bool __init early_can_reuse_p2m_middle(unsigned long set_pfn)
{
unsigned topidx;
unsigned mididx;
unsigned ident_pfns;
unsigned inv_pfns;
unsigned long *p2m;
- unsigned long *mid_mfn_p;
unsigned idx;
unsigned long pfn;
@@ -742,11 +707,6 @@ bool __init early_can_reuse_p2m_middle(unsigned long set_pfn, unsigned long set_
found:
/* Found one, replace old with p2m_identity or p2m_missing */
p2m_top[topidx][mididx] = (ident_pfns ? p2m_identity : p2m_missing);
- /* And the other for save/restore.. */
- mid_mfn_p = p2m_top_mfn_p[topidx];
- /* NOTE: Even if it is a p2m_identity it should still be point to
- * a page filled with INVALID_P2M_ENTRY entries. */
- mid_mfn_p[mididx] = virt_to_mfn(p2m_missing);
/* Reset where we want to stick the old page in. */
topidx = p2m_top_index(set_pfn);
@@ -761,8 +721,6 @@ found:
p2m_init(p2m);
p2m_top[topidx][mididx] = p2m;
- mid_mfn_p = p2m_top_mfn_p[topidx];
- mid_mfn_p[mididx] = virt_to_mfn(p2m);
return true;
}
@@ -772,7 +730,7 @@ bool __init early_set_phys_to_machine(unsigned long pfn, unsigned long mfn)
if (!early_alloc_p2m_middle(pfn))
return false;
- if (early_can_reuse_p2m_middle(pfn, mfn))
+ if (early_can_reuse_p2m_middle(pfn))
return __set_phys_to_machine(pfn, mfn);
if (!early_alloc_p2m(pfn, false /* boundary crossover OK!*/))
@@ -841,10 +799,9 @@ unsigned long __init set_phys_range_identity(unsigned long pfn_s,
pfn = ALIGN(pfn, P2M_PER_PAGE);
}
- if (!WARN((pfn - pfn_s) != (pfn_e - pfn_s),
+ WARN((pfn - pfn_s) != (pfn_e - pfn_s),
"Identity mapping failed. We are %ld short of 1-1 mappings!\n",
- (pfn_e - pfn_s) - (pfn - pfn_s)))
- printk(KERN_DEBUG "1-1 mapping on %lx->%lx\n", pfn_s, pfn);
+ (pfn_e - pfn_s) - (pfn - pfn_s));
return pfn - pfn_s;
}
diff --git a/arch/x86/xen/p2m.h b/arch/x86/xen/p2m.h
new file mode 100644
index 000000000000..ad8aee24ab72
--- /dev/null
+++ b/arch/x86/xen/p2m.h
@@ -0,0 +1,15 @@
+#ifndef _XEN_P2M_H
+#define _XEN_P2M_H
+
+#define P2M_PER_PAGE (PAGE_SIZE / sizeof(unsigned long))
+#define P2M_MID_PER_PAGE (PAGE_SIZE / sizeof(unsigned long *))
+#define P2M_TOP_PER_PAGE (PAGE_SIZE / sizeof(unsigned long **))
+
+#define MAX_P2M_PFN (P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE)
+
+#define MAX_REMAP_RANGES 10
+
+extern unsigned long __init set_phys_range_identity(unsigned long pfn_s,
+ unsigned long pfn_e);
+
+#endif /* _XEN_P2M_H */
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 2e555163c2fe..29834b3fd87f 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -29,6 +29,7 @@
#include <xen/features.h>
#include "xen-ops.h"
#include "vdso.h"
+#include "p2m.h"
/* These are code, but not functions. Defined in entry.S */
extern const char xen_hypervisor_callback[];
@@ -46,6 +47,9 @@ struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata;
/* Number of pages released from the initial allocation. */
unsigned long xen_released_pages;
+/* Buffer used to remap identity mapped pages */
+unsigned long xen_remap_buf[P2M_PER_PAGE] __initdata;
+
/*
* The maximum amount of extra memory compared to the base size. The
* main scaling factor is the size of struct page. At extreme ratios
@@ -151,107 +155,325 @@ static unsigned long __init xen_do_chunk(unsigned long start,
return len;
}
-static unsigned long __init xen_release_chunk(unsigned long start,
- unsigned long end)
-{
- return xen_do_chunk(start, end, true);
-}
-
-static unsigned long __init xen_populate_chunk(
+/*
+ * Finds the next RAM pfn available in the E820 map after min_pfn.
+ * This function updates min_pfn with the pfn found and returns
+ * the size of that range or zero if not found.
+ */
+static unsigned long __init xen_find_pfn_range(
const struct e820entry *list, size_t map_size,
- unsigned long max_pfn, unsigned long *last_pfn,
- unsigned long credits_left)
+ unsigned long *min_pfn)
{
const struct e820entry *entry;
unsigned int i;
unsigned long done = 0;
- unsigned long dest_pfn;
for (i = 0, entry = list; i < map_size; i++, entry++) {
unsigned long s_pfn;
unsigned long e_pfn;
- unsigned long pfns;
- long capacity;
-
- if (credits_left <= 0)
- break;
if (entry->type != E820_RAM)
continue;
e_pfn = PFN_DOWN(entry->addr + entry->size);
- /* We only care about E820 after the xen_start_info->nr_pages */
- if (e_pfn <= max_pfn)
+ /* We only care about E820 after this */
+ if (e_pfn < *min_pfn)
continue;
s_pfn = PFN_UP(entry->addr);
- /* If the E820 falls within the nr_pages, we want to start
- * at the nr_pages PFN.
- * If that would mean going past the E820 entry, skip it
+
+ /* If min_pfn falls within the E820 entry, we want to start
+ * at the min_pfn PFN.
*/
- if (s_pfn <= max_pfn) {
- capacity = e_pfn - max_pfn;
- dest_pfn = max_pfn;
+ if (s_pfn <= *min_pfn) {
+ done = e_pfn - *min_pfn;
} else {
- capacity = e_pfn - s_pfn;
- dest_pfn = s_pfn;
+ done = e_pfn - s_pfn;
+ *min_pfn = s_pfn;
}
+ break;
+ }
- if (credits_left < capacity)
- capacity = credits_left;
+ return done;
+}
- pfns = xen_do_chunk(dest_pfn, dest_pfn + capacity, false);
- done += pfns;
- *last_pfn = (dest_pfn + pfns);
- if (pfns < capacity)
- break;
- credits_left -= pfns;
+/*
+ * This releases a chunk of memory and then does the identity map. It's used as
+ * as a fallback if the remapping fails.
+ */
+static void __init xen_set_identity_and_release_chunk(unsigned long start_pfn,
+ unsigned long end_pfn, unsigned long nr_pages, unsigned long *identity,
+ unsigned long *released)
+{
+ WARN_ON(start_pfn > end_pfn);
+
+ /* Need to release pages first */
+ *released += xen_do_chunk(start_pfn, min(end_pfn, nr_pages), true);
+ *identity += set_phys_range_identity(start_pfn, end_pfn);
+}
+
+/*
+ * Helper function to update both the p2m and m2p tables.
+ */
+static unsigned long __init xen_update_mem_tables(unsigned long pfn,
+ unsigned long mfn)
+{
+ struct mmu_update update = {
+ .ptr = ((unsigned long long)mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE,
+ .val = pfn
+ };
+
+ /* Update p2m */
+ if (!early_set_phys_to_machine(pfn, mfn)) {
+ WARN(1, "Failed to set p2m mapping for pfn=%ld mfn=%ld\n",
+ pfn, mfn);
+ return false;
}
- return done;
+
+ /* Update m2p */
+ if (HYPERVISOR_mmu_update(&update, 1, NULL, DOMID_SELF) < 0) {
+ WARN(1, "Failed to set m2p mapping for mfn=%ld pfn=%ld\n",
+ mfn, pfn);
+ return false;
+ }
+
+ return true;
}
-static void __init xen_set_identity_and_release_chunk(
- unsigned long start_pfn, unsigned long end_pfn, unsigned long nr_pages,
- unsigned long *released, unsigned long *identity)
+/*
+ * This function updates the p2m and m2p tables with an identity map from
+ * start_pfn to start_pfn+size and remaps the underlying RAM of the original
+ * allocation at remap_pfn. It must do so carefully in P2M_PER_PAGE sized blocks
+ * to not exhaust the reserved brk space. Doing it in properly aligned blocks
+ * ensures we only allocate the minimum required leaf pages in the p2m table. It
+ * copies the existing mfns from the p2m table under the 1:1 map, overwrites
+ * them with the identity map and then updates the p2m and m2p tables with the
+ * remapped memory.
+ */
+static unsigned long __init xen_do_set_identity_and_remap_chunk(
+ unsigned long start_pfn, unsigned long size, unsigned long remap_pfn)
{
- unsigned long pfn;
+ unsigned long ident_pfn_iter, remap_pfn_iter;
+ unsigned long ident_start_pfn_align, remap_start_pfn_align;
+ unsigned long ident_end_pfn_align, remap_end_pfn_align;
+ unsigned long ident_boundary_pfn, remap_boundary_pfn;
+ unsigned long ident_cnt = 0;
+ unsigned long remap_cnt = 0;
+ unsigned long left = size;
+ unsigned long mod;
+ int i;
+
+ WARN_ON(size == 0);
+
+ BUG_ON(xen_feature(XENFEAT_auto_translated_physmap));
/*
- * If the PFNs are currently mapped, clear the mappings
- * (except for the ISA region which must be 1:1 mapped) to
- * release the refcounts (in Xen) on the original frames.
+ * Determine the proper alignment to remap memory in P2M_PER_PAGE sized
+ * blocks. We need to keep track of both the existing pfn mapping and
+ * the new pfn remapping.
*/
- for (pfn = start_pfn; pfn <= max_pfn_mapped && pfn < end_pfn; pfn++) {
- pte_t pte = __pte_ma(0);
+ mod = start_pfn % P2M_PER_PAGE;
+ ident_start_pfn_align =
+ mod ? (start_pfn - mod + P2M_PER_PAGE) : start_pfn;
+ mod = remap_pfn % P2M_PER_PAGE;
+ remap_start_pfn_align =
+ mod ? (remap_pfn - mod + P2M_PER_PAGE) : remap_pfn;
+ mod = (start_pfn + size) % P2M_PER_PAGE;
+ ident_end_pfn_align = start_pfn + size - mod;
+ mod = (remap_pfn + size) % P2M_PER_PAGE;
+ remap_end_pfn_align = remap_pfn + size - mod;
+
+ /* Iterate over each p2m leaf node in each range */
+ for (ident_pfn_iter = ident_start_pfn_align, remap_pfn_iter = remap_start_pfn_align;
+ ident_pfn_iter < ident_end_pfn_align && remap_pfn_iter < remap_end_pfn_align;
+ ident_pfn_iter += P2M_PER_PAGE, remap_pfn_iter += P2M_PER_PAGE) {
+ /* Check we aren't past the end */
+ BUG_ON(ident_pfn_iter + P2M_PER_PAGE > start_pfn + size);
+ BUG_ON(remap_pfn_iter + P2M_PER_PAGE > remap_pfn + size);
+
+ /* Save p2m mappings */
+ for (i = 0; i < P2M_PER_PAGE; i++)
+ xen_remap_buf[i] = pfn_to_mfn(ident_pfn_iter + i);
+
+ /* Set identity map which will free a p2m leaf */
+ ident_cnt += set_phys_range_identity(ident_pfn_iter,
+ ident_pfn_iter + P2M_PER_PAGE);
+
+#ifdef DEBUG
+ /* Helps verify a p2m leaf has been freed */
+ for (i = 0; i < P2M_PER_PAGE; i++) {
+ unsigned int pfn = ident_pfn_iter + i;
+ BUG_ON(pfn_to_mfn(pfn) != pfn);
+ }
+#endif
+ /* Now remap memory */
+ for (i = 0; i < P2M_PER_PAGE; i++) {
+ unsigned long mfn = xen_remap_buf[i];
+
+ /* This will use the p2m leaf freed above */
+ if (!xen_update_mem_tables(remap_pfn_iter + i, mfn)) {
+ WARN(1, "Failed to update mem mapping for pfn=%ld mfn=%ld\n",
+ remap_pfn_iter + i, mfn);
+ return 0;
+ }
+
+ remap_cnt++;
+ }
- if (pfn < PFN_UP(ISA_END_ADDRESS))
- pte = mfn_pte(pfn, PAGE_KERNEL_IO);
+ left -= P2M_PER_PAGE;
+ }
- (void)HYPERVISOR_update_va_mapping(
- (unsigned long)__va(pfn << PAGE_SHIFT), pte, 0);
+ /* Max boundary space possible */
+ BUG_ON(left > (P2M_PER_PAGE - 1) * 2);
+
+ /* Now handle the boundary conditions */
+ ident_boundary_pfn = start_pfn;
+ remap_boundary_pfn = remap_pfn;
+ for (i = 0; i < left; i++) {
+ unsigned long mfn;
+
+ /* These two checks move from the start to end boundaries */
+ if (ident_boundary_pfn == ident_start_pfn_align)
+ ident_boundary_pfn = ident_pfn_iter;
+ if (remap_boundary_pfn == remap_start_pfn_align)
+ remap_boundary_pfn = remap_pfn_iter;
+
+ /* Check we aren't past the end */
+ BUG_ON(ident_boundary_pfn >= start_pfn + size);
+ BUG_ON(remap_boundary_pfn >= remap_pfn + size);
+
+ mfn = pfn_to_mfn(ident_boundary_pfn);
+
+ if (!xen_update_mem_tables(remap_boundary_pfn, mfn)) {
+ WARN(1, "Failed to update mem mapping for pfn=%ld mfn=%ld\n",
+ remap_pfn_iter + i, mfn);
+ return 0;
+ }
+ remap_cnt++;
+
+ ident_boundary_pfn++;
+ remap_boundary_pfn++;
}
- if (start_pfn < nr_pages)
- *released += xen_release_chunk(
- start_pfn, min(end_pfn, nr_pages));
+ /* Finish up the identity map */
+ if (ident_start_pfn_align >= ident_end_pfn_align) {
+ /*
+ * In this case we have an identity range which does not span an
+ * aligned block so everything needs to be identity mapped here.
+ * If we didn't check this we might remap too many pages since
+ * the align boundaries are not meaningful in this case.
+ */
+ ident_cnt += set_phys_range_identity(start_pfn,
+ start_pfn + size);
+ } else {
+ /* Remapped above so check each end of the chunk */
+ if (start_pfn < ident_start_pfn_align)
+ ident_cnt += set_phys_range_identity(start_pfn,
+ ident_start_pfn_align);
+ if (start_pfn + size > ident_pfn_iter)
+ ident_cnt += set_phys_range_identity(ident_pfn_iter,
+ start_pfn + size);
+ }
- *identity += set_phys_range_identity(start_pfn, end_pfn);
+ BUG_ON(ident_cnt != size);
+ BUG_ON(remap_cnt != size);
+
+ return size;
}
-static unsigned long __init xen_set_identity_and_release(
- const struct e820entry *list, size_t map_size, unsigned long nr_pages)
+/*
+ * This function takes a contiguous pfn range that needs to be identity mapped
+ * and:
+ *
+ * 1) Finds a new range of pfns to use to remap based on E820 and remap_pfn.
+ * 2) Calls the do_ function to actually do the mapping/remapping work.
+ *
+ * The goal is to not allocate additional memory but to remap the existing
+ * pages. In the case of an error the underlying memory is simply released back
+ * to Xen and not remapped.
+ */
+static unsigned long __init xen_set_identity_and_remap_chunk(
+ const struct e820entry *list, size_t map_size, unsigned long start_pfn,
+ unsigned long end_pfn, unsigned long nr_pages, unsigned long remap_pfn,
+ unsigned long *identity, unsigned long *remapped,
+ unsigned long *released)
+{
+ unsigned long pfn;
+ unsigned long i = 0;
+ unsigned long n = end_pfn - start_pfn;
+
+ while (i < n) {
+ unsigned long cur_pfn = start_pfn + i;
+ unsigned long left = n - i;
+ unsigned long size = left;
+ unsigned long remap_range_size;
+
+ /* Do not remap pages beyond the current allocation */
+ if (cur_pfn >= nr_pages) {
+ /* Identity map remaining pages */
+ *identity += set_phys_range_identity(cur_pfn,
+ cur_pfn + size);
+ break;
+ }
+ if (cur_pfn + size > nr_pages)
+ size = nr_pages - cur_pfn;
+
+ remap_range_size = xen_find_pfn_range(list, map_size,
+ &remap_pfn);
+ if (!remap_range_size) {
+ pr_warning("Unable to find available pfn range, not remapping identity pages\n");
+ xen_set_identity_and_release_chunk(cur_pfn,
+ cur_pfn + left, nr_pages, identity, released);
+ break;
+ }
+ /* Adjust size to fit in current e820 RAM region */
+ if (size > remap_range_size)
+ size = remap_range_size;
+
+ if (!xen_do_set_identity_and_remap_chunk(cur_pfn, size, remap_pfn)) {
+ WARN(1, "Failed to remap 1:1 memory cur_pfn=%ld size=%ld remap_pfn=%ld\n",
+ cur_pfn, size, remap_pfn);
+ xen_set_identity_and_release_chunk(cur_pfn,
+ cur_pfn + left, nr_pages, identity, released);
+ break;
+ }
+
+ /* Update variables to reflect new mappings. */
+ i += size;
+ remap_pfn += size;
+ *identity += size;
+ *remapped += size;
+ }
+
+ /*
+ * If the PFNs are currently mapped, the VA mapping also needs
+ * to be updated to be 1:1.
+ */
+ for (pfn = start_pfn; pfn <= max_pfn_mapped && pfn < end_pfn; pfn++)
+ (void)HYPERVISOR_update_va_mapping(
+ (unsigned long)__va(pfn << PAGE_SHIFT),
+ mfn_pte(pfn, PAGE_KERNEL_IO), 0);
+
+ return remap_pfn;
+}
+
+static unsigned long __init xen_set_identity_and_remap(
+ const struct e820entry *list, size_t map_size, unsigned long nr_pages,
+ unsigned long *released)
{
phys_addr_t start = 0;
- unsigned long released = 0;
unsigned long identity = 0;
+ unsigned long remapped = 0;
+ unsigned long last_pfn = nr_pages;
const struct e820entry *entry;
+ unsigned long num_released = 0;
int i;
/*
* Combine non-RAM regions and gaps until a RAM region (or the
* end of the map) is reached, then set the 1:1 map and
- * release the pages (if available) in those non-RAM regions.
+ * remap the memory in those non-RAM regions.
*
* The combined non-RAM regions are rounded to a whole number
* of pages so any partial pages are accessible via the 1:1
@@ -269,22 +491,24 @@ static unsigned long __init xen_set_identity_and_release(
end_pfn = PFN_UP(entry->addr);
if (start_pfn < end_pfn)
- xen_set_identity_and_release_chunk(
- start_pfn, end_pfn, nr_pages,
- &released, &identity);
-
+ last_pfn = xen_set_identity_and_remap_chunk(
+ list, map_size, start_pfn,
+ end_pfn, nr_pages, last_pfn,
+ &identity, &remapped,
+ &num_released);
start = end;
}
}
- if (released)
- printk(KERN_INFO "Released %lu pages of unused memory\n", released);
- if (identity)
- printk(KERN_INFO "Set %ld page(s) to 1-1 mapping\n", identity);
+ *released = num_released;
- return released;
-}
+ pr_info("Set %ld page(s) to 1-1 mapping\n", identity);
+ pr_info("Remapped %ld page(s), last_pfn=%ld\n", remapped,
+ last_pfn);
+ pr_info("Released %ld page(s)\n", num_released);
+ return last_pfn;
+}
static unsigned long __init xen_get_max_pages(void)
{
unsigned long max_pages = MAX_DOMAIN_PAGES;
@@ -347,7 +571,6 @@ char * __init xen_memory_setup(void)
unsigned long max_pages;
unsigned long last_pfn = 0;
unsigned long extra_pages = 0;
- unsigned long populated;
int i;
int op;
@@ -372,6 +595,7 @@ char * __init xen_memory_setup(void)
rc = 0;
}
BUG_ON(rc);
+ BUG_ON(memmap.nr_entries == 0);
/*
* Xen won't allow a 1:1 mapping to be created to UNUSABLE
@@ -392,20 +616,11 @@ char * __init xen_memory_setup(void)
extra_pages += max_pages - max_pfn;
/*
- * Set P2M for all non-RAM pages and E820 gaps to be identity
- * type PFNs. Any RAM pages that would be made inaccesible by
- * this are first released.
+ * Set identity map on non-RAM pages and remap the underlying RAM.
*/
- xen_released_pages = xen_set_identity_and_release(
- map, memmap.nr_entries, max_pfn);
-
- /*
- * Populate back the non-RAM pages and E820 gaps that had been
- * released. */
- populated = xen_populate_chunk(map, memmap.nr_entries,
- max_pfn, &last_pfn, xen_released_pages);
+ last_pfn = xen_set_identity_and_remap(map, memmap.nr_entries, max_pfn,
+ &xen_released_pages);
- xen_released_pages -= populated;
extra_pages += xen_released_pages;
if (last_pfn > max_pfn) {
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 7005974c3ff3..4c071aeb8417 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -37,6 +37,7 @@
#include <xen/hvc-console.h>
#include "xen-ops.h"
#include "mmu.h"
+#include "smp.h"
cpumask_var_t xen_cpu_initialized_map;
@@ -99,10 +100,14 @@ static void cpu_bringup(void)
wmb(); /* make sure everything is out */
}
-/* Note: cpu parameter is only relevant for PVH */
-static void cpu_bringup_and_idle(int cpu)
+/*
+ * Note: cpu parameter is only relevant for PVH. The reason for passing it
+ * is we can't do smp_processor_id until the percpu segments are loaded, for
+ * which we need the cpu number! So we pass it in rdi as first parameter.
+ */
+asmlinkage __visible void cpu_bringup_and_idle(int cpu)
{
-#ifdef CONFIG_X86_64
+#ifdef CONFIG_XEN_PVH
if (xen_feature(XENFEAT_auto_translated_physmap) &&
xen_feature(XENFEAT_supervisor_mode_kernel))
xen_pvh_secondary_vcpu_init(cpu);
@@ -360,6 +365,8 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
struct desc_struct *gdt;
unsigned long gdt_mfn;
+ /* used to tell cpu_init() that it can proceed with initialization */
+ cpumask_set_cpu(cpu, cpu_callout_mask);
if (cpumask_test_and_set_cpu(cpu, xen_cpu_initialized_map))
return 0;
@@ -374,11 +381,10 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
ctxt->user_regs.fs = __KERNEL_PERCPU;
ctxt->user_regs.gs = __KERNEL_STACK_CANARY;
#endif
- ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle;
-
memset(&ctxt->fpu_ctxt, 0, sizeof(ctxt->fpu_ctxt));
if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+ ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle;
ctxt->flags = VGCF_IN_KERNEL;
ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */
ctxt->user_regs.ds = __USER_DS;
@@ -413,15 +419,18 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
(unsigned long)xen_failsafe_callback;
ctxt->user_regs.cs = __KERNEL_CS;
per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir);
-#ifdef CONFIG_X86_32
}
-#else
- } else
- /* N.B. The user_regs.eip (cpu_bringup_and_idle) is called with
- * %rdi having the cpu number - which means are passing in
- * as the first parameter the cpu. Subtle!
+#ifdef CONFIG_XEN_PVH
+ else {
+ /*
+ * The vcpu comes on kernel page tables which have the NX pte
+ * bit set. This means before DS/SS is touched, NX in
+ * EFER must be set. Hence the following assembly glue code.
*/
+ ctxt->user_regs.eip = (unsigned long)xen_pvh_early_cpu_init;
ctxt->user_regs.rdi = cpu;
+ ctxt->user_regs.rsi = true; /* entry == true */
+ }
#endif
ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs);
ctxt->ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir));
@@ -501,6 +510,9 @@ static void xen_cpu_die(unsigned int cpu)
current->state = TASK_UNINTERRUPTIBLE;
schedule_timeout(HZ/10);
}
+
+ cpu_die_common(cpu);
+
xen_smp_intr_free(cpu);
xen_uninit_lock_cpu(cpu);
xen_teardown_timer(cpu);
diff --git a/arch/x86/xen/smp.h b/arch/x86/xen/smp.h
index c7c2d89efd76..963d62a35c82 100644
--- a/arch/x86/xen/smp.h
+++ b/arch/x86/xen/smp.h
@@ -8,4 +8,12 @@ extern void xen_send_IPI_allbutself(int vector);
extern void xen_send_IPI_all(int vector);
extern void xen_send_IPI_self(int vector);
+#ifdef CONFIG_XEN_PVH
+extern void xen_pvh_early_cpu_init(int cpu, bool entry);
+#else
+static inline void xen_pvh_early_cpu_init(int cpu, bool entry)
+{
+}
+#endif
+
#endif
diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
index 0ba5f3b967f0..23b45eb9a89c 100644
--- a/arch/x86/xen/spinlock.c
+++ b/arch/x86/xen/spinlock.c
@@ -109,7 +109,7 @@ static bool xen_pvspin = true;
__visible void xen_lock_spinning(struct arch_spinlock *lock, __ticket_t want)
{
int irq = __this_cpu_read(lock_kicker_irq);
- struct xen_lock_waiting *w = &__get_cpu_var(lock_waiting);
+ struct xen_lock_waiting *w = this_cpu_ptr(&lock_waiting);
int cpu = smp_processor_id();
u64 start;
unsigned long flags;
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 7b78f88c1707..f473d268d387 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -80,7 +80,7 @@ static void get_runstate_snapshot(struct vcpu_runstate_info *res)
BUG_ON(preemptible());
- state = &__get_cpu_var(xen_runstate);
+ state = this_cpu_ptr(&xen_runstate);
/*
* The runstate info is always updated by the hypervisor on
@@ -123,7 +123,7 @@ static void do_stolen_accounting(void)
WARN_ON(state.state != RUNSTATE_running);
- snap = &__get_cpu_var(xen_runstate_snapshot);
+ snap = this_cpu_ptr(&xen_runstate_snapshot);
/* work out how much time the VCPU has not been runn*ing* */
runnable = state.time[RUNSTATE_runnable] - snap->time[RUNSTATE_runnable];
@@ -158,7 +158,7 @@ cycle_t xen_clocksource_read(void)
cycle_t ret;
preempt_disable_notrace();
- src = &__get_cpu_var(xen_vcpu)->time;
+ src = &__this_cpu_read(xen_vcpu)->time;
ret = pvclock_clocksource_read(src);
preempt_enable_notrace();
return ret;
@@ -397,7 +397,7 @@ static DEFINE_PER_CPU(struct xen_clock_event_device, xen_clock_events) = { .evt.
static irqreturn_t xen_timer_interrupt(int irq, void *dev_id)
{
- struct clock_event_device *evt = &__get_cpu_var(xen_clock_events).evt;
+ struct clock_event_device *evt = this_cpu_ptr(&xen_clock_events.evt);
irqreturn_t ret;
ret = IRQ_NONE;
@@ -444,7 +444,7 @@ void xen_setup_timer(int cpu)
irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt,
IRQF_PERCPU|IRQF_NOBALANCING|IRQF_TIMER|
- IRQF_FORCE_RESUME,
+ IRQF_FORCE_RESUME|IRQF_EARLY_RESUME,
name, NULL);
(void)xen_set_irq_priority(irq, XEN_IRQ_PRIORITY_MAX);
@@ -460,7 +460,7 @@ void xen_setup_cpu_clockevents(void)
{
BUG_ON(preemptible());
- clockevents_register_device(&__get_cpu_var(xen_clock_events).evt);
+ clockevents_register_device(this_cpu_ptr(&xen_clock_events.evt));
}
void xen_timer_resume(void)
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
index 485b69585540..674b222544b7 100644
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -47,6 +47,41 @@ ENTRY(startup_xen)
__FINIT
+#ifdef CONFIG_XEN_PVH
+/*
+ * xen_pvh_early_cpu_init() - early PVH VCPU initialization
+ * @cpu: this cpu number (%rdi)
+ * @entry: true if this is a secondary vcpu coming up on this entry
+ * point, false if this is the boot CPU being initialized for
+ * the first time (%rsi)
+ *
+ * Note: This is called as a function on the boot CPU, and is the entry point
+ * on the secondary CPU.
+ */
+ENTRY(xen_pvh_early_cpu_init)
+ mov %rsi, %r11
+
+ /* Gather features to see if NX implemented. */
+ mov $0x80000001, %eax
+ cpuid
+ mov %edx, %esi
+
+ mov $MSR_EFER, %ecx
+ rdmsr
+ bts $_EFER_SCE, %eax
+
+ bt $20, %esi
+ jnc 1f /* No NX, skip setting it */
+ bts $_EFER_NX, %eax
+1: wrmsr
+#ifdef CONFIG_SMP
+ cmp $0, %r11b
+ jne cpu_bringup_and_idle
+#endif
+ ret
+
+#endif /* CONFIG_XEN_PVH */
+
.pushsection .text
.balign PAGE_SIZE
ENTRY(hypercall_page)
@@ -124,6 +159,7 @@ NEXT_HYPERCALL(arch_6)
ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID,
.quad _PAGE_PRESENT; .quad _PAGE_PRESENT)
ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long 1)
+ ELFNOTE(Xen, XEN_ELFNOTE_MOD_START_PFN, .long 1)
ELFNOTE(Xen, XEN_ELFNOTE_HV_START_LOW, _ASM_PTR __HYPERVISOR_VIRT_START)
ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, _ASM_PTR 0)