aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig72
-rw-r--r--arch/x86/Kconfig.cpu6
-rw-r--r--arch/x86/Kconfig.debug25
-rw-r--r--arch/x86/boot/compressed/Makefile10
-rw-r--r--arch/x86/boot/compressed/eboot.c1022
-rw-r--r--arch/x86/boot/compressed/eboot.h61
-rw-r--r--arch/x86/boot/compressed/efi_stub_32.S86
-rw-r--r--arch/x86/boot/compressed/efi_stub_64.S1
-rw-r--r--arch/x86/boot/compressed/head_32.S22
-rw-r--r--arch/x86/boot/compressed/head_64.S20
-rw-r--r--arch/x86/boot/compressed/string.c9
-rw-r--r--arch/x86/boot/header.S158
-rw-r--r--arch/x86/boot/string.c35
-rw-r--r--arch/x86/boot/tools/build.c39
-rw-r--r--arch/x86/crypto/Makefile4
-rw-r--r--arch/x86/crypto/serpent-sse2-i586-asm_32.S638
-rw-r--r--arch/x86/crypto/serpent-sse2-x86_64-asm_64.S761
-rw-r--r--arch/x86/crypto/serpent_sse2_glue.c1070
-rw-r--r--arch/x86/crypto/twofish_glue_3way.c218
-rw-r--r--arch/x86/ia32/ia32entry.S43
-rw-r--r--arch/x86/include/asm/alternative-asm.h4
-rw-r--r--arch/x86/include/asm/amd_nb.h2
-rw-r--r--arch/x86/include/asm/apic.h7
-rw-r--r--arch/x86/include/asm/apic_flat_64.h7
-rw-r--r--arch/x86/include/asm/apicdef.h1
-rw-r--r--arch/x86/include/asm/atomic64_32.h2
-rw-r--r--arch/x86/include/asm/bitops.h76
-rw-r--r--arch/x86/include/asm/bootparam.h2
-rw-r--r--arch/x86/include/asm/cmpxchg.h163
-rw-r--r--arch/x86/include/asm/cmpxchg_32.h46
-rw-r--r--arch/x86/include/asm/cmpxchg_64.h43
-rw-r--r--arch/x86/include/asm/cpufeature.h3
-rw-r--r--arch/x86/include/asm/debugreg.h22
-rw-r--r--arch/x86/include/asm/desc.h12
-rw-r--r--arch/x86/include/asm/div64.h22
-rw-r--r--arch/x86/include/asm/e820.h2
-rw-r--r--arch/x86/include/asm/efi.h4
-rw-r--r--arch/x86/include/asm/fixmap.h2
-rw-r--r--arch/x86/include/asm/hardirq.h1
-rw-r--r--arch/x86/include/asm/i387.h2
-rw-r--r--arch/x86/include/asm/init.h2
-rw-r--r--arch/x86/include/asm/insn.h7
-rw-r--r--arch/x86/include/asm/intel_scu_ipc.h14
-rw-r--r--arch/x86/include/asm/iommu.h1
-rw-r--r--arch/x86/include/asm/kvm_emulate.h2
-rw-r--r--arch/x86/include/asm/kvm_host.h90
-rw-r--r--arch/x86/include/asm/mach_timer.h2
-rw-r--r--arch/x86/include/asm/mach_traps.h2
-rw-r--r--arch/x86/include/asm/mc146818rtc.h4
-rw-r--r--arch/x86/include/asm/mce.h19
-rw-r--r--arch/x86/include/asm/memblock.h23
-rw-r--r--arch/x86/include/asm/microcode.h2
-rw-r--r--arch/x86/include/asm/mrst.h18
-rw-r--r--arch/x86/include/asm/msr.h9
-rw-r--r--arch/x86/include/asm/numachip/numachip_csr.h167
-rw-r--r--arch/x86/include/asm/pci.h9
-rw-r--r--arch/x86/include/asm/pci_x86.h2
-rw-r--r--arch/x86/include/asm/percpu.h77
-rw-r--r--arch/x86/include/asm/perf_event.h44
-rw-r--r--arch/x86/include/asm/pgtable.h2
-rw-r--r--arch/x86/include/asm/processor-flags.h1
-rw-r--r--arch/x86/include/asm/processor.h2
-rw-r--r--arch/x86/include/asm/serpent.h63
-rw-r--r--arch/x86/include/asm/setup.h2
-rw-r--r--arch/x86/include/asm/smp.h6
-rw-r--r--arch/x86/include/asm/spinlock.h15
-rw-r--r--arch/x86/include/asm/system.h1
-rw-r--r--arch/x86/include/asm/thread_info.h11
-rw-r--r--arch/x86/include/asm/timer.h23
-rw-r--r--arch/x86/include/asm/topology.h4
-rw-r--r--arch/x86/include/asm/tsc.h2
-rw-r--r--arch/x86/include/asm/uaccess.h2
-rw-r--r--arch/x86/include/asm/uv/uv_mmrs.h1
-rw-r--r--arch/x86/include/asm/x86_init.h7
-rw-r--r--arch/x86/kernel/Makefile1
-rw-r--r--arch/x86/kernel/acpi/boot.c10
-rw-r--r--arch/x86/kernel/alternative.c2
-rw-r--r--arch/x86/kernel/amd_nb.c39
-rw-r--r--arch/x86/kernel/aperture_64.c4
-rw-r--r--arch/x86/kernel/apic/Makefile1
-rw-r--r--arch/x86/kernel/apic/apic.c146
-rw-r--r--arch/x86/kernel/apic/apic_flat_64.c9
-rw-r--r--arch/x86/kernel/apic/apic_numachip.c294
-rw-r--r--arch/x86/kernel/apic/io_apic.c15
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c9
-rw-r--r--arch/x86/kernel/apm_32.c16
-rw-r--r--arch/x86/kernel/asm-offsets.c2
-rw-r--r--arch/x86/kernel/check.c34
-rw-r--r--arch/x86/kernel/cpu/amd.c17
-rw-r--r--arch/x86/kernel/cpu/centaur.c2
-rw-r--r--arch/x86/kernel/cpu/common.c38
-rw-r--r--arch/x86/kernel/cpu/cpu.h5
-rw-r--r--arch/x86/kernel/cpu/intel.c2
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c25
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-inject.c36
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-internal.h4
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c229
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c26
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c94
-rw-r--r--arch/x86/kernel/cpu/mcheck/threshold.c2
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c2
-rw-r--r--arch/x86/kernel/cpu/perf_event.c278
-rw-r--r--arch/x86/kernel/cpu/perf_event.h51
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd.c2
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd_ibs.c29
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c94
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_ds.c6
-rw-r--r--arch/x86/kernel/cpu/perf_event_p4.c2
-rw-r--r--arch/x86/kernel/cpu/powerflags.c3
-rw-r--r--arch/x86/kernel/cpu/proc.c4
-rw-r--r--arch/x86/kernel/cpuid.c2
-rw-r--r--arch/x86/kernel/dumpstack_32.c8
-rw-r--r--arch/x86/kernel/dumpstack_64.c8
-rw-r--r--arch/x86/kernel/e820.c117
-rw-r--r--arch/x86/kernel/early_printk.c4
-rw-r--r--arch/x86/kernel/entry_32.S4
-rw-r--r--arch/x86/kernel/entry_64.S249
-rw-r--r--arch/x86/kernel/head.c2
-rw-r--r--arch/x86/kernel/head32.c7
-rw-r--r--arch/x86/kernel/head64.c7
-rw-r--r--arch/x86/kernel/head_64.S4
-rw-r--r--arch/x86/kernel/hpet.c29
-rw-r--r--arch/x86/kernel/irq.c11
-rw-r--r--arch/x86/kernel/irq_32.c5
-rw-r--r--arch/x86/kernel/irq_64.c38
-rw-r--r--arch/x86/kernel/irqinit.c2
-rw-r--r--arch/x86/kernel/jump_label.c2
-rw-r--r--arch/x86/kernel/kvm.c181
-rw-r--r--arch/x86/kernel/kvmclock.c5
-rw-r--r--arch/x86/kernel/microcode_amd.c209
-rw-r--r--arch/x86/kernel/microcode_core.c91
-rw-r--r--arch/x86/kernel/mpparse.c14
-rw-r--r--arch/x86/kernel/msr.c2
-rw-r--r--arch/x86/kernel/nmi.c105
-rw-r--r--arch/x86/kernel/nmi_selftest.c180
-rw-r--r--arch/x86/kernel/pci-dma.c11
-rw-r--r--arch/x86/kernel/process.c10
-rw-r--r--arch/x86/kernel/process_32.c6
-rw-r--r--arch/x86/kernel/process_64.c15
-rw-r--r--arch/x86/kernel/ptrace.c3
-rw-r--r--arch/x86/kernel/quirks.c13
-rw-r--r--arch/x86/kernel/reboot.c21
-rw-r--r--arch/x86/kernel/rtc.c5
-rw-r--r--arch/x86/kernel/setup.c30
-rw-r--r--arch/x86/kernel/signal.c6
-rw-r--r--arch/x86/kernel/smp.c72
-rw-r--r--arch/x86/kernel/smpboot.c20
-rw-r--r--arch/x86/kernel/trampoline.c4
-rw-r--r--arch/x86/kernel/traps.c27
-rw-r--r--arch/x86/kernel/tsc.c26
-rw-r--r--arch/x86/kernel/tsc_sync.c4
-rw-r--r--arch/x86/kernel/vsyscall_64.c77
-rw-r--r--arch/x86/kernel/x86_init.c6
-rw-r--r--arch/x86/kvm/Kconfig3
-rw-r--r--arch/x86/kvm/Makefile2
-rw-r--r--arch/x86/kvm/cpuid.c670
-rw-r--r--arch/x86/kvm/cpuid.h46
-rw-r--r--arch/x86/kvm/emulate.c436
-rw-r--r--arch/x86/kvm/i8254.c14
-rw-r--r--arch/x86/kvm/i8259.c24
-rw-r--r--arch/x86/kvm/lapic.c3
-rw-r--r--arch/x86/kvm/lapic.h1
-rw-r--r--arch/x86/kvm/mmu.c547
-rw-r--r--arch/x86/kvm/mmu_audit.c29
-rw-r--r--arch/x86/kvm/mmutrace.h19
-rw-r--r--arch/x86/kvm/paging_tmpl.h86
-rw-r--r--arch/x86/kvm/pmu.c533
-rw-r--r--arch/x86/kvm/svm.c15
-rw-r--r--arch/x86/kvm/timer.c26
-rw-r--r--arch/x86/kvm/vmx.c194
-rw-r--r--arch/x86/kvm/x86.c1012
-rw-r--r--arch/x86/kvm/x86.h5
-rw-r--r--arch/x86/lguest/boot.c21
-rw-r--r--arch/x86/lib/inat.c9
-rw-r--r--arch/x86/lib/insn.c4
-rw-r--r--arch/x86/lib/string_32.c8
-rw-r--r--arch/x86/lib/x86-opcode-map.txt606
-rw-r--r--arch/x86/mm/Makefile2
-rw-r--r--arch/x86/mm/extable.c2
-rw-r--r--arch/x86/mm/fault.c22
-rw-r--r--arch/x86/mm/gup.c2
-rw-r--r--arch/x86/mm/highmem_32.c2
-rw-r--r--arch/x86/mm/init.c31
-rw-r--r--arch/x86/mm/init_32.c65
-rw-r--r--arch/x86/mm/init_64.c13
-rw-r--r--arch/x86/mm/memblock.c348
-rw-r--r--arch/x86/mm/memtest.c33
-rw-r--r--arch/x86/mm/mmap.c4
-rw-r--r--arch/x86/mm/mmio-mod.c4
-rw-r--r--arch/x86/mm/numa.c49
-rw-r--r--arch/x86/mm/numa_32.c10
-rw-r--r--arch/x86/mm/numa_64.c2
-rw-r--r--arch/x86/mm/numa_emulation.c36
-rw-r--r--arch/x86/mm/pageattr.c8
-rw-r--r--arch/x86/mm/srat.c7
-rw-r--r--arch/x86/net/bpf_jit_comp.c4
-rw-r--r--arch/x86/oprofile/Makefile3
-rw-r--r--arch/x86/oprofile/init.c25
-rw-r--r--arch/x86/oprofile/nmi_int.c27
-rw-r--r--arch/x86/oprofile/nmi_timer_int.c50
-rw-r--r--arch/x86/pci/Makefile5
-rw-r--r--arch/x86/pci/acpi.c75
-rw-r--r--arch/x86/pci/amd_bus.c43
-rw-r--r--arch/x86/pci/broadcom_bus.c62
-rw-r--r--arch/x86/pci/bus_numa.c31
-rw-r--r--arch/x86/pci/common.c19
-rw-r--r--arch/x86/pci/i386.c20
-rw-r--r--arch/x86/pci/legacy.c3
-rw-r--r--arch/x86/pci/numaq_32.c2
-rw-r--r--arch/x86/pci/pcbios.c2
-rw-r--r--arch/x86/platform/efi/efi.c12
-rw-r--r--arch/x86/platform/efi/efi_32.c48
-rw-r--r--arch/x86/platform/geode/alix.c2
-rw-r--r--arch/x86/platform/iris/iris.c2
-rw-r--r--arch/x86/platform/mrst/Makefile6
-rw-r--r--arch/x86/platform/mrst/early_printk_mrst.c16
-rw-r--r--arch/x86/platform/mrst/mrst.c112
-rw-r--r--arch/x86/platform/uv/uv_sysfs.c2
-rw-r--r--arch/x86/tools/Makefile11
-rw-r--r--arch/x86/tools/gen-insn-attr-x86.awk21
-rw-r--r--arch/x86/tools/insn_sanity.c275
-rw-r--r--arch/x86/um/Kconfig8
-rw-r--r--arch/x86/um/asm/processor.h2
-rw-r--r--arch/x86/xen/Kconfig4
-rw-r--r--arch/x86/xen/debugfs.c2
-rw-r--r--arch/x86/xen/debugfs.h2
-rw-r--r--arch/x86/xen/enlighten.c5
-rw-r--r--arch/x86/xen/grant-table.c46
-rw-r--r--arch/x86/xen/mmu.c12
-rw-r--r--arch/x86/xen/setup.c27
230 files changed, 10597 insertions, 3876 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index cb9a1044a771..6c14ecd851d0 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -26,6 +26,8 @@ config X86
select HAVE_IOREMAP_PROT
select HAVE_KPROBES
select HAVE_MEMBLOCK
+ select HAVE_MEMBLOCK_NODE_MAP
+ select ARCH_DISCARD_MEMBLOCK
select ARCH_WANT_OPTIONAL_GPIOLIB
select ARCH_WANT_FRAME_POINTERS
select HAVE_DMA_ATTRS
@@ -58,8 +60,12 @@ config X86
select PERF_EVENTS
select HAVE_PERF_EVENTS_NMI
select ANON_INODES
+ select HAVE_ALIGNED_STRUCT_PAGE if SLUB && !M386
+ select HAVE_CMPXCHG_LOCAL if !M386
+ select HAVE_CMPXCHG_DOUBLE
select HAVE_ARCH_KMEMCHECK
select HAVE_USER_RETURN_NOTIFIER
+ select ARCH_BINFMT_ELF_RANDOMIZE_PIE
select HAVE_ARCH_JUMP_LABEL
select HAVE_TEXT_POKE_SMP
select HAVE_GENERIC_HARDIRQS
@@ -75,6 +81,7 @@ config X86
select HAVE_BPF_JIT if (X86_64 && NET)
select CLKEVT_I8253
select ARCH_HAVE_NMI_SAFE_CMPXCHG
+ select GENERIC_IOMAP
config INSTRUCTION_DECODER
def_bool (KPROBES || PERF_EVENTS)
@@ -140,9 +147,6 @@ config NEED_SG_DMA_LENGTH
config GENERIC_ISA_DMA
def_bool ISA_DMA_API
-config GENERIC_IOMAP
- def_bool y
-
config GENERIC_BUG
def_bool y
depends on BUG
@@ -204,9 +208,6 @@ config ZONE_DMA32
bool
default X86_64
-config ARCH_POPULATES_NODE_MAP
- def_bool y
-
config AUDIT_ARCH
bool
default X86_64
@@ -343,6 +344,7 @@ config X86_EXTENDED_PLATFORM
If you enable this option then you'll be able to select support
for the following (non-PC) 64 bit x86 platforms:
+ Numascale NumaChip
ScaleMP vSMP
SGI Ultraviolet
@@ -351,6 +353,18 @@ config X86_EXTENDED_PLATFORM
endif
# This is an alphabetically sorted list of 64 bit extended platforms
# Please maintain the alphabetic order if and when there are additions
+config X86_NUMACHIP
+ bool "Numascale NumaChip"
+ depends on X86_64
+ depends on X86_EXTENDED_PLATFORM
+ depends on NUMA
+ depends on SMP
+ depends on X86_X2APIC
+ depends on !EDAC_AMD64
+ ---help---
+ Adds support for Numascale NumaChip large-SMP systems. Needed to
+ enable more than ~168 cores.
+ If you don't have one of these, you should say N here.
config X86_VSMP
bool "ScaleMP vSMP"
@@ -390,7 +404,7 @@ config X86_INTEL_CE
This option compiles in support for the CE4100 SOC for settop
boxes and media devices.
-config X86_INTEL_MID
+config X86_WANT_INTEL_MID
bool "Intel MID platform support"
depends on X86_32
depends on X86_EXTENDED_PLATFORM
@@ -399,13 +413,19 @@ config X86_INTEL_MID
systems which do not have the PCI legacy interfaces (Moorestown,
Medfield). If you are building for a PC class system say N here.
-if X86_INTEL_MID
+if X86_WANT_INTEL_MID
+
+config X86_INTEL_MID
+ bool
config X86_MRST
bool "Moorestown MID platform"
depends on PCI
depends on PCI_GOANY
depends on X86_IO_APIC
+ select X86_INTEL_MID
+ select SFI
+ select DW_APB_TIMER
select APB_TIMER
select I2C
select SPI
@@ -419,6 +439,26 @@ config X86_MRST
nor standard legacy replacement devices/features. e.g. Moorestown does
not contain i8259, i8254, HPET, legacy BIOS, most of the io ports.
+config X86_MDFLD
+ bool "Medfield MID platform"
+ depends on PCI
+ depends on PCI_GOANY
+ depends on X86_IO_APIC
+ select X86_INTEL_MID
+ select SFI
+ select DW_APB_TIMER
+ select APB_TIMER
+ select I2C
+ select SPI
+ select INTEL_SCU_IPC
+ select X86_PLATFORM_DEVICES
+ ---help---
+ Medfield is Intel's Low Power Intel Architecture (LPIA) based Moblin
+ Internet Device(MID) platform.
+ Unlike standard x86 PCs, Medfield does not have many legacy devices
+ nor standard legacy replacement devices/features. e.g. Medfield does
+ not contain i8259, i8254, HPET, legacy BIOS, most of the io ports.
+
endif
config X86_RDC321X
@@ -616,7 +656,7 @@ config X86_SUMMIT_NUMA
config X86_CYCLONE_TIMER
def_bool y
- depends on X86_32_NON_STANDARD
+ depends on X86_SUMMIT
source "arch/x86/Kconfig.cpu"
@@ -644,9 +684,10 @@ config HPET_EMULATE_RTC
depends on HPET_TIMER && (RTC=y || RTC=m || RTC_DRV_CMOS=m || RTC_DRV_CMOS=y)
config APB_TIMER
- def_bool y if MRST
- prompt "Langwell APB Timer Support" if X86_MRST
+ def_bool y if X86_INTEL_MID
+ prompt "Intel MID APB Timer Support" if X86_INTEL_MID
select DW_APB_TIMER
+ depends on X86_INTEL_MID && SFI
help
APB timer is the replacement for 8254, HPET on X86 MID platforms.
The APBT provides a stable time base on SMP
@@ -1474,6 +1515,13 @@ config EFI
resultant kernel should continue to boot on existing non-EFI
platforms.
+config EFI_STUB
+ bool "EFI stub support"
+ depends on EFI
+ ---help---
+ This kernel feature allows a bzImage to be loaded directly
+ by EFI firmware without the use of a bootloader.
+
config SECCOMP
def_bool y
prompt "Enable seccomp to safely compute untrusted bytecode"
@@ -1726,7 +1774,7 @@ source "drivers/sfi/Kconfig"
config X86_APM_BOOT
def_bool y
- depends on APM || APM_MODULE
+ depends on APM
menuconfig APM
tristate "APM (Advanced Power Management) BIOS support"
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index e3ca7e0d858c..3c57033e2211 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -309,12 +309,6 @@ config X86_INTERNODE_CACHE_SHIFT
config X86_CMPXCHG
def_bool X86_64 || (X86_32 && !M386)
-config CMPXCHG_LOCAL
- def_bool X86_64 || (X86_32 && !M386)
-
-config CMPXCHG_DOUBLE
- def_bool y
-
config X86_L1_CACHE_SHIFT
int
default "7" if MPENTIUM4 || MPSC
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index bf56e1793272..e46c2147397f 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -43,9 +43,9 @@ config EARLY_PRINTK
with klogd/syslogd or the X server. You should normally N here,
unless you want to debug such a crash.
-config EARLY_PRINTK_MRST
- bool "Early printk for MRST platform support"
- depends on EARLY_PRINTK && X86_MRST
+config EARLY_PRINTK_INTEL_MID
+ bool "Early printk for Intel MID platform support"
+ depends on EARLY_PRINTK && X86_INTEL_MID
config EARLY_PRINTK_DBGP
bool "Early printk via EHCI debug port"
@@ -63,8 +63,11 @@ config DEBUG_STACKOVERFLOW
bool "Check for stack overflows"
depends on DEBUG_KERNEL
---help---
- This option will cause messages to be printed if free stack space
- drops below a certain limit.
+ Say Y here if you want to check the overflows of kernel, IRQ
+ and exception stacks. This option will cause messages of the
+ stacks in detail when free stack space drops below a certain
+ limit.
+ If in doubt, say "N".
config X86_PTDUMP
bool "Export kernel pagetable layout to userspace via debugfs"
@@ -284,4 +287,16 @@ config DEBUG_STRICT_USER_COPY_CHECKS
If unsure, or if you run an older (pre 4.4) gcc, say N.
+config DEBUG_NMI_SELFTEST
+ bool "NMI Selftest"
+ depends on DEBUG_KERNEL && X86_LOCAL_APIC
+ ---help---
+ Enabling this option turns on a quick NMI selftest to verify
+ that the NMI behaves correctly.
+
+ This might help diagnose strange hangs that rely on NMI to
+ function properly.
+
+ If unsure, say N.
+
endmenu
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index 09664efb9cee..b123b9a8f5b3 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -23,7 +23,15 @@ LDFLAGS_vmlinux := -T
hostprogs-y := mkpiggy
-$(obj)/vmlinux: $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o $(obj)/string.o $(obj)/cmdline.o $(obj)/early_serial_console.o $(obj)/piggy.o FORCE
+VMLINUX_OBJS = $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o \
+ $(obj)/string.o $(obj)/cmdline.o $(obj)/early_serial_console.o \
+ $(obj)/piggy.o
+
+ifeq ($(CONFIG_EFI_STUB), y)
+ VMLINUX_OBJS += $(obj)/eboot.o $(obj)/efi_stub_$(BITS).o
+endif
+
+$(obj)/vmlinux: $(VMLINUX_OBJS) FORCE
$(call if_changed,ld)
@:
diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c
new file mode 100644
index 000000000000..fec216f4fbc3
--- /dev/null
+++ b/arch/x86/boot/compressed/eboot.c
@@ -0,0 +1,1022 @@
+/* -----------------------------------------------------------------------
+ *
+ * Copyright 2011 Intel Corporation; author Matt Fleming
+ *
+ * This file is part of the Linux kernel, and is made available under
+ * the terms of the GNU General Public License version 2.
+ *
+ * ----------------------------------------------------------------------- */
+
+#include <linux/efi.h>
+#include <asm/efi.h>
+#include <asm/setup.h>
+#include <asm/desc.h>
+
+#include "eboot.h"
+
+static efi_system_table_t *sys_table;
+
+static efi_status_t __get_map(efi_memory_desc_t **map, unsigned long *map_size,
+ unsigned long *desc_size)
+{
+ efi_memory_desc_t *m = NULL;
+ efi_status_t status;
+ unsigned long key;
+ u32 desc_version;
+
+ *map_size = sizeof(*m) * 32;
+again:
+ /*
+ * Add an additional efi_memory_desc_t because we're doing an
+ * allocation which may be in a new descriptor region.
+ */
+ *map_size += sizeof(*m);
+ status = efi_call_phys3(sys_table->boottime->allocate_pool,
+ EFI_LOADER_DATA, *map_size, (void **)&m);
+ if (status != EFI_SUCCESS)
+ goto fail;
+
+ status = efi_call_phys5(sys_table->boottime->get_memory_map, map_size,
+ m, &key, desc_size, &desc_version);
+ if (status == EFI_BUFFER_TOO_SMALL) {
+ efi_call_phys1(sys_table->boottime->free_pool, m);
+ goto again;
+ }
+
+ if (status != EFI_SUCCESS)
+ efi_call_phys1(sys_table->boottime->free_pool, m);
+
+fail:
+ *map = m;
+ return status;
+}
+
+/*
+ * Allocate at the highest possible address that is not above 'max'.
+ */
+static efi_status_t high_alloc(unsigned long size, unsigned long align,
+ unsigned long *addr, unsigned long max)
+{
+ unsigned long map_size, desc_size;
+ efi_memory_desc_t *map;
+ efi_status_t status;
+ unsigned long nr_pages;
+ u64 max_addr = 0;
+ int i;
+
+ status = __get_map(&map, &map_size, &desc_size);
+ if (status != EFI_SUCCESS)
+ goto fail;
+
+ nr_pages = round_up(size, EFI_PAGE_SIZE) / EFI_PAGE_SIZE;
+again:
+ for (i = 0; i < map_size / desc_size; i++) {
+ efi_memory_desc_t *desc;
+ unsigned long m = (unsigned long)map;
+ u64 start, end;
+
+ desc = (efi_memory_desc_t *)(m + (i * desc_size));
+ if (desc->type != EFI_CONVENTIONAL_MEMORY)
+ continue;
+
+ if (desc->num_pages < nr_pages)
+ continue;
+
+ start = desc->phys_addr;
+ end = start + desc->num_pages * (1UL << EFI_PAGE_SHIFT);
+
+ if ((start + size) > end || (start + size) > max)
+ continue;
+
+ if (end - size > max)
+ end = max;
+
+ if (round_down(end - size, align) < start)
+ continue;
+
+ start = round_down(end - size, align);
+
+ /*
+ * Don't allocate at 0x0. It will confuse code that
+ * checks pointers against NULL.
+ */
+ if (start == 0x0)
+ continue;
+
+ if (start > max_addr)
+ max_addr = start;
+ }
+
+ if (!max_addr)
+ status = EFI_NOT_FOUND;
+ else {
+ status = efi_call_phys4(sys_table->boottime->allocate_pages,
+ EFI_ALLOCATE_ADDRESS, EFI_LOADER_DATA,
+ nr_pages, &max_addr);
+ if (status != EFI_SUCCESS) {
+ max = max_addr;
+ max_addr = 0;
+ goto again;
+ }
+
+ *addr = max_addr;
+ }
+
+free_pool:
+ efi_call_phys1(sys_table->boottime->free_pool, map);
+
+fail:
+ return status;
+}
+
+/*
+ * Allocate at the lowest possible address.
+ */
+static efi_status_t low_alloc(unsigned long size, unsigned long align,
+ unsigned long *addr)
+{
+ unsigned long map_size, desc_size;
+ efi_memory_desc_t *map;
+ efi_status_t status;
+ unsigned long nr_pages;
+ int i;
+
+ status = __get_map(&map, &map_size, &desc_size);
+ if (status != EFI_SUCCESS)
+ goto fail;
+
+ nr_pages = round_up(size, EFI_PAGE_SIZE) / EFI_PAGE_SIZE;
+ for (i = 0; i < map_size / desc_size; i++) {
+ efi_memory_desc_t *desc;
+ unsigned long m = (unsigned long)map;
+ u64 start, end;
+
+ desc = (efi_memory_desc_t *)(m + (i * desc_size));
+
+ if (desc->type != EFI_CONVENTIONAL_MEMORY)
+ continue;
+
+ if (desc->num_pages < nr_pages)
+ continue;
+
+ start = desc->phys_addr;
+ end = start + desc->num_pages * (1UL << EFI_PAGE_SHIFT);
+
+ /*
+ * Don't allocate at 0x0. It will confuse code that
+ * checks pointers against NULL. Skip the first 8
+ * bytes so we start at a nice even number.
+ */
+ if (start == 0x0)
+ start += 8;
+
+ start = round_up(start, align);
+ if ((start + size) > end)
+ continue;
+
+ status = efi_call_phys4(sys_table->boottime->allocate_pages,
+ EFI_ALLOCATE_ADDRESS, EFI_LOADER_DATA,
+ nr_pages, &start);
+ if (status == EFI_SUCCESS) {
+ *addr = start;
+ break;
+ }
+ }
+
+ if (i == map_size / desc_size)
+ status = EFI_NOT_FOUND;
+
+free_pool:
+ efi_call_phys1(sys_table->boottime->free_pool, map);
+fail:
+ return status;
+}
+
+static void low_free(unsigned long size, unsigned long addr)
+{
+ unsigned long nr_pages;
+
+ nr_pages = round_up(size, EFI_PAGE_SIZE) / EFI_PAGE_SIZE;
+ efi_call_phys2(sys_table->boottime->free_pages, addr, size);
+}
+
+static void find_bits(unsigned long mask, u8 *pos, u8 *size)
+{
+ u8 first, len;
+
+ first = 0;
+ len = 0;
+
+ if (mask) {
+ while (!(mask & 0x1)) {
+ mask = mask >> 1;
+ first++;
+ }
+
+ while (mask & 0x1) {
+ mask = mask >> 1;
+ len++;
+ }
+ }
+
+ *pos = first;
+ *size = len;
+}
+
+/*
+ * See if we have Graphics Output Protocol
+ */
+static efi_status_t setup_gop(struct screen_info *si, efi_guid_t *proto,
+ unsigned long size)
+{
+ struct efi_graphics_output_protocol *gop, *first_gop;
+ struct efi_pixel_bitmask pixel_info;
+ unsigned long nr_gops;
+ efi_status_t status;
+ void **gop_handle;
+ u16 width, height;
+ u32 fb_base, fb_size;
+ u32 pixels_per_scan_line;
+ int pixel_format;
+ int i;
+
+ status = efi_call_phys3(sys_table->boottime->allocate_pool,
+ EFI_LOADER_DATA, size, &gop_handle);
+ if (status != EFI_SUCCESS)
+ return status;
+
+ status = efi_call_phys5(sys_table->boottime->locate_handle,
+ EFI_LOCATE_BY_PROTOCOL, proto,
+ NULL, &size, gop_handle);
+ if (status != EFI_SUCCESS)
+ goto free_handle;
+
+ first_gop = NULL;
+
+ nr_gops = size / sizeof(void *);
+ for (i = 0; i < nr_gops; i++) {
+ struct efi_graphics_output_mode_info *info;
+ efi_guid_t pciio_proto = EFI_PCI_IO_PROTOCOL_GUID;
+ void *pciio;
+ void *h = gop_handle[i];
+
+ status = efi_call_phys3(sys_table->boottime->handle_protocol,
+ h, proto, &gop);
+ if (status != EFI_SUCCESS)
+ continue;
+
+ efi_call_phys3(sys_table->boottime->handle_protocol,
+ h, &pciio_proto, &pciio);
+
+ status = efi_call_phys4(gop->query_mode, gop,
+ gop->mode->mode, &size, &info);
+ if (status == EFI_SUCCESS && (!first_gop || pciio)) {
+ /*
+ * Apple provide GOPs that are not backed by
+ * real hardware (they're used to handle
+ * multiple displays). The workaround is to
+ * search for a GOP implementing the PCIIO
+ * protocol, and if one isn't found, to just
+ * fallback to the first GOP.
+ */
+ width = info->horizontal_resolution;
+ height = info->vertical_resolution;
+ fb_base = gop->mode->frame_buffer_base;
+ fb_size = gop->mode->frame_buffer_size;
+ pixel_format = info->pixel_format;
+ pixel_info = info->pixel_information;
+ pixels_per_scan_line = info->pixels_per_scan_line;
+
+ /*
+ * Once we've found a GOP supporting PCIIO,
+ * don't bother looking any further.
+ */
+ if (pciio)
+ break;
+
+ first_gop = gop;
+ }
+ }
+
+ /* Did we find any GOPs? */
+ if (!first_gop)
+ goto free_handle;
+
+ /* EFI framebuffer */
+ si->orig_video_isVGA = VIDEO_TYPE_EFI;
+
+ si->lfb_width = width;
+ si->lfb_height = height;
+ si->lfb_base = fb_base;
+ si->lfb_size = fb_size;
+ si->pages = 1;
+
+ if (pixel_format == PIXEL_RGB_RESERVED_8BIT_PER_COLOR) {
+ si->lfb_depth = 32;
+ si->lfb_linelength = pixels_per_scan_line * 4;
+ si->red_size = 8;
+ si->red_pos = 0;
+ si->green_size = 8;
+ si->green_pos = 8;
+ si->blue_size = 8;
+ si->blue_pos = 16;
+ si->rsvd_size = 8;
+ si->rsvd_pos = 24;
+ } else if (pixel_format == PIXEL_BGR_RESERVED_8BIT_PER_COLOR) {
+ si->lfb_depth = 32;
+ si->lfb_linelength = pixels_per_scan_line * 4;
+ si->red_size = 8;
+ si->red_pos = 16;
+ si->green_size = 8;
+ si->green_pos = 8;
+ si->blue_size = 8;
+ si->blue_pos = 0;
+ si->rsvd_size = 8;
+ si->rsvd_pos = 24;
+ } else if (pixel_format == PIXEL_BIT_MASK) {
+ find_bits(pixel_info.red_mask, &si->red_pos, &si->red_size);
+ find_bits(pixel_info.green_mask, &si->green_pos,
+ &si->green_size);
+ find_bits(pixel_info.blue_mask, &si->blue_pos, &si->blue_size);
+ find_bits(pixel_info.reserved_mask, &si->rsvd_pos,
+ &si->rsvd_size);
+ si->lfb_depth = si->red_size + si->green_size +
+ si->blue_size + si->rsvd_size;
+ si->lfb_linelength = (pixels_per_scan_line * si->lfb_depth) / 8;
+ } else {
+ si->lfb_depth = 4;
+ si->lfb_linelength = si->lfb_width / 2;
+ si->red_size = 0;
+ si->red_pos = 0;
+ si->green_size = 0;
+ si->green_pos = 0;
+ si->blue_size = 0;
+ si->blue_pos = 0;
+ si->rsvd_size = 0;
+ si->rsvd_pos = 0;
+ }
+
+free_handle:
+ efi_call_phys1(sys_table->boottime->free_pool, gop_handle);
+ return status;
+}
+
+/*
+ * See if we have Universal Graphics Adapter (UGA) protocol
+ */
+static efi_status_t setup_uga(struct screen_info *si, efi_guid_t *uga_proto,
+ unsigned long size)
+{
+ struct efi_uga_draw_protocol *uga, *first_uga;
+ unsigned long nr_ugas;
+ efi_status_t status;
+ u32 width, height;
+ void **uga_handle = NULL;
+ int i;
+
+ status = efi_call_phys3(sys_table->boottime->allocate_pool,
+ EFI_LOADER_DATA, size, &uga_handle);
+ if (status != EFI_SUCCESS)
+ return status;
+
+ status = efi_call_phys5(sys_table->boottime->locate_handle,
+ EFI_LOCATE_BY_PROTOCOL, uga_proto,
+ NULL, &size, uga_handle);
+ if (status != EFI_SUCCESS)
+ goto free_handle;
+
+ first_uga = NULL;
+
+ nr_ugas = size / sizeof(void *);
+ for (i = 0; i < nr_ugas; i++) {
+ efi_guid_t pciio_proto = EFI_PCI_IO_PROTOCOL_GUID;
+ void *handle = uga_handle[i];
+ u32 w, h, depth, refresh;
+ void *pciio;
+
+ status = efi_call_phys3(sys_table->boottime->handle_protocol,
+ handle, uga_proto, &uga);
+ if (status != EFI_SUCCESS)
+ continue;
+
+ efi_call_phys3(sys_table->boottime->handle_protocol,
+ handle, &pciio_proto, &pciio);
+
+ status = efi_call_phys5(uga->get_mode, uga, &w, &h,
+ &depth, &refresh);
+ if (status == EFI_SUCCESS && (!first_uga || pciio)) {
+ width = w;
+ height = h;
+
+ /*
+ * Once we've found a UGA supporting PCIIO,
+ * don't bother looking any further.
+ */
+ if (pciio)
+ break;
+
+ first_uga = uga;
+ }
+ }
+
+ if (!first_uga)
+ goto free_handle;
+
+ /* EFI framebuffer */
+ si->orig_video_isVGA = VIDEO_TYPE_EFI;
+
+ si->lfb_depth = 32;
+ si->lfb_width = width;
+ si->lfb_height = height;
+
+ si->red_size = 8;
+ si->red_pos = 16;
+ si->green_size = 8;
+ si->green_pos = 8;
+ si->blue_size = 8;
+ si->blue_pos = 0;
+ si->rsvd_size = 8;
+ si->rsvd_pos = 24;
+
+
+free_handle:
+ efi_call_phys1(sys_table->boottime->free_pool, uga_handle);
+ return status;
+}
+
+void setup_graphics(struct boot_params *boot_params)
+{
+ efi_guid_t graphics_proto = EFI_GRAPHICS_OUTPUT_PROTOCOL_GUID;
+ struct screen_info *si;
+ efi_guid_t uga_proto = EFI_UGA_PROTOCOL_GUID;
+ efi_status_t status;
+ unsigned long size;
+ void **gop_handle = NULL;
+ void **uga_handle = NULL;
+
+ si = &boot_params->screen_info;
+ memset(si, 0, sizeof(*si));
+
+ size = 0;
+ status = efi_call_phys5(sys_table->boottime->locate_handle,
+ EFI_LOCATE_BY_PROTOCOL, &graphics_proto,
+ NULL, &size, gop_handle);
+ if (status == EFI_BUFFER_TOO_SMALL)
+ status = setup_gop(si, &graphics_proto, size);
+
+ if (status != EFI_SUCCESS) {
+ size = 0;
+ status = efi_call_phys5(sys_table->boottime->locate_handle,
+ EFI_LOCATE_BY_PROTOCOL, &uga_proto,
+ NULL, &size, uga_handle);
+ if (status == EFI_BUFFER_TOO_SMALL)
+ setup_uga(si, &uga_proto, size);
+ }
+}
+
+struct initrd {
+ efi_file_handle_t *handle;
+ u64 size;
+};
+
+/*
+ * Check the cmdline for a LILO-style initrd= arguments.
+ *
+ * We only support loading an initrd from the same filesystem as the
+ * kernel image.
+ */
+static efi_status_t handle_ramdisks(efi_loaded_image_t *image,
+ struct setup_header *hdr)
+{
+ struct initrd *initrds;
+ unsigned long initrd_addr;
+ efi_guid_t fs_proto = EFI_FILE_SYSTEM_GUID;
+ u64 initrd_total;
+ efi_file_io_interface_t *io;
+ efi_file_handle_t *fh;
+ efi_status_t status;
+ int nr_initrds;
+ char *str;
+ int i, j, k;
+
+ initrd_addr = 0;
+ initrd_total = 0;
+
+ str = (char *)(unsigned long)hdr->cmd_line_ptr;
+
+ j = 0; /* See close_handles */
+
+ if (!str || !*str)
+ return EFI_SUCCESS;
+
+ for (nr_initrds = 0; *str; nr_initrds++) {
+ str = strstr(str, "initrd=");
+ if (!str)
+ break;
+
+ str += 7;
+
+ /* Skip any leading slashes */
+ while (*str == '/' || *str == '\\')
+ str++;
+
+ while (*str && *str != ' ' && *str != '\n')
+ str++;
+ }
+
+ if (!nr_initrds)
+ return EFI_SUCCESS;
+
+ status = efi_call_phys3(sys_table->boottime->allocate_pool,
+ EFI_LOADER_DATA,
+ nr_initrds * sizeof(*initrds),
+ &initrds);
+ if (status != EFI_SUCCESS)
+ goto fail;
+
+ str = (char *)(unsigned long)hdr->cmd_line_ptr;
+ for (i = 0; i < nr_initrds; i++) {
+ struct initrd *initrd;
+ efi_file_handle_t *h;
+ efi_file_info_t *info;
+ efi_char16_t filename[256];
+ unsigned long info_sz;
+ efi_guid_t info_guid = EFI_FILE_INFO_ID;
+ efi_char16_t *p;
+ u64 file_sz;
+
+ str = strstr(str, "initrd=");
+ if (!str)
+ break;
+
+ str += 7;
+
+ initrd = &initrds[i];
+ p = filename;
+
+ /* Skip any leading slashes */
+ while (*str == '/' || *str == '\\')
+ str++;
+
+ while (*str && *str != ' ' && *str != '\n') {
+ if (p >= filename + sizeof(filename))
+ break;
+
+ *p++ = *str++;
+ }
+
+ *p = '\0';
+
+ /* Only open the volume once. */
+ if (!i) {
+ efi_boot_services_t *boottime;
+
+ boottime = sys_table->boottime;
+
+ status = efi_call_phys3(boottime->handle_protocol,
+ image->device_handle, &fs_proto, &io);
+ if (status != EFI_SUCCESS)
+ goto free_initrds;
+
+ status = efi_call_phys2(io->open_volume, io, &fh);
+ if (status != EFI_SUCCESS)
+ goto free_initrds;
+ }
+
+ status = efi_call_phys5(fh->open, fh, &h, filename,
+ EFI_FILE_MODE_READ, (u64)0);
+ if (status != EFI_SUCCESS)
+ goto close_handles;
+
+ initrd->handle = h;
+
+ info_sz = 0;
+ status = efi_call_phys4(h->get_info, h, &info_guid,
+ &info_sz, NULL);
+ if (status != EFI_BUFFER_TOO_SMALL)
+ goto close_handles;
+
+grow:
+ status = efi_call_phys3(sys_table->boottime->allocate_pool,
+ EFI_LOADER_DATA, info_sz, &info);
+ if (status != EFI_SUCCESS)
+ goto close_handles;
+
+ status = efi_call_phys4(h->get_info, h, &info_guid,
+ &info_sz, info);
+ if (status == EFI_BUFFER_TOO_SMALL) {
+ efi_call_phys1(sys_table->boottime->free_pool, info);
+ goto grow;
+ }
+
+ file_sz = info->file_size;
+ efi_call_phys1(sys_table->boottime->free_pool, info);
+
+ if (status != EFI_SUCCESS)
+ goto close_handles;
+
+ initrd->size = file_sz;
+ initrd_total += file_sz;
+ }
+
+ if (initrd_total) {
+ unsigned long addr;
+
+ /*
+ * Multiple initrd's need to be at consecutive
+ * addresses in memory, so allocate enough memory for
+ * all the initrd's.
+ */
+ status = high_alloc(initrd_total, 0x1000,
+ &initrd_addr, hdr->initrd_addr_max);
+ if (status != EFI_SUCCESS)
+ goto close_handles;
+
+ /* We've run out of free low memory. */
+ if (initrd_addr > hdr->initrd_addr_max) {
+ status = EFI_INVALID_PARAMETER;
+ goto free_initrd_total;
+ }
+
+ addr = initrd_addr;
+ for (j = 0; j < nr_initrds; j++) {
+ u64 size;
+
+ size = initrds[j].size;
+ while (size) {
+ u64 chunksize;
+ if (size > EFI_READ_CHUNK_SIZE)
+ chunksize = EFI_READ_CHUNK_SIZE;
+ else
+ chunksize = size;
+ status = efi_call_phys3(fh->read,
+ initrds[j].handle,
+ &chunksize, addr);
+ if (status != EFI_SUCCESS)
+ goto free_initrd_total;
+ addr += chunksize;
+ size -= chunksize;
+ }
+
+ efi_call_phys1(fh->close, initrds[j].handle);
+ }
+
+ }
+
+ efi_call_phys1(sys_table->boottime->free_pool, initrds);
+
+ hdr->ramdisk_image = initrd_addr;
+ hdr->ramdisk_size = initrd_total;
+
+ return status;
+
+free_initrd_total:
+ low_free(initrd_total, initrd_addr);
+
+close_handles:
+ for (k = j; k < nr_initrds; k++)
+ efi_call_phys1(fh->close, initrds[k].handle);
+free_initrds:
+ efi_call_phys1(sys_table->boottime->free_pool, initrds);
+fail:
+ hdr->ramdisk_image = 0;
+ hdr->ramdisk_size = 0;
+
+ return status;
+}
+
+/*
+ * Because the x86 boot code expects to be passed a boot_params we
+ * need to create one ourselves (usually the bootloader would create
+ * one for us).
+ */
+static efi_status_t make_boot_params(struct boot_params *boot_params,
+ efi_loaded_image_t *image,
+ void *handle)
+{
+ struct efi_info *efi = &boot_params->efi_info;
+ struct apm_bios_info *bi = &boot_params->apm_bios_info;
+ struct sys_desc_table *sdt = &boot_params->sys_desc_table;
+ struct e820entry *e820_map = &boot_params->e820_map[0];
+ struct e820entry *prev = NULL;
+ struct setup_header *hdr = &boot_params->hdr;
+ unsigned long size, key, desc_size, _size;
+ efi_memory_desc_t *mem_map;
+ void *options = image->load_options;
+ u32 load_options_size = image->load_options_size / 2; /* ASCII */
+ int options_size = 0;
+ efi_status_t status;
+ __u32 desc_version;
+ unsigned long cmdline;
+ u8 nr_entries;
+ u16 *s2;
+ u8 *s1;
+ int i;
+
+ hdr->type_of_loader = 0x21;
+
+ /* Convert unicode cmdline to ascii */
+ cmdline = 0;
+ s2 = (u16 *)options;
+
+ if (s2) {
+ while (*s2 && *s2 != '\n' && options_size < load_options_size) {
+ s2++;
+ options_size++;
+ }
+
+ if (options_size) {
+ if (options_size > hdr->cmdline_size)
+ options_size = hdr->cmdline_size;
+
+ options_size++; /* NUL termination */
+
+ status = low_alloc(options_size, 1, &cmdline);
+ if (status != EFI_SUCCESS)
+ goto fail;
+
+ s1 = (u8 *)(unsigned long)cmdline;
+ s2 = (u16 *)options;
+
+ for (i = 0; i < options_size - 1; i++)
+ *s1++ = *s2++;
+
+ *s1 = '\0';
+ }
+ }
+
+ hdr->cmd_line_ptr = cmdline;
+
+ hdr->ramdisk_image = 0;
+ hdr->ramdisk_size = 0;
+
+ status = handle_ramdisks(image, hdr);
+ if (status != EFI_SUCCESS)
+ goto free_cmdline;
+
+ setup_graphics(boot_params);
+
+ /* Clear APM BIOS info */
+ memset(bi, 0, sizeof(*bi));
+
+ memset(sdt, 0, sizeof(*sdt));
+
+ memcpy(&efi->efi_loader_signature, EFI_LOADER_SIGNATURE, sizeof(__u32));
+
+ size = sizeof(*mem_map) * 32;
+
+again:
+ size += sizeof(*mem_map);
+ _size = size;
+ status = low_alloc(size, 1, (unsigned long *)&mem_map);
+ if (status != EFI_SUCCESS)
+ goto free_cmdline;
+
+ status = efi_call_phys5(sys_table->boottime->get_memory_map, &size,
+ mem_map, &key, &desc_size, &desc_version);
+ if (status == EFI_BUFFER_TOO_SMALL) {
+ low_free(_size, (unsigned long)mem_map);
+ goto again;
+ }
+
+ if (status != EFI_SUCCESS)
+ goto free_mem_map;
+
+ efi->efi_systab = (unsigned long)sys_table;
+ efi->efi_memdesc_size = desc_size;
+ efi->efi_memdesc_version = desc_version;
+ efi->efi_memmap = (unsigned long)mem_map;
+ efi->efi_memmap_size = size;
+
+#ifdef CONFIG_X86_64
+ efi->efi_systab_hi = (unsigned long)sys_table >> 32;
+ efi->efi_memmap_hi = (unsigned long)mem_map >> 32;
+#endif
+
+ /* Might as well exit boot services now */
+ status = efi_call_phys2(sys_table->boottime->exit_boot_services,
+ handle, key);
+ if (status != EFI_SUCCESS)
+ goto free_mem_map;
+
+ /* Historic? */
+ boot_params->alt_mem_k = 32 * 1024;
+
+ /*
+ * Convert the EFI memory map to E820.
+ */
+ nr_entries = 0;
+ for (i = 0; i < size / desc_size; i++) {
+ efi_memory_desc_t *d;
+ unsigned int e820_type = 0;
+ unsigned long m = (unsigned long)mem_map;
+
+ d = (efi_memory_desc_t *)(m + (i * desc_size));
+ switch (d->type) {
+ case EFI_RESERVED_TYPE:
+ case EFI_RUNTIME_SERVICES_CODE:
+ case EFI_RUNTIME_SERVICES_DATA:
+ case EFI_MEMORY_MAPPED_IO:
+ case EFI_MEMORY_MAPPED_IO_PORT_SPACE:
+ case EFI_PAL_CODE:
+ e820_type = E820_RESERVED;
+ break;
+
+ case EFI_UNUSABLE_MEMORY:
+ e820_type = E820_UNUSABLE;
+ break;
+
+ case EFI_ACPI_RECLAIM_MEMORY:
+ e820_type = E820_ACPI;
+ break;
+
+ case EFI_LOADER_CODE:
+ case EFI_LOADER_DATA:
+ case EFI_BOOT_SERVICES_CODE:
+ case EFI_BOOT_SERVICES_DATA:
+ case EFI_CONVENTIONAL_MEMORY:
+ e820_type = E820_RAM;
+ break;
+
+ case EFI_ACPI_MEMORY_NVS:
+ e820_type = E820_NVS;
+ break;
+
+ default:
+ continue;
+ }
+
+ /* Merge adjacent mappings */
+ if (prev && prev->type == e820_type &&
+ (prev->addr + prev->size) == d->phys_addr)
+ prev->size += d->num_pages << 12;
+ else {
+ e820_map->addr = d->phys_addr;
+ e820_map->size = d->num_pages << 12;
+ e820_map->type = e820_type;
+ prev = e820_map++;
+ nr_entries++;
+ }
+ }
+
+ boot_params->e820_entries = nr_entries;
+
+ return EFI_SUCCESS;
+
+free_mem_map:
+ low_free(_size, (unsigned long)mem_map);
+free_cmdline:
+ if (options_size)
+ low_free(options_size, hdr->cmd_line_ptr);
+fail:
+ return status;
+}
+
+/*
+ * On success we return a pointer to a boot_params structure, and NULL
+ * on failure.
+ */
+struct boot_params *efi_main(void *handle, efi_system_table_t *_table)
+{
+ struct boot_params *boot_params;
+ unsigned long start, nr_pages;
+ struct desc_ptr *gdt, *idt;
+ efi_loaded_image_t *image;
+ struct setup_header *hdr;
+ efi_status_t status;
+ efi_guid_t proto = LOADED_IMAGE_PROTOCOL_GUID;
+ struct desc_struct *desc;
+
+ sys_table = _table;
+
+ /* Check if we were booted by the EFI firmware */
+ if (sys_table->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE)
+ goto fail;
+
+ status = efi_call_phys3(sys_table->boottime->handle_protocol,
+ handle, &proto, (void *)&image);
+ if (status != EFI_SUCCESS)
+ goto fail;
+
+ status = low_alloc(0x4000, 1, (unsigned long *)&boot_params);
+ if (status != EFI_SUCCESS)
+ goto fail;
+
+ memset(boot_params, 0x0, 0x4000);
+
+ /* Copy first two sectors to boot_params */
+ memcpy(boot_params, image->image_base, 1024);
+
+ hdr = &boot_params->hdr;
+
+ /*
+ * The EFI firmware loader could have placed the kernel image
+ * anywhere in memory, but the kernel has various restrictions
+ * on the max physical address it can run at. Attempt to move
+ * the kernel to boot_params.pref_address, or as low as
+ * possible.
+ */
+ start = hdr->pref_address;
+ nr_pages = round_up(hdr->init_size, EFI_PAGE_SIZE) / EFI_PAGE_SIZE;
+
+ status = efi_call_phys4(sys_table->boottime->allocate_pages,
+ EFI_ALLOCATE_ADDRESS, EFI_LOADER_DATA,
+ nr_pages, &start);
+ if (status != EFI_SUCCESS) {
+ status = low_alloc(hdr->init_size, hdr->kernel_alignment,
+ &start);
+ if (status != EFI_SUCCESS)
+ goto fail;
+ }
+
+ hdr->code32_start = (__u32)start;
+ hdr->pref_address = (__u64)(unsigned long)image->image_base;
+
+ memcpy((void *)start, image->image_base, image->image_size);
+
+ status = efi_call_phys3(sys_table->boottime->allocate_pool,
+ EFI_LOADER_DATA, sizeof(*gdt),
+ (void **)&gdt);
+ if (status != EFI_SUCCESS)
+ goto fail;
+
+ gdt->size = 0x800;
+ status = low_alloc(gdt->size, 8, (unsigned long *)&gdt->address);
+ if (status != EFI_SUCCESS)
+ goto fail;
+
+ status = efi_call_phys3(sys_table->boottime->allocate_pool,
+ EFI_LOADER_DATA, sizeof(*idt),
+ (void **)&idt);
+ if (status != EFI_SUCCESS)
+ goto fail;
+
+ idt->size = 0;
+ idt->address = 0;
+
+ status = make_boot_params(boot_params, image, handle);
+ if (status != EFI_SUCCESS)
+ goto fail;
+
+ memset((char *)gdt->address, 0x0, gdt->size);
+ desc = (struct desc_struct *)gdt->address;
+
+ /* The first GDT is a dummy and the second is unused. */
+ desc += 2;
+
+ desc->limit0 = 0xffff;
+ desc->base0 = 0x0000;
+ desc->base1 = 0x0000;
+ desc->type = SEG_TYPE_CODE | SEG_TYPE_EXEC_READ;
+ desc->s = DESC_TYPE_CODE_DATA;
+ desc->dpl = 0;
+ desc->p = 1;
+ desc->limit = 0xf;
+ desc->avl = 0;
+ desc->l = 0;
+ desc->d = SEG_OP_SIZE_32BIT;
+ desc->g = SEG_GRANULARITY_4KB;
+ desc->base2 = 0x00;
+
+ desc++;
+ desc->limit0 = 0xffff;
+ desc->base0 = 0x0000;
+ desc->base1 = 0x0000;
+ desc->type = SEG_TYPE_DATA | SEG_TYPE_READ_WRITE;
+ desc->s = DESC_TYPE_CODE_DATA;
+ desc->dpl = 0;
+ desc->p = 1;
+ desc->limit = 0xf;
+ desc->avl = 0;
+ desc->l = 0;
+ desc->d = SEG_OP_SIZE_32BIT;
+ desc->g = SEG_GRANULARITY_4KB;
+ desc->base2 = 0x00;
+
+#ifdef CONFIG_X86_64
+ /* Task segment value */
+ desc++;
+ desc->limit0 = 0x0000;
+ desc->base0 = 0x0000;
+ desc->base1 = 0x0000;
+ desc->type = SEG_TYPE_TSS;
+ desc->s = 0;
+ desc->dpl = 0;
+ desc->p = 1;
+ desc->limit = 0x0;
+ desc->avl = 0;
+ desc->l = 0;
+ desc->d = 0;
+ desc->g = SEG_GRANULARITY_4KB;
+ desc->base2 = 0x00;
+#endif /* CONFIG_X86_64 */
+
+ asm volatile ("lidt %0" : : "m" (*idt));
+ asm volatile ("lgdt %0" : : "m" (*gdt));
+
+ asm volatile("cli");
+
+ return boot_params;
+fail:
+ return NULL;
+}
diff --git a/arch/x86/boot/compressed/eboot.h b/arch/x86/boot/compressed/eboot.h
new file mode 100644
index 000000000000..39251663e65b
--- /dev/null
+++ b/arch/x86/boot/compressed/eboot.h
@@ -0,0 +1,61 @@
+#ifndef BOOT_COMPRESSED_EBOOT_H
+#define BOOT_COMPRESSED_EBOOT_H
+
+#define SEG_TYPE_DATA (0 << 3)
+#define SEG_TYPE_READ_WRITE (1 << 1)
+#define SEG_TYPE_CODE (1 << 3)
+#define SEG_TYPE_EXEC_READ (1 << 1)
+#define SEG_TYPE_TSS ((1 << 3) | (1 << 0))
+#define SEG_OP_SIZE_32BIT (1 << 0)
+#define SEG_GRANULARITY_4KB (1 << 0)
+
+#define DESC_TYPE_CODE_DATA (1 << 0)
+
+#define EFI_PAGE_SIZE (1UL << EFI_PAGE_SHIFT)
+#define EFI_READ_CHUNK_SIZE (1024 * 1024)
+
+#define PIXEL_RGB_RESERVED_8BIT_PER_COLOR 0
+#define PIXEL_BGR_RESERVED_8BIT_PER_COLOR 1
+#define PIXEL_BIT_MASK 2
+#define PIXEL_BLT_ONLY 3
+#define PIXEL_FORMAT_MAX 4
+
+struct efi_pixel_bitmask {
+ u32 red_mask;
+ u32 green_mask;
+ u32 blue_mask;
+ u32 reserved_mask;
+};
+
+struct efi_graphics_output_mode_info {
+ u32 version;
+ u32 horizontal_resolution;
+ u32 vertical_resolution;
+ int pixel_format;
+ struct efi_pixel_bitmask pixel_information;
+ u32 pixels_per_scan_line;
+} __packed;
+
+struct efi_graphics_output_protocol_mode {
+ u32 max_mode;
+ u32 mode;
+ unsigned long info;
+ unsigned long size_of_info;
+ u64 frame_buffer_base;
+ unsigned long frame_buffer_size;
+} __packed;
+
+struct efi_graphics_output_protocol {
+ void *query_mode;
+ unsigned long set_mode;
+ unsigned long blt;
+ struct efi_graphics_output_protocol_mode *mode;
+};
+
+struct efi_uga_draw_protocol {
+ void *get_mode;
+ void *set_mode;
+ void *blt;
+};
+
+#endif /* BOOT_COMPRESSED_EBOOT_H */
diff --git a/arch/x86/boot/compressed/efi_stub_32.S b/arch/x86/boot/compressed/efi_stub_32.S
new file mode 100644
index 000000000000..a53440e81d52
--- /dev/null
+++ b/arch/x86/boot/compressed/efi_stub_32.S
@@ -0,0 +1,86 @@
+/*
+ * EFI call stub for IA32.
+ *
+ * This stub allows us to make EFI calls in physical mode with interrupts
+ * turned off. Note that this implementation is different from the one in
+ * arch/x86/platform/efi/efi_stub_32.S because we're _already_ in physical
+ * mode at this point.
+ */
+
+#include <linux/linkage.h>
+#include <asm/page_types.h>
+
+/*
+ * efi_call_phys(void *, ...) is a function with variable parameters.
+ * All the callers of this function assure that all the parameters are 4-bytes.
+ */
+
+/*
+ * In gcc calling convention, EBX, ESP, EBP, ESI and EDI are all callee save.
+ * So we'd better save all of them at the beginning of this function and restore
+ * at the end no matter how many we use, because we can not assure EFI runtime
+ * service functions will comply with gcc calling convention, too.
+ */
+
+.text
+ENTRY(efi_call_phys)
+ /*
+ * 0. The function can only be called in Linux kernel. So CS has been
+ * set to 0x0010, DS and SS have been set to 0x0018. In EFI, I found
+ * the values of these registers are the same. And, the corresponding
+ * GDT entries are identical. So I will do nothing about segment reg
+ * and GDT, but change GDT base register in prelog and epilog.
+ */
+
+ /*
+ * 1. Because we haven't been relocated by this point we need to
+ * use relative addressing.
+ */
+ call 1f
+1: popl %edx
+ subl $1b, %edx
+
+ /*
+ * 2. Now on the top of stack is the return
+ * address in the caller of efi_call_phys(), then parameter 1,
+ * parameter 2, ..., param n. To make things easy, we save the return
+ * address of efi_call_phys in a global variable.
+ */
+ popl %ecx
+ movl %ecx, saved_return_addr(%edx)
+ /* get the function pointer into ECX*/
+ popl %ecx
+ movl %ecx, efi_rt_function_ptr(%edx)
+
+ /*
+ * 3. Call the physical function.
+ */
+ call *%ecx
+
+ /*
+ * 4. Balance the stack. And because EAX contain the return value,
+ * we'd better not clobber it. We need to calculate our address
+ * again because %ecx and %edx are not preserved across EFI function
+ * calls.
+ */
+ call 1f
+1: popl %edx
+ subl $1b, %edx
+
+ movl efi_rt_function_ptr(%edx), %ecx
+ pushl %ecx
+
+ /*
+ * 10. Push the saved return address onto the stack and return.
+ */
+ movl saved_return_addr(%edx), %ecx
+ pushl %ecx
+ ret
+ENDPROC(efi_call_phys)
+.previous
+
+.data
+saved_return_addr:
+ .long 0
+efi_rt_function_ptr:
+ .long 0
diff --git a/arch/x86/boot/compressed/efi_stub_64.S b/arch/x86/boot/compressed/efi_stub_64.S
new file mode 100644
index 000000000000..cedc60de86eb
--- /dev/null
+++ b/arch/x86/boot/compressed/efi_stub_64.S
@@ -0,0 +1 @@
+#include "../../platform/efi/efi_stub_64.S"
diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S
index 67a655a39ce4..a0559930a180 100644
--- a/arch/x86/boot/compressed/head_32.S
+++ b/arch/x86/boot/compressed/head_32.S
@@ -32,6 +32,28 @@
__HEAD
ENTRY(startup_32)
+#ifdef CONFIG_EFI_STUB
+ /*
+ * We don't need the return address, so set up the stack so
+ * efi_main() can find its arugments.
+ */
+ add $0x4, %esp
+
+ call efi_main
+ cmpl $0, %eax
+ je preferred_addr
+ movl %eax, %esi
+ call 1f
+1:
+ popl %eax
+ subl $1b, %eax
+ subl BP_pref_address(%esi), %eax
+ add BP_code32_start(%esi), %eax
+ leal preferred_addr(%eax), %eax
+ jmp *%eax
+
+preferred_addr:
+#endif
cld
/*
* Test KEEP_SEGMENTS flag to see if the bootloader is asking
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index 35af09d13dc1..558d76ce23bc 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -199,6 +199,26 @@ ENTRY(startup_64)
* an identity mapped page table being provied that maps our
* entire text+data+bss and hopefully all of memory.
*/
+#ifdef CONFIG_EFI_STUB
+ pushq %rsi
+ mov %rcx, %rdi
+ mov %rdx, %rsi
+ call efi_main
+ popq %rsi
+ cmpq $0,%rax
+ je preferred_addr
+ movq %rax,%rsi
+ call 1f
+1:
+ popq %rax
+ subq $1b, %rax
+ subq BP_pref_address(%rsi), %rax
+ add BP_code32_start(%esi), %eax
+ leaq preferred_addr(%rax), %rax
+ jmp *%rax
+
+preferred_addr:
+#endif
/* Setup data segments. */
xorl %eax, %eax
diff --git a/arch/x86/boot/compressed/string.c b/arch/x86/boot/compressed/string.c
index 19b3e693cd72..ffb9c5c9d748 100644
--- a/arch/x86/boot/compressed/string.c
+++ b/arch/x86/boot/compressed/string.c
@@ -1,2 +1,11 @@
#include "misc.h"
+
+int memcmp(const void *s1, const void *s2, size_t len)
+{
+ u8 diff;
+ asm("repe; cmpsb; setnz %0"
+ : "=qm" (diff), "+D" (s1), "+S" (s2), "+c" (len));
+ return diff;
+}
+
#include "../string.c"
diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
index bdb4d458ec8c..f1bbeeb09148 100644
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -45,6 +45,11 @@ SYSSEG = 0x1000 /* historical load address >> 4 */
.global bootsect_start
bootsect_start:
+#ifdef CONFIG_EFI_STUB
+ # "MZ", MS-DOS header
+ .byte 0x4d
+ .byte 0x5a
+#endif
# Normalize the start address
ljmp $BOOTSEG, $start2
@@ -79,6 +84,14 @@ bs_die:
# invoke the BIOS reset code...
ljmp $0xf000,$0xfff0
+#ifdef CONFIG_EFI_STUB
+ .org 0x3c
+ #
+ # Offset to the PE header.
+ #
+ .long pe_header
+#endif /* CONFIG_EFI_STUB */
+
.section ".bsdata", "a"
bugger_off_msg:
.ascii "Direct booting from floppy is no longer supported.\r\n"
@@ -87,6 +100,141 @@ bugger_off_msg:
.ascii "Remove disk and press any key to reboot . . .\r\n"
.byte 0
+#ifdef CONFIG_EFI_STUB
+pe_header:
+ .ascii "PE"
+ .word 0
+
+coff_header:
+#ifdef CONFIG_X86_32
+ .word 0x14c # i386
+#else
+ .word 0x8664 # x86-64
+#endif
+ .word 2 # nr_sections
+ .long 0 # TimeDateStamp
+ .long 0 # PointerToSymbolTable
+ .long 1 # NumberOfSymbols
+ .word section_table - optional_header # SizeOfOptionalHeader
+#ifdef CONFIG_X86_32
+ .word 0x306 # Characteristics.
+ # IMAGE_FILE_32BIT_MACHINE |
+ # IMAGE_FILE_DEBUG_STRIPPED |
+ # IMAGE_FILE_EXECUTABLE_IMAGE |
+ # IMAGE_FILE_LINE_NUMS_STRIPPED
+#else
+ .word 0x206 # Characteristics
+ # IMAGE_FILE_DEBUG_STRIPPED |
+ # IMAGE_FILE_EXECUTABLE_IMAGE |
+ # IMAGE_FILE_LINE_NUMS_STRIPPED
+#endif
+
+optional_header:
+#ifdef CONFIG_X86_32
+ .word 0x10b # PE32 format
+#else
+ .word 0x20b # PE32+ format
+#endif
+ .byte 0x02 # MajorLinkerVersion
+ .byte 0x14 # MinorLinkerVersion
+
+ # Filled in by build.c
+ .long 0 # SizeOfCode
+
+ .long 0 # SizeOfInitializedData
+ .long 0 # SizeOfUninitializedData
+
+ # Filled in by build.c
+ .long 0x0000 # AddressOfEntryPoint
+
+ .long 0x0000 # BaseOfCode
+#ifdef CONFIG_X86_32
+ .long 0 # data
+#endif
+
+extra_header_fields:
+#ifdef CONFIG_X86_32
+ .long 0 # ImageBase
+#else
+ .quad 0 # ImageBase
+#endif
+ .long 0x1000 # SectionAlignment
+ .long 0x200 # FileAlignment
+ .word 0 # MajorOperatingSystemVersion
+ .word 0 # MinorOperatingSystemVersion
+ .word 0 # MajorImageVersion
+ .word 0 # MinorImageVersion
+ .word 0 # MajorSubsystemVersion
+ .word 0 # MinorSubsystemVersion
+ .long 0 # Win32VersionValue
+
+ #
+ # The size of the bzImage is written in tools/build.c
+ #
+ .long 0 # SizeOfImage
+
+ .long 0x200 # SizeOfHeaders
+ .long 0 # CheckSum
+ .word 0xa # Subsystem (EFI application)
+ .word 0 # DllCharacteristics
+#ifdef CONFIG_X86_32
+ .long 0 # SizeOfStackReserve
+ .long 0 # SizeOfStackCommit
+ .long 0 # SizeOfHeapReserve
+ .long 0 # SizeOfHeapCommit
+#else
+ .quad 0 # SizeOfStackReserve
+ .quad 0 # SizeOfStackCommit
+ .quad 0 # SizeOfHeapReserve
+ .quad 0 # SizeOfHeapCommit
+#endif
+ .long 0 # LoaderFlags
+ .long 0x1 # NumberOfRvaAndSizes
+
+ .quad 0 # ExportTable
+ .quad 0 # ImportTable
+ .quad 0 # ResourceTable
+ .quad 0 # ExceptionTable
+ .quad 0 # CertificationTable
+ .quad 0 # BaseRelocationTable
+
+ # Section table
+section_table:
+ .ascii ".text"
+ .byte 0
+ .byte 0
+ .byte 0
+ .long 0
+ .long 0x0 # startup_{32,64}
+ .long 0 # Size of initialized data
+ # on disk
+ .long 0x0 # startup_{32,64}
+ .long 0 # PointerToRelocations
+ .long 0 # PointerToLineNumbers
+ .word 0 # NumberOfRelocations
+ .word 0 # NumberOfLineNumbers
+ .long 0x60500020 # Characteristics (section flags)
+
+ #
+ # The EFI application loader requires a relocation section
+ # because EFI applications are relocatable and not having
+ # this section seems to confuse it. But since we don't need
+ # the loader to fixup any relocs for us just fill it with a
+ # single dummy reloc.
+ #
+ .ascii ".reloc"
+ .byte 0
+ .byte 0
+ .long reloc_end - reloc_start
+ .long reloc_start
+ .long reloc_end - reloc_start # SizeOfRawData
+ .long reloc_start # PointerToRawData
+ .long 0 # PointerToRelocations
+ .long 0 # PointerToLineNumbers
+ .word 0 # NumberOfRelocations
+ .word 0 # NumberOfLineNumbers
+ .long 0x42100040 # Characteristics (section flags)
+#endif /* CONFIG_EFI_STUB */
# Kernel attributes; used by setup. This is part 1 of the
# header, from the old boot sector.
@@ -318,3 +466,13 @@ die:
setup_corrupt:
.byte 7
.string "No setup signature found...\n"
+
+ .data
+dummy: .long 0
+
+ .section .reloc
+reloc_start:
+ .long dummy - reloc_start
+ .long 10
+ .word 0
+reloc_end:
diff --git a/arch/x86/boot/string.c b/arch/x86/boot/string.c
index 3cbc4058dd26..574dedfe2890 100644
--- a/arch/x86/boot/string.c
+++ b/arch/x86/boot/string.c
@@ -111,3 +111,38 @@ unsigned long long simple_strtoull(const char *cp, char **endp, unsigned int bas
return result;
}
+
+/**
+ * strlen - Find the length of a string
+ * @s: The string to be sized
+ */
+size_t strlen(const char *s)
+{
+ const char *sc;
+
+ for (sc = s; *sc != '\0'; ++sc)
+ /* nothing */;
+ return sc - s;
+}
+
+/**
+ * strstr - Find the first substring in a %NUL terminated string
+ * @s1: The string to be searched
+ * @s2: The string to search for
+ */
+char *strstr(const char *s1, const char *s2)
+{
+ size_t l1, l2;
+
+ l2 = strlen(s2);
+ if (!l2)
+ return (char *)s1;
+ l1 = strlen(s1);
+ while (l1 >= l2) {
+ l1--;
+ if (!memcmp(s1, s2, l2))
+ return (char *)s1;
+ s1++;
+ }
+ return NULL;
+}
diff --git a/arch/x86/boot/tools/build.c b/arch/x86/boot/tools/build.c
index fdc60a0b3c20..4e9bd6bcafa6 100644
--- a/arch/x86/boot/tools/build.c
+++ b/arch/x86/boot/tools/build.c
@@ -135,6 +135,9 @@ static void usage(void)
int main(int argc, char ** argv)
{
+#ifdef CONFIG_EFI_STUB
+ unsigned int file_sz, pe_header;
+#endif
unsigned int i, sz, setup_sectors;
int c;
u32 sys_size;
@@ -194,6 +197,42 @@ int main(int argc, char ** argv)
buf[0x1f6] = sys_size >> 16;
buf[0x1f7] = sys_size >> 24;
+#ifdef CONFIG_EFI_STUB
+ file_sz = sz + i + ((sys_size * 16) - sz);
+
+ pe_header = *(unsigned int *)&buf[0x3c];
+
+ /* Size of code */
+ *(unsigned int *)&buf[pe_header + 0x1c] = file_sz;
+
+ /* Size of image */
+ *(unsigned int *)&buf[pe_header + 0x50] = file_sz;
+
+#ifdef CONFIG_X86_32
+ /* Address of entry point */
+ *(unsigned int *)&buf[pe_header + 0x28] = i;
+
+ /* .text size */
+ *(unsigned int *)&buf[pe_header + 0xb0] = file_sz;
+
+ /* .text size of initialised data */
+ *(unsigned int *)&buf[pe_header + 0xb8] = file_sz;
+#else
+ /*
+ * Address of entry point. startup_32 is at the beginning and
+ * the 64-bit entry point (startup_64) is always 512 bytes
+ * after.
+ */
+ *(unsigned int *)&buf[pe_header + 0x28] = i + 512;
+
+ /* .text size */
+ *(unsigned int *)&buf[pe_header + 0xc0] = file_sz;
+
+ /* .text size of initialised data */
+ *(unsigned int *)&buf[pe_header + 0xc8] = file_sz;
+#endif /* CONFIG_X86_32 */
+#endif /* CONFIG_EFI_STUB */
+
crc = partial_crc32(buf, i, crc);
if (fwrite(buf, 1, i, stdout) != i)
die("Writing setup failed");
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 3537d4b91f74..2b0b9631474b 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -5,12 +5,14 @@
obj-$(CONFIG_CRYPTO_AES_586) += aes-i586.o
obj-$(CONFIG_CRYPTO_TWOFISH_586) += twofish-i586.o
obj-$(CONFIG_CRYPTO_SALSA20_586) += salsa20-i586.o
+obj-$(CONFIG_CRYPTO_SERPENT_SSE2_586) += serpent-sse2-i586.o
obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o
obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o
obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
obj-$(CONFIG_CRYPTO_TWOFISH_X86_64_3WAY) += twofish-x86_64-3way.o
obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o
+obj-$(CONFIG_CRYPTO_SERPENT_SSE2_X86_64) += serpent-sse2-x86_64.o
obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o
obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o
@@ -20,12 +22,14 @@ obj-$(CONFIG_CRYPTO_SHA1_SSSE3) += sha1-ssse3.o
aes-i586-y := aes-i586-asm_32.o aes_glue.o
twofish-i586-y := twofish-i586-asm_32.o twofish_glue.o
salsa20-i586-y := salsa20-i586-asm_32.o salsa20_glue.o
+serpent-sse2-i586-y := serpent-sse2-i586-asm_32.o serpent_sse2_glue.o
aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o
blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o
twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o
twofish-x86_64-3way-y := twofish-x86_64-asm_64-3way.o twofish_glue_3way.o
salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o
+serpent-sse2-x86_64-y := serpent-sse2-x86_64-asm_64.o serpent_sse2_glue.o
aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o
diff --git a/arch/x86/crypto/serpent-sse2-i586-asm_32.S b/arch/x86/crypto/serpent-sse2-i586-asm_32.S
new file mode 100644
index 000000000000..4e37677ca851
--- /dev/null
+++ b/arch/x86/crypto/serpent-sse2-i586-asm_32.S
@@ -0,0 +1,638 @@
+/*
+ * Serpent Cipher 4-way parallel algorithm (i586/SSE2)
+ *
+ * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ *
+ * Based on crypto/serpent.c by
+ * Copyright (C) 2002 Dag Arne Osvik <osvik@ii.uib.no>
+ * 2003 Herbert Valerio Riedel <hvr@gnu.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+ * USA
+ *
+ */
+
+.file "serpent-sse2-i586-asm_32.S"
+.text
+
+#define arg_ctx 4
+#define arg_dst 8
+#define arg_src 12
+#define arg_xor 16
+
+/**********************************************************************
+ 4-way SSE2 serpent
+ **********************************************************************/
+#define CTX %edx
+
+#define RA %xmm0
+#define RB %xmm1
+#define RC %xmm2
+#define RD %xmm3
+#define RE %xmm4
+
+#define RT0 %xmm5
+#define RT1 %xmm6
+
+#define RNOT %xmm7
+
+#define get_key(i, j, t) \
+ movd (4*(i)+(j))*4(CTX), t; \
+ pshufd $0, t, t;
+
+#define K(x0, x1, x2, x3, x4, i) \
+ get_key(i, 0, x4); \
+ get_key(i, 1, RT0); \
+ get_key(i, 2, RT1); \
+ pxor x4, x0; \
+ pxor RT0, x1; \
+ pxor RT1, x2; \
+ get_key(i, 3, x4); \
+ pxor x4, x3;
+
+#define LK(x0, x1, x2, x3, x4, i) \
+ movdqa x0, x4; \
+ pslld $13, x0; \
+ psrld $(32 - 13), x4; \
+ por x4, x0; \
+ pxor x0, x1; \
+ movdqa x2, x4; \
+ pslld $3, x2; \
+ psrld $(32 - 3), x4; \
+ por x4, x2; \
+ pxor x2, x1; \
+ movdqa x1, x4; \
+ pslld $1, x1; \
+ psrld $(32 - 1), x4; \
+ por x4, x1; \
+ movdqa x0, x4; \
+ pslld $3, x4; \
+ pxor x2, x3; \
+ pxor x4, x3; \
+ movdqa x3, x4; \
+ pslld $7, x3; \
+ psrld $(32 - 7), x4; \
+ por x4, x3; \
+ movdqa x1, x4; \
+ pslld $7, x4; \
+ pxor x1, x0; \
+ pxor x3, x0; \
+ pxor x3, x2; \
+ pxor x4, x2; \
+ movdqa x0, x4; \
+ get_key(i, 1, RT0); \
+ pxor RT0, x1; \
+ get_key(i, 3, RT0); \
+ pxor RT0, x3; \
+ pslld $5, x0; \
+ psrld $(32 - 5), x4; \
+ por x4, x0; \
+ movdqa x2, x4; \
+ pslld $22, x2; \
+ psrld $(32 - 22), x4; \
+ por x4, x2; \
+ get_key(i, 0, RT0); \
+ pxor RT0, x0; \
+ get_key(i, 2, RT0); \
+ pxor RT0, x2;
+
+#define KL(x0, x1, x2, x3, x4, i) \
+ K(x0, x1, x2, x3, x4, i); \
+ movdqa x0, x4; \
+ psrld $5, x0; \
+ pslld $(32 - 5), x4; \
+ por x4, x0; \
+ movdqa x2, x4; \
+ psrld $22, x2; \
+ pslld $(32 - 22), x4; \
+ por x4, x2; \
+ pxor x3, x2; \
+ pxor x3, x0; \
+ movdqa x1, x4; \
+ pslld $7, x4; \
+ pxor x1, x0; \
+ pxor x4, x2; \
+ movdqa x1, x4; \
+ psrld $1, x1; \
+ pslld $(32 - 1), x4; \
+ por x4, x1; \
+ movdqa x3, x4; \
+ psrld $7, x3; \
+ pslld $(32 - 7), x4; \
+ por x4, x3; \
+ pxor x0, x1; \
+ movdqa x0, x4; \
+ pslld $3, x4; \
+ pxor x4, x3; \
+ movdqa x0, x4; \
+ psrld $13, x0; \
+ pslld $(32 - 13), x4; \
+ por x4, x0; \
+ pxor x2, x1; \
+ pxor x2, x3; \
+ movdqa x2, x4; \
+ psrld $3, x2; \
+ pslld $(32 - 3), x4; \
+ por x4, x2;
+
+#define S0(x0, x1, x2, x3, x4) \
+ movdqa x3, x4; \
+ por x0, x3; \
+ pxor x4, x0; \
+ pxor x2, x4; \
+ pxor RNOT, x4; \
+ pxor x1, x3; \
+ pand x0, x1; \
+ pxor x4, x1; \
+ pxor x0, x2; \
+ pxor x3, x0; \
+ por x0, x4; \
+ pxor x2, x0; \
+ pand x1, x2; \
+ pxor x2, x3; \
+ pxor RNOT, x1; \
+ pxor x4, x2; \
+ pxor x2, x1;
+
+#define S1(x0, x1, x2, x3, x4) \
+ movdqa x1, x4; \
+ pxor x0, x1; \
+ pxor x3, x0; \
+ pxor RNOT, x3; \
+ pand x1, x4; \
+ por x1, x0; \
+ pxor x2, x3; \
+ pxor x3, x0; \
+ pxor x3, x1; \
+ pxor x4, x3; \
+ por x4, x1; \
+ pxor x2, x4; \
+ pand x0, x2; \
+ pxor x1, x2; \
+ por x0, x1; \
+ pxor RNOT, x0; \
+ pxor x2, x0; \
+ pxor x1, x4;
+
+#define S2(x0, x1, x2, x3, x4) \
+ pxor RNOT, x3; \
+ pxor x0, x1; \
+ movdqa x0, x4; \
+ pand x2, x0; \
+ pxor x3, x0; \
+ por x4, x3; \
+ pxor x1, x2; \
+ pxor x1, x3; \
+ pand x0, x1; \
+ pxor x2, x0; \
+ pand x3, x2; \
+ por x1, x3; \
+ pxor RNOT, x0; \
+ pxor x0, x3; \
+ pxor x0, x4; \
+ pxor x2, x0; \
+ por x2, x1;
+
+#define S3(x0, x1, x2, x3, x4) \
+ movdqa x1, x4; \
+ pxor x3, x1; \
+ por x0, x3; \
+ pand x0, x4; \
+ pxor x2, x0; \
+ pxor x1, x2; \
+ pand x3, x1; \
+ pxor x3, x2; \
+ por x4, x0; \
+ pxor x3, x4; \
+ pxor x0, x1; \
+ pand x3, x0; \
+ pand x4, x3; \
+ pxor x2, x3; \
+ por x1, x4; \
+ pand x1, x2; \
+ pxor x3, x4; \
+ pxor x3, x0; \
+ pxor x2, x3;
+
+#define S4(x0, x1, x2, x3, x4) \
+ movdqa x3, x4; \
+ pand x0, x3; \
+ pxor x4, x0; \
+ pxor x2, x3; \
+ por x4, x2; \
+ pxor x1, x0; \
+ pxor x3, x4; \
+ por x0, x2; \
+ pxor x1, x2; \
+ pand x0, x1; \
+ pxor x4, x1; \
+ pand x2, x4; \
+ pxor x3, x2; \
+ pxor x0, x4; \
+ por x1, x3; \
+ pxor RNOT, x1; \
+ pxor x0, x3;
+
+#define S5(x0, x1, x2, x3, x4) \
+ movdqa x1, x4; \
+ por x0, x1; \
+ pxor x1, x2; \
+ pxor RNOT, x3; \
+ pxor x0, x4; \
+ pxor x2, x0; \
+ pand x4, x1; \
+ por x3, x4; \
+ pxor x0, x4; \
+ pand x3, x0; \
+ pxor x3, x1; \
+ pxor x2, x3; \
+ pxor x1, x0; \
+ pand x4, x2; \
+ pxor x2, x1; \
+ pand x0, x2; \
+ pxor x2, x3;
+
+#define S6(x0, x1, x2, x3, x4) \
+ movdqa x1, x4; \
+ pxor x0, x3; \
+ pxor x2, x1; \
+ pxor x0, x2; \
+ pand x3, x0; \
+ por x3, x1; \
+ pxor RNOT, x4; \
+ pxor x1, x0; \
+ pxor x2, x1; \
+ pxor x4, x3; \
+ pxor x0, x4; \
+ pand x0, x2; \
+ pxor x1, x4; \
+ pxor x3, x2; \
+ pand x1, x3; \
+ pxor x0, x3; \
+ pxor x2, x1;
+
+#define S7(x0, x1, x2, x3, x4) \
+ pxor RNOT, x1; \
+ movdqa x1, x4; \
+ pxor RNOT, x0; \
+ pand x2, x1; \
+ pxor x3, x1; \
+ por x4, x3; \
+ pxor x2, x4; \
+ pxor x3, x2; \
+ pxor x0, x3; \
+ por x1, x0; \
+ pand x0, x2; \
+ pxor x4, x0; \
+ pxor x3, x4; \
+ pand x0, x3; \
+ pxor x1, x4; \
+ pxor x4, x2; \
+ pxor x1, x3; \
+ por x0, x4; \
+ pxor x1, x4;
+
+#define SI0(x0, x1, x2, x3, x4) \
+ movdqa x3, x4; \
+ pxor x0, x1; \
+ por x1, x3; \
+ pxor x1, x4; \
+ pxor RNOT, x0; \
+ pxor x3, x2; \
+ pxor x0, x3; \
+ pand x1, x0; \
+ pxor x2, x0; \
+ pand x3, x2; \
+ pxor x4, x3; \
+ pxor x3, x2; \
+ pxor x3, x1; \
+ pand x0, x3; \
+ pxor x0, x1; \
+ pxor x2, x0; \
+ pxor x3, x4;
+
+#define SI1(x0, x1, x2, x3, x4) \
+ pxor x3, x1; \
+ movdqa x0, x4; \
+ pxor x2, x0; \
+ pxor RNOT, x2; \
+ por x1, x4; \
+ pxor x3, x4; \
+ pand x1, x3; \
+ pxor x2, x1; \
+ pand x4, x2; \
+ pxor x1, x4; \
+ por x3, x1; \
+ pxor x0, x3; \
+ pxor x0, x2; \
+ por x4, x0; \
+ pxor x4, x2; \
+ pxor x0, x1; \
+ pxor x1, x4;
+
+#define SI2(x0, x1, x2, x3, x4) \
+ pxor x1, x2; \
+ movdqa x3, x4; \
+ pxor RNOT, x3; \
+ por x2, x3; \
+ pxor x4, x2; \
+ pxor x0, x4; \
+ pxor x1, x3; \
+ por x2, x1; \
+ pxor x0, x2; \
+ pxor x4, x1; \
+ por x3, x4; \
+ pxor x3, x2; \
+ pxor x2, x4; \
+ pand x1, x2; \
+ pxor x3, x2; \
+ pxor x4, x3; \
+ pxor x0, x4;
+
+#define SI3(x0, x1, x2, x3, x4) \
+ pxor x1, x2; \
+ movdqa x1, x4; \
+ pand x2, x1; \
+ pxor x0, x1; \
+ por x4, x0; \
+ pxor x3, x4; \
+ pxor x3, x0; \
+ por x1, x3; \
+ pxor x2, x1; \
+ pxor x3, x1; \
+ pxor x2, x0; \
+ pxor x3, x2; \
+ pand x1, x3; \
+ pxor x0, x1; \
+ pand x2, x0; \
+ pxor x3, x4; \
+ pxor x0, x3; \
+ pxor x1, x0;
+
+#define SI4(x0, x1, x2, x3, x4) \
+ pxor x3, x2; \
+ movdqa x0, x4; \
+ pand x1, x0; \
+ pxor x2, x0; \
+ por x3, x2; \
+ pxor RNOT, x4; \
+ pxor x0, x1; \
+ pxor x2, x0; \
+ pand x4, x2; \
+ pxor x0, x2; \
+ por x4, x0; \
+ pxor x3, x0; \
+ pand x2, x3; \
+ pxor x3, x4; \
+ pxor x1, x3; \
+ pand x0, x1; \
+ pxor x1, x4; \
+ pxor x3, x0;
+
+#define SI5(x0, x1, x2, x3, x4) \
+ movdqa x1, x4; \
+ por x2, x1; \
+ pxor x4, x2; \
+ pxor x3, x1; \
+ pand x4, x3; \
+ pxor x3, x2; \
+ por x0, x3; \
+ pxor RNOT, x0; \
+ pxor x2, x3; \
+ por x0, x2; \
+ pxor x1, x4; \
+ pxor x4, x2; \
+ pand x0, x4; \
+ pxor x1, x0; \
+ pxor x3, x1; \
+ pand x2, x0; \
+ pxor x3, x2; \
+ pxor x2, x0; \
+ pxor x4, x2; \
+ pxor x3, x4;
+
+#define SI6(x0, x1, x2, x3, x4) \
+ pxor x2, x0; \
+ movdqa x0, x4; \
+ pand x3, x0; \
+ pxor x3, x2; \
+ pxor x2, x0; \
+ pxor x1, x3; \
+ por x4, x2; \
+ pxor x3, x2; \
+ pand x0, x3; \
+ pxor RNOT, x0; \
+ pxor x1, x3; \
+ pand x2, x1; \
+ pxor x0, x4; \
+ pxor x4, x3; \
+ pxor x2, x4; \
+ pxor x1, x0; \
+ pxor x0, x2;
+
+#define SI7(x0, x1, x2, x3, x4) \
+ movdqa x3, x4; \
+ pand x0, x3; \
+ pxor x2, x0; \
+ por x4, x2; \
+ pxor x1, x4; \
+ pxor RNOT, x0; \
+ por x3, x1; \
+ pxor x0, x4; \
+ pand x2, x0; \
+ pxor x1, x0; \
+ pand x2, x1; \
+ pxor x2, x3; \
+ pxor x3, x4; \
+ pand x3, x2; \
+ por x0, x3; \
+ pxor x4, x1; \
+ pxor x4, x3; \
+ pand x0, x4; \
+ pxor x2, x4;
+
+#define transpose_4x4(x0, x1, x2, x3, t1, t2, t3) \
+ movdqa x2, t3; \
+ movdqa x0, t1; \
+ unpcklps x3, t3; \
+ movdqa x0, t2; \
+ unpcklps x1, t1; \
+ unpckhps x1, t2; \
+ movdqa t3, x1; \
+ unpckhps x3, x2; \
+ movdqa t1, x0; \
+ movhlps t1, x1; \
+ movdqa t2, t1; \
+ movlhps t3, x0; \
+ movlhps x2, t1; \
+ movhlps t2, x2; \
+ movdqa x2, x3; \
+ movdqa t1, x2;
+
+#define read_blocks(in, x0, x1, x2, x3, t0, t1, t2) \
+ movdqu (0*4*4)(in), x0; \
+ movdqu (1*4*4)(in), x1; \
+ movdqu (2*4*4)(in), x2; \
+ movdqu (3*4*4)(in), x3; \
+ \
+ transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
+
+#define write_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
+ transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
+ \
+ movdqu x0, (0*4*4)(out); \
+ movdqu x1, (1*4*4)(out); \
+ movdqu x2, (2*4*4)(out); \
+ movdqu x3, (3*4*4)(out);
+
+#define xor_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
+ transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
+ \
+ movdqu (0*4*4)(out), t0; \
+ pxor t0, x0; \
+ movdqu x0, (0*4*4)(out); \
+ movdqu (1*4*4)(out), t0; \
+ pxor t0, x1; \
+ movdqu x1, (1*4*4)(out); \
+ movdqu (2*4*4)(out), t0; \
+ pxor t0, x2; \
+ movdqu x2, (2*4*4)(out); \
+ movdqu (3*4*4)(out), t0; \
+ pxor t0, x3; \
+ movdqu x3, (3*4*4)(out);
+
+.align 8
+.global __serpent_enc_blk_4way
+.type __serpent_enc_blk_4way,@function;
+
+__serpent_enc_blk_4way:
+ /* input:
+ * arg_ctx(%esp): ctx, CTX
+ * arg_dst(%esp): dst
+ * arg_src(%esp): src
+ * arg_xor(%esp): bool, if true: xor output
+ */
+
+ pcmpeqd RNOT, RNOT;
+
+ movl arg_ctx(%esp), CTX;
+
+ movl arg_src(%esp), %eax;
+ read_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE);
+
+ K(RA, RB, RC, RD, RE, 0);
+ S0(RA, RB, RC, RD, RE); LK(RC, RB, RD, RA, RE, 1);
+ S1(RC, RB, RD, RA, RE); LK(RE, RD, RA, RC, RB, 2);
+ S2(RE, RD, RA, RC, RB); LK(RB, RD, RE, RC, RA, 3);
+ S3(RB, RD, RE, RC, RA); LK(RC, RA, RD, RB, RE, 4);
+ S4(RC, RA, RD, RB, RE); LK(RA, RD, RB, RE, RC, 5);
+ S5(RA, RD, RB, RE, RC); LK(RC, RA, RD, RE, RB, 6);
+ S6(RC, RA, RD, RE, RB); LK(RD, RB, RA, RE, RC, 7);
+ S7(RD, RB, RA, RE, RC); LK(RC, RA, RE, RD, RB, 8);
+ S0(RC, RA, RE, RD, RB); LK(RE, RA, RD, RC, RB, 9);
+ S1(RE, RA, RD, RC, RB); LK(RB, RD, RC, RE, RA, 10);
+ S2(RB, RD, RC, RE, RA); LK(RA, RD, RB, RE, RC, 11);
+ S3(RA, RD, RB, RE, RC); LK(RE, RC, RD, RA, RB, 12);
+ S4(RE, RC, RD, RA, RB); LK(RC, RD, RA, RB, RE, 13);
+ S5(RC, RD, RA, RB, RE); LK(RE, RC, RD, RB, RA, 14);
+ S6(RE, RC, RD, RB, RA); LK(RD, RA, RC, RB, RE, 15);
+ S7(RD, RA, RC, RB, RE); LK(RE, RC, RB, RD, RA, 16);
+ S0(RE, RC, RB, RD, RA); LK(RB, RC, RD, RE, RA, 17);
+ S1(RB, RC, RD, RE, RA); LK(RA, RD, RE, RB, RC, 18);
+ S2(RA, RD, RE, RB, RC); LK(RC, RD, RA, RB, RE, 19);
+ S3(RC, RD, RA, RB, RE); LK(RB, RE, RD, RC, RA, 20);
+ S4(RB, RE, RD, RC, RA); LK(RE, RD, RC, RA, RB, 21);
+ S5(RE, RD, RC, RA, RB); LK(RB, RE, RD, RA, RC, 22);
+ S6(RB, RE, RD, RA, RC); LK(RD, RC, RE, RA, RB, 23);
+ S7(RD, RC, RE, RA, RB); LK(RB, RE, RA, RD, RC, 24);
+ S0(RB, RE, RA, RD, RC); LK(RA, RE, RD, RB, RC, 25);
+ S1(RA, RE, RD, RB, RC); LK(RC, RD, RB, RA, RE, 26);
+ S2(RC, RD, RB, RA, RE); LK(RE, RD, RC, RA, RB, 27);
+ S3(RE, RD, RC, RA, RB); LK(RA, RB, RD, RE, RC, 28);
+ S4(RA, RB, RD, RE, RC); LK(RB, RD, RE, RC, RA, 29);
+ S5(RB, RD, RE, RC, RA); LK(RA, RB, RD, RC, RE, 30);
+ S6(RA, RB, RD, RC, RE); LK(RD, RE, RB, RC, RA, 31);
+ S7(RD, RE, RB, RC, RA); K(RA, RB, RC, RD, RE, 32);
+
+ movl arg_dst(%esp), %eax;
+
+ cmpb $0, arg_xor(%esp);
+ jnz __enc_xor4;
+
+ write_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE);
+
+ ret;
+
+__enc_xor4:
+ xor_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE);
+
+ ret;
+
+.align 8
+.global serpent_dec_blk_4way
+.type serpent_dec_blk_4way,@function;
+
+serpent_dec_blk_4way:
+ /* input:
+ * arg_ctx(%esp): ctx, CTX
+ * arg_dst(%esp): dst
+ * arg_src(%esp): src
+ */
+
+ pcmpeqd RNOT, RNOT;
+
+ movl arg_ctx(%esp), CTX;
+
+ movl arg_src(%esp), %eax;
+ read_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE);
+
+ K(RA, RB, RC, RD, RE, 32);
+ SI7(RA, RB, RC, RD, RE); KL(RB, RD, RA, RE, RC, 31);
+ SI6(RB, RD, RA, RE, RC); KL(RA, RC, RE, RB, RD, 30);
+ SI5(RA, RC, RE, RB, RD); KL(RC, RD, RA, RE, RB, 29);
+ SI4(RC, RD, RA, RE, RB); KL(RC, RA, RB, RE, RD, 28);
+ SI3(RC, RA, RB, RE, RD); KL(RB, RC, RD, RE, RA, 27);
+ SI2(RB, RC, RD, RE, RA); KL(RC, RA, RE, RD, RB, 26);
+ SI1(RC, RA, RE, RD, RB); KL(RB, RA, RE, RD, RC, 25);
+ SI0(RB, RA, RE, RD, RC); KL(RE, RC, RA, RB, RD, 24);
+ SI7(RE, RC, RA, RB, RD); KL(RC, RB, RE, RD, RA, 23);
+ SI6(RC, RB, RE, RD, RA); KL(RE, RA, RD, RC, RB, 22);
+ SI5(RE, RA, RD, RC, RB); KL(RA, RB, RE, RD, RC, 21);
+ SI4(RA, RB, RE, RD, RC); KL(RA, RE, RC, RD, RB, 20);
+ SI3(RA, RE, RC, RD, RB); KL(RC, RA, RB, RD, RE, 19);
+ SI2(RC, RA, RB, RD, RE); KL(RA, RE, RD, RB, RC, 18);
+ SI1(RA, RE, RD, RB, RC); KL(RC, RE, RD, RB, RA, 17);
+ SI0(RC, RE, RD, RB, RA); KL(RD, RA, RE, RC, RB, 16);
+ SI7(RD, RA, RE, RC, RB); KL(RA, RC, RD, RB, RE, 15);
+ SI6(RA, RC, RD, RB, RE); KL(RD, RE, RB, RA, RC, 14);
+ SI5(RD, RE, RB, RA, RC); KL(RE, RC, RD, RB, RA, 13);
+ SI4(RE, RC, RD, RB, RA); KL(RE, RD, RA, RB, RC, 12);
+ SI3(RE, RD, RA, RB, RC); KL(RA, RE, RC, RB, RD, 11);
+ SI2(RA, RE, RC, RB, RD); KL(RE, RD, RB, RC, RA, 10);
+ SI1(RE, RD, RB, RC, RA); KL(RA, RD, RB, RC, RE, 9);
+ SI0(RA, RD, RB, RC, RE); KL(RB, RE, RD, RA, RC, 8);
+ SI7(RB, RE, RD, RA, RC); KL(RE, RA, RB, RC, RD, 7);
+ SI6(RE, RA, RB, RC, RD); KL(RB, RD, RC, RE, RA, 6);
+ SI5(RB, RD, RC, RE, RA); KL(RD, RA, RB, RC, RE, 5);
+ SI4(RD, RA, RB, RC, RE); KL(RD, RB, RE, RC, RA, 4);
+ SI3(RD, RB, RE, RC, RA); KL(RE, RD, RA, RC, RB, 3);
+ SI2(RE, RD, RA, RC, RB); KL(RD, RB, RC, RA, RE, 2);
+ SI1(RD, RB, RC, RA, RE); KL(RE, RB, RC, RA, RD, 1);
+ SI0(RE, RB, RC, RA, RD); K(RC, RD, RB, RE, RA, 0);
+
+ movl arg_dst(%esp), %eax;
+ write_blocks(%eax, RC, RD, RB, RE, RT0, RT1, RA);
+
+ ret;
diff --git a/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S b/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S
new file mode 100644
index 000000000000..7f24a1540821
--- /dev/null
+++ b/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S
@@ -0,0 +1,761 @@
+/*
+ * Serpent Cipher 8-way parallel algorithm (x86_64/SSE2)
+ *
+ * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ *
+ * Based on crypto/serpent.c by
+ * Copyright (C) 2002 Dag Arne Osvik <osvik@ii.uib.no>
+ * 2003 Herbert Valerio Riedel <hvr@gnu.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+ * USA
+ *
+ */
+
+.file "serpent-sse2-x86_64-asm_64.S"
+.text
+
+#define CTX %rdi
+
+/**********************************************************************
+ 8-way SSE2 serpent
+ **********************************************************************/
+#define RA1 %xmm0
+#define RB1 %xmm1
+#define RC1 %xmm2
+#define RD1 %xmm3
+#define RE1 %xmm4
+
+#define RA2 %xmm5
+#define RB2 %xmm6
+#define RC2 %xmm7
+#define RD2 %xmm8
+#define RE2 %xmm9
+
+#define RNOT %xmm10
+
+#define RK0 %xmm11
+#define RK1 %xmm12
+#define RK2 %xmm13
+#define RK3 %xmm14
+
+#define S0_1(x0, x1, x2, x3, x4) \
+ movdqa x3, x4; \
+ por x0, x3; \
+ pxor x4, x0; \
+ pxor x2, x4; \
+ pxor RNOT, x4; \
+ pxor x1, x3; \
+ pand x0, x1; \
+ pxor x4, x1; \
+ pxor x0, x2;
+#define S0_2(x0, x1, x2, x3, x4) \
+ pxor x3, x0; \
+ por x0, x4; \
+ pxor x2, x0; \
+ pand x1, x2; \
+ pxor x2, x3; \
+ pxor RNOT, x1; \
+ pxor x4, x2; \
+ pxor x2, x1;
+
+#define S1_1(x0, x1, x2, x3, x4) \
+ movdqa x1, x4; \
+ pxor x0, x1; \
+ pxor x3, x0; \
+ pxor RNOT, x3; \
+ pand x1, x4; \
+ por x1, x0; \
+ pxor x2, x3; \
+ pxor x3, x0; \
+ pxor x3, x1;
+#define S1_2(x0, x1, x2, x3, x4) \
+ pxor x4, x3; \
+ por x4, x1; \
+ pxor x2, x4; \
+ pand x0, x2; \
+ pxor x1, x2; \
+ por x0, x1; \
+ pxor RNOT, x0; \
+ pxor x2, x0; \
+ pxor x1, x4;
+
+#define S2_1(x0, x1, x2, x3, x4) \
+ pxor RNOT, x3; \
+ pxor x0, x1; \
+ movdqa x0, x4; \
+ pand x2, x0; \
+ pxor x3, x0; \
+ por x4, x3; \
+ pxor x1, x2; \
+ pxor x1, x3; \
+ pand x0, x1;
+#define S2_2(x0, x1, x2, x3, x4) \
+ pxor x2, x0; \
+ pand x3, x2; \
+ por x1, x3; \
+ pxor RNOT, x0; \
+ pxor x0, x3; \
+ pxor x0, x4; \
+ pxor x2, x0; \
+ por x2, x1;
+
+#define S3_1(x0, x1, x2, x3, x4) \
+ movdqa x1, x4; \
+ pxor x3, x1; \
+ por x0, x3; \
+ pand x0, x4; \
+ pxor x2, x0; \
+ pxor x1, x2; \
+ pand x3, x1; \
+ pxor x3, x2; \
+ por x4, x0; \
+ pxor x3, x4;
+#define S3_2(x0, x1, x2, x3, x4) \
+ pxor x0, x1; \
+ pand x3, x0; \
+ pand x4, x3; \
+ pxor x2, x3; \
+ por x1, x4; \
+ pand x1, x2; \
+ pxor x3, x4; \
+ pxor x3, x0; \
+ pxor x2, x3;
+
+#define S4_1(x0, x1, x2, x3, x4) \
+ movdqa x3, x4; \
+ pand x0, x3; \
+ pxor x4, x0; \
+ pxor x2, x3; \
+ por x4, x2; \
+ pxor x1, x0; \
+ pxor x3, x4; \
+ por x0, x2; \
+ pxor x1, x2;
+#define S4_2(x0, x1, x2, x3, x4) \
+ pand x0, x1; \
+ pxor x4, x1; \
+ pand x2, x4; \
+ pxor x3, x2; \
+ pxor x0, x4; \
+ por x1, x3; \
+ pxor RNOT, x1; \
+ pxor x0, x3;
+
+#define S5_1(x0, x1, x2, x3, x4) \
+ movdqa x1, x4; \
+ por x0, x1; \
+ pxor x1, x2; \
+ pxor RNOT, x3; \
+ pxor x0, x4; \
+ pxor x2, x0; \
+ pand x4, x1; \
+ por x3, x4; \
+ pxor x0, x4;
+#define S5_2(x0, x1, x2, x3, x4) \
+ pand x3, x0; \
+ pxor x3, x1; \
+ pxor x2, x3; \
+ pxor x1, x0; \
+ pand x4, x2; \
+ pxor x2, x1; \
+ pand x0, x2; \
+ pxor x2, x3;
+
+#define S6_1(x0, x1, x2, x3, x4) \
+ movdqa x1, x4; \
+ pxor x0, x3; \
+ pxor x2, x1; \
+ pxor x0, x2; \
+ pand x3, x0; \
+ por x3, x1; \
+ pxor RNOT, x4; \
+ pxor x1, x0; \
+ pxor x2, x1;
+#define S6_2(x0, x1, x2, x3, x4) \
+ pxor x4, x3; \
+ pxor x0, x4; \
+ pand x0, x2; \
+ pxor x1, x4; \
+ pxor x3, x2; \
+ pand x1, x3; \
+ pxor x0, x3; \
+ pxor x2, x1;
+
+#define S7_1(x0, x1, x2, x3, x4) \
+ pxor RNOT, x1; \
+ movdqa x1, x4; \
+ pxor RNOT, x0; \
+ pand x2, x1; \
+ pxor x3, x1; \
+ por x4, x3; \
+ pxor x2, x4; \
+ pxor x3, x2; \
+ pxor x0, x3; \
+ por x1, x0;
+#define S7_2(x0, x1, x2, x3, x4) \
+ pand x0, x2; \
+ pxor x4, x0; \
+ pxor x3, x4; \
+ pand x0, x3; \
+ pxor x1, x4; \
+ pxor x4, x2; \
+ pxor x1, x3; \
+ por x0, x4; \
+ pxor x1, x4;
+
+#define SI0_1(x0, x1, x2, x3, x4) \
+ movdqa x3, x4; \
+ pxor x0, x1; \
+ por x1, x3; \
+ pxor x1, x4; \
+ pxor RNOT, x0; \
+ pxor x3, x2; \
+ pxor x0, x3; \
+ pand x1, x0; \
+ pxor x2, x0;
+#define SI0_2(x0, x1, x2, x3, x4) \
+ pand x3, x2; \
+ pxor x4, x3; \
+ pxor x3, x2; \
+ pxor x3, x1; \
+ pand x0, x3; \
+ pxor x0, x1; \
+ pxor x2, x0; \
+ pxor x3, x4;
+
+#define SI1_1(x0, x1, x2, x3, x4) \
+ pxor x3, x1; \
+ movdqa x0, x4; \
+ pxor x2, x0; \
+ pxor RNOT, x2; \
+ por x1, x4; \
+ pxor x3, x4; \
+ pand x1, x3; \
+ pxor x2, x1; \
+ pand x4, x2;
+#define SI1_2(x0, x1, x2, x3, x4) \
+ pxor x1, x4; \
+ por x3, x1; \
+ pxor x0, x3; \
+ pxor x0, x2; \
+ por x4, x0; \
+ pxor x4, x2; \
+ pxor x0, x1; \
+ pxor x1, x4;
+
+#define SI2_1(x0, x1, x2, x3, x4) \
+ pxor x1, x2; \
+ movdqa x3, x4; \
+ pxor RNOT, x3; \
+ por x2, x3; \
+ pxor x4, x2; \
+ pxor x0, x4; \
+ pxor x1, x3; \
+ por x2, x1; \
+ pxor x0, x2;
+#define SI2_2(x0, x1, x2, x3, x4) \
+ pxor x4, x1; \
+ por x3, x4; \
+ pxor x3, x2; \
+ pxor x2, x4; \
+ pand x1, x2; \
+ pxor x3, x2; \
+ pxor x4, x3; \
+ pxor x0, x4;
+
+#define SI3_1(x0, x1, x2, x3, x4) \
+ pxor x1, x2; \
+ movdqa x1, x4; \
+ pand x2, x1; \
+ pxor x0, x1; \
+ por x4, x0; \
+ pxor x3, x4; \
+ pxor x3, x0; \
+ por x1, x3; \
+ pxor x2, x1;
+#define SI3_2(x0, x1, x2, x3, x4) \
+ pxor x3, x1; \
+ pxor x2, x0; \
+ pxor x3, x2; \
+ pand x1, x3; \
+ pxor x0, x1; \
+ pand x2, x0; \
+ pxor x3, x4; \
+ pxor x0, x3; \
+ pxor x1, x0;
+
+#define SI4_1(x0, x1, x2, x3, x4) \
+ pxor x3, x2; \
+ movdqa x0, x4; \
+ pand x1, x0; \
+ pxor x2, x0; \
+ por x3, x2; \
+ pxor RNOT, x4; \
+ pxor x0, x1; \
+ pxor x2, x0; \
+ pand x4, x2;
+#define SI4_2(x0, x1, x2, x3, x4) \
+ pxor x0, x2; \
+ por x4, x0; \
+ pxor x3, x0; \
+ pand x2, x3; \
+ pxor x3, x4; \
+ pxor x1, x3; \
+ pand x0, x1; \
+ pxor x1, x4; \
+ pxor x3, x0;
+
+#define SI5_1(x0, x1, x2, x3, x4) \
+ movdqa x1, x4; \
+ por x2, x1; \
+ pxor x4, x2; \
+ pxor x3, x1; \
+ pand x4, x3; \
+ pxor x3, x2; \
+ por x0, x3; \
+ pxor RNOT, x0; \
+ pxor x2, x3; \
+ por x0, x2;
+#define SI5_2(x0, x1, x2, x3, x4) \
+ pxor x1, x4; \
+ pxor x4, x2; \
+ pand x0, x4; \
+ pxor x1, x0; \
+ pxor x3, x1; \
+ pand x2, x0; \
+ pxor x3, x2; \
+ pxor x2, x0; \
+ pxor x4, x2; \
+ pxor x3, x4;
+
+#define SI6_1(x0, x1, x2, x3, x4) \
+ pxor x2, x0; \
+ movdqa x0, x4; \
+ pand x3, x0; \
+ pxor x3, x2; \
+ pxor x2, x0; \
+ pxor x1, x3; \
+ por x4, x2; \
+ pxor x3, x2; \
+ pand x0, x3;
+#define SI6_2(x0, x1, x2, x3, x4) \
+ pxor RNOT, x0; \
+ pxor x1, x3; \
+ pand x2, x1; \
+ pxor x0, x4; \
+ pxor x4, x3; \
+ pxor x2, x4; \
+ pxor x1, x0; \
+ pxor x0, x2;
+
+#define SI7_1(x0, x1, x2, x3, x4) \
+ movdqa x3, x4; \
+ pand x0, x3; \
+ pxor x2, x0; \
+ por x4, x2; \
+ pxor x1, x4; \
+ pxor RNOT, x0; \
+ por x3, x1; \
+ pxor x0, x4; \
+ pand x2, x0; \
+ pxor x1, x0;
+#define SI7_2(x0, x1, x2, x3, x4) \
+ pand x2, x1; \
+ pxor x2, x3; \
+ pxor x3, x4; \
+ pand x3, x2; \
+ por x0, x3; \
+ pxor x4, x1; \
+ pxor x4, x3; \
+ pand x0, x4; \
+ pxor x2, x4;
+
+#define get_key(i, j, t) \
+ movd (4*(i)+(j))*4(CTX), t; \
+ pshufd $0, t, t;
+
+#define K2(x0, x1, x2, x3, x4, i) \
+ get_key(i, 0, RK0); \
+ get_key(i, 1, RK1); \
+ get_key(i, 2, RK2); \
+ get_key(i, 3, RK3); \
+ pxor RK0, x0 ## 1; \
+ pxor RK1, x1 ## 1; \
+ pxor RK2, x2 ## 1; \
+ pxor RK3, x3 ## 1; \
+ pxor RK0, x0 ## 2; \
+ pxor RK1, x1 ## 2; \
+ pxor RK2, x2 ## 2; \
+ pxor RK3, x3 ## 2;
+
+#define LK2(x0, x1, x2, x3, x4, i) \
+ movdqa x0 ## 1, x4 ## 1; \
+ pslld $13, x0 ## 1; \
+ psrld $(32 - 13), x4 ## 1; \
+ por x4 ## 1, x0 ## 1; \
+ pxor x0 ## 1, x1 ## 1; \
+ movdqa x2 ## 1, x4 ## 1; \
+ pslld $3, x2 ## 1; \
+ psrld $(32 - 3), x4 ## 1; \
+ por x4 ## 1, x2 ## 1; \
+ pxor x2 ## 1, x1 ## 1; \
+ movdqa x0 ## 2, x4 ## 2; \
+ pslld $13, x0 ## 2; \
+ psrld $(32 - 13), x4 ## 2; \
+ por x4 ## 2, x0 ## 2; \
+ pxor x0 ## 2, x1 ## 2; \
+ movdqa x2 ## 2, x4 ## 2; \
+ pslld $3, x2 ## 2; \
+ psrld $(32 - 3), x4 ## 2; \
+ por x4 ## 2, x2 ## 2; \
+ pxor x2 ## 2, x1 ## 2; \
+ movdqa x1 ## 1, x4 ## 1; \
+ pslld $1, x1 ## 1; \
+ psrld $(32 - 1), x4 ## 1; \
+ por x4 ## 1, x1 ## 1; \
+ movdqa x0 ## 1, x4 ## 1; \
+ pslld $3, x4 ## 1; \
+ pxor x2 ## 1, x3 ## 1; \
+ pxor x4 ## 1, x3 ## 1; \
+ movdqa x3 ## 1, x4 ## 1; \
+ get_key(i, 1, RK1); \
+ movdqa x1 ## 2, x4 ## 2; \
+ pslld $1, x1 ## 2; \
+ psrld $(32 - 1), x4 ## 2; \
+ por x4 ## 2, x1 ## 2; \
+ movdqa x0 ## 2, x4 ## 2; \
+ pslld $3, x4 ## 2; \
+ pxor x2 ## 2, x3 ## 2; \
+ pxor x4 ## 2, x3 ## 2; \
+ movdqa x3 ## 2, x4 ## 2; \
+ get_key(i, 3, RK3); \
+ pslld $7, x3 ## 1; \
+ psrld $(32 - 7), x4 ## 1; \
+ por x4 ## 1, x3 ## 1; \
+ movdqa x1 ## 1, x4 ## 1; \
+ pslld $7, x4 ## 1; \
+ pxor x1 ## 1, x0 ## 1; \
+ pxor x3 ## 1, x0 ## 1; \
+ pxor x3 ## 1, x2 ## 1; \
+ pxor x4 ## 1, x2 ## 1; \
+ get_key(i, 0, RK0); \
+ pslld $7, x3 ## 2; \
+ psrld $(32 - 7), x4 ## 2; \
+ por x4 ## 2, x3 ## 2; \
+ movdqa x1 ## 2, x4 ## 2; \
+ pslld $7, x4 ## 2; \
+ pxor x1 ## 2, x0 ## 2; \
+ pxor x3 ## 2, x0 ## 2; \
+ pxor x3 ## 2, x2 ## 2; \
+ pxor x4 ## 2, x2 ## 2; \
+ get_key(i, 2, RK2); \
+ pxor RK1, x1 ## 1; \
+ pxor RK3, x3 ## 1; \
+ movdqa x0 ## 1, x4 ## 1; \
+ pslld $5, x0 ## 1; \
+ psrld $(32 - 5), x4 ## 1; \
+ por x4 ## 1, x0 ## 1; \
+ movdqa x2 ## 1, x4 ## 1; \
+ pslld $22, x2 ## 1; \
+ psrld $(32 - 22), x4 ## 1; \
+ por x4 ## 1, x2 ## 1; \
+ pxor RK0, x0 ## 1; \
+ pxor RK2, x2 ## 1; \
+ pxor RK1, x1 ## 2; \
+ pxor RK3, x3 ## 2; \
+ movdqa x0 ## 2, x4 ## 2; \
+ pslld $5, x0 ## 2; \
+ psrld $(32 - 5), x4 ## 2; \
+ por x4 ## 2, x0 ## 2; \
+ movdqa x2 ## 2, x4 ## 2; \
+ pslld $22, x2 ## 2; \
+ psrld $(32 - 22), x4 ## 2; \
+ por x4 ## 2, x2 ## 2; \
+ pxor RK0, x0 ## 2; \
+ pxor RK2, x2 ## 2;
+
+#define KL2(x0, x1, x2, x3, x4, i) \
+ pxor RK0, x0 ## 1; \
+ pxor RK2, x2 ## 1; \
+ movdqa x0 ## 1, x4 ## 1; \
+ psrld $5, x0 ## 1; \
+ pslld $(32 - 5), x4 ## 1; \
+ por x4 ## 1, x0 ## 1; \
+ pxor RK3, x3 ## 1; \
+ pxor RK1, x1 ## 1; \
+ movdqa x2 ## 1, x4 ## 1; \
+ psrld $22, x2 ## 1; \
+ pslld $(32 - 22), x4 ## 1; \
+ por x4 ## 1, x2 ## 1; \
+ pxor x3 ## 1, x2 ## 1; \
+ pxor RK0, x0 ## 2; \
+ pxor RK2, x2 ## 2; \
+ movdqa x0 ## 2, x4 ## 2; \
+ psrld $5, x0 ## 2; \
+ pslld $(32 - 5), x4 ## 2; \
+ por x4 ## 2, x0 ## 2; \
+ pxor RK3, x3 ## 2; \
+ pxor RK1, x1 ## 2; \
+ movdqa x2 ## 2, x4 ## 2; \
+ psrld $22, x2 ## 2; \
+ pslld $(32 - 22), x4 ## 2; \
+ por x4 ## 2, x2 ## 2; \
+ pxor x3 ## 2, x2 ## 2; \
+ pxor x3 ## 1, x0 ## 1; \
+ movdqa x1 ## 1, x4 ## 1; \
+ pslld $7, x4 ## 1; \
+ pxor x1 ## 1, x0 ## 1; \
+ pxor x4 ## 1, x2 ## 1; \
+ movdqa x1 ## 1, x4 ## 1; \
+ psrld $1, x1 ## 1; \
+ pslld $(32 - 1), x4 ## 1; \
+ por x4 ## 1, x1 ## 1; \
+ pxor x3 ## 2, x0 ## 2; \
+ movdqa x1 ## 2, x4 ## 2; \
+ pslld $7, x4 ## 2; \
+ pxor x1 ## 2, x0 ## 2; \
+ pxor x4 ## 2, x2 ## 2; \
+ movdqa x1 ## 2, x4 ## 2; \
+ psrld $1, x1 ## 2; \
+ pslld $(32 - 1), x4 ## 2; \
+ por x4 ## 2, x1 ## 2; \
+ movdqa x3 ## 1, x4 ## 1; \
+ psrld $7, x3 ## 1; \
+ pslld $(32 - 7), x4 ## 1; \
+ por x4 ## 1, x3 ## 1; \
+ pxor x0 ## 1, x1 ## 1; \
+ movdqa x0 ## 1, x4 ## 1; \
+ pslld $3, x4 ## 1; \
+ pxor x4 ## 1, x3 ## 1; \
+ movdqa x0 ## 1, x4 ## 1; \
+ movdqa x3 ## 2, x4 ## 2; \
+ psrld $7, x3 ## 2; \
+ pslld $(32 - 7), x4 ## 2; \
+ por x4 ## 2, x3 ## 2; \
+ pxor x0 ## 2, x1 ## 2; \
+ movdqa x0 ## 2, x4 ## 2; \
+ pslld $3, x4 ## 2; \
+ pxor x4 ## 2, x3 ## 2; \
+ movdqa x0 ## 2, x4 ## 2; \
+ psrld $13, x0 ## 1; \
+ pslld $(32 - 13), x4 ## 1; \
+ por x4 ## 1, x0 ## 1; \
+ pxor x2 ## 1, x1 ## 1; \
+ pxor x2 ## 1, x3 ## 1; \
+ movdqa x2 ## 1, x4 ## 1; \
+ psrld $3, x2 ## 1; \
+ pslld $(32 - 3), x4 ## 1; \
+ por x4 ## 1, x2 ## 1; \
+ psrld $13, x0 ## 2; \
+ pslld $(32 - 13), x4 ## 2; \
+ por x4 ## 2, x0 ## 2; \
+ pxor x2 ## 2, x1 ## 2; \
+ pxor x2 ## 2, x3 ## 2; \
+ movdqa x2 ## 2, x4 ## 2; \
+ psrld $3, x2 ## 2; \
+ pslld $(32 - 3), x4 ## 2; \
+ por x4 ## 2, x2 ## 2;
+
+#define S(SBOX, x0, x1, x2, x3, x4) \
+ SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
+ SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
+ SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
+ SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2);
+
+#define SP(SBOX, x0, x1, x2, x3, x4, i) \
+ get_key(i, 0, RK0); \
+ SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
+ get_key(i, 2, RK2); \
+ SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
+ get_key(i, 3, RK3); \
+ SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
+ get_key(i, 1, RK1); \
+ SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
+
+#define transpose_4x4(x0, x1, x2, x3, t1, t2, t3) \
+ movdqa x2, t3; \
+ movdqa x0, t1; \
+ unpcklps x3, t3; \
+ movdqa x0, t2; \
+ unpcklps x1, t1; \
+ unpckhps x1, t2; \
+ movdqa t3, x1; \
+ unpckhps x3, x2; \
+ movdqa t1, x0; \
+ movhlps t1, x1; \
+ movdqa t2, t1; \
+ movlhps t3, x0; \
+ movlhps x2, t1; \
+ movhlps t2, x2; \
+ movdqa x2, x3; \
+ movdqa t1, x2;
+
+#define read_blocks(in, x0, x1, x2, x3, t0, t1, t2) \
+ movdqu (0*4*4)(in), x0; \
+ movdqu (1*4*4)(in), x1; \
+ movdqu (2*4*4)(in), x2; \
+ movdqu (3*4*4)(in), x3; \
+ \
+ transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
+
+#define write_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
+ transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
+ \
+ movdqu x0, (0*4*4)(out); \
+ movdqu x1, (1*4*4)(out); \
+ movdqu x2, (2*4*4)(out); \
+ movdqu x3, (3*4*4)(out);
+
+#define xor_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
+ transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
+ \
+ movdqu (0*4*4)(out), t0; \
+ pxor t0, x0; \
+ movdqu x0, (0*4*4)(out); \
+ movdqu (1*4*4)(out), t0; \
+ pxor t0, x1; \
+ movdqu x1, (1*4*4)(out); \
+ movdqu (2*4*4)(out), t0; \
+ pxor t0, x2; \
+ movdqu x2, (2*4*4)(out); \
+ movdqu (3*4*4)(out), t0; \
+ pxor t0, x3; \
+ movdqu x3, (3*4*4)(out);
+
+.align 8
+.global __serpent_enc_blk_8way
+.type __serpent_enc_blk_8way,@function;
+
+__serpent_enc_blk_8way:
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ * %rcx: bool, if true: xor output
+ */
+
+ pcmpeqd RNOT, RNOT;
+
+ leaq (4*4*4)(%rdx), %rax;
+ read_blocks(%rdx, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
+ read_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
+
+ K2(RA, RB, RC, RD, RE, 0);
+ S(S0, RA, RB, RC, RD, RE); LK2(RC, RB, RD, RA, RE, 1);
+ S(S1, RC, RB, RD, RA, RE); LK2(RE, RD, RA, RC, RB, 2);
+ S(S2, RE, RD, RA, RC, RB); LK2(RB, RD, RE, RC, RA, 3);
+ S(S3, RB, RD, RE, RC, RA); LK2(RC, RA, RD, RB, RE, 4);
+ S(S4, RC, RA, RD, RB, RE); LK2(RA, RD, RB, RE, RC, 5);
+ S(S5, RA, RD, RB, RE, RC); LK2(RC, RA, RD, RE, RB, 6);
+ S(S6, RC, RA, RD, RE, RB); LK2(RD, RB, RA, RE, RC, 7);
+ S(S7, RD, RB, RA, RE, RC); LK2(RC, RA, RE, RD, RB, 8);
+ S(S0, RC, RA, RE, RD, RB); LK2(RE, RA, RD, RC, RB, 9);
+ S(S1, RE, RA, RD, RC, RB); LK2(RB, RD, RC, RE, RA, 10);
+ S(S2, RB, RD, RC, RE, RA); LK2(RA, RD, RB, RE, RC, 11);
+ S(S3, RA, RD, RB, RE, RC); LK2(RE, RC, RD, RA, RB, 12);
+ S(S4, RE, RC, RD, RA, RB); LK2(RC, RD, RA, RB, RE, 13);
+ S(S5, RC, RD, RA, RB, RE); LK2(RE, RC, RD, RB, RA, 14);
+ S(S6, RE, RC, RD, RB, RA); LK2(RD, RA, RC, RB, RE, 15);
+ S(S7, RD, RA, RC, RB, RE); LK2(RE, RC, RB, RD, RA, 16);
+ S(S0, RE, RC, RB, RD, RA); LK2(RB, RC, RD, RE, RA, 17);
+ S(S1, RB, RC, RD, RE, RA); LK2(RA, RD, RE, RB, RC, 18);
+ S(S2, RA, RD, RE, RB, RC); LK2(RC, RD, RA, RB, RE, 19);
+ S(S3, RC, RD, RA, RB, RE); LK2(RB, RE, RD, RC, RA, 20);
+ S(S4, RB, RE, RD, RC, RA); LK2(RE, RD, RC, RA, RB, 21);
+ S(S5, RE, RD, RC, RA, RB); LK2(RB, RE, RD, RA, RC, 22);
+ S(S6, RB, RE, RD, RA, RC); LK2(RD, RC, RE, RA, RB, 23);
+ S(S7, RD, RC, RE, RA, RB); LK2(RB, RE, RA, RD, RC, 24);
+ S(S0, RB, RE, RA, RD, RC); LK2(RA, RE, RD, RB, RC, 25);
+ S(S1, RA, RE, RD, RB, RC); LK2(RC, RD, RB, RA, RE, 26);
+ S(S2, RC, RD, RB, RA, RE); LK2(RE, RD, RC, RA, RB, 27);
+ S(S3, RE, RD, RC, RA, RB); LK2(RA, RB, RD, RE, RC, 28);
+ S(S4, RA, RB, RD, RE, RC); LK2(RB, RD, RE, RC, RA, 29);
+ S(S5, RB, RD, RE, RC, RA); LK2(RA, RB, RD, RC, RE, 30);
+ S(S6, RA, RB, RD, RC, RE); LK2(RD, RE, RB, RC, RA, 31);
+ S(S7, RD, RE, RB, RC, RA); K2(RA, RB, RC, RD, RE, 32);
+
+ leaq (4*4*4)(%rsi), %rax;
+
+ testb %cl, %cl;
+ jnz __enc_xor8;
+
+ write_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
+ write_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
+
+ ret;
+
+__enc_xor8:
+ xor_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
+ xor_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
+
+ ret;
+
+.align 8
+.global serpent_dec_blk_8way
+.type serpent_dec_blk_8way,@function;
+
+serpent_dec_blk_8way:
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst
+ * %rdx: src
+ */
+
+ pcmpeqd RNOT, RNOT;
+
+ leaq (4*4*4)(%rdx), %rax;
+ read_blocks(%rdx, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
+ read_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
+
+ K2(RA, RB, RC, RD, RE, 32);
+ SP(SI7, RA, RB, RC, RD, RE, 31); KL2(RB, RD, RA, RE, RC, 31);
+ SP(SI6, RB, RD, RA, RE, RC, 30); KL2(RA, RC, RE, RB, RD, 30);
+ SP(SI5, RA, RC, RE, RB, RD, 29); KL2(RC, RD, RA, RE, RB, 29);
+ SP(SI4, RC, RD, RA, RE, RB, 28); KL2(RC, RA, RB, RE, RD, 28);
+ SP(SI3, RC, RA, RB, RE, RD, 27); KL2(RB, RC, RD, RE, RA, 27);
+ SP(SI2, RB, RC, RD, RE, RA, 26); KL2(RC, RA, RE, RD, RB, 26);
+ SP(SI1, RC, RA, RE, RD, RB, 25); KL2(RB, RA, RE, RD, RC, 25);
+ SP(SI0, RB, RA, RE, RD, RC, 24); KL2(RE, RC, RA, RB, RD, 24);
+ SP(SI7, RE, RC, RA, RB, RD, 23); KL2(RC, RB, RE, RD, RA, 23);
+ SP(SI6, RC, RB, RE, RD, RA, 22); KL2(RE, RA, RD, RC, RB, 22);
+ SP(SI5, RE, RA, RD, RC, RB, 21); KL2(RA, RB, RE, RD, RC, 21);
+ SP(SI4, RA, RB, RE, RD, RC, 20); KL2(RA, RE, RC, RD, RB, 20);
+ SP(SI3, RA, RE, RC, RD, RB, 19); KL2(RC, RA, RB, RD, RE, 19);
+ SP(SI2, RC, RA, RB, RD, RE, 18); KL2(RA, RE, RD, RB, RC, 18);
+ SP(SI1, RA, RE, RD, RB, RC, 17); KL2(RC, RE, RD, RB, RA, 17);
+ SP(SI0, RC, RE, RD, RB, RA, 16); KL2(RD, RA, RE, RC, RB, 16);
+ SP(SI7, RD, RA, RE, RC, RB, 15); KL2(RA, RC, RD, RB, RE, 15);
+ SP(SI6, RA, RC, RD, RB, RE, 14); KL2(RD, RE, RB, RA, RC, 14);
+ SP(SI5, RD, RE, RB, RA, RC, 13); KL2(RE, RC, RD, RB, RA, 13);
+ SP(SI4, RE, RC, RD, RB, RA, 12); KL2(RE, RD, RA, RB, RC, 12);
+ SP(SI3, RE, RD, RA, RB, RC, 11); KL2(RA, RE, RC, RB, RD, 11);
+ SP(SI2, RA, RE, RC, RB, RD, 10); KL2(RE, RD, RB, RC, RA, 10);
+ SP(SI1, RE, RD, RB, RC, RA, 9); KL2(RA, RD, RB, RC, RE, 9);
+ SP(SI0, RA, RD, RB, RC, RE, 8); KL2(RB, RE, RD, RA, RC, 8);
+ SP(SI7, RB, RE, RD, RA, RC, 7); KL2(RE, RA, RB, RC, RD, 7);
+ SP(SI6, RE, RA, RB, RC, RD, 6); KL2(RB, RD, RC, RE, RA, 6);
+ SP(SI5, RB, RD, RC, RE, RA, 5); KL2(RD, RA, RB, RC, RE, 5);
+ SP(SI4, RD, RA, RB, RC, RE, 4); KL2(RD, RB, RE, RC, RA, 4);
+ SP(SI3, RD, RB, RE, RC, RA, 3); KL2(RE, RD, RA, RC, RB, 3);
+ SP(SI2, RE, RD, RA, RC, RB, 2); KL2(RD, RB, RC, RA, RE, 2);
+ SP(SI1, RD, RB, RC, RA, RE, 1); KL2(RE, RB, RC, RA, RD, 1);
+ S(SI0, RE, RB, RC, RA, RD); K2(RC, RD, RB, RE, RA, 0);
+
+ leaq (4*4*4)(%rsi), %rax;
+ write_blocks(%rsi, RC1, RD1, RB1, RE1, RK0, RK1, RK2);
+ write_blocks(%rax, RC2, RD2, RB2, RE2, RK0, RK1, RK2);
+
+ ret;
diff --git a/arch/x86/crypto/serpent_sse2_glue.c b/arch/x86/crypto/serpent_sse2_glue.c
new file mode 100644
index 000000000000..7955a9b76b91
--- /dev/null
+++ b/arch/x86/crypto/serpent_sse2_glue.c
@@ -0,0 +1,1070 @@
+/*
+ * Glue Code for SSE2 assembler versions of Serpent Cipher
+ *
+ * Copyright (c) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ *
+ * Glue code based on aesni-intel_glue.c by:
+ * Copyright (C) 2008, Intel Corp.
+ * Author: Huang Ying <ying.huang@intel.com>
+ *
+ * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by:
+ * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au>
+ * CTR part based on code (crypto/ctr.c) by:
+ * (C) Copyright IBM Corp. 2007 - Joy Latten <latten@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+ * USA
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/hardirq.h>
+#include <linux/types.h>
+#include <linux/crypto.h>
+#include <linux/err.h>
+#include <crypto/algapi.h>
+#include <crypto/serpent.h>
+#include <crypto/cryptd.h>
+#include <crypto/b128ops.h>
+#include <crypto/ctr.h>
+#include <crypto/lrw.h>
+#include <crypto/xts.h>
+#include <asm/i387.h>
+#include <asm/serpent.h>
+#include <crypto/scatterwalk.h>
+#include <linux/workqueue.h>
+#include <linux/spinlock.h>
+
+struct async_serpent_ctx {
+ struct cryptd_ablkcipher *cryptd_tfm;
+};
+
+static inline bool serpent_fpu_begin(bool fpu_enabled, unsigned int nbytes)
+{
+ if (fpu_enabled)
+ return true;
+
+ /* SSE2 is only used when chunk to be processed is large enough, so
+ * do not enable FPU until it is necessary.
+ */
+ if (nbytes < SERPENT_BLOCK_SIZE * SERPENT_PARALLEL_BLOCKS)
+ return false;
+
+ kernel_fpu_begin();
+ return true;
+}
+
+static inline void serpent_fpu_end(bool fpu_enabled)
+{
+ if (fpu_enabled)
+ kernel_fpu_end();
+}
+
+static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
+ bool enc)
+{
+ bool fpu_enabled = false;
+ struct serpent_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+ const unsigned int bsize = SERPENT_BLOCK_SIZE;
+ unsigned int nbytes;
+ int err;
+
+ err = blkcipher_walk_virt(desc, walk);
+ desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+
+ while ((nbytes = walk->nbytes)) {
+ u8 *wsrc = walk->src.virt.addr;
+ u8 *wdst = walk->dst.virt.addr;
+
+ fpu_enabled = serpent_fpu_begin(fpu_enabled, nbytes);
+
+ /* Process multi-block batch */
+ if (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS) {
+ do {
+ if (enc)
+ serpent_enc_blk_xway(ctx, wdst, wsrc);
+ else
+ serpent_dec_blk_xway(ctx, wdst, wsrc);
+
+ wsrc += bsize * SERPENT_PARALLEL_BLOCKS;
+ wdst += bsize * SERPENT_PARALLEL_BLOCKS;
+ nbytes -= bsize * SERPENT_PARALLEL_BLOCKS;
+ } while (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS);
+
+ if (nbytes < bsize)
+ goto done;
+ }
+
+ /* Handle leftovers */
+ do {
+ if (enc)
+ __serpent_encrypt(ctx, wdst, wsrc);
+ else
+ __serpent_decrypt(ctx, wdst, wsrc);
+
+ wsrc += bsize;
+ wdst += bsize;
+ nbytes -= bsize;
+ } while (nbytes >= bsize);
+
+done:
+ err = blkcipher_walk_done(desc, walk, nbytes);
+ }
+
+ serpent_fpu_end(fpu_enabled);
+ return err;
+}
+
+static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+ struct scatterlist *src, unsigned int nbytes)
+{
+ struct blkcipher_walk walk;
+
+ blkcipher_walk_init(&walk, dst, src, nbytes);
+ return ecb_crypt(desc, &walk, true);
+}
+
+static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+ struct scatterlist *src, unsigned int nbytes)
+{
+ struct blkcipher_walk walk;
+
+ blkcipher_walk_init(&walk, dst, src, nbytes);
+ return ecb_crypt(desc, &walk, false);
+}
+
+static struct crypto_alg blk_ecb_alg = {
+ .cra_name = "__ecb-serpent-sse2",
+ .cra_driver_name = "__driver-ecb-serpent-sse2",
+ .cra_priority = 0,
+ .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
+ .cra_blocksize = SERPENT_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct serpent_ctx),
+ .cra_alignmask = 0,
+ .cra_type = &crypto_blkcipher_type,
+ .cra_module = THIS_MODULE,
+ .cra_list = LIST_HEAD_INIT(blk_ecb_alg.cra_list),
+ .cra_u = {
+ .blkcipher = {
+ .min_keysize = SERPENT_MIN_KEY_SIZE,
+ .max_keysize = SERPENT_MAX_KEY_SIZE,
+ .setkey = serpent_setkey,
+ .encrypt = ecb_encrypt,
+ .decrypt = ecb_decrypt,
+ },
+ },
+};
+
+static unsigned int __cbc_encrypt(struct blkcipher_desc *desc,
+ struct blkcipher_walk *walk)
+{
+ struct serpent_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+ const unsigned int bsize = SERPENT_BLOCK_SIZE;
+ unsigned int nbytes = walk->nbytes;
+ u128 *src = (u128 *)walk->src.virt.addr;
+ u128 *dst = (u128 *)walk->dst.virt.addr;
+ u128 *iv = (u128 *)walk->iv;
+
+ do {
+ u128_xor(dst, src, iv);
+ __serpent_encrypt(ctx, (u8 *)dst, (u8 *)dst);
+ iv = dst;
+
+ src += 1;
+ dst += 1;
+ nbytes -= bsize;
+ } while (nbytes >= bsize);
+
+ u128_xor((u128 *)walk->iv, (u128 *)walk->iv, iv);
+ return nbytes;
+}
+
+static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+ struct scatterlist *src, unsigned int nbytes)
+{
+ struct blkcipher_walk walk;
+ int err;
+
+ blkcipher_walk_init(&walk, dst, src, nbytes);
+ err = blkcipher_walk_virt(desc, &walk);
+
+ while ((nbytes = walk.nbytes)) {
+ nbytes = __cbc_encrypt(desc, &walk);
+ err = blkcipher_walk_done(desc, &walk, nbytes);
+ }
+
+ return err;
+}
+
+static unsigned int __cbc_decrypt(struct blkcipher_desc *desc,
+ struct blkcipher_walk *walk)
+{
+ struct serpent_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+ const unsigned int bsize = SERPENT_BLOCK_SIZE;
+ unsigned int nbytes = walk->nbytes;
+ u128 *src = (u128 *)walk->src.virt.addr;
+ u128 *dst = (u128 *)walk->dst.virt.addr;
+ u128 ivs[SERPENT_PARALLEL_BLOCKS - 1];
+ u128 last_iv;
+ int i;
+
+ /* Start of the last block. */
+ src += nbytes / bsize - 1;
+ dst += nbytes / bsize - 1;
+
+ last_iv = *src;
+
+ /* Process multi-block batch */
+ if (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS) {
+ do {
+ nbytes -= bsize * (SERPENT_PARALLEL_BLOCKS - 1);
+ src -= SERPENT_PARALLEL_BLOCKS - 1;
+ dst -= SERPENT_PARALLEL_BLOCKS - 1;
+
+ for (i = 0; i < SERPENT_PARALLEL_BLOCKS - 1; i++)
+ ivs[i] = src[i];
+
+ serpent_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src);
+
+ for (i = 0; i < SERPENT_PARALLEL_BLOCKS - 1; i++)
+ u128_xor(dst + (i + 1), dst + (i + 1), ivs + i);
+
+ nbytes -= bsize;
+ if (nbytes < bsize)
+ goto done;
+
+ u128_xor(dst, dst, src - 1);
+ src -= 1;
+ dst -= 1;
+ } while (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS);
+
+ if (nbytes < bsize)
+ goto done;
+ }
+
+ /* Handle leftovers */
+ for (;;) {
+ __serpent_decrypt(ctx, (u8 *)dst, (u8 *)src);
+
+ nbytes -= bsize;
+ if (nbytes < bsize)
+ break;
+
+ u128_xor(dst, dst, src - 1);
+ src -= 1;
+ dst -= 1;
+ }
+
+done:
+ u128_xor(dst, dst, (u128 *)walk->iv);
+ *(u128 *)walk->iv = last_iv;
+
+ return nbytes;
+}
+
+static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+ struct scatterlist *src, unsigned int nbytes)
+{
+ bool fpu_enabled = false;
+ struct blkcipher_walk walk;
+ int err;
+
+ blkcipher_walk_init(&walk, dst, src, nbytes);
+ err = blkcipher_walk_virt(desc, &walk);
+ desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+
+ while ((nbytes = walk.nbytes)) {
+ fpu_enabled = serpent_fpu_begin(fpu_enabled, nbytes);
+ nbytes = __cbc_decrypt(desc, &walk);
+ err = blkcipher_walk_done(desc, &walk, nbytes);
+ }
+
+ serpent_fpu_end(fpu_enabled);
+ return err;
+}
+
+static struct crypto_alg blk_cbc_alg = {
+ .cra_name = "__cbc-serpent-sse2",
+ .cra_driver_name = "__driver-cbc-serpent-sse2",
+ .cra_priority = 0,
+ .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
+ .cra_blocksize = SERPENT_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct serpent_ctx),
+ .cra_alignmask = 0,
+ .cra_type = &crypto_blkcipher_type,
+ .cra_module = THIS_MODULE,
+ .cra_list = LIST_HEAD_INIT(blk_cbc_alg.cra_list),
+ .cra_u = {
+ .blkcipher = {
+ .min_keysize = SERPENT_MIN_KEY_SIZE,
+ .max_keysize = SERPENT_MAX_KEY_SIZE,
+ .setkey = serpent_setkey,
+ .encrypt = cbc_encrypt,
+ .decrypt = cbc_decrypt,
+ },
+ },
+};
+
+static inline void u128_to_be128(be128 *dst, const u128 *src)
+{
+ dst->a = cpu_to_be64(src->a);
+ dst->b = cpu_to_be64(src->b);
+}
+
+static inline void be128_to_u128(u128 *dst, const be128 *src)
+{
+ dst->a = be64_to_cpu(src->a);
+ dst->b = be64_to_cpu(src->b);
+}
+
+static inline void u128_inc(u128 *i)
+{
+ i->b++;
+ if (!i->b)
+ i->a++;
+}
+
+static void ctr_crypt_final(struct blkcipher_desc *desc,
+ struct blkcipher_walk *walk)
+{
+ struct serpent_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+ u8 *ctrblk = walk->iv;
+ u8 keystream[SERPENT_BLOCK_SIZE];
+ u8 *src = walk->src.virt.addr;
+ u8 *dst = walk->dst.virt.addr;
+ unsigned int nbytes = walk->nbytes;
+
+ __serpent_encrypt(ctx, keystream, ctrblk);
+ crypto_xor(keystream, src, nbytes);
+ memcpy(dst, keystream, nbytes);
+
+ crypto_inc(ctrblk, SERPENT_BLOCK_SIZE);
+}
+
+static unsigned int __ctr_crypt(struct blkcipher_desc *desc,
+ struct blkcipher_walk *walk)
+{
+ struct serpent_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+ const unsigned int bsize = SERPENT_BLOCK_SIZE;
+ unsigned int nbytes = walk->nbytes;
+ u128 *src = (u128 *)walk->src.virt.addr;
+ u128 *dst = (u128 *)walk->dst.virt.addr;
+ u128 ctrblk;
+ be128 ctrblocks[SERPENT_PARALLEL_BLOCKS];
+ int i;
+
+ be128_to_u128(&ctrblk, (be128 *)walk->iv);
+
+ /* Process multi-block batch */
+ if (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS) {
+ do {
+ /* create ctrblks for parallel encrypt */
+ for (i = 0; i < SERPENT_PARALLEL_BLOCKS; i++) {
+ if (dst != src)
+ dst[i] = src[i];
+
+ u128_to_be128(&ctrblocks[i], &ctrblk);
+ u128_inc(&ctrblk);
+ }
+
+ serpent_enc_blk_xway_xor(ctx, (u8 *)dst,
+ (u8 *)ctrblocks);
+
+ src += SERPENT_PARALLEL_BLOCKS;
+ dst += SERPENT_PARALLEL_BLOCKS;
+ nbytes -= bsize * SERPENT_PARALLEL_BLOCKS;
+ } while (nbytes >= bsize * SERPENT_PARALLEL_BLOCKS);
+
+ if (nbytes < bsize)
+ goto done;
+ }
+
+ /* Handle leftovers */
+ do {
+ if (dst != src)
+ *dst = *src;
+
+ u128_to_be128(&ctrblocks[0], &ctrblk);
+ u128_inc(&ctrblk);
+
+ __serpent_encrypt(ctx, (u8 *)ctrblocks, (u8 *)ctrblocks);
+ u128_xor(dst, dst, (u128 *)ctrblocks);
+
+ src += 1;
+ dst += 1;
+ nbytes -= bsize;
+ } while (nbytes >= bsize);
+
+done:
+ u128_to_be128((be128 *)walk->iv, &ctrblk);
+ return nbytes;
+}
+
+static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+ struct scatterlist *src, unsigned int nbytes)
+{
+ bool fpu_enabled = false;
+ struct blkcipher_walk walk;
+ int err;
+
+ blkcipher_walk_init(&walk, dst, src, nbytes);
+ err = blkcipher_walk_virt_block(desc, &walk, SERPENT_BLOCK_SIZE);
+ desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+
+ while ((nbytes = walk.nbytes) >= SERPENT_BLOCK_SIZE) {
+ fpu_enabled = serpent_fpu_begin(fpu_enabled, nbytes);
+ nbytes = __ctr_crypt(desc, &walk);
+ err = blkcipher_walk_done(desc, &walk, nbytes);
+ }
+
+ serpent_fpu_end(fpu_enabled);
+
+ if (walk.nbytes) {
+ ctr_crypt_final(desc, &walk);
+ err = blkcipher_walk_done(desc, &walk, 0);
+ }
+
+ return err;
+}
+
+static struct crypto_alg blk_ctr_alg = {
+ .cra_name = "__ctr-serpent-sse2",
+ .cra_driver_name = "__driver-ctr-serpent-sse2",
+ .cra_priority = 0,
+ .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
+ .cra_blocksize = 1,
+ .cra_ctxsize = sizeof(struct serpent_ctx),
+ .cra_alignmask = 0,
+ .cra_type = &crypto_blkcipher_type,
+ .cra_module = THIS_MODULE,
+ .cra_list = LIST_HEAD_INIT(blk_ctr_alg.cra_list),
+ .cra_u = {
+ .blkcipher = {
+ .min_keysize = SERPENT_MIN_KEY_SIZE,
+ .max_keysize = SERPENT_MAX_KEY_SIZE,
+ .ivsize = SERPENT_BLOCK_SIZE,
+ .setkey = serpent_setkey,
+ .encrypt = ctr_crypt,
+ .decrypt = ctr_crypt,
+ },
+ },
+};
+
+struct crypt_priv {
+ struct serpent_ctx *ctx;
+ bool fpu_enabled;
+};
+
+static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
+{
+ const unsigned int bsize = SERPENT_BLOCK_SIZE;
+ struct crypt_priv *ctx = priv;
+ int i;
+
+ ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
+
+ if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) {
+ serpent_enc_blk_xway(ctx->ctx, srcdst, srcdst);
+ return;
+ }
+
+ for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
+ __serpent_encrypt(ctx->ctx, srcdst, srcdst);
+}
+
+static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
+{
+ const unsigned int bsize = SERPENT_BLOCK_SIZE;
+ struct crypt_priv *ctx = priv;
+ int i;
+
+ ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
+
+ if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) {
+ serpent_dec_blk_xway(ctx->ctx, srcdst, srcdst);
+ return;
+ }
+
+ for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
+ __serpent_decrypt(ctx->ctx, srcdst, srcdst);
+}
+
+struct serpent_lrw_ctx {
+ struct lrw_table_ctx lrw_table;
+ struct serpent_ctx serpent_ctx;
+};
+
+static int lrw_serpent_setkey(struct crypto_tfm *tfm, const u8 *key,
+ unsigned int keylen)
+{
+ struct serpent_lrw_ctx *ctx = crypto_tfm_ctx(tfm);
+ int err;
+
+ err = __serpent_setkey(&ctx->serpent_ctx, key, keylen -
+ SERPENT_BLOCK_SIZE);
+ if (err)
+ return err;
+
+ return lrw_init_table(&ctx->lrw_table, key + keylen -
+ SERPENT_BLOCK_SIZE);
+}
+
+static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+ struct scatterlist *src, unsigned int nbytes)
+{
+ struct serpent_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+ be128 buf[SERPENT_PARALLEL_BLOCKS];
+ struct crypt_priv crypt_ctx = {
+ .ctx = &ctx->serpent_ctx,
+ .fpu_enabled = false,
+ };
+ struct lrw_crypt_req req = {
+ .tbuf = buf,
+ .tbuflen = sizeof(buf),
+
+ .table_ctx = &ctx->lrw_table,
+ .crypt_ctx = &crypt_ctx,
+ .crypt_fn = encrypt_callback,
+ };
+ int ret;
+
+ desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+ ret = lrw_crypt(desc, dst, src, nbytes, &req);
+ serpent_fpu_end(crypt_ctx.fpu_enabled);
+
+ return ret;
+}
+
+static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+ struct scatterlist *src, unsigned int nbytes)
+{
+ struct serpent_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+ be128 buf[SERPENT_PARALLEL_BLOCKS];
+ struct crypt_priv crypt_ctx = {
+ .ctx = &ctx->serpent_ctx,
+ .fpu_enabled = false,
+ };
+ struct lrw_crypt_req req = {
+ .tbuf = buf,
+ .tbuflen = sizeof(buf),
+
+ .table_ctx = &ctx->lrw_table,
+ .crypt_ctx = &crypt_ctx,
+ .crypt_fn = decrypt_callback,
+ };
+ int ret;
+
+ desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+ ret = lrw_crypt(desc, dst, src, nbytes, &req);
+ serpent_fpu_end(crypt_ctx.fpu_enabled);
+
+ return ret;
+}
+
+static void lrw_exit_tfm(struct crypto_tfm *tfm)
+{
+ struct serpent_lrw_ctx *ctx = crypto_tfm_ctx(tfm);
+
+ lrw_free_table(&ctx->lrw_table);
+}
+
+static struct crypto_alg blk_lrw_alg = {
+ .cra_name = "__lrw-serpent-sse2",
+ .cra_driver_name = "__driver-lrw-serpent-sse2",
+ .cra_priority = 0,
+ .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
+ .cra_blocksize = SERPENT_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct serpent_lrw_ctx),
+ .cra_alignmask = 0,
+ .cra_type = &crypto_blkcipher_type,
+ .cra_module = THIS_MODULE,
+ .cra_list = LIST_HEAD_INIT(blk_lrw_alg.cra_list),
+ .cra_exit = lrw_exit_tfm,
+ .cra_u = {
+ .blkcipher = {
+ .min_keysize = SERPENT_MIN_KEY_SIZE +
+ SERPENT_BLOCK_SIZE,
+ .max_keysize = SERPENT_MAX_KEY_SIZE +
+ SERPENT_BLOCK_SIZE,
+ .ivsize = SERPENT_BLOCK_SIZE,
+ .setkey = lrw_serpent_setkey,
+ .encrypt = lrw_encrypt,
+ .decrypt = lrw_decrypt,
+ },
+ },
+};
+
+struct serpent_xts_ctx {
+ struct serpent_ctx tweak_ctx;
+ struct serpent_ctx crypt_ctx;
+};
+
+static int xts_serpent_setkey(struct crypto_tfm *tfm, const u8 *key,
+ unsigned int keylen)
+{
+ struct serpent_xts_ctx *ctx = crypto_tfm_ctx(tfm);
+ u32 *flags = &tfm->crt_flags;
+ int err;
+
+ /* key consists of keys of equal size concatenated, therefore
+ * the length must be even
+ */
+ if (keylen % 2) {
+ *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
+ return -EINVAL;
+ }
+
+ /* first half of xts-key is for crypt */
+ err = __serpent_setkey(&ctx->crypt_ctx, key, keylen / 2);
+ if (err)
+ return err;
+
+ /* second half of xts-key is for tweak */
+ return __serpent_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2);
+}
+
+static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+ struct scatterlist *src, unsigned int nbytes)
+{
+ struct serpent_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+ be128 buf[SERPENT_PARALLEL_BLOCKS];
+ struct crypt_priv crypt_ctx = {
+ .ctx = &ctx->crypt_ctx,
+ .fpu_enabled = false,
+ };
+ struct xts_crypt_req req = {
+ .tbuf = buf,
+ .tbuflen = sizeof(buf),
+
+ .tweak_ctx = &ctx->tweak_ctx,
+ .tweak_fn = XTS_TWEAK_CAST(__serpent_encrypt),
+ .crypt_ctx = &crypt_ctx,
+ .crypt_fn = encrypt_callback,
+ };
+ int ret;
+
+ desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+ ret = xts_crypt(desc, dst, src, nbytes, &req);
+ serpent_fpu_end(crypt_ctx.fpu_enabled);
+
+ return ret;
+}
+
+static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+ struct scatterlist *src, unsigned int nbytes)
+{
+ struct serpent_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+ be128 buf[SERPENT_PARALLEL_BLOCKS];
+ struct crypt_priv crypt_ctx = {
+ .ctx = &ctx->crypt_ctx,
+ .fpu_enabled = false,
+ };
+ struct xts_crypt_req req = {
+ .tbuf = buf,
+ .tbuflen = sizeof(buf),
+
+ .tweak_ctx = &ctx->tweak_ctx,
+ .tweak_fn = XTS_TWEAK_CAST(__serpent_encrypt),
+ .crypt_ctx = &crypt_ctx,
+ .crypt_fn = decrypt_callback,
+ };
+ int ret;
+
+ desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+ ret = xts_crypt(desc, dst, src, nbytes, &req);
+ serpent_fpu_end(crypt_ctx.fpu_enabled);
+
+ return ret;
+}
+
+static struct crypto_alg blk_xts_alg = {
+ .cra_name = "__xts-serpent-sse2",
+ .cra_driver_name = "__driver-xts-serpent-sse2",
+ .cra_priority = 0,
+ .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
+ .cra_blocksize = SERPENT_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct serpent_xts_ctx),
+ .cra_alignmask = 0,
+ .cra_type = &crypto_blkcipher_type,
+ .cra_module = THIS_MODULE,
+ .cra_list = LIST_HEAD_INIT(blk_xts_alg.cra_list),
+ .cra_u = {
+ .blkcipher = {
+ .min_keysize = SERPENT_MIN_KEY_SIZE * 2,
+ .max_keysize = SERPENT_MAX_KEY_SIZE * 2,
+ .ivsize = SERPENT_BLOCK_SIZE,
+ .setkey = xts_serpent_setkey,
+ .encrypt = xts_encrypt,
+ .decrypt = xts_decrypt,
+ },
+ },
+};
+
+static int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key,
+ unsigned int key_len)
+{
+ struct async_serpent_ctx *ctx = crypto_ablkcipher_ctx(tfm);
+ struct crypto_ablkcipher *child = &ctx->cryptd_tfm->base;
+ int err;
+
+ crypto_ablkcipher_clear_flags(child, CRYPTO_TFM_REQ_MASK);
+ crypto_ablkcipher_set_flags(child, crypto_ablkcipher_get_flags(tfm)
+ & CRYPTO_TFM_REQ_MASK);
+ err = crypto_ablkcipher_setkey(child, key, key_len);
+ crypto_ablkcipher_set_flags(tfm, crypto_ablkcipher_get_flags(child)
+ & CRYPTO_TFM_RES_MASK);
+ return err;
+}
+
+static int __ablk_encrypt(struct ablkcipher_request *req)
+{
+ struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
+ struct async_serpent_ctx *ctx = crypto_ablkcipher_ctx(tfm);
+ struct blkcipher_desc desc;
+
+ desc.tfm = cryptd_ablkcipher_child(ctx->cryptd_tfm);
+ desc.info = req->info;
+ desc.flags = 0;
+
+ return crypto_blkcipher_crt(desc.tfm)->encrypt(
+ &desc, req->dst, req->src, req->nbytes);
+}
+
+static int ablk_encrypt(struct ablkcipher_request *req)
+{
+ struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
+ struct async_serpent_ctx *ctx = crypto_ablkcipher_ctx(tfm);
+
+ if (!irq_fpu_usable()) {
+ struct ablkcipher_request *cryptd_req =
+ ablkcipher_request_ctx(req);
+
+ memcpy(cryptd_req, req, sizeof(*req));
+ ablkcipher_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
+
+ return crypto_ablkcipher_encrypt(cryptd_req);
+ } else {
+ return __ablk_encrypt(req);
+ }
+}
+
+static int ablk_decrypt(struct ablkcipher_request *req)
+{
+ struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req);
+ struct async_serpent_ctx *ctx = crypto_ablkcipher_ctx(tfm);
+
+ if (!irq_fpu_usable()) {
+ struct ablkcipher_request *cryptd_req =
+ ablkcipher_request_ctx(req);
+
+ memcpy(cryptd_req, req, sizeof(*req));
+ ablkcipher_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
+
+ return crypto_ablkcipher_decrypt(cryptd_req);
+ } else {
+ struct blkcipher_desc desc;
+
+ desc.tfm = cryptd_ablkcipher_child(ctx->cryptd_tfm);
+ desc.info = req->info;
+ desc.flags = 0;
+
+ return crypto_blkcipher_crt(desc.tfm)->decrypt(
+ &desc, req->dst, req->src, req->nbytes);
+ }
+}
+
+static void ablk_exit(struct crypto_tfm *tfm)
+{
+ struct async_serpent_ctx *ctx = crypto_tfm_ctx(tfm);
+
+ cryptd_free_ablkcipher(ctx->cryptd_tfm);
+}
+
+static void ablk_init_common(struct crypto_tfm *tfm,
+ struct cryptd_ablkcipher *cryptd_tfm)
+{
+ struct async_serpent_ctx *ctx = crypto_tfm_ctx(tfm);
+
+ ctx->cryptd_tfm = cryptd_tfm;
+ tfm->crt_ablkcipher.reqsize = sizeof(struct ablkcipher_request) +
+ crypto_ablkcipher_reqsize(&cryptd_tfm->base);
+}
+
+static int ablk_ecb_init(struct crypto_tfm *tfm)
+{
+ struct cryptd_ablkcipher *cryptd_tfm;
+
+ cryptd_tfm = cryptd_alloc_ablkcipher("__driver-ecb-serpent-sse2", 0, 0);
+ if (IS_ERR(cryptd_tfm))
+ return PTR_ERR(cryptd_tfm);
+ ablk_init_common(tfm, cryptd_tfm);
+ return 0;
+}
+
+static struct crypto_alg ablk_ecb_alg = {
+ .cra_name = "ecb(serpent)",
+ .cra_driver_name = "ecb-serpent-sse2",
+ .cra_priority = 400,
+ .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
+ .cra_blocksize = SERPENT_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct async_serpent_ctx),
+ .cra_alignmask = 0,
+ .cra_type = &crypto_ablkcipher_type,
+ .cra_module = THIS_MODULE,
+ .cra_list = LIST_HEAD_INIT(ablk_ecb_alg.cra_list),
+ .cra_init = ablk_ecb_init,
+ .cra_exit = ablk_exit,
+ .cra_u = {
+ .ablkcipher = {
+ .min_keysize = SERPENT_MIN_KEY_SIZE,
+ .max_keysize = SERPENT_MAX_KEY_SIZE,
+ .setkey = ablk_set_key,
+ .encrypt = ablk_encrypt,
+ .decrypt = ablk_decrypt,
+ },
+ },
+};
+
+static int ablk_cbc_init(struct crypto_tfm *tfm)
+{
+ struct cryptd_ablkcipher *cryptd_tfm;
+
+ cryptd_tfm = cryptd_alloc_ablkcipher("__driver-cbc-serpent-sse2", 0, 0);
+ if (IS_ERR(cryptd_tfm))
+ return PTR_ERR(cryptd_tfm);
+ ablk_init_common(tfm, cryptd_tfm);
+ return 0;
+}
+
+static struct crypto_alg ablk_cbc_alg = {
+ .cra_name = "cbc(serpent)",
+ .cra_driver_name = "cbc-serpent-sse2",
+ .cra_priority = 400,
+ .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
+ .cra_blocksize = SERPENT_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct async_serpent_ctx),
+ .cra_alignmask = 0,
+ .cra_type = &crypto_ablkcipher_type,
+ .cra_module = THIS_MODULE,
+ .cra_list = LIST_HEAD_INIT(ablk_cbc_alg.cra_list),
+ .cra_init = ablk_cbc_init,
+ .cra_exit = ablk_exit,
+ .cra_u = {
+ .ablkcipher = {
+ .min_keysize = SERPENT_MIN_KEY_SIZE,
+ .max_keysize = SERPENT_MAX_KEY_SIZE,
+ .ivsize = SERPENT_BLOCK_SIZE,
+ .setkey = ablk_set_key,
+ .encrypt = __ablk_encrypt,
+ .decrypt = ablk_decrypt,
+ },
+ },
+};
+
+static int ablk_ctr_init(struct crypto_tfm *tfm)
+{
+ struct cryptd_ablkcipher *cryptd_tfm;
+
+ cryptd_tfm = cryptd_alloc_ablkcipher("__driver-ctr-serpent-sse2", 0, 0);
+ if (IS_ERR(cryptd_tfm))
+ return PTR_ERR(cryptd_tfm);
+ ablk_init_common(tfm, cryptd_tfm);
+ return 0;
+}
+
+static struct crypto_alg ablk_ctr_alg = {
+ .cra_name = "ctr(serpent)",
+ .cra_driver_name = "ctr-serpent-sse2",
+ .cra_priority = 400,
+ .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
+ .cra_blocksize = 1,
+ .cra_ctxsize = sizeof(struct async_serpent_ctx),
+ .cra_alignmask = 0,
+ .cra_type = &crypto_ablkcipher_type,
+ .cra_module = THIS_MODULE,
+ .cra_list = LIST_HEAD_INIT(ablk_ctr_alg.cra_list),
+ .cra_init = ablk_ctr_init,
+ .cra_exit = ablk_exit,
+ .cra_u = {
+ .ablkcipher = {
+ .min_keysize = SERPENT_MIN_KEY_SIZE,
+ .max_keysize = SERPENT_MAX_KEY_SIZE,
+ .ivsize = SERPENT_BLOCK_SIZE,
+ .setkey = ablk_set_key,
+ .encrypt = ablk_encrypt,
+ .decrypt = ablk_encrypt,
+ .geniv = "chainiv",
+ },
+ },
+};
+
+static int ablk_lrw_init(struct crypto_tfm *tfm)
+{
+ struct cryptd_ablkcipher *cryptd_tfm;
+
+ cryptd_tfm = cryptd_alloc_ablkcipher("__driver-lrw-serpent-sse2", 0, 0);
+ if (IS_ERR(cryptd_tfm))
+ return PTR_ERR(cryptd_tfm);
+ ablk_init_common(tfm, cryptd_tfm);
+ return 0;
+}
+
+static struct crypto_alg ablk_lrw_alg = {
+ .cra_name = "lrw(serpent)",
+ .cra_driver_name = "lrw-serpent-sse2",
+ .cra_priority = 400,
+ .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
+ .cra_blocksize = SERPENT_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct async_serpent_ctx),
+ .cra_alignmask = 0,
+ .cra_type = &crypto_ablkcipher_type,
+ .cra_module = THIS_MODULE,
+ .cra_list = LIST_HEAD_INIT(ablk_lrw_alg.cra_list),
+ .cra_init = ablk_lrw_init,
+ .cra_exit = ablk_exit,
+ .cra_u = {
+ .ablkcipher = {
+ .min_keysize = SERPENT_MIN_KEY_SIZE +
+ SERPENT_BLOCK_SIZE,
+ .max_keysize = SERPENT_MAX_KEY_SIZE +
+ SERPENT_BLOCK_SIZE,
+ .ivsize = SERPENT_BLOCK_SIZE,
+ .setkey = ablk_set_key,
+ .encrypt = ablk_encrypt,
+ .decrypt = ablk_decrypt,
+ },
+ },
+};
+
+static int ablk_xts_init(struct crypto_tfm *tfm)
+{
+ struct cryptd_ablkcipher *cryptd_tfm;
+
+ cryptd_tfm = cryptd_alloc_ablkcipher("__driver-xts-serpent-sse2", 0, 0);
+ if (IS_ERR(cryptd_tfm))
+ return PTR_ERR(cryptd_tfm);
+ ablk_init_common(tfm, cryptd_tfm);
+ return 0;
+}
+
+static struct crypto_alg ablk_xts_alg = {
+ .cra_name = "xts(serpent)",
+ .cra_driver_name = "xts-serpent-sse2",
+ .cra_priority = 400,
+ .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
+ .cra_blocksize = SERPENT_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct async_serpent_ctx),
+ .cra_alignmask = 0,
+ .cra_type = &crypto_ablkcipher_type,
+ .cra_module = THIS_MODULE,
+ .cra_list = LIST_HEAD_INIT(ablk_xts_alg.cra_list),
+ .cra_init = ablk_xts_init,
+ .cra_exit = ablk_exit,
+ .cra_u = {
+ .ablkcipher = {
+ .min_keysize = SERPENT_MIN_KEY_SIZE * 2,
+ .max_keysize = SERPENT_MAX_KEY_SIZE * 2,
+ .ivsize = SERPENT_BLOCK_SIZE,
+ .setkey = ablk_set_key,
+ .encrypt = ablk_encrypt,
+ .decrypt = ablk_decrypt,
+ },
+ },
+};
+
+static int __init serpent_sse2_init(void)
+{
+ int err;
+
+ if (!cpu_has_xmm2) {
+ printk(KERN_INFO "SSE2 instructions are not detected.\n");
+ return -ENODEV;
+ }
+
+ err = crypto_register_alg(&blk_ecb_alg);
+ if (err)
+ goto blk_ecb_err;
+ err = crypto_register_alg(&blk_cbc_alg);
+ if (err)
+ goto blk_cbc_err;
+ err = crypto_register_alg(&blk_ctr_alg);
+ if (err)
+ goto blk_ctr_err;
+ err = crypto_register_alg(&ablk_ecb_alg);
+ if (err)
+ goto ablk_ecb_err;
+ err = crypto_register_alg(&ablk_cbc_alg);
+ if (err)
+ goto ablk_cbc_err;
+ err = crypto_register_alg(&ablk_ctr_alg);
+ if (err)
+ goto ablk_ctr_err;
+ err = crypto_register_alg(&blk_lrw_alg);
+ if (err)
+ goto blk_lrw_err;
+ err = crypto_register_alg(&ablk_lrw_alg);
+ if (err)
+ goto ablk_lrw_err;
+ err = crypto_register_alg(&blk_xts_alg);
+ if (err)
+ goto blk_xts_err;
+ err = crypto_register_alg(&ablk_xts_alg);
+ if (err)
+ goto ablk_xts_err;
+ return err;
+
+ crypto_unregister_alg(&ablk_xts_alg);
+ablk_xts_err:
+ crypto_unregister_alg(&blk_xts_alg);
+blk_xts_err:
+ crypto_unregister_alg(&ablk_lrw_alg);
+ablk_lrw_err:
+ crypto_unregister_alg(&blk_lrw_alg);
+blk_lrw_err:
+ crypto_unregister_alg(&ablk_ctr_alg);
+ablk_ctr_err:
+ crypto_unregister_alg(&ablk_cbc_alg);
+ablk_cbc_err:
+ crypto_unregister_alg(&ablk_ecb_alg);
+ablk_ecb_err:
+ crypto_unregister_alg(&blk_ctr_alg);
+blk_ctr_err:
+ crypto_unregister_alg(&blk_cbc_alg);
+blk_cbc_err:
+ crypto_unregister_alg(&blk_ecb_alg);
+blk_ecb_err:
+ return err;
+}
+
+static void __exit serpent_sse2_exit(void)
+{
+ crypto_unregister_alg(&ablk_xts_alg);
+ crypto_unregister_alg(&blk_xts_alg);
+ crypto_unregister_alg(&ablk_lrw_alg);
+ crypto_unregister_alg(&blk_lrw_alg);
+ crypto_unregister_alg(&ablk_ctr_alg);
+ crypto_unregister_alg(&ablk_cbc_alg);
+ crypto_unregister_alg(&ablk_ecb_alg);
+ crypto_unregister_alg(&blk_ctr_alg);
+ crypto_unregister_alg(&blk_cbc_alg);
+ crypto_unregister_alg(&blk_ecb_alg);
+}
+
+module_init(serpent_sse2_init);
+module_exit(serpent_sse2_exit);
+
+MODULE_DESCRIPTION("Serpent Cipher Algorithm, SSE2 optimized");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("serpent");
diff --git a/arch/x86/crypto/twofish_glue_3way.c b/arch/x86/crypto/twofish_glue_3way.c
index 5ede9c444c3e..7fee8c152f93 100644
--- a/arch/x86/crypto/twofish_glue_3way.c
+++ b/arch/x86/crypto/twofish_glue_3way.c
@@ -32,6 +32,8 @@
#include <crypto/algapi.h>
#include <crypto/twofish.h>
#include <crypto/b128ops.h>
+#include <crypto/lrw.h>
+#include <crypto/xts.h>
/* regular block cipher functions from twofish_x86_64 module */
asmlinkage void twofish_enc_blk(struct twofish_ctx *ctx, u8 *dst,
@@ -432,6 +434,209 @@ static struct crypto_alg blk_ctr_alg = {
},
};
+static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
+{
+ const unsigned int bsize = TF_BLOCK_SIZE;
+ struct twofish_ctx *ctx = priv;
+ int i;
+
+ if (nbytes == 3 * bsize) {
+ twofish_enc_blk_3way(ctx, srcdst, srcdst);
+ return;
+ }
+
+ for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
+ twofish_enc_blk(ctx, srcdst, srcdst);
+}
+
+static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
+{
+ const unsigned int bsize = TF_BLOCK_SIZE;
+ struct twofish_ctx *ctx = priv;
+ int i;
+
+ if (nbytes == 3 * bsize) {
+ twofish_dec_blk_3way(ctx, srcdst, srcdst);
+ return;
+ }
+
+ for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
+ twofish_dec_blk(ctx, srcdst, srcdst);
+}
+
+struct twofish_lrw_ctx {
+ struct lrw_table_ctx lrw_table;
+ struct twofish_ctx twofish_ctx;
+};
+
+static int lrw_twofish_setkey(struct crypto_tfm *tfm, const u8 *key,
+ unsigned int keylen)
+{
+ struct twofish_lrw_ctx *ctx = crypto_tfm_ctx(tfm);
+ int err;
+
+ err = __twofish_setkey(&ctx->twofish_ctx, key, keylen - TF_BLOCK_SIZE,
+ &tfm->crt_flags);
+ if (err)
+ return err;
+
+ return lrw_init_table(&ctx->lrw_table, key + keylen - TF_BLOCK_SIZE);
+}
+
+static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+ struct scatterlist *src, unsigned int nbytes)
+{
+ struct twofish_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+ be128 buf[3];
+ struct lrw_crypt_req req = {
+ .tbuf = buf,
+ .tbuflen = sizeof(buf),
+
+ .table_ctx = &ctx->lrw_table,
+ .crypt_ctx = &ctx->twofish_ctx,
+ .crypt_fn = encrypt_callback,
+ };
+
+ return lrw_crypt(desc, dst, src, nbytes, &req);
+}
+
+static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+ struct scatterlist *src, unsigned int nbytes)
+{
+ struct twofish_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+ be128 buf[3];
+ struct lrw_crypt_req req = {
+ .tbuf = buf,
+ .tbuflen = sizeof(buf),
+
+ .table_ctx = &ctx->lrw_table,
+ .crypt_ctx = &ctx->twofish_ctx,
+ .crypt_fn = decrypt_callback,
+ };
+
+ return lrw_crypt(desc, dst, src, nbytes, &req);
+}
+
+static void lrw_exit_tfm(struct crypto_tfm *tfm)
+{
+ struct twofish_lrw_ctx *ctx = crypto_tfm_ctx(tfm);
+
+ lrw_free_table(&ctx->lrw_table);
+}
+
+static struct crypto_alg blk_lrw_alg = {
+ .cra_name = "lrw(twofish)",
+ .cra_driver_name = "lrw-twofish-3way",
+ .cra_priority = 300,
+ .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
+ .cra_blocksize = TF_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct twofish_lrw_ctx),
+ .cra_alignmask = 0,
+ .cra_type = &crypto_blkcipher_type,
+ .cra_module = THIS_MODULE,
+ .cra_list = LIST_HEAD_INIT(blk_lrw_alg.cra_list),
+ .cra_exit = lrw_exit_tfm,
+ .cra_u = {
+ .blkcipher = {
+ .min_keysize = TF_MIN_KEY_SIZE + TF_BLOCK_SIZE,
+ .max_keysize = TF_MAX_KEY_SIZE + TF_BLOCK_SIZE,
+ .ivsize = TF_BLOCK_SIZE,
+ .setkey = lrw_twofish_setkey,
+ .encrypt = lrw_encrypt,
+ .decrypt = lrw_decrypt,
+ },
+ },
+};
+
+struct twofish_xts_ctx {
+ struct twofish_ctx tweak_ctx;
+ struct twofish_ctx crypt_ctx;
+};
+
+static int xts_twofish_setkey(struct crypto_tfm *tfm, const u8 *key,
+ unsigned int keylen)
+{
+ struct twofish_xts_ctx *ctx = crypto_tfm_ctx(tfm);
+ u32 *flags = &tfm->crt_flags;
+ int err;
+
+ /* key consists of keys of equal size concatenated, therefore
+ * the length must be even
+ */
+ if (keylen % 2) {
+ *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
+ return -EINVAL;
+ }
+
+ /* first half of xts-key is for crypt */
+ err = __twofish_setkey(&ctx->crypt_ctx, key, keylen / 2, flags);
+ if (err)
+ return err;
+
+ /* second half of xts-key is for tweak */
+ return __twofish_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2,
+ flags);
+}
+
+static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+ struct scatterlist *src, unsigned int nbytes)
+{
+ struct twofish_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+ be128 buf[3];
+ struct xts_crypt_req req = {
+ .tbuf = buf,
+ .tbuflen = sizeof(buf),
+
+ .tweak_ctx = &ctx->tweak_ctx,
+ .tweak_fn = XTS_TWEAK_CAST(twofish_enc_blk),
+ .crypt_ctx = &ctx->crypt_ctx,
+ .crypt_fn = encrypt_callback,
+ };
+
+ return xts_crypt(desc, dst, src, nbytes, &req);
+}
+
+static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
+ struct scatterlist *src, unsigned int nbytes)
+{
+ struct twofish_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+ be128 buf[3];
+ struct xts_crypt_req req = {
+ .tbuf = buf,
+ .tbuflen = sizeof(buf),
+
+ .tweak_ctx = &ctx->tweak_ctx,
+ .tweak_fn = XTS_TWEAK_CAST(twofish_enc_blk),
+ .crypt_ctx = &ctx->crypt_ctx,
+ .crypt_fn = decrypt_callback,
+ };
+
+ return xts_crypt(desc, dst, src, nbytes, &req);
+}
+
+static struct crypto_alg blk_xts_alg = {
+ .cra_name = "xts(twofish)",
+ .cra_driver_name = "xts-twofish-3way",
+ .cra_priority = 300,
+ .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
+ .cra_blocksize = TF_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct twofish_xts_ctx),
+ .cra_alignmask = 0,
+ .cra_type = &crypto_blkcipher_type,
+ .cra_module = THIS_MODULE,
+ .cra_list = LIST_HEAD_INIT(blk_xts_alg.cra_list),
+ .cra_u = {
+ .blkcipher = {
+ .min_keysize = TF_MIN_KEY_SIZE * 2,
+ .max_keysize = TF_MAX_KEY_SIZE * 2,
+ .ivsize = TF_BLOCK_SIZE,
+ .setkey = xts_twofish_setkey,
+ .encrypt = xts_encrypt,
+ .decrypt = xts_decrypt,
+ },
+ },
+};
+
int __init init(void)
{
int err;
@@ -445,9 +650,20 @@ int __init init(void)
err = crypto_register_alg(&blk_ctr_alg);
if (err)
goto ctr_err;
+ err = crypto_register_alg(&blk_lrw_alg);
+ if (err)
+ goto blk_lrw_err;
+ err = crypto_register_alg(&blk_xts_alg);
+ if (err)
+ goto blk_xts_err;
return 0;
+ crypto_unregister_alg(&blk_xts_alg);
+blk_xts_err:
+ crypto_unregister_alg(&blk_lrw_alg);
+blk_lrw_err:
+ crypto_unregister_alg(&blk_ctr_alg);
ctr_err:
crypto_unregister_alg(&blk_cbc_alg);
cbc_err:
@@ -458,6 +674,8 @@ ecb_err:
void __exit fini(void)
{
+ crypto_unregister_alg(&blk_xts_alg);
+ crypto_unregister_alg(&blk_lrw_alg);
crypto_unregister_alg(&blk_ctr_alg);
crypto_unregister_alg(&blk_cbc_alg);
crypto_unregister_alg(&blk_ecb_alg);
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 72f853aea478..1106261856c8 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -132,7 +132,7 @@ ENTRY(ia32_sysenter_target)
CFI_REL_OFFSET rsp,0
pushfq_cfi
/*CFI_REL_OFFSET rflags,0*/
- movl 8*3-THREAD_SIZE+TI_sysenter_return(%rsp), %r10d
+ movl TI_sysenter_return+THREAD_INFO(%rsp,3*8-KERNEL_STACK_OFFSET),%r10d
CFI_REGISTER rip,r10
pushq_cfi $__USER32_CS
/*CFI_REL_OFFSET cs,0*/
@@ -148,9 +148,8 @@ ENTRY(ia32_sysenter_target)
.section __ex_table,"a"
.quad 1b,ia32_badarg
.previous
- GET_THREAD_INFO(%r10)
- orl $TS_COMPAT,TI_status(%r10)
- testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
+ orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+ testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
CFI_REMEMBER_STATE
jnz sysenter_tracesys
cmpq $(IA32_NR_syscalls-1),%rax
@@ -160,13 +159,12 @@ sysenter_do_call:
sysenter_dispatch:
call *ia32_sys_call_table(,%rax,8)
movq %rax,RAX-ARGOFFSET(%rsp)
- GET_THREAD_INFO(%r10)
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
- testl $_TIF_ALLWORK_MASK,TI_flags(%r10)
+ testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
jnz sysexit_audit
sysexit_from_sys_call:
- andl $~TS_COMPAT,TI_status(%r10)
+ andl $~TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET)
/* clear IF, that popfq doesn't enable interrupts early */
andl $~0x200,EFLAGS-R11(%rsp)
movl RIP-R11(%rsp),%edx /* User %eip */
@@ -203,7 +201,7 @@ sysexit_from_sys_call:
.endm
.macro auditsys_exit exit
- testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10)
+ testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
jnz ia32_ret_from_sys_call
TRACE_IRQS_ON
sti
@@ -213,12 +211,11 @@ sysexit_from_sys_call:
movzbl %al,%edi /* zero-extend that into %edi */
inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */
call audit_syscall_exit
- GET_THREAD_INFO(%r10)
movl RAX-ARGOFFSET(%rsp),%eax /* reload syscall return value */
movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
cli
TRACE_IRQS_OFF
- testl %edi,TI_flags(%r10)
+ testl %edi,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
jz \exit
CLEAR_RREGS -ARGOFFSET
jmp int_with_check
@@ -236,7 +233,7 @@ sysexit_audit:
sysenter_tracesys:
#ifdef CONFIG_AUDITSYSCALL
- testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10)
+ testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
jz sysenter_auditsys
#endif
SAVE_REST
@@ -307,9 +304,8 @@ ENTRY(ia32_cstar_target)
.section __ex_table,"a"
.quad 1b,ia32_badarg
.previous
- GET_THREAD_INFO(%r10)
- orl $TS_COMPAT,TI_status(%r10)
- testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
+ orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+ testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
CFI_REMEMBER_STATE
jnz cstar_tracesys
cmpq $IA32_NR_syscalls-1,%rax
@@ -319,13 +315,12 @@ cstar_do_call:
cstar_dispatch:
call *ia32_sys_call_table(,%rax,8)
movq %rax,RAX-ARGOFFSET(%rsp)
- GET_THREAD_INFO(%r10)
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
- testl $_TIF_ALLWORK_MASK,TI_flags(%r10)
+ testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
jnz sysretl_audit
sysretl_from_sys_call:
- andl $~TS_COMPAT,TI_status(%r10)
+ andl $~TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET)
RESTORE_ARGS 0,-ARG_SKIP,0,0,0
movl RIP-ARGOFFSET(%rsp),%ecx
CFI_REGISTER rip,rcx
@@ -353,7 +348,7 @@ sysretl_audit:
cstar_tracesys:
#ifdef CONFIG_AUDITSYSCALL
- testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10)
+ testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
jz cstar_auditsys
#endif
xchgl %r9d,%ebp
@@ -418,9 +413,8 @@ ENTRY(ia32_syscall)
/* note the registers are not zero extended to the sf.
this could be a problem. */
SAVE_ARGS 0,1,0
- GET_THREAD_INFO(%r10)
- orl $TS_COMPAT,TI_status(%r10)
- testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
+ orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+ testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
jnz ia32_tracesys
cmpq $(IA32_NR_syscalls-1),%rax
ja ia32_badsys
@@ -454,8 +448,8 @@ ia32_badsys:
CFI_ENDPROC
.macro PTREGSCALL label, func, arg
- .globl \label
-\label:
+ ALIGN
+GLOBAL(\label)
leaq \func(%rip),%rax
leaq -ARGOFFSET+8(%rsp),\arg /* 8 for return address */
jmp ia32_ptregs_common
@@ -472,7 +466,8 @@ ia32_badsys:
PTREGSCALL stub32_vfork, sys_vfork, %rdi
PTREGSCALL stub32_iopl, sys_iopl, %rsi
-ENTRY(ia32_ptregs_common)
+ ALIGN
+ia32_ptregs_common:
popq %r11
CFI_ENDPROC
CFI_STARTPROC32 simple
diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h
index 091508b533b4..952bd0100c5c 100644
--- a/arch/x86/include/asm/alternative-asm.h
+++ b/arch/x86/include/asm/alternative-asm.h
@@ -4,10 +4,10 @@
#ifdef CONFIG_SMP
.macro LOCK_PREFIX
-1: lock
+672: lock
.section .smp_locks,"a"
.balign 4
- .long 1b - .
+ .long 672b - .
.previous
.endm
#else
diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h
index 8e41071704a5..49ad773f4b9f 100644
--- a/arch/x86/include/asm/amd_nb.h
+++ b/arch/x86/include/asm/amd_nb.h
@@ -1,6 +1,7 @@
#ifndef _ASM_X86_AMD_NB_H
#define _ASM_X86_AMD_NB_H
+#include <linux/ioport.h>
#include <linux/pci.h>
struct amd_nb_bus_dev_range {
@@ -13,6 +14,7 @@ extern const struct pci_device_id amd_nb_misc_ids[];
extern const struct amd_nb_bus_dev_range amd_nb_bus_dev_ranges[];
extern bool early_is_amd_nb(u32 value);
+extern struct resource *amd_get_mmconfig_range(struct resource *res);
extern int amd_cache_northbridges(void);
extern void amd_flush_garts(void);
extern int amd_numa_init(void);
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 9b7273cb2193..3ab9bdd87e79 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -49,6 +49,7 @@ extern unsigned int apic_verbosity;
extern int local_apic_timer_c2_ok;
extern int disable_apic;
+extern unsigned int lapic_timer_frequency;
#ifdef CONFIG_SMP
extern void __inquire_remote_apic(int apicid);
@@ -175,6 +176,7 @@ static inline u64 native_x2apic_icr_read(void)
}
extern int x2apic_phys;
+extern int x2apic_preenabled;
extern void check_x2apic(void);
extern void enable_x2apic(void);
extern void x2apic_icr_write(u32 low, u32 id);
@@ -197,6 +199,9 @@ static inline void x2apic_force_phys(void)
x2apic_phys = 1;
}
#else
+static inline void disable_x2apic(void)
+{
+}
static inline void check_x2apic(void)
{
}
@@ -211,6 +216,7 @@ static inline void x2apic_force_phys(void)
{
}
+#define nox2apic 0
#define x2apic_preenabled 0
#define x2apic_supported() 0
#endif
@@ -409,6 +415,7 @@ extern int wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip);
#endif
#ifdef CONFIG_X86_LOCAL_APIC
+
static inline u32 apic_read(u32 reg)
{
return apic->read(reg);
diff --git a/arch/x86/include/asm/apic_flat_64.h b/arch/x86/include/asm/apic_flat_64.h
new file mode 100644
index 000000000000..a2d312796440
--- /dev/null
+++ b/arch/x86/include/asm/apic_flat_64.h
@@ -0,0 +1,7 @@
+#ifndef _ASM_X86_APIC_FLAT_64_H
+#define _ASM_X86_APIC_FLAT_64_H
+
+extern void flat_init_apic_ldr(void);
+
+#endif
+
diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h
index 3925d8007864..134bba00df09 100644
--- a/arch/x86/include/asm/apicdef.h
+++ b/arch/x86/include/asm/apicdef.h
@@ -144,6 +144,7 @@
#define APIC_BASE (fix_to_virt(FIX_APIC_BASE))
#define APIC_BASE_MSR 0x800
+#define XAPIC_ENABLE (1UL << 11)
#define X2APIC_ENABLE (1UL << 10)
#ifdef CONFIG_X86_32
diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h
index 24098aafce0d..fa13f0ec2874 100644
--- a/arch/x86/include/asm/atomic64_32.h
+++ b/arch/x86/include/asm/atomic64_32.h
@@ -82,7 +82,7 @@ static inline void atomic64_set(atomic64_t *v, long long i)
*
* Atomically reads the value of @v and returns it.
*/
-static inline long long atomic64_read(atomic64_t *v)
+static inline long long atomic64_read(const atomic64_t *v)
{
long long r;
asm volatile(ATOMIC64_ALTERNATIVE(read)
diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
index 1775d6e5920e..b97596e2b68c 100644
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -380,6 +380,8 @@ static inline unsigned long __fls(unsigned long word)
return word;
}
+#undef ADDR
+
#ifdef __KERNEL__
/**
* ffs - find first set bit in word
@@ -395,10 +397,25 @@ static inline unsigned long __fls(unsigned long word)
static inline int ffs(int x)
{
int r;
-#ifdef CONFIG_X86_CMOV
+
+#ifdef CONFIG_X86_64
+ /*
+ * AMD64 says BSFL won't clobber the dest reg if x==0; Intel64 says the
+ * dest reg is undefined if x==0, but their CPU architect says its
+ * value is written to set it to the same as before, except that the
+ * top 32 bits will be cleared.
+ *
+ * We cannot do this on 32 bits because at the very least some
+ * 486 CPUs did not behave this way.
+ */
+ long tmp = -1;
+ asm("bsfl %1,%0"
+ : "=r" (r)
+ : "rm" (x), "0" (tmp));
+#elif defined(CONFIG_X86_CMOV)
asm("bsfl %1,%0\n\t"
"cmovzl %2,%0"
- : "=r" (r) : "rm" (x), "r" (-1));
+ : "=&r" (r) : "rm" (x), "r" (-1));
#else
asm("bsfl %1,%0\n\t"
"jnz 1f\n\t"
@@ -422,7 +439,22 @@ static inline int ffs(int x)
static inline int fls(int x)
{
int r;
-#ifdef CONFIG_X86_CMOV
+
+#ifdef CONFIG_X86_64
+ /*
+ * AMD64 says BSRL won't clobber the dest reg if x==0; Intel64 says the
+ * dest reg is undefined if x==0, but their CPU architect says its
+ * value is written to set it to the same as before, except that the
+ * top 32 bits will be cleared.
+ *
+ * We cannot do this on 32 bits because at the very least some
+ * 486 CPUs did not behave this way.
+ */
+ long tmp = -1;
+ asm("bsrl %1,%0"
+ : "=r" (r)
+ : "rm" (x), "0" (tmp));
+#elif defined(CONFIG_X86_CMOV)
asm("bsrl %1,%0\n\t"
"cmovzl %2,%0"
: "=&r" (r) : "rm" (x), "rm" (-1));
@@ -434,11 +466,35 @@ static inline int fls(int x)
#endif
return r + 1;
}
-#endif /* __KERNEL__ */
-
-#undef ADDR
-#ifdef __KERNEL__
+/**
+ * fls64 - find last set bit in a 64-bit word
+ * @x: the word to search
+ *
+ * This is defined in a similar way as the libc and compiler builtin
+ * ffsll, but returns the position of the most significant set bit.
+ *
+ * fls64(value) returns 0 if value is 0 or the position of the last
+ * set bit if value is nonzero. The last (most significant) bit is
+ * at position 64.
+ */
+#ifdef CONFIG_X86_64
+static __always_inline int fls64(__u64 x)
+{
+ long bitpos = -1;
+ /*
+ * AMD64 says BSRQ won't clobber the dest reg if x==0; Intel64 says the
+ * dest reg is undefined if x==0, but their CPU architect says its
+ * value is written to set it to the same as before.
+ */
+ asm("bsrq %1,%0"
+ : "+r" (bitpos)
+ : "rm" (x));
+ return bitpos + 1;
+}
+#else
+#include <asm-generic/bitops/fls64.h>
+#endif
#include <asm-generic/bitops/find.h>
@@ -450,12 +506,6 @@ static inline int fls(int x)
#include <asm-generic/bitops/const_hweight.h>
-#endif /* __KERNEL__ */
-
-#include <asm-generic/bitops/fls64.h>
-
-#ifdef __KERNEL__
-
#include <asm-generic/bitops/le.h>
#include <asm-generic/bitops/ext2-atomic-setbit.h>
diff --git a/arch/x86/include/asm/bootparam.h b/arch/x86/include/asm/bootparam.h
index e020d88ec02d..2f90c51cc49d 100644
--- a/arch/x86/include/asm/bootparam.h
+++ b/arch/x86/include/asm/bootparam.h
@@ -64,6 +64,8 @@ struct setup_header {
__u32 payload_offset;
__u32 payload_length;
__u64 setup_data;
+ __u64 pref_address;
+ __u32 init_size;
} __attribute__((packed));
struct sys_desc_table {
diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h
index 5d3acdf5a7a6..0c9fa2745f13 100644
--- a/arch/x86/include/asm/cmpxchg.h
+++ b/arch/x86/include/asm/cmpxchg.h
@@ -14,6 +14,8 @@ extern void __cmpxchg_wrong_size(void)
__compiletime_error("Bad argument size for cmpxchg");
extern void __xadd_wrong_size(void)
__compiletime_error("Bad argument size for xadd");
+extern void __add_wrong_size(void)
+ __compiletime_error("Bad argument size for add");
/*
* Constants for operation sizes. On 32-bit, the 64-bit size it set to
@@ -31,60 +33,47 @@ extern void __xadd_wrong_size(void)
#define __X86_CASE_Q -1 /* sizeof will never return -1 */
#endif
+/*
+ * An exchange-type operation, which takes a value and a pointer, and
+ * returns a the old value.
+ */
+#define __xchg_op(ptr, arg, op, lock) \
+ ({ \
+ __typeof__ (*(ptr)) __ret = (arg); \
+ switch (sizeof(*(ptr))) { \
+ case __X86_CASE_B: \
+ asm volatile (lock #op "b %b0, %1\n" \
+ : "+r" (__ret), "+m" (*(ptr)) \
+ : : "memory", "cc"); \
+ break; \
+ case __X86_CASE_W: \
+ asm volatile (lock #op "w %w0, %1\n" \
+ : "+r" (__ret), "+m" (*(ptr)) \
+ : : "memory", "cc"); \
+ break; \
+ case __X86_CASE_L: \
+ asm volatile (lock #op "l %0, %1\n" \
+ : "+r" (__ret), "+m" (*(ptr)) \
+ : : "memory", "cc"); \
+ break; \
+ case __X86_CASE_Q: \
+ asm volatile (lock #op "q %q0, %1\n" \
+ : "+r" (__ret), "+m" (*(ptr)) \
+ : : "memory", "cc"); \
+ break; \
+ default: \
+ __ ## op ## _wrong_size(); \
+ } \
+ __ret; \
+ })
+
/*
* Note: no "lock" prefix even on SMP: xchg always implies lock anyway.
* Since this is generally used to protect other memory information, we
* use "asm volatile" and "memory" clobbers to prevent gcc from moving
* information around.
*/
-#define __xchg(x, ptr, size) \
-({ \
- __typeof(*(ptr)) __x = (x); \
- switch (size) { \
- case __X86_CASE_B: \
- { \
- volatile u8 *__ptr = (volatile u8 *)(ptr); \
- asm volatile("xchgb %0,%1" \
- : "=q" (__x), "+m" (*__ptr) \
- : "0" (__x) \
- : "memory"); \
- break; \
- } \
- case __X86_CASE_W: \
- { \
- volatile u16 *__ptr = (volatile u16 *)(ptr); \
- asm volatile("xchgw %0,%1" \
- : "=r" (__x), "+m" (*__ptr) \
- : "0" (__x) \
- : "memory"); \
- break; \
- } \
- case __X86_CASE_L: \
- { \
- volatile u32 *__ptr = (volatile u32 *)(ptr); \
- asm volatile("xchgl %0,%1" \
- : "=r" (__x), "+m" (*__ptr) \
- : "0" (__x) \
- : "memory"); \
- break; \
- } \
- case __X86_CASE_Q: \
- { \
- volatile u64 *__ptr = (volatile u64 *)(ptr); \
- asm volatile("xchgq %0,%1" \
- : "=r" (__x), "+m" (*__ptr) \
- : "0" (__x) \
- : "memory"); \
- break; \
- } \
- default: \
- __xchg_wrong_size(); \
- } \
- __x; \
-})
-
-#define xchg(ptr, v) \
- __xchg((v), (ptr), sizeof(*ptr))
+#define xchg(ptr, v) __xchg_op((ptr), (v), xchg, "")
/*
* Atomic compare and exchange. Compare OLD with MEM, if identical,
@@ -165,46 +154,80 @@ extern void __xadd_wrong_size(void)
__cmpxchg_local((ptr), (old), (new), sizeof(*ptr))
#endif
-#define __xadd(ptr, inc, lock) \
+/*
+ * xadd() adds "inc" to "*ptr" and atomically returns the previous
+ * value of "*ptr".
+ *
+ * xadd() is locked when multiple CPUs are online
+ * xadd_sync() is always locked
+ * xadd_local() is never locked
+ */
+#define __xadd(ptr, inc, lock) __xchg_op((ptr), (inc), xadd, lock)
+#define xadd(ptr, inc) __xadd((ptr), (inc), LOCK_PREFIX)
+#define xadd_sync(ptr, inc) __xadd((ptr), (inc), "lock; ")
+#define xadd_local(ptr, inc) __xadd((ptr), (inc), "")
+
+#define __add(ptr, inc, lock) \
({ \
__typeof__ (*(ptr)) __ret = (inc); \
switch (sizeof(*(ptr))) { \
case __X86_CASE_B: \
- asm volatile (lock "xaddb %b0, %1\n" \
- : "+r" (__ret), "+m" (*(ptr)) \
- : : "memory", "cc"); \
+ asm volatile (lock "addb %b1, %0\n" \
+ : "+m" (*(ptr)) : "ri" (inc) \
+ : "memory", "cc"); \
break; \
case __X86_CASE_W: \
- asm volatile (lock "xaddw %w0, %1\n" \
- : "+r" (__ret), "+m" (*(ptr)) \
- : : "memory", "cc"); \
+ asm volatile (lock "addw %w1, %0\n" \
+ : "+m" (*(ptr)) : "ri" (inc) \
+ : "memory", "cc"); \
break; \
case __X86_CASE_L: \
- asm volatile (lock "xaddl %0, %1\n" \
- : "+r" (__ret), "+m" (*(ptr)) \
- : : "memory", "cc"); \
+ asm volatile (lock "addl %1, %0\n" \
+ : "+m" (*(ptr)) : "ri" (inc) \
+ : "memory", "cc"); \
break; \
case __X86_CASE_Q: \
- asm volatile (lock "xaddq %q0, %1\n" \
- : "+r" (__ret), "+m" (*(ptr)) \
- : : "memory", "cc"); \
+ asm volatile (lock "addq %1, %0\n" \
+ : "+m" (*(ptr)) : "ri" (inc) \
+ : "memory", "cc"); \
break; \
default: \
- __xadd_wrong_size(); \
+ __add_wrong_size(); \
} \
__ret; \
})
/*
- * xadd() adds "inc" to "*ptr" and atomically returns the previous
- * value of "*ptr".
+ * add_*() adds "inc" to "*ptr"
*
- * xadd() is locked when multiple CPUs are online
- * xadd_sync() is always locked
- * xadd_local() is never locked
+ * __add() takes a lock prefix
+ * add_smp() is locked when multiple CPUs are online
+ * add_sync() is always locked
*/
-#define xadd(ptr, inc) __xadd((ptr), (inc), LOCK_PREFIX)
-#define xadd_sync(ptr, inc) __xadd((ptr), (inc), "lock; ")
-#define xadd_local(ptr, inc) __xadd((ptr), (inc), "")
+#define add_smp(ptr, inc) __add((ptr), (inc), LOCK_PREFIX)
+#define add_sync(ptr, inc) __add((ptr), (inc), "lock; ")
+
+#define __cmpxchg_double(pfx, p1, p2, o1, o2, n1, n2) \
+({ \
+ bool __ret; \
+ __typeof__(*(p1)) __old1 = (o1), __new1 = (n1); \
+ __typeof__(*(p2)) __old2 = (o2), __new2 = (n2); \
+ BUILD_BUG_ON(sizeof(*(p1)) != sizeof(long)); \
+ BUILD_BUG_ON(sizeof(*(p2)) != sizeof(long)); \
+ VM_BUG_ON((unsigned long)(p1) % (2 * sizeof(long))); \
+ VM_BUG_ON((unsigned long)((p1) + 1) != (unsigned long)(p2)); \
+ asm volatile(pfx "cmpxchg%c4b %2; sete %0" \
+ : "=a" (__ret), "+d" (__old2), \
+ "+m" (*(p1)), "+m" (*(p2)) \
+ : "i" (2 * sizeof(long)), "a" (__old1), \
+ "b" (__new1), "c" (__new2)); \
+ __ret; \
+})
+
+#define cmpxchg_double(p1, p2, o1, o2, n1, n2) \
+ __cmpxchg_double(LOCK_PREFIX, p1, p2, o1, o2, n1, n2)
+
+#define cmpxchg_double_local(p1, p2, o1, o2, n1, n2) \
+ __cmpxchg_double(, p1, p2, o1, o2, n1, n2)
#endif /* ASM_X86_CMPXCHG_H */
diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h
index fbebb07dd80b..53f4b219336b 100644
--- a/arch/x86/include/asm/cmpxchg_32.h
+++ b/arch/x86/include/asm/cmpxchg_32.h
@@ -166,52 +166,6 @@ static inline unsigned long cmpxchg_386(volatile void *ptr, unsigned long old,
#endif
-#define cmpxchg8b(ptr, o1, o2, n1, n2) \
-({ \
- char __ret; \
- __typeof__(o2) __dummy; \
- __typeof__(*(ptr)) __old1 = (o1); \
- __typeof__(o2) __old2 = (o2); \
- __typeof__(*(ptr)) __new1 = (n1); \
- __typeof__(o2) __new2 = (n2); \
- asm volatile(LOCK_PREFIX "cmpxchg8b %2; setz %1" \
- : "=d"(__dummy), "=a" (__ret), "+m" (*ptr)\
- : "a" (__old1), "d"(__old2), \
- "b" (__new1), "c" (__new2) \
- : "memory"); \
- __ret; })
-
-
-#define cmpxchg8b_local(ptr, o1, o2, n1, n2) \
-({ \
- char __ret; \
- __typeof__(o2) __dummy; \
- __typeof__(*(ptr)) __old1 = (o1); \
- __typeof__(o2) __old2 = (o2); \
- __typeof__(*(ptr)) __new1 = (n1); \
- __typeof__(o2) __new2 = (n2); \
- asm volatile("cmpxchg8b %2; setz %1" \
- : "=d"(__dummy), "=a"(__ret), "+m" (*ptr)\
- : "a" (__old), "d"(__old2), \
- "b" (__new1), "c" (__new2), \
- : "memory"); \
- __ret; })
-
-
-#define cmpxchg_double(ptr, o1, o2, n1, n2) \
-({ \
- BUILD_BUG_ON(sizeof(*(ptr)) != 4); \
- VM_BUG_ON((unsigned long)(ptr) % 8); \
- cmpxchg8b((ptr), (o1), (o2), (n1), (n2)); \
-})
-
-#define cmpxchg_double_local(ptr, o1, o2, n1, n2) \
-({ \
- BUILD_BUG_ON(sizeof(*(ptr)) != 4); \
- VM_BUG_ON((unsigned long)(ptr) % 8); \
- cmpxchg16b_local((ptr), (o1), (o2), (n1), (n2)); \
-})
-
#define system_has_cmpxchg_double() cpu_has_cx8
#endif /* _ASM_X86_CMPXCHG_32_H */
diff --git a/arch/x86/include/asm/cmpxchg_64.h b/arch/x86/include/asm/cmpxchg_64.h
index 285da02c38fa..614be87f1a9b 100644
--- a/arch/x86/include/asm/cmpxchg_64.h
+++ b/arch/x86/include/asm/cmpxchg_64.h
@@ -20,49 +20,6 @@ static inline void set_64bit(volatile u64 *ptr, u64 val)
cmpxchg_local((ptr), (o), (n)); \
})
-#define cmpxchg16b(ptr, o1, o2, n1, n2) \
-({ \
- char __ret; \
- __typeof__(o2) __junk; \
- __typeof__(*(ptr)) __old1 = (o1); \
- __typeof__(o2) __old2 = (o2); \
- __typeof__(*(ptr)) __new1 = (n1); \
- __typeof__(o2) __new2 = (n2); \
- asm volatile(LOCK_PREFIX "cmpxchg16b %2;setz %1" \
- : "=d"(__junk), "=a"(__ret), "+m" (*ptr) \
- : "b"(__new1), "c"(__new2), \
- "a"(__old1), "d"(__old2)); \
- __ret; })
-
-
-#define cmpxchg16b_local(ptr, o1, o2, n1, n2) \
-({ \
- char __ret; \
- __typeof__(o2) __junk; \
- __typeof__(*(ptr)) __old1 = (o1); \
- __typeof__(o2) __old2 = (o2); \
- __typeof__(*(ptr)) __new1 = (n1); \
- __typeof__(o2) __new2 = (n2); \
- asm volatile("cmpxchg16b %2;setz %1" \
- : "=d"(__junk), "=a"(__ret), "+m" (*ptr) \
- : "b"(__new1), "c"(__new2), \
- "a"(__old1), "d"(__old2)); \
- __ret; })
-
-#define cmpxchg_double(ptr, o1, o2, n1, n2) \
-({ \
- BUILD_BUG_ON(sizeof(*(ptr)) != 8); \
- VM_BUG_ON((unsigned long)(ptr) % 16); \
- cmpxchg16b((ptr), (o1), (o2), (n1), (n2)); \
-})
-
-#define cmpxchg_double_local(ptr, o1, o2, n1, n2) \
-({ \
- BUILD_BUG_ON(sizeof(*(ptr)) != 8); \
- VM_BUG_ON((unsigned long)(ptr) % 16); \
- cmpxchg16b_local((ptr), (o1), (o2), (n1), (n2)); \
-})
-
#define system_has_cmpxchg_double() cpu_has_cx16
#endif /* _ASM_X86_CMPXCHG_64_H */
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index f3444f700f36..17c5d4bdee5e 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -197,7 +197,10 @@
/* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */
#define X86_FEATURE_FSGSBASE (9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/
+#define X86_FEATURE_BMI1 (9*32+ 3) /* 1st group bit manipulation extensions */
+#define X86_FEATURE_AVX2 (9*32+ 5) /* AVX2 instructions */
#define X86_FEATURE_SMEP (9*32+ 7) /* Supervisor Mode Execution Protection */
+#define X86_FEATURE_BMI2 (9*32+ 8) /* 2nd group bit manipulation extensions */
#define X86_FEATURE_ERMS (9*32+ 9) /* Enhanced REP MOVSB/STOSB */
#if defined(__KERNEL__) && !defined(__ASSEMBLY__)
diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h
index 078ad0caefc6..b903d5ea3941 100644
--- a/arch/x86/include/asm/debugreg.h
+++ b/arch/x86/include/asm/debugreg.h
@@ -101,6 +101,28 @@ extern void aout_dump_debugregs(struct user *dump);
extern void hw_breakpoint_restore(void);
+#ifdef CONFIG_X86_64
+DECLARE_PER_CPU(int, debug_stack_usage);
+static inline void debug_stack_usage_inc(void)
+{
+ __get_cpu_var(debug_stack_usage)++;
+}
+static inline void debug_stack_usage_dec(void)
+{
+ __get_cpu_var(debug_stack_usage)--;
+}
+int is_debug_stack(unsigned long addr);
+void debug_stack_set_zero(void);
+void debug_stack_reset(void);
+#else /* !X86_64 */
+static inline int is_debug_stack(unsigned long addr) { return 0; }
+static inline void debug_stack_set_zero(void) { }
+static inline void debug_stack_reset(void) { }
+static inline void debug_stack_usage_inc(void) { }
+static inline void debug_stack_usage_dec(void) { }
+#endif /* X86_64 */
+
+
#endif /* __KERNEL__ */
#endif /* _ASM_X86_DEBUGREG_H */
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index 41935fadfdfc..e95822d683f4 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -35,6 +35,8 @@ static inline void fill_ldt(struct desc_struct *desc, const struct user_desc *in
extern struct desc_ptr idt_descr;
extern gate_desc idt_table[];
+extern struct desc_ptr nmi_idt_descr;
+extern gate_desc nmi_idt_table[];
struct gdt_page {
struct desc_struct gdt[GDT_ENTRIES];
@@ -307,6 +309,16 @@ static inline void set_desc_limit(struct desc_struct *desc, unsigned long limit)
desc->limit = (limit >> 16) & 0xf;
}
+#ifdef CONFIG_X86_64
+static inline void set_nmi_gate(int gate, void *addr)
+{
+ gate_desc s;
+
+ pack_gate(&s, GATE_INTERRUPT, (unsigned long)addr, 0, 0, __KERNEL_CS);
+ write_idt_entry(nmi_idt_table, gate, &s);
+}
+#endif
+
static inline void _set_gate(int gate, unsigned type, void *addr,
unsigned dpl, unsigned ist, unsigned seg)
{
diff --git a/arch/x86/include/asm/div64.h b/arch/x86/include/asm/div64.h
index 9a2d644c08ef..ced283ac79df 100644
--- a/arch/x86/include/asm/div64.h
+++ b/arch/x86/include/asm/div64.h
@@ -4,6 +4,7 @@
#ifdef CONFIG_X86_32
#include <linux/types.h>
+#include <linux/log2.h>
/*
* do_div() is NOT a C function. It wants to return
@@ -21,15 +22,20 @@
({ \
unsigned long __upper, __low, __high, __mod, __base; \
__base = (base); \
- asm("":"=a" (__low), "=d" (__high) : "A" (n)); \
- __upper = __high; \
- if (__high) { \
- __upper = __high % (__base); \
- __high = __high / (__base); \
+ if (__builtin_constant_p(__base) && is_power_of_2(__base)) { \
+ __mod = n & (__base - 1); \
+ n >>= ilog2(__base); \
+ } else { \
+ asm("" : "=a" (__low), "=d" (__high) : "A" (n));\
+ __upper = __high; \
+ if (__high) { \
+ __upper = __high % (__base); \
+ __high = __high / (__base); \
+ } \
+ asm("divl %2" : "=a" (__low), "=d" (__mod) \
+ : "rm" (__base), "0" (__low), "1" (__upper)); \
+ asm("" : "=A" (n) : "a" (__low), "d" (__high)); \
} \
- asm("divl %2":"=a" (__low), "=d" (__mod) \
- : "rm" (__base), "0" (__low), "1" (__upper)); \
- asm("":"=A" (n) : "a" (__low), "d" (__high)); \
__mod; \
})
diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h
index 908b96957d88..37782566af24 100644
--- a/arch/x86/include/asm/e820.h
+++ b/arch/x86/include/asm/e820.h
@@ -117,7 +117,7 @@ static inline void early_memtest(unsigned long start, unsigned long end)
extern unsigned long e820_end_of_ram_pfn(void);
extern unsigned long e820_end_of_low_ram_pfn(void);
-extern u64 early_reserve_e820(u64 startt, u64 sizet, u64 align);
+extern u64 early_reserve_e820(u64 sizet, u64 align);
void memblock_x86_fill(void);
void memblock_find_dma_reserve(void);
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index 7093e4a6a0bc..844f735fd63a 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -3,6 +3,8 @@
#ifdef CONFIG_X86_32
+#define EFI_LOADER_SIGNATURE "EL32"
+
extern unsigned long asmlinkage efi_call_phys(void *, ...);
#define efi_call_phys0(f) efi_call_phys(f)
@@ -37,6 +39,8 @@ extern unsigned long asmlinkage efi_call_phys(void *, ...);
#else /* !CONFIG_X86_32 */
+#define EFI_LOADER_SIGNATURE "EL64"
+
extern u64 efi_call0(void *fp);
extern u64 efi_call1(void *fp, u64 arg1);
extern u64 efi_call2(void *fp, u64 arg1, u64 arg2);
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index 460c74e4852c..4da3c0c4c974 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -117,7 +117,7 @@ enum fixed_addresses {
#endif
FIX_TEXT_POKE1, /* reserve 2 pages for text_poke() */
FIX_TEXT_POKE0, /* first page is last, because allocation is backward */
-#ifdef CONFIG_X86_MRST
+#ifdef CONFIG_X86_INTEL_MID
FIX_LNW_VRTC,
#endif
__end_of_permanent_fixed_addresses,
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index 55e4de613f0e..da0b3ca815b7 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -11,6 +11,7 @@ typedef struct {
#ifdef CONFIG_X86_LOCAL_APIC
unsigned int apic_timer_irqs; /* arch dependent */
unsigned int irq_spurious_count;
+ unsigned int icr_read_retry_count;
#endif
unsigned int x86_platform_ipis; /* arch dependent */
unsigned int apic_perf_irqs;
diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index c9e09ea05644..6919e936345b 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -218,7 +218,7 @@ static inline void fpu_fxsave(struct fpu *fpu)
#ifdef CONFIG_SMP
#define safe_address (__per_cpu_offset[0])
#else
-#define safe_address (kstat_cpu(0).cpustat.user)
+#define safe_address (__get_cpu_var(kernel_cpustat).cpustat[CPUTIME_USER])
#endif
/*
diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h
index 8dbe353e41e1..adcc0ae73d09 100644
--- a/arch/x86/include/asm/init.h
+++ b/arch/x86/include/asm/init.h
@@ -5,6 +5,8 @@
extern void __init early_ioremap_page_table_range_init(void);
#endif
+extern void __init zone_sizes_init(void);
+
extern unsigned long __init
kernel_physical_mapping_init(unsigned long start,
unsigned long end,
diff --git a/arch/x86/include/asm/insn.h b/arch/x86/include/asm/insn.h
index 88c765e16410..74df3f1eddfd 100644
--- a/arch/x86/include/asm/insn.h
+++ b/arch/x86/include/asm/insn.h
@@ -137,6 +137,13 @@ static inline int insn_is_avx(struct insn *insn)
return (insn->vex_prefix.value != 0);
}
+/* Ensure this instruction is decoded completely */
+static inline int insn_complete(struct insn *insn)
+{
+ return insn->opcode.got && insn->modrm.got && insn->sib.got &&
+ insn->displacement.got && insn->immediate.got;
+}
+
static inline insn_byte_t insn_vex_m_bits(struct insn *insn)
{
if (insn->vex_prefix.nbytes == 2) /* 2 bytes VEX */
diff --git a/arch/x86/include/asm/intel_scu_ipc.h b/arch/x86/include/asm/intel_scu_ipc.h
index 4420993acc47..925b605eb5c6 100644
--- a/arch/x86/include/asm/intel_scu_ipc.h
+++ b/arch/x86/include/asm/intel_scu_ipc.h
@@ -3,11 +3,15 @@
#include <linux/notifier.h>
-#define IPCMSG_VRTC 0xFA /* Set vRTC device */
-
-/* Command id associated with message IPCMSG_VRTC */
-#define IPC_CMD_VRTC_SETTIME 1 /* Set time */
-#define IPC_CMD_VRTC_SETALARM 2 /* Set alarm */
+#define IPCMSG_WARM_RESET 0xF0
+#define IPCMSG_COLD_RESET 0xF1
+#define IPCMSG_SOFT_RESET 0xF2
+#define IPCMSG_COLD_BOOT 0xF3
+
+#define IPCMSG_VRTC 0xFA /* Set vRTC device */
+ /* Command id associated with message IPCMSG_VRTC */
+ #define IPC_CMD_VRTC_SETTIME 1 /* Set time */
+ #define IPC_CMD_VRTC_SETALARM 2 /* Set alarm */
/* Read single register */
int intel_scu_ipc_ioread8(u16 addr, u8 *data);
diff --git a/arch/x86/include/asm/iommu.h b/arch/x86/include/asm/iommu.h
index 345c99cef152..dffc38ee6255 100644
--- a/arch/x86/include/asm/iommu.h
+++ b/arch/x86/include/asm/iommu.h
@@ -5,6 +5,7 @@ extern struct dma_map_ops nommu_dma_ops;
extern int force_iommu, no_iommu;
extern int iommu_detected;
extern int iommu_pass_through;
+extern int iommu_group_mf;
/* 10 seconds */
#define DMAR_OPERATION_TIMEOUT ((cycles_t) tsc_khz*10*1000)
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index a026507893e9..ab4092e3214e 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -181,6 +181,7 @@ struct x86_emulate_ops {
int (*set_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong value);
int (*set_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 data);
int (*get_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata);
+ int (*read_pmc)(struct x86_emulate_ctxt *ctxt, u32 pmc, u64 *pdata);
void (*halt)(struct x86_emulate_ctxt *ctxt);
void (*wbinvd)(struct x86_emulate_ctxt *ctxt);
int (*fix_hypercall)(struct x86_emulate_ctxt *ctxt);
@@ -364,6 +365,7 @@ enum x86_intercept {
#endif
int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len);
+bool x86_page_table_writing_insn(struct x86_emulate_ctxt *ctxt);
#define EMULATION_FAILED -1
#define EMULATION_OK 0
#define EMULATION_RESTART 1
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index b4973f4dab98..52d6640a5ca1 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -16,10 +16,12 @@
#include <linux/mmu_notifier.h>
#include <linux/tracepoint.h>
#include <linux/cpumask.h>
+#include <linux/irq_work.h>
#include <linux/kvm.h>
#include <linux/kvm_para.h>
#include <linux/kvm_types.h>
+#include <linux/perf_event.h>
#include <asm/pvclock-abi.h>
#include <asm/desc.h>
@@ -31,6 +33,8 @@
#define KVM_MEMORY_SLOTS 32
/* memory slots that does not exposed to userspace */
#define KVM_PRIVATE_MEM_SLOTS 4
+#define KVM_MEM_SLOTS_NUM (KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS)
+
#define KVM_MMIO_SIZE 16
#define KVM_PIO_PAGE_OFFSET 1
@@ -228,7 +232,7 @@ struct kvm_mmu_page {
* One bit set per slot which has memory
* in this shadow page.
*/
- DECLARE_BITMAP(slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
+ DECLARE_BITMAP(slot_bitmap, KVM_MEM_SLOTS_NUM);
bool unsync;
int root_count; /* Currently serving as active root */
unsigned int unsync_children;
@@ -239,14 +243,9 @@ struct kvm_mmu_page {
int clear_spte_count;
#endif
- struct rcu_head rcu;
-};
+ int write_flooding_count;
-struct kvm_pv_mmu_op_buffer {
- void *ptr;
- unsigned len;
- unsigned processed;
- char buf[512] __aligned(sizeof(long));
+ struct rcu_head rcu;
};
struct kvm_pio_request {
@@ -294,6 +293,37 @@ struct kvm_mmu {
u64 pdptrs[4]; /* pae */
};
+enum pmc_type {
+ KVM_PMC_GP = 0,
+ KVM_PMC_FIXED,
+};
+
+struct kvm_pmc {
+ enum pmc_type type;
+ u8 idx;
+ u64 counter;
+ u64 eventsel;
+ struct perf_event *perf_event;
+ struct kvm_vcpu *vcpu;
+};
+
+struct kvm_pmu {
+ unsigned nr_arch_gp_counters;
+ unsigned nr_arch_fixed_counters;
+ unsigned available_event_types;
+ u64 fixed_ctr_ctrl;
+ u64 global_ctrl;
+ u64 global_status;
+ u64 global_ovf_ctrl;
+ u64 counter_bitmask[2];
+ u64 global_ctrl_mask;
+ u8 version;
+ struct kvm_pmc gp_counters[X86_PMC_MAX_GENERIC];
+ struct kvm_pmc fixed_counters[X86_PMC_MAX_FIXED];
+ struct irq_work irq_work;
+ u64 reprogram_pmi;
+};
+
struct kvm_vcpu_arch {
/*
* rip and regs accesses must go through
@@ -345,19 +375,10 @@ struct kvm_vcpu_arch {
*/
struct kvm_mmu *walk_mmu;
- /* only needed in kvm_pv_mmu_op() path, but it's hot so
- * put it here to avoid allocation */
- struct kvm_pv_mmu_op_buffer mmu_op_buffer;
-
struct kvm_mmu_memory_cache mmu_pte_list_desc_cache;
struct kvm_mmu_memory_cache mmu_page_cache;
struct kvm_mmu_memory_cache mmu_page_header_cache;
- gfn_t last_pt_write_gfn;
- int last_pt_write_count;
- u64 *last_pte_updated;
- gfn_t last_pte_gfn;
-
struct fpu guest_fpu;
u64 xcr0;
@@ -436,6 +457,8 @@ struct kvm_vcpu_arch {
unsigned access;
gfn_t mmio_gfn;
+ struct kvm_pmu pmu;
+
/* used for guest single stepping over the given code position */
unsigned long singlestep_rip;
@@ -444,6 +467,9 @@ struct kvm_vcpu_arch {
cpumask_var_t wbinvd_dirty_mask;
+ unsigned long last_retry_eip;
+ unsigned long last_retry_addr;
+
struct {
bool halted;
gfn_t gfns[roundup_pow_of_two(ASYNC_PF_PER_VCPU)];
@@ -459,7 +485,6 @@ struct kvm_arch {
unsigned int n_requested_mmu_pages;
unsigned int n_max_mmu_pages;
unsigned int indirect_shadow_pages;
- atomic_t invlpg_counter;
struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
/*
* Hash table of struct kvm_mmu_page.
@@ -660,6 +685,8 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
int kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot);
+int kvm_mmu_rmap_write_protect(struct kvm *kvm, u64 gfn,
+ struct kvm_memory_slot *slot);
void kvm_mmu_zap_all(struct kvm *kvm);
unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm);
void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages);
@@ -668,8 +695,6 @@ int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3);
int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
const void *val, int bytes);
-int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes,
- gpa_t addr, unsigned long *ret);
u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn);
extern bool tdp_enabled;
@@ -692,6 +717,7 @@ enum emulation_result {
#define EMULTYPE_NO_DECODE (1 << 0)
#define EMULTYPE_TRAP_UD (1 << 1)
#define EMULTYPE_SKIP (1 << 2)
+#define EMULTYPE_RETRY (1 << 3)
int x86_emulate_instruction(struct kvm_vcpu *vcpu, unsigned long cr2,
int emulation_type, void *insn, int insn_len);
@@ -734,6 +760,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data);
unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu);
void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
+bool kvm_rdpmc(struct kvm_vcpu *vcpu);
void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr);
void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
@@ -754,13 +781,14 @@ int fx_init(struct kvm_vcpu *vcpu);
void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu);
void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
- const u8 *new, int bytes,
- bool guest_initiated);
+ const u8 *new, int bytes);
+int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn);
int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva);
void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
int kvm_mmu_load(struct kvm_vcpu *vcpu);
void kvm_mmu_unload(struct kvm_vcpu *vcpu);
void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu);
+gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access);
gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
struct x86_exception *exception);
gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva,
@@ -782,6 +810,11 @@ void kvm_disable_tdp(void);
int complete_pio(struct kvm_vcpu *vcpu);
bool kvm_check_iopl(struct kvm_vcpu *vcpu);
+static inline gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access)
+{
+ return gpa;
+}
+
static inline struct kvm_mmu_page *page_header(hpa_t shadow_page)
{
struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT);
@@ -894,4 +927,17 @@ extern bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn);
void kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err);
+int kvm_is_in_guest(void);
+
+void kvm_pmu_init(struct kvm_vcpu *vcpu);
+void kvm_pmu_destroy(struct kvm_vcpu *vcpu);
+void kvm_pmu_reset(struct kvm_vcpu *vcpu);
+void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu);
+bool kvm_pmu_msr(struct kvm_vcpu *vcpu, u32 msr);
+int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data);
+int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data);
+int kvm_pmu_read_pmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data);
+void kvm_handle_pmu_event(struct kvm_vcpu *vcpu);
+void kvm_deliver_pmi(struct kvm_vcpu *vcpu);
+
#endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/include/asm/mach_timer.h b/arch/x86/include/asm/mach_timer.h
index 853728519ae9..88d0c3c74c13 100644
--- a/arch/x86/include/asm/mach_timer.h
+++ b/arch/x86/include/asm/mach_timer.h
@@ -15,7 +15,7 @@
#define CALIBRATE_TIME_MSEC 30 /* 30 msecs */
#define CALIBRATE_LATCH \
- ((CLOCK_TICK_RATE * CALIBRATE_TIME_MSEC + 1000/2)/1000)
+ ((PIT_TICK_RATE * CALIBRATE_TIME_MSEC + 1000/2)/1000)
static inline void mach_prepare_counter(void)
{
diff --git a/arch/x86/include/asm/mach_traps.h b/arch/x86/include/asm/mach_traps.h
index 72a8b52e7dfd..a01e7ec7d237 100644
--- a/arch/x86/include/asm/mach_traps.h
+++ b/arch/x86/include/asm/mach_traps.h
@@ -17,7 +17,7 @@
#define NMI_REASON_CLEAR_IOCHK 0x08
#define NMI_REASON_CLEAR_MASK 0x0f
-static inline unsigned char get_nmi_reason(void)
+static inline unsigned char default_get_nmi_reason(void)
{
return inb(NMI_REASON_PORT);
}
diff --git a/arch/x86/include/asm/mc146818rtc.h b/arch/x86/include/asm/mc146818rtc.h
index 01fdf5674e24..0e8e85bb7c51 100644
--- a/arch/x86/include/asm/mc146818rtc.h
+++ b/arch/x86/include/asm/mc146818rtc.h
@@ -81,8 +81,8 @@ static inline unsigned char current_lock_cmos_reg(void)
#else
#define lock_cmos_prefix(reg) do {} while (0)
#define lock_cmos_suffix(reg) do {} while (0)
-#define lock_cmos(reg)
-#define unlock_cmos()
+#define lock_cmos(reg) do { } while (0)
+#define unlock_cmos() do { } while (0)
#define do_i_have_lock_cmos() 0
#define current_lock_cmos_reg() 0
#endif
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index c9321f34e55b..6aefb14cbbc5 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -50,10 +50,11 @@
#define MCJ_CTX_MASK 3
#define MCJ_CTX(flags) ((flags) & MCJ_CTX_MASK)
#define MCJ_CTX_RANDOM 0 /* inject context: random */
-#define MCJ_CTX_PROCESS 1 /* inject context: process */
-#define MCJ_CTX_IRQ 2 /* inject context: IRQ */
-#define MCJ_NMI_BROADCAST 4 /* do NMI broadcasting */
-#define MCJ_EXCEPTION 8 /* raise as exception */
+#define MCJ_CTX_PROCESS 0x1 /* inject context: process */
+#define MCJ_CTX_IRQ 0x2 /* inject context: IRQ */
+#define MCJ_NMI_BROADCAST 0x4 /* do NMI broadcasting */
+#define MCJ_EXCEPTION 0x8 /* raise as exception */
+#define MCJ_IRQ_BRAODCAST 0x10 /* do IRQ broadcasting */
/* Fields are zero when not available */
struct mce {
@@ -120,7 +121,8 @@ struct mce_log {
#ifdef __KERNEL__
-extern struct atomic_notifier_head x86_mce_decoder_chain;
+extern void mce_register_decode_chain(struct notifier_block *nb);
+extern void mce_unregister_decode_chain(struct notifier_block *nb);
#include <linux/percpu.h>
#include <linux/init.h>
@@ -149,7 +151,7 @@ static inline void enable_p5_mce(void) {}
void mce_setup(struct mce *m);
void mce_log(struct mce *m);
-DECLARE_PER_CPU(struct sys_device, mce_sysdev);
+extern struct device *mce_device[CONFIG_NR_CPUS];
/*
* Maximum banks number.
@@ -201,7 +203,10 @@ int mce_notify_irq(void);
void mce_notify_process(void);
DECLARE_PER_CPU(struct mce, injectm);
-extern struct file_operations mce_chrdev_ops;
+
+extern void register_mce_write_callback(ssize_t (*)(struct file *filp,
+ const char __user *ubuf,
+ size_t usize, loff_t *off));
/*
* Exception handler
diff --git a/arch/x86/include/asm/memblock.h b/arch/x86/include/asm/memblock.h
deleted file mode 100644
index 0cd3800f33b9..000000000000
--- a/arch/x86/include/asm/memblock.h
+++ /dev/null
@@ -1,23 +0,0 @@
-#ifndef _X86_MEMBLOCK_H
-#define _X86_MEMBLOCK_H
-
-#define ARCH_DISCARD_MEMBLOCK
-
-u64 memblock_x86_find_in_range_size(u64 start, u64 *sizep, u64 align);
-
-void memblock_x86_reserve_range(u64 start, u64 end, char *name);
-void memblock_x86_free_range(u64 start, u64 end);
-struct range;
-int __get_free_all_memory_range(struct range **range, int nodeid,
- unsigned long start_pfn, unsigned long end_pfn);
-int get_free_all_memory_range(struct range **rangep, int nodeid);
-
-void memblock_x86_register_active_regions(int nid, unsigned long start_pfn,
- unsigned long last_pfn);
-u64 memblock_x86_hole_size(u64 start, u64 end);
-u64 memblock_x86_find_in_range_node(int nid, u64 start, u64 end, u64 size, u64 align);
-u64 memblock_x86_free_memory_in_range(u64 addr, u64 limit);
-u64 memblock_x86_memory_in_range(u64 addr, u64 limit);
-bool memblock_x86_check_reserved_size(u64 *addrp, u64 *sizep, u64 align);
-
-#endif
diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h
index 24215072d0e1..4ebe157bf73d 100644
--- a/arch/x86/include/asm/microcode.h
+++ b/arch/x86/include/asm/microcode.h
@@ -48,6 +48,7 @@ static inline struct microcode_ops * __init init_intel_microcode(void)
#ifdef CONFIG_MICROCODE_AMD
extern struct microcode_ops * __init init_amd_microcode(void);
+extern void __exit exit_amd_microcode(void);
static inline void get_ucode_data(void *to, const u8 *from, size_t n)
{
@@ -59,6 +60,7 @@ static inline struct microcode_ops * __init init_amd_microcode(void)
{
return NULL;
}
+static inline void __exit exit_amd_microcode(void) {}
#endif
#endif /* _ASM_X86_MICROCODE_H */
diff --git a/arch/x86/include/asm/mrst.h b/arch/x86/include/asm/mrst.h
index 719f00b28ff5..0a0a95460434 100644
--- a/arch/x86/include/asm/mrst.h
+++ b/arch/x86/include/asm/mrst.h
@@ -31,11 +31,20 @@ enum mrst_cpu_type {
};
extern enum mrst_cpu_type __mrst_cpu_chip;
+
+#ifdef CONFIG_X86_INTEL_MID
+
static inline enum mrst_cpu_type mrst_identify_cpu(void)
{
return __mrst_cpu_chip;
}
+#else /* !CONFIG_X86_INTEL_MID */
+
+#define mrst_identify_cpu() (0)
+
+#endif /* !CONFIG_X86_INTEL_MID */
+
enum mrst_timer_options {
MRST_TIMER_DEFAULT,
MRST_TIMER_APBT_ONLY,
@@ -44,6 +53,13 @@ enum mrst_timer_options {
extern enum mrst_timer_options mrst_timer_options;
+/*
+ * Penwell uses spread spectrum clock, so the freq number is not exactly
+ * the same as reported by MSR based on SDM.
+ */
+#define PENWELL_FSB_FREQ_83SKU 83200
+#define PENWELL_FSB_FREQ_100SKU 99840
+
#define SFI_MTMR_MAX_NUM 8
#define SFI_MRTC_MAX 8
@@ -51,7 +67,7 @@ extern struct console early_mrst_console;
extern void mrst_early_console_init(void);
extern struct console early_hsu_console;
-extern void hsu_early_console_init(void);
+extern void hsu_early_console_init(const char *);
extern void intel_scu_devices_create(void);
extern void intel_scu_devices_destroy(void);
diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index 084ef95274cd..95203d40ffdd 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -169,7 +169,14 @@ static inline int wrmsr_safe(unsigned msr, unsigned low, unsigned high)
return native_write_msr_safe(msr, low, high);
}
-/* rdmsr with exception handling */
+/*
+ * rdmsr with exception handling.
+ *
+ * Please note that the exception handling works only after we've
+ * switched to the "smart" #GP handler in trap_init() which knows about
+ * exception tables - using this macro earlier than that causes machine
+ * hangs on boxes which do not implement the @msr in the first argument.
+ */
#define rdmsr_safe(msr, p1, p2) \
({ \
int __err; \
diff --git a/arch/x86/include/asm/numachip/numachip_csr.h b/arch/x86/include/asm/numachip/numachip_csr.h
new file mode 100644
index 000000000000..660f843df928
--- /dev/null
+++ b/arch/x86/include/asm/numachip/numachip_csr.h
@@ -0,0 +1,167 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Numascale NumaConnect-Specific Header file
+ *
+ * Copyright (C) 2011 Numascale AS. All rights reserved.
+ *
+ * Send feedback to <support@numascale.com>
+ *
+ */
+
+#ifndef _ASM_X86_NUMACHIP_NUMACHIP_CSR_H
+#define _ASM_X86_NUMACHIP_NUMACHIP_CSR_H
+
+#include <linux/numa.h>
+#include <linux/percpu.h>
+#include <linux/io.h>
+#include <linux/swab.h>
+#include <asm/types.h>
+#include <asm/processor.h>
+
+#define CSR_NODE_SHIFT 16
+#define CSR_NODE_BITS(p) (((unsigned long)(p)) << CSR_NODE_SHIFT)
+#define CSR_NODE_MASK 0x0fff /* 4K nodes */
+
+/* 32K CSR space, b15 indicates geo/non-geo */
+#define CSR_OFFSET_MASK 0x7fffUL
+
+/* Global CSR space covers all 4K possible nodes with 64K CSR space per node */
+#define NUMACHIP_GCSR_BASE 0x3fff00000000ULL
+#define NUMACHIP_GCSR_LIM 0x3fff0fffffffULL
+#define NUMACHIP_GCSR_SIZE (NUMACHIP_GCSR_LIM - NUMACHIP_GCSR_BASE + 1)
+
+/*
+ * Local CSR space starts in global CSR space with "nodeid" = 0xfff0, however
+ * when using the direct mapping on x86_64, both start and size needs to be
+ * aligned with PMD_SIZE which is 2M
+ */
+#define NUMACHIP_LCSR_BASE 0x3ffffe000000ULL
+#define NUMACHIP_LCSR_LIM 0x3fffffffffffULL
+#define NUMACHIP_LCSR_SIZE (NUMACHIP_LCSR_LIM - NUMACHIP_LCSR_BASE + 1)
+
+static inline void *gcsr_address(int node, unsigned long offset)
+{
+ return __va(NUMACHIP_GCSR_BASE | (1UL << 15) |
+ CSR_NODE_BITS(node & CSR_NODE_MASK) | (offset & CSR_OFFSET_MASK));
+}
+
+static inline void *lcsr_address(unsigned long offset)
+{
+ return __va(NUMACHIP_LCSR_BASE | (1UL << 15) |
+ CSR_NODE_BITS(0xfff0) | (offset & CSR_OFFSET_MASK));
+}
+
+static inline unsigned int read_gcsr(int node, unsigned long offset)
+{
+ return swab32(readl(gcsr_address(node, offset)));
+}
+
+static inline void write_gcsr(int node, unsigned long offset, unsigned int val)
+{
+ writel(swab32(val), gcsr_address(node, offset));
+}
+
+static inline unsigned int read_lcsr(unsigned long offset)
+{
+ return swab32(readl(lcsr_address(offset)));
+}
+
+static inline void write_lcsr(unsigned long offset, unsigned int val)
+{
+ writel(swab32(val), lcsr_address(offset));
+}
+
+/* ========================================================================= */
+/* CSR_G0_STATE_CLEAR */
+/* ========================================================================= */
+
+#define CSR_G0_STATE_CLEAR (0x000 + (0 << 12))
+union numachip_csr_g0_state_clear {
+ unsigned int v;
+ struct numachip_csr_g0_state_clear_s {
+ unsigned int _state:2;
+ unsigned int _rsvd_2_6:5;
+ unsigned int _lost:1;
+ unsigned int _rsvd_8_31:24;
+ } s;
+};
+
+/* ========================================================================= */
+/* CSR_G0_NODE_IDS */
+/* ========================================================================= */
+
+#define CSR_G0_NODE_IDS (0x008 + (0 << 12))
+union numachip_csr_g0_node_ids {
+ unsigned int v;
+ struct numachip_csr_g0_node_ids_s {
+ unsigned int _initialid:16;
+ unsigned int _nodeid:12;
+ unsigned int _rsvd_28_31:4;
+ } s;
+};
+
+/* ========================================================================= */
+/* CSR_G3_EXT_IRQ_GEN */
+/* ========================================================================= */
+
+#define CSR_G3_EXT_IRQ_GEN (0x030 + (3 << 12))
+union numachip_csr_g3_ext_irq_gen {
+ unsigned int v;
+ struct numachip_csr_g3_ext_irq_gen_s {
+ unsigned int _vector:8;
+ unsigned int _msgtype:3;
+ unsigned int _index:5;
+ unsigned int _destination_apic_id:16;
+ } s;
+};
+
+/* ========================================================================= */
+/* CSR_G3_EXT_IRQ_STATUS */
+/* ========================================================================= */
+
+#define CSR_G3_EXT_IRQ_STATUS (0x034 + (3 << 12))
+union numachip_csr_g3_ext_irq_status {
+ unsigned int v;
+ struct numachip_csr_g3_ext_irq_status_s {
+ unsigned int _result:32;
+ } s;
+};
+
+/* ========================================================================= */
+/* CSR_G3_EXT_IRQ_DEST */
+/* ========================================================================= */
+
+#define CSR_G3_EXT_IRQ_DEST (0x038 + (3 << 12))
+union numachip_csr_g3_ext_irq_dest {
+ unsigned int v;
+ struct numachip_csr_g3_ext_irq_dest_s {
+ unsigned int _irq:8;
+ unsigned int _rsvd_8_31:24;
+ } s;
+};
+
+/* ========================================================================= */
+/* CSR_G3_NC_ATT_MAP_SELECT */
+/* ========================================================================= */
+
+#define CSR_G3_NC_ATT_MAP_SELECT (0x7fc + (3 << 12))
+union numachip_csr_g3_nc_att_map_select {
+ unsigned int v;
+ struct numachip_csr_g3_nc_att_map_select_s {
+ unsigned int _upper_address_bits:4;
+ unsigned int _select_ram:4;
+ unsigned int _rsvd_8_31:24;
+ } s;
+};
+
+/* ========================================================================= */
+/* CSR_G3_NC_ATT_MAP_SELECT_0-255 */
+/* ========================================================================= */
+
+#define CSR_G3_NC_ATT_MAP_SELECT_0 (0x800 + (3 << 12))
+
+#endif /* _ASM_X86_NUMACHIP_NUMACHIP_CSR_H */
+
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index d498943b906c..df75d07571ce 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -112,19 +112,28 @@ static inline void x86_teardown_msi_irq(unsigned int irq)
{
x86_msi.teardown_msi_irq(irq);
}
+static inline void x86_restore_msi_irqs(struct pci_dev *dev, int irq)
+{
+ x86_msi.restore_msi_irqs(dev, irq);
+}
#define arch_setup_msi_irqs x86_setup_msi_irqs
#define arch_teardown_msi_irqs x86_teardown_msi_irqs
#define arch_teardown_msi_irq x86_teardown_msi_irq
+#define arch_restore_msi_irqs x86_restore_msi_irqs
/* implemented in arch/x86/kernel/apic/io_apic. */
int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type);
void native_teardown_msi_irq(unsigned int irq);
+void native_restore_msi_irqs(struct pci_dev *dev, int irq);
/* default to the implementation in drivers/lib/msi.c */
#define HAVE_DEFAULT_MSI_TEARDOWN_IRQS
+#define HAVE_DEFAULT_MSI_RESTORE_IRQS
void default_teardown_msi_irqs(struct pci_dev *dev);
+void default_restore_msi_irqs(struct pci_dev *dev, int irq);
#else
#define native_setup_msi_irqs NULL
#define native_teardown_msi_irq NULL
#define default_teardown_msi_irqs NULL
+#define default_restore_msi_irqs NULL
#endif
#define PCI_DMA_BUS_IS_PHYS (dma_ops->is_phys)
diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h
index e38197806853..b3a531746026 100644
--- a/arch/x86/include/asm/pci_x86.h
+++ b/arch/x86/include/asm/pci_x86.h
@@ -44,8 +44,6 @@ enum pci_bf_sort_state {
/* pci-i386.c */
-extern unsigned int pcibios_max_latency;
-
void pcibios_resource_survey(void);
void pcibios_set_cache_line_size(void);
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index 3470c9d0ebba..7a11910a63c4 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -414,22 +414,6 @@ do { \
#define this_cpu_xchg_2(pcp, nval) percpu_xchg_op(pcp, nval)
#define this_cpu_xchg_4(pcp, nval) percpu_xchg_op(pcp, nval)
-#define irqsafe_cpu_add_1(pcp, val) percpu_add_op((pcp), val)
-#define irqsafe_cpu_add_2(pcp, val) percpu_add_op((pcp), val)
-#define irqsafe_cpu_add_4(pcp, val) percpu_add_op((pcp), val)
-#define irqsafe_cpu_and_1(pcp, val) percpu_to_op("and", (pcp), val)
-#define irqsafe_cpu_and_2(pcp, val) percpu_to_op("and", (pcp), val)
-#define irqsafe_cpu_and_4(pcp, val) percpu_to_op("and", (pcp), val)
-#define irqsafe_cpu_or_1(pcp, val) percpu_to_op("or", (pcp), val)
-#define irqsafe_cpu_or_2(pcp, val) percpu_to_op("or", (pcp), val)
-#define irqsafe_cpu_or_4(pcp, val) percpu_to_op("or", (pcp), val)
-#define irqsafe_cpu_xor_1(pcp, val) percpu_to_op("xor", (pcp), val)
-#define irqsafe_cpu_xor_2(pcp, val) percpu_to_op("xor", (pcp), val)
-#define irqsafe_cpu_xor_4(pcp, val) percpu_to_op("xor", (pcp), val)
-#define irqsafe_cpu_xchg_1(pcp, nval) percpu_xchg_op(pcp, nval)
-#define irqsafe_cpu_xchg_2(pcp, nval) percpu_xchg_op(pcp, nval)
-#define irqsafe_cpu_xchg_4(pcp, nval) percpu_xchg_op(pcp, nval)
-
#ifndef CONFIG_M386
#define __this_cpu_add_return_1(pcp, val) percpu_add_return_op(pcp, val)
#define __this_cpu_add_return_2(pcp, val) percpu_add_return_op(pcp, val)
@@ -445,29 +429,22 @@ do { \
#define this_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
#define this_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
-#define irqsafe_cpu_cmpxchg_1(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
-#define irqsafe_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
-#define irqsafe_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
#endif /* !CONFIG_M386 */
#ifdef CONFIG_X86_CMPXCHG64
-#define percpu_cmpxchg8b_double(pcp1, o1, o2, n1, n2) \
+#define percpu_cmpxchg8b_double(pcp1, pcp2, o1, o2, n1, n2) \
({ \
- char __ret; \
- typeof(o1) __o1 = o1; \
- typeof(o1) __n1 = n1; \
- typeof(o2) __o2 = o2; \
- typeof(o2) __n2 = n2; \
- typeof(o2) __dummy = n2; \
+ bool __ret; \
+ typeof(pcp1) __o1 = (o1), __n1 = (n1); \
+ typeof(pcp2) __o2 = (o2), __n2 = (n2); \
asm volatile("cmpxchg8b "__percpu_arg(1)"\n\tsetz %0\n\t" \
- : "=a"(__ret), "=m" (pcp1), "=d"(__dummy) \
- : "b"(__n1), "c"(__n2), "a"(__o1), "d"(__o2)); \
+ : "=a" (__ret), "+m" (pcp1), "+m" (pcp2), "+d" (__o2) \
+ : "b" (__n1), "c" (__n2), "a" (__o1)); \
__ret; \
})
-#define __this_cpu_cmpxchg_double_4(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg8b_double(pcp1, o1, o2, n1, n2)
-#define this_cpu_cmpxchg_double_4(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg8b_double(pcp1, o1, o2, n1, n2)
-#define irqsafe_cpu_cmpxchg_double_4(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg8b_double(pcp1, o1, o2, n1, n2)
+#define __this_cpu_cmpxchg_double_4 percpu_cmpxchg8b_double
+#define this_cpu_cmpxchg_double_4 percpu_cmpxchg8b_double
#endif /* CONFIG_X86_CMPXCHG64 */
/*
@@ -495,44 +472,28 @@ do { \
#define this_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval)
#define this_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
-#define irqsafe_cpu_add_8(pcp, val) percpu_add_op((pcp), val)
-#define irqsafe_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val)
-#define irqsafe_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val)
-#define irqsafe_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val)
-#define irqsafe_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval)
-#define irqsafe_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
-
/*
* Pretty complex macro to generate cmpxchg16 instruction. The instruction
* is not supported on early AMD64 processors so we must be able to emulate
* it in software. The address used in the cmpxchg16 instruction must be
* aligned to a 16 byte boundary.
*/
-#ifdef CONFIG_SMP
-#define CMPXCHG16B_EMU_CALL "call this_cpu_cmpxchg16b_emu\n\t" ASM_NOP3
-#else
-#define CMPXCHG16B_EMU_CALL "call this_cpu_cmpxchg16b_emu\n\t" ASM_NOP2
-#endif
-#define percpu_cmpxchg16b_double(pcp1, o1, o2, n1, n2) \
+#define percpu_cmpxchg16b_double(pcp1, pcp2, o1, o2, n1, n2) \
({ \
- char __ret; \
- typeof(o1) __o1 = o1; \
- typeof(o1) __n1 = n1; \
- typeof(o2) __o2 = o2; \
- typeof(o2) __n2 = n2; \
- typeof(o2) __dummy; \
- alternative_io(CMPXCHG16B_EMU_CALL, \
- "cmpxchg16b " __percpu_prefix "(%%rsi)\n\tsetz %0\n\t", \
+ bool __ret; \
+ typeof(pcp1) __o1 = (o1), __n1 = (n1); \
+ typeof(pcp2) __o2 = (o2), __n2 = (n2); \
+ alternative_io("leaq %P1,%%rsi\n\tcall this_cpu_cmpxchg16b_emu\n\t", \
+ "cmpxchg16b " __percpu_arg(1) "\n\tsetz %0\n\t", \
X86_FEATURE_CX16, \
- ASM_OUTPUT2("=a"(__ret), "=d"(__dummy)), \
- "S" (&pcp1), "b"(__n1), "c"(__n2), \
- "a"(__o1), "d"(__o2) : "memory"); \
+ ASM_OUTPUT2("=a" (__ret), "+m" (pcp1), \
+ "+m" (pcp2), "+d" (__o2)), \
+ "b" (__n1), "c" (__n2), "a" (__o1) : "rsi"); \
__ret; \
})
-#define __this_cpu_cmpxchg_double_8(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg16b_double(pcp1, o1, o2, n1, n2)
-#define this_cpu_cmpxchg_double_8(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg16b_double(pcp1, o1, o2, n1, n2)
-#define irqsafe_cpu_cmpxchg_double_8(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg16b_double(pcp1, o1, o2, n1, n2)
+#define __this_cpu_cmpxchg_double_8 percpu_cmpxchg16b_double
+#define this_cpu_cmpxchg_double_8 percpu_cmpxchg16b_double
#endif
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index f61c62f7d5d8..096c975e099f 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -57,6 +57,7 @@
(1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX))
#define ARCH_PERFMON_BRANCH_MISSES_RETIRED 6
+#define ARCH_PERFMON_EVENTS_COUNT 7
/*
* Intel "Architectural Performance Monitoring" CPUID
@@ -72,6 +73,19 @@ union cpuid10_eax {
unsigned int full;
};
+union cpuid10_ebx {
+ struct {
+ unsigned int no_unhalted_core_cycles:1;
+ unsigned int no_instructions_retired:1;
+ unsigned int no_unhalted_reference_cycles:1;
+ unsigned int no_llc_reference:1;
+ unsigned int no_llc_misses:1;
+ unsigned int no_branch_instruction_retired:1;
+ unsigned int no_branch_misses_retired:1;
+ } split;
+ unsigned int full;
+};
+
union cpuid10_edx {
struct {
unsigned int num_counters_fixed:5;
@@ -81,6 +95,15 @@ union cpuid10_edx {
unsigned int full;
};
+struct x86_pmu_capability {
+ int version;
+ int num_counters_gp;
+ int num_counters_fixed;
+ int bit_width_gp;
+ int bit_width_fixed;
+ unsigned int events_mask;
+ int events_mask_len;
+};
/*
* Fixed-purpose performance events:
@@ -89,23 +112,24 @@ union cpuid10_edx {
/*
* All 3 fixed-mode PMCs are configured via this single MSR:
*/
-#define MSR_ARCH_PERFMON_FIXED_CTR_CTRL 0x38d
+#define MSR_ARCH_PERFMON_FIXED_CTR_CTRL 0x38d
/*
* The counts are available in three separate MSRs:
*/
/* Instr_Retired.Any: */
-#define MSR_ARCH_PERFMON_FIXED_CTR0 0x309
-#define X86_PMC_IDX_FIXED_INSTRUCTIONS (X86_PMC_IDX_FIXED + 0)
+#define MSR_ARCH_PERFMON_FIXED_CTR0 0x309
+#define X86_PMC_IDX_FIXED_INSTRUCTIONS (X86_PMC_IDX_FIXED + 0)
/* CPU_CLK_Unhalted.Core: */
-#define MSR_ARCH_PERFMON_FIXED_CTR1 0x30a
-#define X86_PMC_IDX_FIXED_CPU_CYCLES (X86_PMC_IDX_FIXED + 1)
+#define MSR_ARCH_PERFMON_FIXED_CTR1 0x30a
+#define X86_PMC_IDX_FIXED_CPU_CYCLES (X86_PMC_IDX_FIXED + 1)
/* CPU_CLK_Unhalted.Ref: */
-#define MSR_ARCH_PERFMON_FIXED_CTR2 0x30b
-#define X86_PMC_IDX_FIXED_BUS_CYCLES (X86_PMC_IDX_FIXED + 2)
+#define MSR_ARCH_PERFMON_FIXED_CTR2 0x30b
+#define X86_PMC_IDX_FIXED_REF_CYCLES (X86_PMC_IDX_FIXED + 2)
+#define X86_PMC_MSK_FIXED_REF_CYCLES (1ULL << X86_PMC_IDX_FIXED_REF_CYCLES)
/*
* We model BTS tracing as another fixed-mode PMC.
@@ -202,6 +226,7 @@ struct perf_guest_switch_msr {
};
extern struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr);
+extern void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap);
#else
static inline perf_guest_switch_msr *perf_guest_get_msrs(int *nr)
{
@@ -209,6 +234,11 @@ static inline perf_guest_switch_msr *perf_guest_get_msrs(int *nr)
return NULL;
}
+static inline void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap)
+{
+ memset(cap, 0, sizeof(*cap));
+}
+
static inline void perf_events_lapic_init(void) { }
#endif
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 18601c86fab1..49afb3f41eb6 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -703,7 +703,7 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm,
pte_update(mm, addr, ptep);
}
-#define flush_tlb_fix_spurious_fault(vma, address)
+#define flush_tlb_fix_spurious_fault(vma, address) do { } while (0)
#define mk_pmd(page, pgprot) pfn_pmd(page_to_pfn(page), (pgprot))
diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h
index 2dddb317bb39..f8ab3eaad128 100644
--- a/arch/x86/include/asm/processor-flags.h
+++ b/arch/x86/include/asm/processor-flags.h
@@ -6,6 +6,7 @@
* EFLAGS bits
*/
#define X86_EFLAGS_CF 0x00000001 /* Carry Flag */
+#define X86_EFLAGS_BIT1 0x00000002 /* Bit 1 - always on */
#define X86_EFLAGS_PF 0x00000004 /* Parity Flag */
#define X86_EFLAGS_AF 0x00000010 /* Auxiliary carry Flag */
#define X86_EFLAGS_ZF 0x00000040 /* Zero Flag */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index b650435ffb53..aa9088c26931 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -99,7 +99,6 @@ struct cpuinfo_x86 {
u16 apicid;
u16 initial_apicid;
u16 x86_clflush_size;
-#ifdef CONFIG_SMP
/* number of cores as seen by the OS: */
u16 booted_cores;
/* Physical processor id: */
@@ -110,7 +109,6 @@ struct cpuinfo_x86 {
u8 compute_unit_id;
/* Index into per_cpu list: */
u16 cpu_index;
-#endif
u32 microcode;
} __attribute__((__aligned__(SMP_CACHE_BYTES)));
diff --git a/arch/x86/include/asm/serpent.h b/arch/x86/include/asm/serpent.h
new file mode 100644
index 000000000000..d3ef63fe0c81
--- /dev/null
+++ b/arch/x86/include/asm/serpent.h
@@ -0,0 +1,63 @@
+#ifndef ASM_X86_SERPENT_H
+#define ASM_X86_SERPENT_H
+
+#include <linux/crypto.h>
+#include <crypto/serpent.h>
+
+#ifdef CONFIG_X86_32
+
+#define SERPENT_PARALLEL_BLOCKS 4
+
+asmlinkage void __serpent_enc_blk_4way(struct serpent_ctx *ctx, u8 *dst,
+ const u8 *src, bool xor);
+asmlinkage void serpent_dec_blk_4way(struct serpent_ctx *ctx, u8 *dst,
+ const u8 *src);
+
+static inline void serpent_enc_blk_xway(struct serpent_ctx *ctx, u8 *dst,
+ const u8 *src)
+{
+ __serpent_enc_blk_4way(ctx, dst, src, false);
+}
+
+static inline void serpent_enc_blk_xway_xor(struct serpent_ctx *ctx, u8 *dst,
+ const u8 *src)
+{
+ __serpent_enc_blk_4way(ctx, dst, src, true);
+}
+
+static inline void serpent_dec_blk_xway(struct serpent_ctx *ctx, u8 *dst,
+ const u8 *src)
+{
+ serpent_dec_blk_4way(ctx, dst, src);
+}
+
+#else
+
+#define SERPENT_PARALLEL_BLOCKS 8
+
+asmlinkage void __serpent_enc_blk_8way(struct serpent_ctx *ctx, u8 *dst,
+ const u8 *src, bool xor);
+asmlinkage void serpent_dec_blk_8way(struct serpent_ctx *ctx, u8 *dst,
+ const u8 *src);
+
+static inline void serpent_enc_blk_xway(struct serpent_ctx *ctx, u8 *dst,
+ const u8 *src)
+{
+ __serpent_enc_blk_8way(ctx, dst, src, false);
+}
+
+static inline void serpent_enc_blk_xway_xor(struct serpent_ctx *ctx, u8 *dst,
+ const u8 *src)
+{
+ __serpent_enc_blk_8way(ctx, dst, src, true);
+}
+
+static inline void serpent_dec_blk_xway(struct serpent_ctx *ctx, u8 *dst,
+ const u8 *src)
+{
+ serpent_dec_blk_8way(ctx, dst, src);
+}
+
+#endif
+
+#endif
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index 9756551ec760..d0f19f9fb846 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -47,7 +47,7 @@ extern void reserve_standard_io_resources(void);
extern void i386_reserve_resources(void);
extern void setup_default_timer_irq(void);
-#ifdef CONFIG_X86_MRST
+#ifdef CONFIG_X86_INTEL_MID
extern void x86_mrst_early_setup(void);
#else
static inline void x86_mrst_early_setup(void) { }
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 73b11bc0ae6f..0434c400287c 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -225,5 +225,11 @@ extern int hard_smp_processor_id(void);
#endif /* CONFIG_X86_LOCAL_APIC */
+#ifdef CONFIG_DEBUG_NMI_SELFTEST
+extern void nmi_selftest(void);
+#else
+#define nmi_selftest() do { } while (0)
+#endif
+
#endif /* __ASSEMBLY__ */
#endif /* _ASM_X86_SMP_H */
diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
index 972c260919a3..a82c2bf504b6 100644
--- a/arch/x86/include/asm/spinlock.h
+++ b/arch/x86/include/asm/spinlock.h
@@ -79,23 +79,10 @@ static __always_inline int __ticket_spin_trylock(arch_spinlock_t *lock)
return cmpxchg(&lock->head_tail, old.head_tail, new.head_tail) == old.head_tail;
}
-#if (NR_CPUS < 256)
static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock)
{
- asm volatile(UNLOCK_LOCK_PREFIX "incb %0"
- : "+m" (lock->head_tail)
- :
- : "memory", "cc");
+ __add(&lock->tickets.head, 1, UNLOCK_LOCK_PREFIX);
}
-#else
-static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock)
-{
- asm volatile(UNLOCK_LOCK_PREFIX "incw %0"
- : "+m" (lock->head_tail)
- :
- : "memory", "cc");
-}
-#endif
static inline int __ticket_spin_is_locked(arch_spinlock_t *lock)
{
diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h
index c2ff2a1d845e..2d2f01ce6dcb 100644
--- a/arch/x86/include/asm/system.h
+++ b/arch/x86/include/asm/system.h
@@ -401,6 +401,7 @@ extern unsigned long arch_align_stack(unsigned long sp);
extern void free_init_pages(char *what, unsigned long begin, unsigned long end);
void default_idle(void);
+bool set_pm_idle_to_default(void);
void stop_this_cpu(void *dummy);
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index a1fe5c127b52..bc817cd8b443 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -40,7 +40,8 @@ struct thread_info {
*/
__u8 supervisor_stack[0];
#endif
- int uaccess_err;
+ unsigned int sig_on_uaccess_error:1;
+ unsigned int uaccess_err:1; /* uaccess failed */
};
#define INIT_THREAD_INFO(tsk) \
@@ -90,7 +91,6 @@ struct thread_info {
#define TIF_MEMDIE 20 /* is terminating due to OOM killer */
#define TIF_DEBUG 21 /* uses debug registers */
#define TIF_IO_BITMAP 22 /* uses I/O bitmap */
-#define TIF_FREEZE 23 /* is freezing for suspend */
#define TIF_FORCED_TF 24 /* true if TF in eflags artificially */
#define TIF_BLOCKSTEP 25 /* set when we want DEBUGCTLMSR_BTF */
#define TIF_LAZY_MMU_UPDATES 27 /* task is updating the mmu lazily */
@@ -112,7 +112,6 @@ struct thread_info {
#define _TIF_FORK (1 << TIF_FORK)
#define _TIF_DEBUG (1 << TIF_DEBUG)
#define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP)
-#define _TIF_FREEZE (1 << TIF_FREEZE)
#define _TIF_FORCED_TF (1 << TIF_FORCED_TF)
#define _TIF_BLOCKSTEP (1 << TIF_BLOCKSTEP)
#define _TIF_LAZY_MMU_UPDATES (1 << TIF_LAZY_MMU_UPDATES)
@@ -231,6 +230,12 @@ static inline struct thread_info *current_thread_info(void)
movq PER_CPU_VAR(kernel_stack),reg ; \
subq $(THREAD_SIZE-KERNEL_STACK_OFFSET),reg
+/*
+ * Same if PER_CPU_VAR(kernel_stack) is, perhaps with some offset, already in
+ * a certain register (to be used in assembler memory operands).
+ */
+#define THREAD_INFO(reg, off) KERNEL_STACK_OFFSET+(off)-THREAD_SIZE(reg)
+
#endif
#endif /* !X86_32 */
diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h
index fa7b9176b76c..431793e5d484 100644
--- a/arch/x86/include/asm/timer.h
+++ b/arch/x86/include/asm/timer.h
@@ -32,6 +32,22 @@ extern int no_timer_check;
* (mathieu.desnoyers@polymtl.ca)
*
* -johnstul@us.ibm.com "math is hard, lets go shopping!"
+ *
+ * In:
+ *
+ * ns = cycles * cyc2ns_scale / SC
+ *
+ * Although we may still have enough bits to store the value of ns,
+ * in some cases, we may not have enough bits to store cycles * cyc2ns_scale,
+ * leading to an incorrect result.
+ *
+ * To avoid this, we can decompose 'cycles' into quotient and remainder
+ * of division by SC. Then,
+ *
+ * ns = (quot * SC + rem) * cyc2ns_scale / SC
+ * = quot * cyc2ns_scale + (rem * cyc2ns_scale) / SC
+ *
+ * - sqazi@google.com
*/
DECLARE_PER_CPU(unsigned long, cyc2ns);
@@ -41,9 +57,14 @@ DECLARE_PER_CPU(unsigned long long, cyc2ns_offset);
static inline unsigned long long __cycles_2_ns(unsigned long long cyc)
{
+ unsigned long long quot;
+ unsigned long long rem;
int cpu = smp_processor_id();
unsigned long long ns = per_cpu(cyc2ns_offset, cpu);
- ns += cyc * per_cpu(cyc2ns, cpu) >> CYC2NS_SCALE_FACTOR;
+ quot = (cyc >> CYC2NS_SCALE_FACTOR);
+ rem = cyc & ((1ULL << CYC2NS_SCALE_FACTOR) - 1);
+ ns += quot * per_cpu(cyc2ns, cpu) +
+ ((rem * per_cpu(cyc2ns, cpu)) >> CYC2NS_SCALE_FACTOR);
return ns;
}
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index c00692476e9f..b9676ae37ada 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -130,10 +130,8 @@ extern void setup_node_to_cpumask_map(void);
.balance_interval = 1, \
}
-#ifdef CONFIG_X86_64
extern int __node_distance(int, int);
#define node_distance(a, b) __node_distance(a, b)
-#endif
#else /* !CONFIG_NUMA */
@@ -174,7 +172,7 @@ static inline void arch_fix_phys_package_id(int num, u32 slot)
}
struct pci_bus;
-void x86_pci_root_bus_res_quirks(struct pci_bus *b);
+void x86_pci_root_bus_resources(int bus, struct list_head *resources);
#ifdef CONFIG_SMP
#define mc_capable() ((boot_cpu_data.x86_max_cores > 1) && \
diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h
index 83e2efd181e2..15d99153a96d 100644
--- a/arch/x86/include/asm/tsc.h
+++ b/arch/x86/include/asm/tsc.h
@@ -51,6 +51,8 @@ extern int unsynchronized_tsc(void);
extern int check_tsc_unstable(void);
extern unsigned long native_calibrate_tsc(void);
+extern int tsc_clocksource_reliable;
+
/*
* Boot-time check whether the TSCs are synchronized across
* all CPUs/cores:
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 36361bf6fdd1..8be5f54d9360 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -462,7 +462,7 @@ struct __large_struct { unsigned long buf[100]; };
barrier();
#define uaccess_catch(err) \
- (err) |= current_thread_info()->uaccess_err; \
+ (err) |= (current_thread_info()->uaccess_err ? -EFAULT : 0); \
current_thread_info()->uaccess_err = prev_err; \
} while (0)
diff --git a/arch/x86/include/asm/uv/uv_mmrs.h b/arch/x86/include/asm/uv/uv_mmrs.h
index 10474fb1185d..cf1d73643f60 100644
--- a/arch/x86/include/asm/uv/uv_mmrs.h
+++ b/arch/x86/include/asm/uv/uv_mmrs.h
@@ -57,6 +57,7 @@
#define UV1_HUB_PART_NUMBER 0x88a5
#define UV2_HUB_PART_NUMBER 0x8eb8
+#define UV2_HUB_PART_NUMBER_X 0x1111
/* Compat: if this #define is present, UV headers support UV2 */
#define UV2_HUB_IS_SUPPORTED 1
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index d3d859035af9..517d4767ffdd 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -7,6 +7,7 @@
struct mpc_bus;
struct mpc_cpu;
struct mpc_table;
+struct cpuinfo_x86;
/**
* struct x86_init_mpparse - platform specific mpparse ops
@@ -147,11 +148,13 @@ struct x86_init_ops {
*/
struct x86_cpuinit_ops {
void (*setup_percpu_clockev)(void);
+ void (*fixup_cpu_id)(struct cpuinfo_x86 *c, int node);
};
/**
* struct x86_platform_ops - platform specific runtime functions
* @calibrate_tsc: calibrate TSC
+ * @wallclock_init: init the wallclock device
* @get_wallclock: get time from HW clock like RTC etc.
* @set_wallclock: set time back to HW clock
* @is_untracked_pat_range exclude from PAT logic
@@ -160,11 +163,13 @@ struct x86_cpuinit_ops {
*/
struct x86_platform_ops {
unsigned long (*calibrate_tsc)(void);
+ void (*wallclock_init)(void);
unsigned long (*get_wallclock)(void);
int (*set_wallclock)(unsigned long nowtime);
void (*iommu_shutdown)(void);
bool (*is_untracked_pat_range)(u64 start, u64 end);
void (*nmi_init)(void);
+ unsigned char (*get_nmi_reason)(void);
int (*i8042_detect)(void);
};
@@ -174,6 +179,7 @@ struct x86_msi_ops {
int (*setup_msi_irqs)(struct pci_dev *dev, int nvec, int type);
void (*teardown_msi_irq)(unsigned int irq);
void (*teardown_msi_irqs)(struct pci_dev *dev);
+ void (*restore_msi_irqs)(struct pci_dev *dev, int irq);
};
extern struct x86_init_ops x86_init;
@@ -183,5 +189,6 @@ extern struct x86_msi_ops x86_msi;
extern void x86_init_noop(void);
extern void x86_init_uint_noop(unsigned int unused);
+extern void x86_default_fixup_cpu_id(struct cpuinfo_x86 *c, int node);
#endif
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 8c473d9d0b83..5369059c07a9 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -81,6 +81,7 @@ obj-$(CONFIG_APB_TIMER) += apb_timer.o
obj-$(CONFIG_AMD_NB) += amd_nb.o
obj-$(CONFIG_DEBUG_RODATA_TEST) += test_rodata.o
obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o
+obj-$(CONFIG_DEBUG_NMI_SELFTEST) += nmi_selftest.o
obj-$(CONFIG_KVM_GUEST) += kvm.o
obj-$(CONFIG_KVM_CLOCK) += kvmclock.o
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 4558f0d0822d..ce664f33ea8e 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -219,6 +219,8 @@ static int __init
acpi_parse_x2apic(struct acpi_subtable_header *header, const unsigned long end)
{
struct acpi_madt_local_x2apic *processor = NULL;
+ int apic_id;
+ u8 enabled;
processor = (struct acpi_madt_local_x2apic *)header;
@@ -227,6 +229,8 @@ acpi_parse_x2apic(struct acpi_subtable_header *header, const unsigned long end)
acpi_table_print_madt_entry(header);
+ apic_id = processor->local_apic_id;
+ enabled = processor->lapic_flags & ACPI_MADT_ENABLED;
#ifdef CONFIG_X86_X2APIC
/*
* We need to register disabled CPU as well to permit
@@ -235,8 +239,10 @@ acpi_parse_x2apic(struct acpi_subtable_header *header, const unsigned long end)
* to not preallocating memory for all NR_CPUS
* when we use CPU hotplug.
*/
- acpi_register_lapic(processor->local_apic_id, /* APIC ID */
- processor->lapic_flags & ACPI_MADT_ENABLED);
+ if (!cpu_has_x2apic && (apic_id >= 0xff) && enabled)
+ printk(KERN_WARNING PREFIX "x2apic entry ignored\n");
+ else
+ acpi_register_lapic(apic_id, enabled);
#else
printk(KERN_WARNING PREFIX "x2apic entry ignored\n");
#endif
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index c63822816249..1f84794f0759 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -738,5 +738,5 @@ void __kprobes text_poke_smp_batch(struct text_poke_param *params, int n)
atomic_set(&stop_machine_first, 1);
wrote_text = 0;
- __stop_machine(stop_machine_text_poke, (void *)&tpp, NULL);
+ __stop_machine(stop_machine_text_poke, (void *)&tpp, cpu_online_mask);
}
diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
index 4c39baa8facc..be16854591cc 100644
--- a/arch/x86/kernel/amd_nb.c
+++ b/arch/x86/kernel/amd_nb.c
@@ -119,20 +119,49 @@ bool __init early_is_amd_nb(u32 device)
return false;
}
+struct resource *amd_get_mmconfig_range(struct resource *res)
+{
+ u32 address;
+ u64 base, msr;
+ unsigned segn_busn_bits;
+
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
+ return NULL;
+
+ /* assume all cpus from fam10h have mmconfig */
+ if (boot_cpu_data.x86 < 0x10)
+ return NULL;
+
+ address = MSR_FAM10H_MMIO_CONF_BASE;
+ rdmsrl(address, msr);
+
+ /* mmconfig is not enabled */
+ if (!(msr & FAM10H_MMIO_CONF_ENABLE))
+ return NULL;
+
+ base = msr & (FAM10H_MMIO_CONF_BASE_MASK<<FAM10H_MMIO_CONF_BASE_SHIFT);
+
+ segn_busn_bits = (msr >> FAM10H_MMIO_CONF_BUSRANGE_SHIFT) &
+ FAM10H_MMIO_CONF_BUSRANGE_MASK;
+
+ res->flags = IORESOURCE_MEM;
+ res->start = base;
+ res->end = base + (1ULL<<(segn_busn_bits + 20)) - 1;
+ return res;
+}
+
int amd_get_subcaches(int cpu)
{
struct pci_dev *link = node_to_amd_nb(amd_get_nb_id(cpu))->link;
unsigned int mask;
- int cuid = 0;
+ int cuid;
if (!amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
return 0;
pci_read_config_dword(link, 0x1d4, &mask);
-#ifdef CONFIG_SMP
cuid = cpu_data(cpu).compute_unit_id;
-#endif
return (mask >> (4 * cuid)) & 0xf;
}
@@ -141,7 +170,7 @@ int amd_set_subcaches(int cpu, int mask)
static unsigned int reset, ban;
struct amd_northbridge *nb = node_to_amd_nb(amd_get_nb_id(cpu));
unsigned int reg;
- int cuid = 0;
+ int cuid;
if (!amd_nb_has_feature(AMD_NB_L3_PARTITIONING) || mask > 0xf)
return -EINVAL;
@@ -159,9 +188,7 @@ int amd_set_subcaches(int cpu, int mask)
pci_write_config_dword(nb->misc, 0x1b8, reg & ~0x180000);
}
-#ifdef CONFIG_SMP
cuid = cpu_data(cpu).compute_unit_id;
-#endif
mask <<= 4 * cuid;
mask |= (0xf ^ (1 << cuid)) << 26;
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 3d2661ca6542..6e76c191a835 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -88,13 +88,13 @@ static u32 __init allocate_aperture(void)
*/
addr = memblock_find_in_range(GART_MIN_ADDR, GART_MAX_ADDR,
aper_size, aper_size);
- if (addr == MEMBLOCK_ERROR || addr + aper_size > GART_MAX_ADDR) {
+ if (!addr || addr + aper_size > GART_MAX_ADDR) {
printk(KERN_ERR
"Cannot allocate aperture memory hole (%lx,%uK)\n",
addr, aper_size>>10);
return 0;
}
- memblock_x86_reserve_range(addr, addr + aper_size, "aperture64");
+ memblock_reserve(addr, aper_size);
/*
* Kmemleak should not scan this block as it may not be mapped via the
* kernel direct mapping.
diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile
index 767fd04f2843..0ae0323b1f9c 100644
--- a/arch/x86/kernel/apic/Makefile
+++ b/arch/x86/kernel/apic/Makefile
@@ -10,6 +10,7 @@ obj-$(CONFIG_SMP) += ipi.o
ifeq ($(CONFIG_X86_64),y)
# APIC probe will depend on the listing order here
+obj-$(CONFIG_X86_NUMACHIP) += apic_numachip.o
obj-$(CONFIG_X86_UV) += x2apic_uv_x.o
obj-$(CONFIG_X86_X2APIC) += x2apic_phys.o
obj-$(CONFIG_X86_X2APIC) += x2apic_cluster.o
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index a2fd72e0ab35..2eec05b6d1b8 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -146,16 +146,26 @@ __setup("apicpmtimer", setup_apicpmtimer);
int x2apic_mode;
#ifdef CONFIG_X86_X2APIC
/* x2apic enabled before OS handover */
-static int x2apic_preenabled;
+int x2apic_preenabled;
+static int x2apic_disabled;
+static int nox2apic;
static __init int setup_nox2apic(char *str)
{
if (x2apic_enabled()) {
- pr_warning("Bios already enabled x2apic, "
- "can't enforce nox2apic");
- return 0;
- }
+ int apicid = native_apic_msr_read(APIC_ID);
+
+ if (apicid >= 255) {
+ pr_warning("Apicid: %08x, cannot enforce nox2apic\n",
+ apicid);
+ return 0;
+ }
+
+ pr_warning("x2apic already enabled. will disable it\n");
+ } else
+ setup_clear_cpu_cap(X86_FEATURE_X2APIC);
+
+ nox2apic = 1;
- setup_clear_cpu_cap(X86_FEATURE_X2APIC);
return 0;
}
early_param("nox2apic", setup_nox2apic);
@@ -186,7 +196,7 @@ static struct resource lapic_resource = {
.flags = IORESOURCE_MEM | IORESOURCE_BUSY,
};
-static unsigned int calibration_result;
+unsigned int lapic_timer_frequency = 0;
static void apic_pm_activate(void);
@@ -250,6 +260,7 @@ u32 native_safe_apic_wait_icr_idle(void)
send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
if (!send_status)
break;
+ inc_irq_stat(icr_read_retry_count);
udelay(100);
} while (timeout++ < 1000);
@@ -454,7 +465,7 @@ static void lapic_timer_setup(enum clock_event_mode mode,
switch (mode) {
case CLOCK_EVT_MODE_PERIODIC:
case CLOCK_EVT_MODE_ONESHOT:
- __setup_APIC_LVTT(calibration_result,
+ __setup_APIC_LVTT(lapic_timer_frequency,
mode != CLOCK_EVT_MODE_PERIODIC, 1);
break;
case CLOCK_EVT_MODE_UNUSED:
@@ -638,6 +649,25 @@ static int __init calibrate_APIC_clock(void)
long delta, deltatsc;
int pm_referenced = 0;
+ /**
+ * check if lapic timer has already been calibrated by platform
+ * specific routine, such as tsc calibration code. if so, we just fill
+ * in the clockevent structure and return.
+ */
+
+ if (lapic_timer_frequency) {
+ apic_printk(APIC_VERBOSE, "lapic timer already calibrated %d\n",
+ lapic_timer_frequency);
+ lapic_clockevent.mult = div_sc(lapic_timer_frequency/APIC_DIVISOR,
+ TICK_NSEC, lapic_clockevent.shift);
+ lapic_clockevent.max_delta_ns =
+ clockevent_delta2ns(0x7FFFFF, &lapic_clockevent);
+ lapic_clockevent.min_delta_ns =
+ clockevent_delta2ns(0xF, &lapic_clockevent);
+ lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
+ return 0;
+ }
+
local_irq_disable();
/* Replace the global interrupt handler */
@@ -679,12 +709,12 @@ static int __init calibrate_APIC_clock(void)
lapic_clockevent.min_delta_ns =
clockevent_delta2ns(0xF, &lapic_clockevent);
- calibration_result = (delta * APIC_DIVISOR) / LAPIC_CAL_LOOPS;
+ lapic_timer_frequency = (delta * APIC_DIVISOR) / LAPIC_CAL_LOOPS;
apic_printk(APIC_VERBOSE, "..... delta %ld\n", delta);
apic_printk(APIC_VERBOSE, "..... mult: %u\n", lapic_clockevent.mult);
apic_printk(APIC_VERBOSE, "..... calibration result: %u\n",
- calibration_result);
+ lapic_timer_frequency);
if (cpu_has_tsc) {
apic_printk(APIC_VERBOSE, "..... CPU clock speed is "
@@ -695,13 +725,13 @@ static int __init calibrate_APIC_clock(void)
apic_printk(APIC_VERBOSE, "..... host bus clock speed is "
"%u.%04u MHz.\n",
- calibration_result / (1000000 / HZ),
- calibration_result % (1000000 / HZ));
+ lapic_timer_frequency / (1000000 / HZ),
+ lapic_timer_frequency % (1000000 / HZ));
/*
* Do a sanity check on the APIC calibration result
*/
- if (calibration_result < (1000000 / HZ)) {
+ if (lapic_timer_frequency < (1000000 / HZ)) {
local_irq_enable();
pr_warning("APIC frequency too slow, disabling apic timer\n");
return -1;
@@ -857,8 +887,8 @@ void __irq_entry smp_apic_timer_interrupt(struct pt_regs *regs)
* Besides, if we don't timer interrupts ignore the global
* interrupt lock, which is the WrongThing (tm) to do.
*/
- exit_idle();
irq_enter();
+ exit_idle();
local_apic_timer_interrupt();
irq_exit();
@@ -1412,6 +1442,45 @@ void __init bsp_end_local_APIC_setup(void)
}
#ifdef CONFIG_X86_X2APIC
+/*
+ * Need to disable xapic and x2apic at the same time and then enable xapic mode
+ */
+static inline void __disable_x2apic(u64 msr)
+{
+ wrmsrl(MSR_IA32_APICBASE,
+ msr & ~(X2APIC_ENABLE | XAPIC_ENABLE));
+ wrmsrl(MSR_IA32_APICBASE, msr & ~X2APIC_ENABLE);
+}
+
+static __init void disable_x2apic(void)
+{
+ u64 msr;
+
+ if (!cpu_has_x2apic)
+ return;
+
+ rdmsrl(MSR_IA32_APICBASE, msr);
+ if (msr & X2APIC_ENABLE) {
+ u32 x2apic_id = read_apic_id();
+
+ if (x2apic_id >= 255)
+ panic("Cannot disable x2apic, id: %08x\n", x2apic_id);
+
+ pr_info("Disabling x2apic\n");
+ __disable_x2apic(msr);
+
+ if (nox2apic) {
+ clear_cpu_cap(&cpu_data(0), X86_FEATURE_X2APIC);
+ setup_clear_cpu_cap(X86_FEATURE_X2APIC);
+ }
+
+ x2apic_disabled = 1;
+ x2apic_mode = 0;
+
+ register_lapic_address(mp_lapic_addr);
+ }
+}
+
void check_x2apic(void)
{
if (x2apic_enabled()) {
@@ -1422,15 +1491,20 @@ void check_x2apic(void)
void enable_x2apic(void)
{
- int msr, msr2;
+ u64 msr;
+
+ rdmsrl(MSR_IA32_APICBASE, msr);
+ if (x2apic_disabled) {
+ __disable_x2apic(msr);
+ return;
+ }
if (!x2apic_mode)
return;
- rdmsr(MSR_IA32_APICBASE, msr, msr2);
if (!(msr & X2APIC_ENABLE)) {
printk_once(KERN_INFO "Enabling x2apic\n");
- wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, msr2);
+ wrmsrl(MSR_IA32_APICBASE, msr | X2APIC_ENABLE);
}
}
#endif /* CONFIG_X86_X2APIC */
@@ -1467,25 +1541,34 @@ void __init enable_IR_x2apic(void)
ret = save_ioapic_entries();
if (ret) {
pr_info("Saving IO-APIC state failed: %d\n", ret);
- goto out;
+ return;
}
local_irq_save(flags);
legacy_pic->mask_all();
mask_ioapic_entries();
+ if (x2apic_preenabled && nox2apic)
+ disable_x2apic();
+
if (dmar_table_init_ret)
ret = -1;
else
ret = enable_IR();
+ if (!x2apic_supported())
+ goto skip_x2apic;
+
if (ret < 0) {
/* IR is required if there is APIC ID > 255 even when running
* under KVM
*/
if (max_physical_apicid > 255 ||
- !hypervisor_x2apic_available())
- goto nox2apic;
+ !hypervisor_x2apic_available()) {
+ if (x2apic_preenabled)
+ disable_x2apic();
+ goto skip_x2apic;
+ }
/*
* without IR all CPUs can be addressed by IOAPIC/MSI
* only in physical mode
@@ -1493,8 +1576,10 @@ void __init enable_IR_x2apic(void)
x2apic_force_phys();
}
- if (ret == IRQ_REMAP_XAPIC_MODE)
- goto nox2apic;
+ if (ret == IRQ_REMAP_XAPIC_MODE) {
+ pr_info("x2apic not enabled, IRQ remapping is in xapic mode\n");
+ goto skip_x2apic;
+ }
x2apic_enabled = 1;
@@ -1504,22 +1589,11 @@ void __init enable_IR_x2apic(void)
pr_info("Enabled x2apic\n");
}
-nox2apic:
+skip_x2apic:
if (ret < 0) /* IR enabling failed */
restore_ioapic_entries();
legacy_pic->restore_mask();
local_irq_restore(flags);
-
-out:
- if (x2apic_enabled || !x2apic_supported())
- return;
-
- if (x2apic_preenabled)
- panic("x2apic: enabled by BIOS but kernel init failed.");
- else if (ret == IRQ_REMAP_XAPIC_MODE)
- pr_info("x2apic not enabled, IRQ remapping is in xapic mode\n");
- else if (ret < 0)
- pr_info("x2apic not enabled, IRQ remapping init failed\n");
}
#ifdef CONFIG_X86_64
@@ -1790,8 +1864,8 @@ void smp_spurious_interrupt(struct pt_regs *regs)
{
u32 v;
- exit_idle();
irq_enter();
+ exit_idle();
/*
* Check if this really is a spurious interrupt and ACK it
* if it is a vectored one. Just in case...
@@ -1827,8 +1901,8 @@ void smp_error_interrupt(struct pt_regs *regs)
"Illegal register address", /* APIC Error Bit 7 */
};
- exit_idle();
irq_enter();
+ exit_idle();
/* First tickle the hardware, only then report what went on. -- REW */
v0 = apic_read(APIC_ESR);
apic_write(APIC_ESR, 0);
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index f7a41e4cae47..8c3cdded6f2b 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -62,7 +62,7 @@ static void flat_vector_allocation_domain(int cpu, struct cpumask *retmask)
* an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel
* document number 292116). So here it goes...
*/
-static void flat_init_apic_ldr(void)
+void flat_init_apic_ldr(void)
{
unsigned long val;
unsigned long num, id;
@@ -171,9 +171,14 @@ static int flat_phys_pkg_id(int initial_apic_id, int index_msb)
return initial_apic_id >> index_msb;
}
+static int flat_probe(void)
+{
+ return 1;
+}
+
static struct apic apic_flat = {
.name = "flat",
- .probe = NULL,
+ .probe = flat_probe,
.acpi_madt_oem_check = flat_acpi_madt_oem_check,
.apic_id_registered = flat_apic_id_registered,
diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c
new file mode 100644
index 000000000000..09d3d8c1cd99
--- /dev/null
+++ b/arch/x86/kernel/apic/apic_numachip.c
@@ -0,0 +1,294 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Numascale NumaConnect-Specific APIC Code
+ *
+ * Copyright (C) 2011 Numascale AS. All rights reserved.
+ *
+ * Send feedback to <support@numascale.com>
+ *
+ */
+
+#include <linux/errno.h>
+#include <linux/threads.h>
+#include <linux/cpumask.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/ctype.h>
+#include <linux/init.h>
+#include <linux/hardirq.h>
+#include <linux/delay.h>
+
+#include <asm/numachip/numachip_csr.h>
+#include <asm/smp.h>
+#include <asm/apic.h>
+#include <asm/ipi.h>
+#include <asm/apic_flat_64.h>
+
+static int numachip_system __read_mostly;
+
+static struct apic apic_numachip __read_mostly;
+
+static unsigned int get_apic_id(unsigned long x)
+{
+ unsigned long value;
+ unsigned int id;
+
+ rdmsrl(MSR_FAM10H_NODE_ID, value);
+ id = ((x >> 24) & 0xffU) | ((value << 2) & 0x3f00U);
+
+ return id;
+}
+
+static unsigned long set_apic_id(unsigned int id)
+{
+ unsigned long x;
+
+ x = ((id & 0xffU) << 24);
+ return x;
+}
+
+static unsigned int read_xapic_id(void)
+{
+ return get_apic_id(apic_read(APIC_ID));
+}
+
+static int numachip_apic_id_registered(void)
+{
+ return physid_isset(read_xapic_id(), phys_cpu_present_map);
+}
+
+static int numachip_phys_pkg_id(int initial_apic_id, int index_msb)
+{
+ return initial_apic_id >> index_msb;
+}
+
+static const struct cpumask *numachip_target_cpus(void)
+{
+ return cpu_online_mask;
+}
+
+static void numachip_vector_allocation_domain(int cpu, struct cpumask *retmask)
+{
+ cpumask_clear(retmask);
+ cpumask_set_cpu(cpu, retmask);
+}
+
+static int __cpuinit numachip_wakeup_secondary(int phys_apicid, unsigned long start_rip)
+{
+ union numachip_csr_g3_ext_irq_gen int_gen;
+
+ int_gen.s._destination_apic_id = phys_apicid;
+ int_gen.s._vector = 0;
+ int_gen.s._msgtype = APIC_DM_INIT >> 8;
+ int_gen.s._index = 0;
+
+ write_lcsr(CSR_G3_EXT_IRQ_GEN, int_gen.v);
+
+ int_gen.s._msgtype = APIC_DM_STARTUP >> 8;
+ int_gen.s._vector = start_rip >> 12;
+
+ write_lcsr(CSR_G3_EXT_IRQ_GEN, int_gen.v);
+
+ atomic_set(&init_deasserted, 1);
+ return 0;
+}
+
+static void numachip_send_IPI_one(int cpu, int vector)
+{
+ union numachip_csr_g3_ext_irq_gen int_gen;
+ int apicid = per_cpu(x86_cpu_to_apicid, cpu);
+
+ int_gen.s._destination_apic_id = apicid;
+ int_gen.s._vector = vector;
+ int_gen.s._msgtype = (vector == NMI_VECTOR ? APIC_DM_NMI : APIC_DM_FIXED) >> 8;
+ int_gen.s._index = 0;
+
+ write_lcsr(CSR_G3_EXT_IRQ_GEN, int_gen.v);
+}
+
+static void numachip_send_IPI_mask(const struct cpumask *mask, int vector)
+{
+ unsigned int cpu;
+
+ for_each_cpu(cpu, mask)
+ numachip_send_IPI_one(cpu, vector);
+}
+
+static void numachip_send_IPI_mask_allbutself(const struct cpumask *mask,
+ int vector)
+{
+ unsigned int this_cpu = smp_processor_id();
+ unsigned int cpu;
+
+ for_each_cpu(cpu, mask) {
+ if (cpu != this_cpu)
+ numachip_send_IPI_one(cpu, vector);
+ }
+}
+
+static void numachip_send_IPI_allbutself(int vector)
+{
+ unsigned int this_cpu = smp_processor_id();
+ unsigned int cpu;
+
+ for_each_online_cpu(cpu) {
+ if (cpu != this_cpu)
+ numachip_send_IPI_one(cpu, vector);
+ }
+}
+
+static void numachip_send_IPI_all(int vector)
+{
+ numachip_send_IPI_mask(cpu_online_mask, vector);
+}
+
+static void numachip_send_IPI_self(int vector)
+{
+ __default_send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL);
+}
+
+static unsigned int numachip_cpu_mask_to_apicid(const struct cpumask *cpumask)
+{
+ int cpu;
+
+ /*
+ * We're using fixed IRQ delivery, can only return one phys APIC ID.
+ * May as well be the first.
+ */
+ cpu = cpumask_first(cpumask);
+ if (likely((unsigned)cpu < nr_cpu_ids))
+ return per_cpu(x86_cpu_to_apicid, cpu);
+
+ return BAD_APICID;
+}
+
+static unsigned int
+numachip_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
+ const struct cpumask *andmask)
+{
+ int cpu;
+
+ /*
+ * We're using fixed IRQ delivery, can only return one phys APIC ID.
+ * May as well be the first.
+ */
+ for_each_cpu_and(cpu, cpumask, andmask) {
+ if (cpumask_test_cpu(cpu, cpu_online_mask))
+ break;
+ }
+ return per_cpu(x86_cpu_to_apicid, cpu);
+}
+
+static int __init numachip_probe(void)
+{
+ return apic == &apic_numachip;
+}
+
+static void __init map_csrs(void)
+{
+ printk(KERN_INFO "NumaChip: Mapping local CSR space (%016llx - %016llx)\n",
+ NUMACHIP_LCSR_BASE, NUMACHIP_LCSR_BASE + NUMACHIP_LCSR_SIZE - 1);
+ init_extra_mapping_uc(NUMACHIP_LCSR_BASE, NUMACHIP_LCSR_SIZE);
+
+ printk(KERN_INFO "NumaChip: Mapping global CSR space (%016llx - %016llx)\n",
+ NUMACHIP_GCSR_BASE, NUMACHIP_GCSR_BASE + NUMACHIP_GCSR_SIZE - 1);
+ init_extra_mapping_uc(NUMACHIP_GCSR_BASE, NUMACHIP_GCSR_SIZE);
+}
+
+static void fixup_cpu_id(struct cpuinfo_x86 *c, int node)
+{
+ c->phys_proc_id = node;
+ per_cpu(cpu_llc_id, smp_processor_id()) = node;
+}
+
+static int __init numachip_system_init(void)
+{
+ unsigned int val;
+
+ if (!numachip_system)
+ return 0;
+
+ x86_cpuinit.fixup_cpu_id = fixup_cpu_id;
+
+ map_csrs();
+
+ val = read_lcsr(CSR_G0_NODE_IDS);
+ printk(KERN_INFO "NumaChip: Local NodeID = %08x\n", val);
+
+ return 0;
+}
+early_initcall(numachip_system_init);
+
+static int numachip_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+{
+ if (!strncmp(oem_id, "NUMASC", 6)) {
+ numachip_system = 1;
+ return 1;
+ }
+
+ return 0;
+}
+
+static struct apic apic_numachip __refconst = {
+
+ .name = "NumaConnect system",
+ .probe = numachip_probe,
+ .acpi_madt_oem_check = numachip_acpi_madt_oem_check,
+ .apic_id_registered = numachip_apic_id_registered,
+
+ .irq_delivery_mode = dest_Fixed,
+ .irq_dest_mode = 0, /* physical */
+
+ .target_cpus = numachip_target_cpus,
+ .disable_esr = 0,
+ .dest_logical = 0,
+ .check_apicid_used = NULL,
+ .check_apicid_present = NULL,
+
+ .vector_allocation_domain = numachip_vector_allocation_domain,
+ .init_apic_ldr = flat_init_apic_ldr,
+
+ .ioapic_phys_id_map = NULL,
+ .setup_apic_routing = NULL,
+ .multi_timer_check = NULL,
+ .cpu_present_to_apicid = default_cpu_present_to_apicid,
+ .apicid_to_cpu_present = NULL,
+ .setup_portio_remap = NULL,
+ .check_phys_apicid_present = default_check_phys_apicid_present,
+ .enable_apic_mode = NULL,
+ .phys_pkg_id = numachip_phys_pkg_id,
+ .mps_oem_check = NULL,
+
+ .get_apic_id = get_apic_id,
+ .set_apic_id = set_apic_id,
+ .apic_id_mask = 0xffU << 24,
+
+ .cpu_mask_to_apicid = numachip_cpu_mask_to_apicid,
+ .cpu_mask_to_apicid_and = numachip_cpu_mask_to_apicid_and,
+
+ .send_IPI_mask = numachip_send_IPI_mask,
+ .send_IPI_mask_allbutself = numachip_send_IPI_mask_allbutself,
+ .send_IPI_allbutself = numachip_send_IPI_allbutself,
+ .send_IPI_all = numachip_send_IPI_all,
+ .send_IPI_self = numachip_send_IPI_self,
+
+ .wakeup_secondary_cpu = numachip_wakeup_secondary,
+ .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
+ .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
+ .wait_for_init_deassert = NULL,
+ .smp_callin_clear_local_apic = NULL,
+ .inquire_remote_apic = NULL, /* REMRD not supported */
+
+ .read = native_apic_mem_read,
+ .write = native_apic_mem_write,
+ .icr_read = native_apic_icr_read,
+ .icr_write = native_apic_icr_write,
+ .wait_icr_idle = native_apic_wait_icr_idle,
+ .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
+};
+apic_driver(apic_numachip);
+
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 3c31fa98af6d..fb072754bc1d 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -193,10 +193,8 @@ int __init arch_early_irq_init(void)
struct irq_cfg *cfg;
int count, node, i;
- if (!legacy_pic->nr_legacy_irqs) {
- nr_irqs_gsi = 0;
+ if (!legacy_pic->nr_legacy_irqs)
io_apic_irqs = ~0UL;
- }
for (i = 0; i < nr_ioapics; i++) {
ioapics[i].saved_registers =
@@ -1696,6 +1694,7 @@ __apicdebuginit(void) print_IO_APICs(void)
int ioapic_idx;
struct irq_cfg *cfg;
unsigned int irq;
+ struct irq_chip *chip;
printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
for (ioapic_idx = 0; ioapic_idx < nr_ioapics; ioapic_idx++)
@@ -1716,6 +1715,10 @@ __apicdebuginit(void) print_IO_APICs(void)
for_each_active_irq(irq) {
struct irq_pin_list *entry;
+ chip = irq_get_chip(irq);
+ if (chip != &ioapic_chip)
+ continue;
+
cfg = irq_get_chip_data(irq);
if (!cfg)
continue;
@@ -2418,8 +2421,8 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)
unsigned vector, me;
ack_APIC_irq();
- exit_idle();
irq_enter();
+ exit_idle();
me = smp_processor_id();
for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
@@ -2945,6 +2948,10 @@ static inline void __init check_timer(void)
}
local_irq_disable();
apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n");
+ if (x2apic_preenabled)
+ apic_printk(APIC_QUIET, KERN_INFO
+ "Perhaps problem with the pre-enabled x2apic mode\n"
+ "Try booting with x2apic and interrupt-remapping disabled in the bios.\n");
panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a "
"report. Then try booting with the 'noapic' option.\n");
out:
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 62ae3001ae02..79b05b88aa19 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -93,6 +93,8 @@ static int __init early_get_pnodeid(void)
if (node_id.s.part_number == UV2_HUB_PART_NUMBER)
uv_min_hub_revision_id += UV2_HUB_REVISION_BASE - 1;
+ if (node_id.s.part_number == UV2_HUB_PART_NUMBER_X)
+ uv_min_hub_revision_id += UV2_HUB_REVISION_BASE - 1;
uv_hub_info->hub_revision = uv_min_hub_revision_id;
pnode = (node_id.s.node_id >> 1) & ((1 << m_n_config.s.n_skt) - 1);
@@ -767,7 +769,12 @@ void __init uv_system_init(void)
for(i = 0; i < UVH_NODE_PRESENT_TABLE_DEPTH; i++)
uv_possible_blades +=
hweight64(uv_read_local_mmr( UVH_NODE_PRESENT_TABLE + i * 8));
- printk(KERN_DEBUG "UV: Found %d blades\n", uv_num_possible_blades());
+
+ /* uv_num_possible_blades() is really the hub count */
+ printk(KERN_INFO "UV: Found %d blades, %d hubs\n",
+ is_uv1_hub() ? uv_num_possible_blades() :
+ (uv_num_possible_blades() + 1) / 2,
+ uv_num_possible_blades());
bytes = sizeof(struct uv_blade_info) * uv_num_possible_blades();
uv_blade_info = kzalloc(bytes, GFP_KERNEL);
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index a46bd383953c..f76623cbe263 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -383,21 +383,21 @@ static int ignore_sys_suspend;
static int ignore_normal_resume;
static int bounce_interval __read_mostly = DEFAULT_BOUNCE_INTERVAL;
-static int debug __read_mostly;
-static int smp __read_mostly;
+static bool debug __read_mostly;
+static bool smp __read_mostly;
static int apm_disabled = -1;
#ifdef CONFIG_SMP
-static int power_off;
+static bool power_off;
#else
-static int power_off = 1;
+static bool power_off = 1;
#endif
-static int realmode_power_off;
+static bool realmode_power_off;
#ifdef CONFIG_APM_ALLOW_INTS
-static int allow_ints = 1;
+static bool allow_ints = 1;
#else
-static int allow_ints;
+static bool allow_ints;
#endif
-static int broken_psr;
+static bool broken_psr;
static DECLARE_WAIT_QUEUE_HEAD(apm_waitqueue);
static DECLARE_WAIT_QUEUE_HEAD(apm_suspend_waitqueue);
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index 4f13fafc5264..68de2dc962ec 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -67,4 +67,6 @@ void common(void) {
OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch);
OFFSET(BP_version, boot_params, hdr.version);
OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment);
+ OFFSET(BP_pref_address, boot_params, hdr.pref_address);
+ OFFSET(BP_code32_start, boot_params, hdr.code32_start);
}
diff --git a/arch/x86/kernel/check.c b/arch/x86/kernel/check.c
index 452932d34730..5da1269e8ddc 100644
--- a/arch/x86/kernel/check.c
+++ b/arch/x86/kernel/check.c
@@ -62,7 +62,8 @@ early_param("memory_corruption_check_size", set_corruption_check_size);
void __init setup_bios_corruption_check(void)
{
- u64 addr = PAGE_SIZE; /* assume first page is reserved anyway */
+ phys_addr_t start, end;
+ u64 i;
if (memory_corruption_check == -1) {
memory_corruption_check =
@@ -82,28 +83,23 @@ void __init setup_bios_corruption_check(void)
corruption_check_size = round_up(corruption_check_size, PAGE_SIZE);
- while (addr < corruption_check_size && num_scan_areas < MAX_SCAN_AREAS) {
- u64 size;
- addr = memblock_x86_find_in_range_size(addr, &size, PAGE_SIZE);
+ for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL) {
+ start = clamp_t(phys_addr_t, round_up(start, PAGE_SIZE),
+ PAGE_SIZE, corruption_check_size);
+ end = clamp_t(phys_addr_t, round_down(end, PAGE_SIZE),
+ PAGE_SIZE, corruption_check_size);
+ if (start >= end)
+ continue;
- if (addr == MEMBLOCK_ERROR)
- break;
-
- if (addr >= corruption_check_size)
- break;
-
- if ((addr + size) > corruption_check_size)
- size = corruption_check_size - addr;
-
- memblock_x86_reserve_range(addr, addr + size, "SCAN RAM");
- scan_areas[num_scan_areas].addr = addr;
- scan_areas[num_scan_areas].size = size;
- num_scan_areas++;
+ memblock_reserve(start, end - start);
+ scan_areas[num_scan_areas].addr = start;
+ scan_areas[num_scan_areas].size = end - start;
/* Assume we've already mapped this early memory */
- memset(__va(addr), 0, size);
+ memset(__va(start), 0, end - start);
- addr += size;
+ if (++num_scan_areas >= MAX_SCAN_AREAS)
+ break;
}
if (num_scan_areas)
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index c7e46cb35327..f4773f4aae35 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -148,7 +148,6 @@ static void __cpuinit init_amd_k6(struct cpuinfo_x86 *c)
static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c)
{
-#ifdef CONFIG_SMP
/* calling is from identify_secondary_cpu() ? */
if (!c->cpu_index)
return;
@@ -192,7 +191,6 @@ static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c)
valid_k7:
;
-#endif
}
static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c)
@@ -353,6 +351,13 @@ static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
if (node == NUMA_NO_NODE)
node = per_cpu(cpu_llc_id, cpu);
+ /*
+ * If core numbers are inconsistent, it's likely a multi-fabric platform,
+ * so invoke platform-specific handler
+ */
+ if (c->phys_proc_id != node)
+ x86_cpuinit.fixup_cpu_id(c, node);
+
if (!node_online(node)) {
/*
* Two possibilities here:
@@ -442,8 +447,6 @@ static void __cpuinit bsp_init_amd(struct cpuinfo_x86 *c)
static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
{
- u32 dummy;
-
early_init_amd_mc(c);
/*
@@ -473,12 +476,12 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
set_cpu_cap(c, X86_FEATURE_EXTD_APICID);
}
#endif
-
- rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy);
}
static void __cpuinit init_amd(struct cpuinfo_x86 *c)
{
+ u32 dummy;
+
#ifdef CONFIG_SMP
unsigned long long value;
@@ -657,6 +660,8 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
checking_wrmsrl(MSR_AMD64_MCx_MASK(4), mask);
}
}
+
+ rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy);
}
#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c
index e58d978e0758..159103c0b1f4 100644
--- a/arch/x86/kernel/cpu/centaur.c
+++ b/arch/x86/kernel/cpu/centaur.c
@@ -278,7 +278,7 @@ static void __cpuinit init_c3(struct cpuinfo_x86 *c)
}
#ifdef CONFIG_X86_32
/* Cyrix III family needs CX8 & PGE explicitly enabled. */
- if (c->x86_model >= 6 && c->x86_model <= 9) {
+ if (c->x86_model >= 6 && c->x86_model <= 13) {
rdmsr(MSR_VIA_FCR, lo, hi);
lo |= (1<<1 | 1<<7);
wrmsr(MSR_VIA_FCR, lo, hi);
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index aa003b13a831..d43cad74f166 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -676,9 +676,7 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
if (this_cpu->c_early_init)
this_cpu->c_early_init(c);
-#ifdef CONFIG_SMP
c->cpu_index = 0;
-#endif
filter_cpuid_features(c, false);
setup_smep(c);
@@ -764,10 +762,7 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 *c)
c->apicid = c->initial_apicid;
# endif
#endif
-
-#ifdef CONFIG_X86_HT
c->phys_proc_id = c->initial_apicid;
-#endif
}
setup_smep(c);
@@ -1026,6 +1021,8 @@ __setup("clearcpuid=", setup_disablecpuid);
#ifdef CONFIG_X86_64
struct desc_ptr idt_descr = { NR_VECTORS * 16 - 1, (unsigned long) idt_table };
+struct desc_ptr nmi_idt_descr = { NR_VECTORS * 16 - 1,
+ (unsigned long) nmi_idt_table };
DEFINE_PER_CPU_FIRST(union irq_stack_union,
irq_stack_union) __aligned(PAGE_SIZE);
@@ -1090,6 +1087,26 @@ unsigned long kernel_eflags;
*/
DEFINE_PER_CPU(struct orig_ist, orig_ist);
+static DEFINE_PER_CPU(unsigned long, debug_stack_addr);
+DEFINE_PER_CPU(int, debug_stack_usage);
+
+int is_debug_stack(unsigned long addr)
+{
+ return __get_cpu_var(debug_stack_usage) ||
+ (addr <= __get_cpu_var(debug_stack_addr) &&
+ addr > (__get_cpu_var(debug_stack_addr) - DEBUG_STKSZ));
+}
+
+void debug_stack_set_zero(void)
+{
+ load_idt((const struct desc_ptr *)&nmi_idt_descr);
+}
+
+void debug_stack_reset(void)
+{
+ load_idt((const struct desc_ptr *)&idt_descr);
+}
+
#else /* CONFIG_X86_64 */
DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
@@ -1141,6 +1158,15 @@ static void dbg_restore_debug_regs(void)
#endif /* ! CONFIG_KGDB */
/*
+ * Prints an error where the NUMA and configured core-number mismatch and the
+ * platform didn't override this to fix it up
+ */
+void __cpuinit x86_default_fixup_cpu_id(struct cpuinfo_x86 *c, int node)
+{
+ pr_err("NUMA core number %d differs from configured core number %d\n", node, c->phys_proc_id);
+}
+
+/*
* cpu_init() initializes state that is per-CPU. Some data is already
* initialized (naturally) in the bootstrap process, such as the GDT
* and IDT. We reload them nevertheless, this function acts as a
@@ -1208,6 +1234,8 @@ void __cpuinit cpu_init(void)
estacks += exception_stack_sizes[v];
oist->ist[v] = t->x86_tss.ist[v] =
(unsigned long)estacks;
+ if (v == DEBUG_STACK-1)
+ per_cpu(debug_stack_addr, cpu) = (unsigned long)estacks;
}
}
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index 1b22dcc51af4..8bacc7826fb3 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -1,5 +1,4 @@
#ifndef ARCH_X86_CPU_H
-
#define ARCH_X86_CPU_H
struct cpu_model_info {
@@ -35,6 +34,4 @@ extern const struct cpu_dev *const __x86_cpu_dev_start[],
extern void get_cpu_cap(struct cpuinfo_x86 *c);
extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c);
-extern void get_cpu_cap(struct cpuinfo_x86 *c);
-
-#endif
+#endif /* ARCH_X86_CPU_H */
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 523131213f08..3e6ff6cbf42a 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -181,7 +181,6 @@ static void __cpuinit trap_init_f00f_bug(void)
static void __cpuinit intel_smp_check(struct cpuinfo_x86 *c)
{
-#ifdef CONFIG_SMP
/* calling is from identify_secondary_cpu() ? */
if (!c->cpu_index)
return;
@@ -198,7 +197,6 @@ static void __cpuinit intel_smp_check(struct cpuinfo_x86 *c)
WARN_ONCE(1, "WARNING: SMP operation may be unreliable"
"with B stepping processors.\n");
}
-#endif
}
static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index a3b0811693c9..6b45e5e7a901 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -844,8 +844,7 @@ static int __cpuinit detect_cache_attributes(unsigned int cpu)
#include <linux/kobject.h>
#include <linux/sysfs.h>
-
-extern struct sysdev_class cpu_sysdev_class; /* from drivers/base/cpu.c */
+#include <linux/cpu.h>
/* pointer to kobject for cpuX/cache */
static DEFINE_PER_CPU(struct kobject *, ici_cache_kobject);
@@ -1073,9 +1072,9 @@ err_out:
static DECLARE_BITMAP(cache_dev_map, NR_CPUS);
/* Add/Remove cache interface for CPU device */
-static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
+static int __cpuinit cache_add_dev(struct device *dev)
{
- unsigned int cpu = sys_dev->id;
+ unsigned int cpu = dev->id;
unsigned long i, j;
struct _index_kobject *this_object;
struct _cpuid4_info *this_leaf;
@@ -1087,7 +1086,7 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
retval = kobject_init_and_add(per_cpu(ici_cache_kobject, cpu),
&ktype_percpu_entry,
- &sys_dev->kobj, "%s", "cache");
+ &dev->kobj, "%s", "cache");
if (retval < 0) {
cpuid4_cache_sysfs_exit(cpu);
return retval;
@@ -1124,9 +1123,9 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
return 0;
}
-static void __cpuinit cache_remove_dev(struct sys_device * sys_dev)
+static void __cpuinit cache_remove_dev(struct device *dev)
{
- unsigned int cpu = sys_dev->id;
+ unsigned int cpu = dev->id;
unsigned long i;
if (per_cpu(ici_cpuid4_info, cpu) == NULL)
@@ -1145,17 +1144,17 @@ static int __cpuinit cacheinfo_cpu_callback(struct notifier_block *nfb,
unsigned long action, void *hcpu)
{
unsigned int cpu = (unsigned long)hcpu;
- struct sys_device *sys_dev;
+ struct device *dev;
- sys_dev = get_cpu_sysdev(cpu);
+ dev = get_cpu_device(cpu);
switch (action) {
case CPU_ONLINE:
case CPU_ONLINE_FROZEN:
- cache_add_dev(sys_dev);
+ cache_add_dev(dev);
break;
case CPU_DEAD:
case CPU_DEAD_FROZEN:
- cache_remove_dev(sys_dev);
+ cache_remove_dev(dev);
break;
}
return NOTIFY_OK;
@@ -1174,9 +1173,9 @@ static int __cpuinit cache_sysfs_init(void)
for_each_online_cpu(i) {
int err;
- struct sys_device *sys_dev = get_cpu_sysdev(i);
+ struct device *dev = get_cpu_device(i);
- err = cache_add_dev(sys_dev);
+ err = cache_add_dev(dev);
if (err)
return err;
}
diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c
index 6199232161cf..fc4beb393577 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-inject.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c
@@ -17,6 +17,7 @@
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/fs.h>
+#include <linux/preempt.h>
#include <linux/smp.h>
#include <linux/notifier.h>
#include <linux/kdebug.h>
@@ -92,6 +93,18 @@ static int mce_raise_notify(unsigned int cmd, struct pt_regs *regs)
return NMI_HANDLED;
}
+static void mce_irq_ipi(void *info)
+{
+ int cpu = smp_processor_id();
+ struct mce *m = &__get_cpu_var(injectm);
+
+ if (cpumask_test_cpu(cpu, mce_inject_cpumask) &&
+ m->inject_flags & MCJ_EXCEPTION) {
+ cpumask_clear_cpu(cpu, mce_inject_cpumask);
+ raise_exception(m, NULL);
+ }
+}
+
/* Inject mce on current CPU */
static int raise_local(void)
{
@@ -139,9 +152,10 @@ static void raise_mce(struct mce *m)
return;
#ifdef CONFIG_X86_LOCAL_APIC
- if (m->inject_flags & MCJ_NMI_BROADCAST) {
+ if (m->inject_flags & (MCJ_IRQ_BRAODCAST | MCJ_NMI_BROADCAST)) {
unsigned long start;
int cpu;
+
get_online_cpus();
cpumask_copy(mce_inject_cpumask, cpu_online_mask);
cpumask_clear_cpu(get_cpu(), mce_inject_cpumask);
@@ -151,13 +165,25 @@ static void raise_mce(struct mce *m)
MCJ_CTX(mcpu->inject_flags) != MCJ_CTX_RANDOM)
cpumask_clear_cpu(cpu, mce_inject_cpumask);
}
- if (!cpumask_empty(mce_inject_cpumask))
- apic->send_IPI_mask(mce_inject_cpumask, NMI_VECTOR);
+ if (!cpumask_empty(mce_inject_cpumask)) {
+ if (m->inject_flags & MCJ_IRQ_BRAODCAST) {
+ /*
+ * don't wait because mce_irq_ipi is necessary
+ * to be sync with following raise_local
+ */
+ preempt_disable();
+ smp_call_function_many(mce_inject_cpumask,
+ mce_irq_ipi, NULL, 0);
+ preempt_enable();
+ } else if (m->inject_flags & MCJ_NMI_BROADCAST)
+ apic->send_IPI_mask(mce_inject_cpumask,
+ NMI_VECTOR);
+ }
start = jiffies;
while (!cpumask_empty(mce_inject_cpumask)) {
if (!time_before(jiffies, start + 2*HZ)) {
printk(KERN_ERR
- "Timeout waiting for mce inject NMI %lx\n",
+ "Timeout waiting for mce inject %lx\n",
*cpumask_bits(mce_inject_cpumask));
break;
}
@@ -208,7 +234,7 @@ static int inject_init(void)
if (!alloc_cpumask_var(&mce_inject_cpumask, GFP_KERNEL))
return -ENOMEM;
printk(KERN_INFO "Machine check injector initialized\n");
- mce_chrdev_ops.write = mce_write;
+ register_mce_write_callback(mce_write);
register_nmi_handler(NMI_LOCAL, mce_raise_notify, 0,
"mce_notify");
return 0;
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h
index fefcc69ee8b5..ed44c8a65858 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-internal.h
+++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h
@@ -1,4 +1,4 @@
-#include <linux/sysdev.h>
+#include <linux/device.h>
#include <asm/mce.h>
enum severity_level {
@@ -17,7 +17,7 @@ enum severity_level {
struct mce_bank {
u64 ctl; /* subevents to enable */
unsigned char init; /* initialise bank? */
- struct sysdev_attribute attr; /* sysdev attribute */
+ struct device_attribute attr; /* device attribute */
char attrname[ATTR_LEN]; /* attribute name */
};
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 362056aefeb4..5a11ae2e9e91 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -19,7 +19,7 @@
#include <linux/kernel.h>
#include <linux/percpu.h>
#include <linux/string.h>
-#include <linux/sysdev.h>
+#include <linux/device.h>
#include <linux/syscore_ops.h>
#include <linux/delay.h>
#include <linux/ctype.h>
@@ -95,13 +95,6 @@ static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait);
static DEFINE_PER_CPU(struct mce, mces_seen);
static int cpu_missing;
-/*
- * CPU/chipset specific EDAC code can register a notifier call here to print
- * MCE errors in a human-readable form.
- */
-ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain);
-EXPORT_SYMBOL_GPL(x86_mce_decoder_chain);
-
/* MCA banks polled by the period polling timer for corrected events */
DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
[0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
@@ -109,6 +102,12 @@ DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
static DEFINE_PER_CPU(struct work_struct, mce_work);
+/*
+ * CPU/chipset specific EDAC code can register a notifier call here to print
+ * MCE errors in a human-readable form.
+ */
+ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain);
+
/* Do initial initialization of a struct mce */
void mce_setup(struct mce *m)
{
@@ -119,9 +118,7 @@ void mce_setup(struct mce *m)
m->time = get_seconds();
m->cpuvendor = boot_cpu_data.x86_vendor;
m->cpuid = cpuid_eax(1);
-#ifdef CONFIG_SMP
m->socketid = cpu_data(m->extcpu).phys_proc_id;
-#endif
m->apicid = cpu_data(m->extcpu).initial_apicid;
rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap);
}
@@ -190,6 +187,57 @@ void mce_log(struct mce *mce)
set_bit(0, &mce_need_notify);
}
+static void drain_mcelog_buffer(void)
+{
+ unsigned int next, i, prev = 0;
+
+ next = rcu_dereference_check_mce(mcelog.next);
+
+ do {
+ struct mce *m;
+
+ /* drain what was logged during boot */
+ for (i = prev; i < next; i++) {
+ unsigned long start = jiffies;
+ unsigned retries = 1;
+
+ m = &mcelog.entry[i];
+
+ while (!m->finished) {
+ if (time_after_eq(jiffies, start + 2*retries))
+ retries++;
+
+ cpu_relax();
+
+ if (!m->finished && retries >= 4) {
+ pr_err("MCE: skipping error being logged currently!\n");
+ break;
+ }
+ }
+ smp_rmb();
+ atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m);
+ }
+
+ memset(mcelog.entry + prev, 0, (next - prev) * sizeof(*m));
+ prev = next;
+ next = cmpxchg(&mcelog.next, prev, 0);
+ } while (next != prev);
+}
+
+
+void mce_register_decode_chain(struct notifier_block *nb)
+{
+ atomic_notifier_chain_register(&x86_mce_decoder_chain, nb);
+ drain_mcelog_buffer();
+}
+EXPORT_SYMBOL_GPL(mce_register_decode_chain);
+
+void mce_unregister_decode_chain(struct notifier_block *nb)
+{
+ atomic_notifier_chain_unregister(&x86_mce_decoder_chain, nb);
+}
+EXPORT_SYMBOL_GPL(mce_unregister_decode_chain);
+
static void print_mce(struct mce *m)
{
int ret = 0;
@@ -1634,16 +1682,35 @@ static long mce_chrdev_ioctl(struct file *f, unsigned int cmd,
}
}
-/* Modified in mce-inject.c, so not static or const */
-struct file_operations mce_chrdev_ops = {
+static ssize_t (*mce_write)(struct file *filp, const char __user *ubuf,
+ size_t usize, loff_t *off);
+
+void register_mce_write_callback(ssize_t (*fn)(struct file *filp,
+ const char __user *ubuf,
+ size_t usize, loff_t *off))
+{
+ mce_write = fn;
+}
+EXPORT_SYMBOL_GPL(register_mce_write_callback);
+
+ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf,
+ size_t usize, loff_t *off)
+{
+ if (mce_write)
+ return mce_write(filp, ubuf, usize, off);
+ else
+ return -EINVAL;
+}
+
+static const struct file_operations mce_chrdev_ops = {
.open = mce_chrdev_open,
.release = mce_chrdev_release,
.read = mce_chrdev_read,
+ .write = mce_chrdev_write,
.poll = mce_chrdev_poll,
.unlocked_ioctl = mce_chrdev_ioctl,
.llseek = no_llseek,
};
-EXPORT_SYMBOL_GPL(mce_chrdev_ops);
static struct miscdevice mce_chrdev_device = {
MISC_MCELOG_MINOR,
@@ -1751,7 +1818,7 @@ static struct syscore_ops mce_syscore_ops = {
};
/*
- * mce_sysdev: Sysfs support
+ * mce_device: Sysfs support
*/
static void mce_cpu_restart(void *data)
@@ -1787,27 +1854,28 @@ static void mce_enable_ce(void *all)
__mcheck_cpu_init_timer();
}
-static struct sysdev_class mce_sysdev_class = {
+static struct bus_type mce_subsys = {
.name = "machinecheck",
+ .dev_name = "machinecheck",
};
-DEFINE_PER_CPU(struct sys_device, mce_sysdev);
+struct device *mce_device[CONFIG_NR_CPUS];
__cpuinitdata
void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
-static inline struct mce_bank *attr_to_bank(struct sysdev_attribute *attr)
+static inline struct mce_bank *attr_to_bank(struct device_attribute *attr)
{
return container_of(attr, struct mce_bank, attr);
}
-static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr,
+static ssize_t show_bank(struct device *s, struct device_attribute *attr,
char *buf)
{
return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl);
}
-static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
+static ssize_t set_bank(struct device *s, struct device_attribute *attr,
const char *buf, size_t size)
{
u64 new;
@@ -1822,14 +1890,14 @@ static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
}
static ssize_t
-show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf)
+show_trigger(struct device *s, struct device_attribute *attr, char *buf)
{
strcpy(buf, mce_helper);
strcat(buf, "\n");
return strlen(mce_helper) + 1;
}
-static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr,
+static ssize_t set_trigger(struct device *s, struct device_attribute *attr,
const char *buf, size_t siz)
{
char *p;
@@ -1844,8 +1912,8 @@ static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr,
return strlen(mce_helper) + !!p;
}
-static ssize_t set_ignore_ce(struct sys_device *s,
- struct sysdev_attribute *attr,
+static ssize_t set_ignore_ce(struct device *s,
+ struct device_attribute *attr,
const char *buf, size_t size)
{
u64 new;
@@ -1868,8 +1936,8 @@ static ssize_t set_ignore_ce(struct sys_device *s,
return size;
}
-static ssize_t set_cmci_disabled(struct sys_device *s,
- struct sysdev_attribute *attr,
+static ssize_t set_cmci_disabled(struct device *s,
+ struct device_attribute *attr,
const char *buf, size_t size)
{
u64 new;
@@ -1891,108 +1959,117 @@ static ssize_t set_cmci_disabled(struct sys_device *s,
return size;
}
-static ssize_t store_int_with_restart(struct sys_device *s,
- struct sysdev_attribute *attr,
+static ssize_t store_int_with_restart(struct device *s,
+ struct device_attribute *attr,
const char *buf, size_t size)
{
- ssize_t ret = sysdev_store_int(s, attr, buf, size);
+ ssize_t ret = device_store_int(s, attr, buf, size);
mce_restart();
return ret;
}
-static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
-static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
-static SYSDEV_INT_ATTR(monarch_timeout, 0644, monarch_timeout);
-static SYSDEV_INT_ATTR(dont_log_ce, 0644, mce_dont_log_ce);
+static DEVICE_ATTR(trigger, 0644, show_trigger, set_trigger);
+static DEVICE_INT_ATTR(tolerant, 0644, tolerant);
+static DEVICE_INT_ATTR(monarch_timeout, 0644, monarch_timeout);
+static DEVICE_INT_ATTR(dont_log_ce, 0644, mce_dont_log_ce);
-static struct sysdev_ext_attribute attr_check_interval = {
- _SYSDEV_ATTR(check_interval, 0644, sysdev_show_int,
- store_int_with_restart),
+static struct dev_ext_attribute dev_attr_check_interval = {
+ __ATTR(check_interval, 0644, device_show_int, store_int_with_restart),
&check_interval
};
-static struct sysdev_ext_attribute attr_ignore_ce = {
- _SYSDEV_ATTR(ignore_ce, 0644, sysdev_show_int, set_ignore_ce),
+static struct dev_ext_attribute dev_attr_ignore_ce = {
+ __ATTR(ignore_ce, 0644, device_show_int, set_ignore_ce),
&mce_ignore_ce
};
-static struct sysdev_ext_attribute attr_cmci_disabled = {
- _SYSDEV_ATTR(cmci_disabled, 0644, sysdev_show_int, set_cmci_disabled),
+static struct dev_ext_attribute dev_attr_cmci_disabled = {
+ __ATTR(cmci_disabled, 0644, device_show_int, set_cmci_disabled),
&mce_cmci_disabled
};
-static struct sysdev_attribute *mce_sysdev_attrs[] = {
- &attr_tolerant.attr,
- &attr_check_interval.attr,
- &attr_trigger,
- &attr_monarch_timeout.attr,
- &attr_dont_log_ce.attr,
- &attr_ignore_ce.attr,
- &attr_cmci_disabled.attr,
+static struct device_attribute *mce_device_attrs[] = {
+ &dev_attr_tolerant.attr,
+ &dev_attr_check_interval.attr,
+ &dev_attr_trigger,
+ &dev_attr_monarch_timeout.attr,
+ &dev_attr_dont_log_ce.attr,
+ &dev_attr_ignore_ce.attr,
+ &dev_attr_cmci_disabled.attr,
NULL
};
-static cpumask_var_t mce_sysdev_initialized;
+static cpumask_var_t mce_device_initialized;
-/* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */
-static __cpuinit int mce_sysdev_create(unsigned int cpu)
+static void mce_device_release(struct device *dev)
{
- struct sys_device *sysdev = &per_cpu(mce_sysdev, cpu);
+ kfree(dev);
+}
+
+/* Per cpu device init. All of the cpus still share the same ctrl bank: */
+static __cpuinit int mce_device_create(unsigned int cpu)
+{
+ struct device *dev;
int err;
int i, j;
if (!mce_available(&boot_cpu_data))
return -EIO;
- memset(&sysdev->kobj, 0, sizeof(struct kobject));
- sysdev->id = cpu;
- sysdev->cls = &mce_sysdev_class;
+ dev = kzalloc(sizeof *dev, GFP_KERNEL);
+ if (!dev)
+ return -ENOMEM;
+ dev->id = cpu;
+ dev->bus = &mce_subsys;
+ dev->release = &mce_device_release;
- err = sysdev_register(sysdev);
+ err = device_register(dev);
if (err)
return err;
- for (i = 0; mce_sysdev_attrs[i]; i++) {
- err = sysdev_create_file(sysdev, mce_sysdev_attrs[i]);
+ for (i = 0; mce_device_attrs[i]; i++) {
+ err = device_create_file(dev, mce_device_attrs[i]);
if (err)
goto error;
}
for (j = 0; j < banks; j++) {
- err = sysdev_create_file(sysdev, &mce_banks[j].attr);
+ err = device_create_file(dev, &mce_banks[j].attr);
if (err)
goto error2;
}
- cpumask_set_cpu(cpu, mce_sysdev_initialized);
+ cpumask_set_cpu(cpu, mce_device_initialized);
+ mce_device[cpu] = dev;
return 0;
error2:
while (--j >= 0)
- sysdev_remove_file(sysdev, &mce_banks[j].attr);
+ device_remove_file(dev, &mce_banks[j].attr);
error:
while (--i >= 0)
- sysdev_remove_file(sysdev, mce_sysdev_attrs[i]);
+ device_remove_file(dev, mce_device_attrs[i]);
- sysdev_unregister(sysdev);
+ device_unregister(dev);
return err;
}
-static __cpuinit void mce_sysdev_remove(unsigned int cpu)
+static __cpuinit void mce_device_remove(unsigned int cpu)
{
- struct sys_device *sysdev = &per_cpu(mce_sysdev, cpu);
+ struct device *dev = mce_device[cpu];
int i;
- if (!cpumask_test_cpu(cpu, mce_sysdev_initialized))
+ if (!cpumask_test_cpu(cpu, mce_device_initialized))
return;
- for (i = 0; mce_sysdev_attrs[i]; i++)
- sysdev_remove_file(sysdev, mce_sysdev_attrs[i]);
+ for (i = 0; mce_device_attrs[i]; i++)
+ device_remove_file(dev, mce_device_attrs[i]);
for (i = 0; i < banks; i++)
- sysdev_remove_file(sysdev, &mce_banks[i].attr);
+ device_remove_file(dev, &mce_banks[i].attr);
- sysdev_unregister(sysdev);
- cpumask_clear_cpu(cpu, mce_sysdev_initialized);
+ device_unregister(dev);
+ cpumask_clear_cpu(cpu, mce_device_initialized);
+ mce_device[cpu] = NULL;
}
/* Make sure there are no machine checks on offlined CPUs. */
@@ -2042,7 +2119,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
switch (action) {
case CPU_ONLINE:
case CPU_ONLINE_FROZEN:
- mce_sysdev_create(cpu);
+ mce_device_create(cpu);
if (threshold_cpu_callback)
threshold_cpu_callback(action, cpu);
break;
@@ -2050,7 +2127,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
case CPU_DEAD_FROZEN:
if (threshold_cpu_callback)
threshold_cpu_callback(action, cpu);
- mce_sysdev_remove(cpu);
+ mce_device_remove(cpu);
break;
case CPU_DOWN_PREPARE:
case CPU_DOWN_PREPARE_FROZEN:
@@ -2084,7 +2161,7 @@ static __init void mce_init_banks(void)
for (i = 0; i < banks; i++) {
struct mce_bank *b = &mce_banks[i];
- struct sysdev_attribute *a = &b->attr;
+ struct device_attribute *a = &b->attr;
sysfs_attr_init(&a->attr);
a->attr.name = b->attrname;
@@ -2104,16 +2181,16 @@ static __init int mcheck_init_device(void)
if (!mce_available(&boot_cpu_data))
return -EIO;
- zalloc_cpumask_var(&mce_sysdev_initialized, GFP_KERNEL);
+ zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL);
mce_init_banks();
- err = sysdev_class_register(&mce_sysdev_class);
+ err = subsys_system_register(&mce_subsys, NULL);
if (err)
return err;
for_each_online_cpu(i) {
- err = mce_sysdev_create(i);
+ err = mce_device_create(i);
if (err)
return err;
}
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index f5474218cffe..786e76a86322 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -17,7 +17,6 @@
#include <linux/notifier.h>
#include <linux/kobject.h>
#include <linux/percpu.h>
-#include <linux/sysdev.h>
#include <linux/errno.h>
#include <linux/sched.h>
#include <linux/sysfs.h>
@@ -64,11 +63,9 @@ struct threshold_bank {
};
static DEFINE_PER_CPU(struct threshold_bank * [NR_BANKS], threshold_banks);
-#ifdef CONFIG_SMP
static unsigned char shared_bank[NR_BANKS] = {
0, 0, 0, 0, 1
};
-#endif
static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */
@@ -202,10 +199,9 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
if (!block)
per_cpu(bank_map, cpu) |= (1 << bank);
-#ifdef CONFIG_SMP
if (shared_bank[bank] && c->cpu_core_id)
break;
-#endif
+
offset = setup_APIC_mce(offset,
(high & MASK_LVTOFF_HI) >> 20);
@@ -527,11 +523,11 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
{
int i, err = 0;
struct threshold_bank *b = NULL;
+ struct device *dev = mce_device[cpu];
char name[32];
sprintf(name, "threshold_bank%i", bank);
-#ifdef CONFIG_SMP
if (cpu_data(cpu).cpu_core_id && shared_bank[bank]) { /* symlink */
i = cpumask_first(cpu_llc_shared_mask(cpu));
@@ -548,8 +544,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
if (!b)
goto out;
- err = sysfs_create_link(&per_cpu(mce_sysdev, cpu).kobj,
- b->kobj, name);
+ err = sysfs_create_link(&dev->kobj, b->kobj, name);
if (err)
goto out;
@@ -558,7 +553,6 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
goto out;
}
-#endif
b = kzalloc(sizeof(struct threshold_bank), GFP_KERNEL);
if (!b) {
@@ -571,7 +565,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
goto out;
}
- b->kobj = kobject_create_and_add(name, &per_cpu(mce_sysdev, cpu).kobj);
+ b->kobj = kobject_create_and_add(name, &dev->kobj);
if (!b->kobj)
goto out_free;
@@ -591,8 +585,9 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
if (i == cpu)
continue;
- err = sysfs_create_link(&per_cpu(mce_sysdev, i).kobj,
- b->kobj, name);
+ dev = mce_device[i];
+ if (dev)
+ err = sysfs_create_link(&dev->kobj,b->kobj, name);
if (err)
goto out;
@@ -655,6 +650,7 @@ static void deallocate_threshold_block(unsigned int cpu,
static void threshold_remove_bank(unsigned int cpu, int bank)
{
struct threshold_bank *b;
+ struct device *dev;
char name[32];
int i = 0;
@@ -669,7 +665,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
#ifdef CONFIG_SMP
/* sibling symlink */
if (shared_bank[bank] && b->blocks->cpu != cpu) {
- sysfs_remove_link(&per_cpu(mce_sysdev, cpu).kobj, name);
+ sysfs_remove_link(&mce_device[cpu]->kobj, name);
per_cpu(threshold_banks, cpu)[bank] = NULL;
return;
@@ -681,7 +677,9 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
if (i == cpu)
continue;
- sysfs_remove_link(&per_cpu(mce_sysdev, i).kobj, name);
+ dev = mce_device[i];
+ if (dev)
+ sysfs_remove_link(&dev->kobj, name);
per_cpu(threshold_banks, i)[bank] = NULL;
}
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index 787e06c84ea6..67bb17a37a0a 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -19,7 +19,6 @@
#include <linux/kernel.h>
#include <linux/percpu.h>
#include <linux/export.h>
-#include <linux/sysdev.h>
#include <linux/types.h>
#include <linux/init.h>
#include <linux/smp.h>
@@ -69,16 +68,16 @@ static atomic_t therm_throt_en = ATOMIC_INIT(0);
static u32 lvtthmr_init __read_mostly;
#ifdef CONFIG_SYSFS
-#define define_therm_throt_sysdev_one_ro(_name) \
- static SYSDEV_ATTR(_name, 0444, \
- therm_throt_sysdev_show_##_name, \
+#define define_therm_throt_device_one_ro(_name) \
+ static DEVICE_ATTR(_name, 0444, \
+ therm_throt_device_show_##_name, \
NULL) \
-#define define_therm_throt_sysdev_show_func(event, name) \
+#define define_therm_throt_device_show_func(event, name) \
\
-static ssize_t therm_throt_sysdev_show_##event##_##name( \
- struct sys_device *dev, \
- struct sysdev_attribute *attr, \
+static ssize_t therm_throt_device_show_##event##_##name( \
+ struct device *dev, \
+ struct device_attribute *attr, \
char *buf) \
{ \
unsigned int cpu = dev->id; \
@@ -95,20 +94,20 @@ static ssize_t therm_throt_sysdev_show_##event##_##name( \
return ret; \
}
-define_therm_throt_sysdev_show_func(core_throttle, count);
-define_therm_throt_sysdev_one_ro(core_throttle_count);
+define_therm_throt_device_show_func(core_throttle, count);
+define_therm_throt_device_one_ro(core_throttle_count);
-define_therm_throt_sysdev_show_func(core_power_limit, count);
-define_therm_throt_sysdev_one_ro(core_power_limit_count);
+define_therm_throt_device_show_func(core_power_limit, count);
+define_therm_throt_device_one_ro(core_power_limit_count);
-define_therm_throt_sysdev_show_func(package_throttle, count);
-define_therm_throt_sysdev_one_ro(package_throttle_count);
+define_therm_throt_device_show_func(package_throttle, count);
+define_therm_throt_device_one_ro(package_throttle_count);
-define_therm_throt_sysdev_show_func(package_power_limit, count);
-define_therm_throt_sysdev_one_ro(package_power_limit_count);
+define_therm_throt_device_show_func(package_power_limit, count);
+define_therm_throt_device_one_ro(package_power_limit_count);
static struct attribute *thermal_throttle_attrs[] = {
- &attr_core_throttle_count.attr,
+ &dev_attr_core_throttle_count.attr,
NULL
};
@@ -223,36 +222,36 @@ static int thresh_event_valid(int event)
#ifdef CONFIG_SYSFS
/* Add/Remove thermal_throttle interface for CPU device: */
-static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev,
+static __cpuinit int thermal_throttle_add_dev(struct device *dev,
unsigned int cpu)
{
int err;
struct cpuinfo_x86 *c = &cpu_data(cpu);
- err = sysfs_create_group(&sys_dev->kobj, &thermal_attr_group);
+ err = sysfs_create_group(&dev->kobj, &thermal_attr_group);
if (err)
return err;
if (cpu_has(c, X86_FEATURE_PLN))
- err = sysfs_add_file_to_group(&sys_dev->kobj,
- &attr_core_power_limit_count.attr,
+ err = sysfs_add_file_to_group(&dev->kobj,
+ &dev_attr_core_power_limit_count.attr,
thermal_attr_group.name);
if (cpu_has(c, X86_FEATURE_PTS)) {
- err = sysfs_add_file_to_group(&sys_dev->kobj,
- &attr_package_throttle_count.attr,
+ err = sysfs_add_file_to_group(&dev->kobj,
+ &dev_attr_package_throttle_count.attr,
thermal_attr_group.name);
if (cpu_has(c, X86_FEATURE_PLN))
- err = sysfs_add_file_to_group(&sys_dev->kobj,
- &attr_package_power_limit_count.attr,
+ err = sysfs_add_file_to_group(&dev->kobj,
+ &dev_attr_package_power_limit_count.attr,
thermal_attr_group.name);
}
return err;
}
-static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev)
+static __cpuinit void thermal_throttle_remove_dev(struct device *dev)
{
- sysfs_remove_group(&sys_dev->kobj, &thermal_attr_group);
+ sysfs_remove_group(&dev->kobj, &thermal_attr_group);
}
/* Mutex protecting device creation against CPU hotplug: */
@@ -265,16 +264,16 @@ thermal_throttle_cpu_callback(struct notifier_block *nfb,
void *hcpu)
{
unsigned int cpu = (unsigned long)hcpu;
- struct sys_device *sys_dev;
+ struct device *dev;
int err = 0;
- sys_dev = get_cpu_sysdev(cpu);
+ dev = get_cpu_device(cpu);
switch (action) {
case CPU_UP_PREPARE:
case CPU_UP_PREPARE_FROZEN:
mutex_lock(&therm_cpu_lock);
- err = thermal_throttle_add_dev(sys_dev, cpu);
+ err = thermal_throttle_add_dev(dev, cpu);
mutex_unlock(&therm_cpu_lock);
WARN_ON(err);
break;
@@ -283,7 +282,7 @@ thermal_throttle_cpu_callback(struct notifier_block *nfb,
case CPU_DEAD:
case CPU_DEAD_FROZEN:
mutex_lock(&therm_cpu_lock);
- thermal_throttle_remove_dev(sys_dev);
+ thermal_throttle_remove_dev(dev);
mutex_unlock(&therm_cpu_lock);
break;
}
@@ -310,7 +309,7 @@ static __init int thermal_throttle_init_device(void)
#endif
/* connect live CPUs to sysfs */
for_each_online_cpu(cpu) {
- err = thermal_throttle_add_dev(get_cpu_sysdev(cpu), cpu);
+ err = thermal_throttle_add_dev(get_cpu_device(cpu), cpu);
WARN_ON(err);
}
#ifdef CONFIG_HOTPLUG_CPU
@@ -323,17 +322,6 @@ device_initcall(thermal_throttle_init_device);
#endif /* CONFIG_SYSFS */
-/*
- * Set up the most two significant bit to notify mce log that this thermal
- * event type.
- * This is a temp solution. May be changed in the future with mce log
- * infrasture.
- */
-#define CORE_THROTTLED (0)
-#define CORE_POWER_LIMIT ((__u64)1 << 62)
-#define PACKAGE_THROTTLED ((__u64)2 << 62)
-#define PACKAGE_POWER_LIMIT ((__u64)3 << 62)
-
static void notify_thresholds(__u64 msr_val)
{
/* check whether the interrupt handler is defined;
@@ -363,27 +351,23 @@ static void intel_thermal_interrupt(void)
if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT,
THERMAL_THROTTLING_EVENT,
CORE_LEVEL) != 0)
- mce_log_therm_throt_event(CORE_THROTTLED | msr_val);
+ mce_log_therm_throt_event(msr_val);
if (this_cpu_has(X86_FEATURE_PLN))
- if (therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT,
+ therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT,
POWER_LIMIT_EVENT,
- CORE_LEVEL) != 0)
- mce_log_therm_throt_event(CORE_POWER_LIMIT | msr_val);
+ CORE_LEVEL);
if (this_cpu_has(X86_FEATURE_PTS)) {
rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val);
- if (therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT,
+ therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT,
THERMAL_THROTTLING_EVENT,
- PACKAGE_LEVEL) != 0)
- mce_log_therm_throt_event(PACKAGE_THROTTLED | msr_val);
+ PACKAGE_LEVEL);
if (this_cpu_has(X86_FEATURE_PLN))
- if (therm_throt_process(msr_val &
+ therm_throt_process(msr_val &
PACKAGE_THERM_STATUS_POWER_LIMIT,
POWER_LIMIT_EVENT,
- PACKAGE_LEVEL) != 0)
- mce_log_therm_throt_event(PACKAGE_POWER_LIMIT
- | msr_val);
+ PACKAGE_LEVEL);
}
}
@@ -397,8 +381,8 @@ static void (*smp_thermal_vector)(void) = unexpected_thermal_interrupt;
asmlinkage void smp_thermal_interrupt(struct pt_regs *regs)
{
- exit_idle();
irq_enter();
+ exit_idle();
inc_irq_stat(irq_thermal_count);
smp_thermal_vector();
irq_exit();
diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c
index d746df2909c9..aa578cadb940 100644
--- a/arch/x86/kernel/cpu/mcheck/threshold.c
+++ b/arch/x86/kernel/cpu/mcheck/threshold.c
@@ -19,8 +19,8 @@ void (*mce_threshold_vector)(void) = default_threshold_interrupt;
asmlinkage void smp_threshold_interrupt(void)
{
- exit_idle();
irq_enter();
+ exit_idle();
inc_irq_stat(irq_threshold_count);
mce_threshold_vector();
irq_exit();
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index a71efcdbb092..97b26356e9ee 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -547,6 +547,7 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,
if (tmp != mask_lo) {
printk(KERN_WARNING "mtrr: your BIOS has configured an incorrect mask, fixing it.\n");
+ add_taint(TAINT_FIRMWARE_WORKAROUND);
mask_lo = tmp;
}
}
@@ -693,6 +694,7 @@ static void prepare_set(void) __acquires(set_atomicity_lock)
/* Disable MTRRs, and set the default type to uncached */
mtrr_wrmsr(MSR_MTRRdefType, deftype_lo & ~0xcff, deftype_hi);
+ wbinvd();
}
static void post_set(void) __releases(set_atomicity_lock)
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 640891014b2a..5adce1040b11 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -312,12 +312,8 @@ int x86_setup_perfctr(struct perf_event *event)
return -EOPNOTSUPP;
}
- /*
- * Do not allow config1 (extended registers) to propagate,
- * there's no sane user-space generalization yet:
- */
if (attr->type == PERF_TYPE_RAW)
- return 0;
+ return x86_pmu_extra_regs(event->attr.config, event);
if (attr->type == PERF_TYPE_HW_CACHE)
return set_ext_hw_attr(hwc, event);
@@ -488,18 +484,195 @@ static inline int is_x86_event(struct perf_event *event)
return event->pmu == &pmu;
}
+/*
+ * Event scheduler state:
+ *
+ * Assign events iterating over all events and counters, beginning
+ * with events with least weights first. Keep the current iterator
+ * state in struct sched_state.
+ */
+struct sched_state {
+ int weight;
+ int event; /* event index */
+ int counter; /* counter index */
+ int unassigned; /* number of events to be assigned left */
+ unsigned long used[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+};
+
+/* Total max is X86_PMC_IDX_MAX, but we are O(n!) limited */
+#define SCHED_STATES_MAX 2
+
+struct perf_sched {
+ int max_weight;
+ int max_events;
+ struct event_constraint **constraints;
+ struct sched_state state;
+ int saved_states;
+ struct sched_state saved[SCHED_STATES_MAX];
+};
+
+/*
+ * Initialize interator that runs through all events and counters.
+ */
+static void perf_sched_init(struct perf_sched *sched, struct event_constraint **c,
+ int num, int wmin, int wmax)
+{
+ int idx;
+
+ memset(sched, 0, sizeof(*sched));
+ sched->max_events = num;
+ sched->max_weight = wmax;
+ sched->constraints = c;
+
+ for (idx = 0; idx < num; idx++) {
+ if (c[idx]->weight == wmin)
+ break;
+ }
+
+ sched->state.event = idx; /* start with min weight */
+ sched->state.weight = wmin;
+ sched->state.unassigned = num;
+}
+
+static void perf_sched_save_state(struct perf_sched *sched)
+{
+ if (WARN_ON_ONCE(sched->saved_states >= SCHED_STATES_MAX))
+ return;
+
+ sched->saved[sched->saved_states] = sched->state;
+ sched->saved_states++;
+}
+
+static bool perf_sched_restore_state(struct perf_sched *sched)
+{
+ if (!sched->saved_states)
+ return false;
+
+ sched->saved_states--;
+ sched->state = sched->saved[sched->saved_states];
+
+ /* continue with next counter: */
+ clear_bit(sched->state.counter++, sched->state.used);
+
+ return true;
+}
+
+/*
+ * Select a counter for the current event to schedule. Return true on
+ * success.
+ */
+static bool __perf_sched_find_counter(struct perf_sched *sched)
+{
+ struct event_constraint *c;
+ int idx;
+
+ if (!sched->state.unassigned)
+ return false;
+
+ if (sched->state.event >= sched->max_events)
+ return false;
+
+ c = sched->constraints[sched->state.event];
+
+ /* Prefer fixed purpose counters */
+ if (x86_pmu.num_counters_fixed) {
+ idx = X86_PMC_IDX_FIXED;
+ for_each_set_bit_cont(idx, c->idxmsk, X86_PMC_IDX_MAX) {
+ if (!__test_and_set_bit(idx, sched->state.used))
+ goto done;
+ }
+ }
+ /* Grab the first unused counter starting with idx */
+ idx = sched->state.counter;
+ for_each_set_bit_cont(idx, c->idxmsk, X86_PMC_IDX_FIXED) {
+ if (!__test_and_set_bit(idx, sched->state.used))
+ goto done;
+ }
+
+ return false;
+
+done:
+ sched->state.counter = idx;
+
+ if (c->overlap)
+ perf_sched_save_state(sched);
+
+ return true;
+}
+
+static bool perf_sched_find_counter(struct perf_sched *sched)
+{
+ while (!__perf_sched_find_counter(sched)) {
+ if (!perf_sched_restore_state(sched))
+ return false;
+ }
+
+ return true;
+}
+
+/*
+ * Go through all unassigned events and find the next one to schedule.
+ * Take events with the least weight first. Return true on success.
+ */
+static bool perf_sched_next_event(struct perf_sched *sched)
+{
+ struct event_constraint *c;
+
+ if (!sched->state.unassigned || !--sched->state.unassigned)
+ return false;
+
+ do {
+ /* next event */
+ sched->state.event++;
+ if (sched->state.event >= sched->max_events) {
+ /* next weight */
+ sched->state.event = 0;
+ sched->state.weight++;
+ if (sched->state.weight > sched->max_weight)
+ return false;
+ }
+ c = sched->constraints[sched->state.event];
+ } while (c->weight != sched->state.weight);
+
+ sched->state.counter = 0; /* start with first counter */
+
+ return true;
+}
+
+/*
+ * Assign a counter for each event.
+ */
+static int perf_assign_events(struct event_constraint **constraints, int n,
+ int wmin, int wmax, int *assign)
+{
+ struct perf_sched sched;
+
+ perf_sched_init(&sched, constraints, n, wmin, wmax);
+
+ do {
+ if (!perf_sched_find_counter(&sched))
+ break; /* failed */
+ if (assign)
+ assign[sched.state.event] = sched.state.counter;
+ } while (perf_sched_next_event(&sched));
+
+ return sched.state.unassigned;
+}
+
int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
{
struct event_constraint *c, *constraints[X86_PMC_IDX_MAX];
unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
- int i, j, w, wmax, num = 0;
+ int i, wmin, wmax, num = 0;
struct hw_perf_event *hwc;
bitmap_zero(used_mask, X86_PMC_IDX_MAX);
- for (i = 0; i < n; i++) {
+ for (i = 0, wmin = X86_PMC_IDX_MAX, wmax = 0; i < n; i++) {
c = x86_pmu.get_event_constraints(cpuc, cpuc->event_list[i]);
constraints[i] = c;
+ wmin = min(wmin, c->weight);
+ wmax = max(wmax, c->weight);
}
/*
@@ -525,59 +698,11 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
if (assign)
assign[i] = hwc->idx;
}
- if (i == n)
- goto done;
-
- /*
- * begin slow path
- */
-
- bitmap_zero(used_mask, X86_PMC_IDX_MAX);
-
- /*
- * weight = number of possible counters
- *
- * 1 = most constrained, only works on one counter
- * wmax = least constrained, works on any counter
- *
- * assign events to counters starting with most
- * constrained events.
- */
- wmax = x86_pmu.num_counters;
-
- /*
- * when fixed event counters are present,
- * wmax is incremented by 1 to account
- * for one more choice
- */
- if (x86_pmu.num_counters_fixed)
- wmax++;
-
- for (w = 1, num = n; num && w <= wmax; w++) {
- /* for each event */
- for (i = 0; num && i < n; i++) {
- c = constraints[i];
- hwc = &cpuc->event_list[i]->hw;
-
- if (c->weight != w)
- continue;
- for_each_set_bit(j, c->idxmsk, X86_PMC_IDX_MAX) {
- if (!test_bit(j, used_mask))
- break;
- }
-
- if (j == X86_PMC_IDX_MAX)
- break;
+ /* slow path */
+ if (i != n)
+ num = perf_assign_events(constraints, n, wmin, wmax, assign);
- __set_bit(j, used_mask);
-
- if (assign)
- assign[i] = j;
- num--;
- }
- }
-done:
/*
* scheduling failed or is just a simulation,
* free resources if necessary
@@ -588,7 +713,7 @@ done:
x86_pmu.put_event_constraints(cpuc, cpuc->event_list[i]);
}
}
- return num ? -ENOSPC : 0;
+ return num ? -EINVAL : 0;
}
/*
@@ -607,7 +732,7 @@ static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader,
if (is_x86_event(leader)) {
if (n >= max_count)
- return -ENOSPC;
+ return -EINVAL;
cpuc->event_list[n] = leader;
n++;
}
@@ -620,7 +745,7 @@ static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader,
continue;
if (n >= max_count)
- return -ENOSPC;
+ return -EINVAL;
cpuc->event_list[n] = event;
n++;
@@ -1123,6 +1248,7 @@ static void __init pmu_check_apic(void)
static int __init init_hw_perf_events(void)
{
+ struct x86_pmu_quirk *quirk;
struct event_constraint *c;
int err;
@@ -1151,8 +1277,8 @@ static int __init init_hw_perf_events(void)
pr_cont("%s PMU driver.\n", x86_pmu.name);
- if (x86_pmu.quirks)
- x86_pmu.quirks();
+ for (quirk = x86_pmu.quirks; quirk; quirk = quirk->next)
+ quirk->func();
if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) {
WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!",
@@ -1175,12 +1301,18 @@ static int __init init_hw_perf_events(void)
unconstrained = (struct event_constraint)
__EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_counters) - 1,
- 0, x86_pmu.num_counters);
+ 0, x86_pmu.num_counters, 0);
if (x86_pmu.event_constraints) {
+ /*
+ * event on fixed counter2 (REF_CYCLES) only works on this
+ * counter, so do not extend mask to generic counters
+ */
for_each_event_constraint(c, x86_pmu.event_constraints) {
- if (c->cmask != X86_RAW_EVENT_MASK)
+ if (c->cmask != X86_RAW_EVENT_MASK
+ || c->idxmsk64 == X86_PMC_MSK_FIXED_REF_CYCLES) {
continue;
+ }
c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1;
c->weight += x86_pmu.num_counters;
@@ -1316,7 +1448,7 @@ static int validate_event(struct perf_event *event)
c = x86_pmu.get_event_constraints(fake_cpuc, event);
if (!c || !c->weight)
- ret = -ENOSPC;
+ ret = -EINVAL;
if (x86_pmu.put_event_constraints)
x86_pmu.put_event_constraints(fake_cpuc, event);
@@ -1341,7 +1473,7 @@ static int validate_group(struct perf_event *event)
{
struct perf_event *leader = event->group_leader;
struct cpu_hw_events *fake_cpuc;
- int ret = -ENOSPC, n;
+ int ret = -EINVAL, n;
fake_cpuc = allocate_fake_cpuc();
if (IS_ERR(fake_cpuc))
@@ -1570,3 +1702,15 @@ unsigned long perf_misc_flags(struct pt_regs *regs)
return misc;
}
+
+void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap)
+{
+ cap->version = x86_pmu.version;
+ cap->num_counters_gp = x86_pmu.num_counters;
+ cap->num_counters_fixed = x86_pmu.num_counters_fixed;
+ cap->bit_width_gp = x86_pmu.cntval_bits;
+ cap->bit_width_fixed = x86_pmu.cntval_bits;
+ cap->events_mask = (unsigned int)x86_pmu.events_maskl;
+ cap->events_mask_len = x86_pmu.events_mask_len;
+}
+EXPORT_SYMBOL_GPL(perf_get_x86_pmu_capability);
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index b9698d40ac4b..8944062f46e2 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -45,6 +45,7 @@ struct event_constraint {
u64 code;
u64 cmask;
int weight;
+ int overlap;
};
struct amd_nb {
@@ -151,15 +152,40 @@ struct cpu_hw_events {
void *kfree_on_online;
};
-#define __EVENT_CONSTRAINT(c, n, m, w) {\
+#define __EVENT_CONSTRAINT(c, n, m, w, o) {\
{ .idxmsk64 = (n) }, \
.code = (c), \
.cmask = (m), \
.weight = (w), \
+ .overlap = (o), \
}
#define EVENT_CONSTRAINT(c, n, m) \
- __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n))
+ __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n), 0)
+
+/*
+ * The overlap flag marks event constraints with overlapping counter
+ * masks. This is the case if the counter mask of such an event is not
+ * a subset of any other counter mask of a constraint with an equal or
+ * higher weight, e.g.:
+ *
+ * c_overlaps = EVENT_CONSTRAINT_OVERLAP(0, 0x09, 0);
+ * c_another1 = EVENT_CONSTRAINT(0, 0x07, 0);
+ * c_another2 = EVENT_CONSTRAINT(0, 0x38, 0);
+ *
+ * The event scheduler may not select the correct counter in the first
+ * cycle because it needs to know which subsequent events will be
+ * scheduled. It may fail to schedule the events then. So we set the
+ * overlap flag for such constraints to give the scheduler a hint which
+ * events to select for counter rescheduling.
+ *
+ * Care must be taken as the rescheduling algorithm is O(n!) which
+ * will increase scheduling cycles for an over-commited system
+ * dramatically. The number of such EVENT_CONSTRAINT_OVERLAP() macros
+ * and its counter masks must be kept at a minimum.
+ */
+#define EVENT_CONSTRAINT_OVERLAP(c, n, m) \
+ __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n), 1)
/*
* Constraint on the Event code.
@@ -235,6 +261,11 @@ union perf_capabilities {
u64 capabilities;
};
+struct x86_pmu_quirk {
+ struct x86_pmu_quirk *next;
+ void (*func)(void);
+};
+
/*
* struct x86_pmu - generic x86 pmu
*/
@@ -259,6 +290,11 @@ struct x86_pmu {
int num_counters_fixed;
int cntval_bits;
u64 cntval_mask;
+ union {
+ unsigned long events_maskl;
+ unsigned long events_mask[BITS_TO_LONGS(ARCH_PERFMON_EVENTS_COUNT)];
+ };
+ int events_mask_len;
int apic;
u64 max_period;
struct event_constraint *
@@ -268,7 +304,7 @@ struct x86_pmu {
void (*put_event_constraints)(struct cpu_hw_events *cpuc,
struct perf_event *event);
struct event_constraint *event_constraints;
- void (*quirks)(void);
+ struct x86_pmu_quirk *quirks;
int perfctr_second_write;
int (*cpu_prepare)(int cpu);
@@ -309,6 +345,15 @@ struct x86_pmu {
struct perf_guest_switch_msr *(*guest_get_msrs)(int *nr);
};
+#define x86_add_quirk(func_) \
+do { \
+ static struct x86_pmu_quirk __quirk __initdata = { \
+ .func = func_, \
+ }; \
+ __quirk.next = x86_pmu.quirks; \
+ x86_pmu.quirks = &__quirk; \
+} while (0)
+
#define ERF_NO_HT_SHARING 1
#define ERF_HAS_RSP_1 2
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index aeefd45697a2..0397b23be8e9 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -492,7 +492,7 @@ static __initconst const struct x86_pmu amd_pmu = {
static struct event_constraint amd_f15_PMC0 = EVENT_CONSTRAINT(0, 0x01, 0);
static struct event_constraint amd_f15_PMC20 = EVENT_CONSTRAINT(0, 0x07, 0);
static struct event_constraint amd_f15_PMC3 = EVENT_CONSTRAINT(0, 0x08, 0);
-static struct event_constraint amd_f15_PMC30 = EVENT_CONSTRAINT(0, 0x09, 0);
+static struct event_constraint amd_f15_PMC30 = EVENT_CONSTRAINT_OVERLAP(0, 0x09, 0);
static struct event_constraint amd_f15_PMC50 = EVENT_CONSTRAINT(0, 0x3F, 0);
static struct event_constraint amd_f15_PMC53 = EVENT_CONSTRAINT(0, 0x38, 0);
diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
index ab6343d21825..3b8a2d30d14e 100644
--- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c
+++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
@@ -199,8 +199,7 @@ static int force_ibs_eilvt_setup(void)
goto out;
}
- pr_err(FW_BUG "using offset %d for IBS interrupts\n", offset);
- pr_err(FW_BUG "workaround enabled for IBS LVT offset\n");
+ pr_info("IBS: LVT offset %d assigned\n", offset);
return 0;
out:
@@ -265,19 +264,23 @@ perf_ibs_cpu_notifier(struct notifier_block *self, unsigned long action, void *h
static __init int amd_ibs_init(void)
{
u32 caps;
- int ret;
+ int ret = -EINVAL;
caps = __get_ibs_caps();
if (!caps)
return -ENODEV; /* ibs not supported by the cpu */
- if (!ibs_eilvt_valid()) {
- ret = force_ibs_eilvt_setup();
- if (ret) {
- pr_err("Failed to setup IBS, %d\n", ret);
- return ret;
- }
- }
+ /*
+ * Force LVT offset assignment for family 10h: The offsets are
+ * not assigned by the BIOS for this family, so the OS is
+ * responsible for doing it. If the OS assignment fails, fall
+ * back to BIOS settings and try to setup this.
+ */
+ if (boot_cpu_data.x86 == 0x10)
+ force_ibs_eilvt_setup();
+
+ if (!ibs_eilvt_valid())
+ goto out;
get_online_cpus();
ibs_caps = caps;
@@ -287,7 +290,11 @@ static __init int amd_ibs_init(void)
smp_call_function(setup_APIC_ibs, NULL, 1);
put_online_cpus();
- return perf_event_ibs_init();
+ ret = perf_event_ibs_init();
+out:
+ if (ret)
+ pr_err("Failed to setup IBS, %d\n", ret);
+ return ret;
}
/* Since we need the pci subsystem to init ibs we can't do this earlier: */
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 2be5ebe99872..3bd37bdf1b8e 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -28,6 +28,7 @@ static u64 intel_perfmon_event_map[PERF_COUNT_HW_MAX] __read_mostly =
[PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
[PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
[PERF_COUNT_HW_BUS_CYCLES] = 0x013c,
+ [PERF_COUNT_HW_REF_CPU_CYCLES] = 0x0300, /* pseudo-encoding */
};
static struct event_constraint intel_core_event_constraints[] __read_mostly =
@@ -45,12 +46,7 @@ static struct event_constraint intel_core2_event_constraints[] __read_mostly =
{
FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
- /*
- * Core2 has Fixed Counter 2 listed as CPU_CLK_UNHALTED.REF and event
- * 0x013c as CPU_CLK_UNHALTED.BUS and specifies there is a fixed
- * ratio between these counters.
- */
- /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */
+ FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
INTEL_EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */
INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */
INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
@@ -68,7 +64,7 @@ static struct event_constraint intel_nehalem_event_constraints[] __read_mostly =
{
FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
- /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */
+ FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
INTEL_EVENT_CONSTRAINT(0x40, 0x3), /* L1D_CACHE_LD */
INTEL_EVENT_CONSTRAINT(0x41, 0x3), /* L1D_CACHE_ST */
INTEL_EVENT_CONSTRAINT(0x42, 0x3), /* L1D_CACHE_LOCK */
@@ -90,7 +86,7 @@ static struct event_constraint intel_westmere_event_constraints[] __read_mostly
{
FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
- /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */
+ FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */
INTEL_EVENT_CONSTRAINT(0x60, 0x1), /* OFFCORE_REQUESTS_OUTSTANDING */
INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */
@@ -102,7 +98,7 @@ static struct event_constraint intel_snb_event_constraints[] __read_mostly =
{
FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
- /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */
+ FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
INTEL_EVENT_CONSTRAINT(0x48, 0x4), /* L1D_PEND_MISS.PENDING */
INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */
INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */
@@ -125,7 +121,7 @@ static struct event_constraint intel_gen_event_constraints[] __read_mostly =
{
FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
- /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */
+ FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
EVENT_CONSTRAINT_END
};
@@ -1169,7 +1165,7 @@ again:
*/
c = &unconstrained;
} else if (intel_try_alt_er(event, orig_idx)) {
- raw_spin_unlock(&era->lock);
+ raw_spin_unlock_irqrestore(&era->lock, flags);
goto again;
}
raw_spin_unlock_irqrestore(&era->lock, flags);
@@ -1519,7 +1515,7 @@ static __initconst const struct x86_pmu intel_pmu = {
.guest_get_msrs = intel_guest_get_msrs,
};
-static void intel_clovertown_quirks(void)
+static __init void intel_clovertown_quirk(void)
{
/*
* PEBS is unreliable due to:
@@ -1545,12 +1541,60 @@ static void intel_clovertown_quirks(void)
x86_pmu.pebs_constraints = NULL;
}
+static __init void intel_sandybridge_quirk(void)
+{
+ printk(KERN_WARNING "PEBS disabled due to CPU errata.\n");
+ x86_pmu.pebs = 0;
+ x86_pmu.pebs_constraints = NULL;
+}
+
+static const struct { int id; char *name; } intel_arch_events_map[] __initconst = {
+ { PERF_COUNT_HW_CPU_CYCLES, "cpu cycles" },
+ { PERF_COUNT_HW_INSTRUCTIONS, "instructions" },
+ { PERF_COUNT_HW_BUS_CYCLES, "bus cycles" },
+ { PERF_COUNT_HW_CACHE_REFERENCES, "cache references" },
+ { PERF_COUNT_HW_CACHE_MISSES, "cache misses" },
+ { PERF_COUNT_HW_BRANCH_INSTRUCTIONS, "branch instructions" },
+ { PERF_COUNT_HW_BRANCH_MISSES, "branch misses" },
+};
+
+static __init void intel_arch_events_quirk(void)
+{
+ int bit;
+
+ /* disable event that reported as not presend by cpuid */
+ for_each_set_bit(bit, x86_pmu.events_mask, ARRAY_SIZE(intel_arch_events_map)) {
+ intel_perfmon_event_map[intel_arch_events_map[bit].id] = 0;
+ printk(KERN_WARNING "CPUID marked event: \'%s\' unavailable\n",
+ intel_arch_events_map[bit].name);
+ }
+}
+
+static __init void intel_nehalem_quirk(void)
+{
+ union cpuid10_ebx ebx;
+
+ ebx.full = x86_pmu.events_maskl;
+ if (ebx.split.no_branch_misses_retired) {
+ /*
+ * Erratum AAJ80 detected, we work it around by using
+ * the BR_MISP_EXEC.ANY event. This will over-count
+ * branch-misses, but it's still much better than the
+ * architectural event which is often completely bogus:
+ */
+ intel_perfmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x7f89;
+ ebx.split.no_branch_misses_retired = 0;
+ x86_pmu.events_maskl = ebx.full;
+ printk(KERN_INFO "CPU erratum AAJ80 worked around\n");
+ }
+}
+
__init int intel_pmu_init(void)
{
union cpuid10_edx edx;
union cpuid10_eax eax;
+ union cpuid10_ebx ebx;
unsigned int unused;
- unsigned int ebx;
int version;
if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
@@ -1567,8 +1611,8 @@ __init int intel_pmu_init(void)
* Check whether the Architectural PerfMon supports
* Branch Misses Retired hw_event or not.
*/
- cpuid(10, &eax.full, &ebx, &unused, &edx.full);
- if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED)
+ cpuid(10, &eax.full, &ebx.full, &unused, &edx.full);
+ if (eax.split.mask_length < ARCH_PERFMON_EVENTS_COUNT)
return -ENODEV;
version = eax.split.version_id;
@@ -1582,6 +1626,9 @@ __init int intel_pmu_init(void)
x86_pmu.cntval_bits = eax.split.bit_width;
x86_pmu.cntval_mask = (1ULL << eax.split.bit_width) - 1;
+ x86_pmu.events_maskl = ebx.full;
+ x86_pmu.events_mask_len = eax.split.mask_length;
+
/*
* Quirk: v2 perfmon does not report fixed-purpose events, so
* assume at least 3 events:
@@ -1601,6 +1648,8 @@ __init int intel_pmu_init(void)
intel_ds_init();
+ x86_add_quirk(intel_arch_events_quirk); /* Install first, so it runs last */
+
/*
* Install the hw-cache-events table:
*/
@@ -1610,7 +1659,7 @@ __init int intel_pmu_init(void)
break;
case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */
- x86_pmu.quirks = intel_clovertown_quirks;
+ x86_add_quirk(intel_clovertown_quirk);
case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */
case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */
case 29: /* six-core 45 nm xeon "Dunnington" */
@@ -1644,17 +1693,8 @@ __init int intel_pmu_init(void)
/* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */
intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x1803fb1;
- if (ebx & 0x40) {
- /*
- * Erratum AAJ80 detected, we work it around by using
- * the BR_MISP_EXEC.ANY event. This will over-count
- * branch-misses, but it's still much better than the
- * architectural event which is often completely bogus:
- */
- intel_perfmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x7f89;
+ x86_add_quirk(intel_nehalem_quirk);
- pr_cont("erratum AAJ80 worked around, ");
- }
pr_cont("Nehalem events, ");
break;
@@ -1694,6 +1734,7 @@ __init int intel_pmu_init(void)
break;
case 42: /* SandyBridge */
+ x86_add_quirk(intel_sandybridge_quirk);
case 45: /* SandyBridge, "Romely-EP" */
memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
@@ -1730,5 +1771,6 @@ __init int intel_pmu_init(void)
break;
}
}
+
return 0;
}
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index c0d238f49db8..73da6b64f5b7 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -493,6 +493,7 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
unsigned long from = cpuc->lbr_entries[0].from;
unsigned long old_to, to = cpuc->lbr_entries[0].to;
unsigned long ip = regs->ip;
+ int is_64bit = 0;
/*
* We don't need to fixup if the PEBS assist is fault like
@@ -544,7 +545,10 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
} else
kaddr = (void *)to;
- kernel_insn_init(&insn, kaddr);
+#ifdef CONFIG_X86_64
+ is_64bit = kernel_ip(to) || !test_thread_flag(TIF_IA32);
+#endif
+ insn_init(&insn, kaddr, is_64bit);
insn_get_length(&insn);
to += insn.length;
} while (to < ip);
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index 492bf1358a7c..ef484d9d0a25 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -1268,7 +1268,7 @@ reserve:
}
done:
- return num ? -ENOSPC : 0;
+ return num ? -EINVAL : 0;
}
static __initconst const struct x86_pmu p4_pmu = {
diff --git a/arch/x86/kernel/cpu/powerflags.c b/arch/x86/kernel/cpu/powerflags.c
index 5abbea297e0c..7b3fe56b1c21 100644
--- a/arch/x86/kernel/cpu/powerflags.c
+++ b/arch/x86/kernel/cpu/powerflags.c
@@ -16,5 +16,6 @@ const char *const x86_power_flags[32] = {
"100mhzsteps",
"hwpstate",
"", /* tsc invariant mapped to constant_tsc */
- /* nothing */
+ "cpb", /* core performance boost */
+ "eff_freq_ro", /* Readonly aperf/mperf */
};
diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c
index 14b23140e81f..8022c6681485 100644
--- a/arch/x86/kernel/cpu/proc.c
+++ b/arch/x86/kernel/cpu/proc.c
@@ -64,12 +64,10 @@ static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c)
static int show_cpuinfo(struct seq_file *m, void *v)
{
struct cpuinfo_x86 *c = v;
- unsigned int cpu = 0;
+ unsigned int cpu;
int i;
-#ifdef CONFIG_SMP
cpu = c->cpu_index;
-#endif
seq_printf(m, "processor\t: %u\n"
"vendor_id\t: %s\n"
"cpu family\t: %d\n"
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index 212a6a42527c..a524353d93f2 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -177,7 +177,7 @@ static struct notifier_block __refdata cpuid_class_cpu_notifier =
.notifier_call = cpuid_class_cpu_callback,
};
-static char *cpuid_devnode(struct device *dev, mode_t *mode)
+static char *cpuid_devnode(struct device *dev, umode_t *mode)
{
return kasprintf(GFP_KERNEL, "cpu/%u/cpuid", MINOR(dev->devt));
}
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index 3b97a80ce329..c99f9ed013d5 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -116,16 +116,16 @@ void show_registers(struct pt_regs *regs)
for (i = 0; i < code_len; i++, ip++) {
if (ip < (u8 *)PAGE_OFFSET ||
probe_kernel_address(ip, c)) {
- printk(" Bad EIP value.");
+ printk(KERN_CONT " Bad EIP value.");
break;
}
if (ip == (u8 *)regs->ip)
- printk("<%02x> ", c);
+ printk(KERN_CONT "<%02x> ", c);
else
- printk("%02x ", c);
+ printk(KERN_CONT "%02x ", c);
}
}
- printk("\n");
+ printk(KERN_CONT "\n");
}
int is_valid_bugaddr(unsigned long ip)
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 19853ad8afc5..6d728d9284bd 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -284,16 +284,16 @@ void show_registers(struct pt_regs *regs)
for (i = 0; i < code_len; i++, ip++) {
if (ip < (u8 *)PAGE_OFFSET ||
probe_kernel_address(ip, c)) {
- printk(" Bad RIP value.");
+ printk(KERN_CONT " Bad RIP value.");
break;
}
if (ip == (u8 *)regs->ip)
- printk("<%02x> ", c);
+ printk(KERN_CONT "<%02x> ", c);
else
- printk("%02x ", c);
+ printk(KERN_CONT "%02x ", c);
}
}
- printk("\n");
+ printk(KERN_CONT "\n");
}
int is_valid_bugaddr(unsigned long ip)
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 303a0e48f076..174d938d576b 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -19,6 +19,7 @@
#include <linux/acpi.h>
#include <linux/firmware-map.h>
#include <linux/memblock.h>
+#include <linux/sort.h>
#include <asm/e820.h>
#include <asm/proto.h>
@@ -227,22 +228,38 @@ void __init e820_print_map(char *who)
* ____________________33__
* ______________________4_
*/
+struct change_member {
+ struct e820entry *pbios; /* pointer to original bios entry */
+ unsigned long long addr; /* address for this change point */
+};
+
+static int __init cpcompare(const void *a, const void *b)
+{
+ struct change_member * const *app = a, * const *bpp = b;
+ const struct change_member *ap = *app, *bp = *bpp;
+
+ /*
+ * Inputs are pointers to two elements of change_point[]. If their
+ * addresses are unequal, their difference dominates. If the addresses
+ * are equal, then consider one that represents the end of its region
+ * to be greater than one that does not.
+ */
+ if (ap->addr != bp->addr)
+ return ap->addr > bp->addr ? 1 : -1;
+
+ return (ap->addr != ap->pbios->addr) - (bp->addr != bp->pbios->addr);
+}
int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map,
u32 *pnr_map)
{
- struct change_member {
- struct e820entry *pbios; /* pointer to original bios entry */
- unsigned long long addr; /* address for this change point */
- };
static struct change_member change_point_list[2*E820_X_MAX] __initdata;
static struct change_member *change_point[2*E820_X_MAX] __initdata;
static struct e820entry *overlap_list[E820_X_MAX] __initdata;
static struct e820entry new_bios[E820_X_MAX] __initdata;
- struct change_member *change_tmp;
unsigned long current_type, last_type;
unsigned long long last_addr;
- int chgidx, still_changing;
+ int chgidx;
int overlap_entries;
int new_bios_entry;
int old_nr, new_nr, chg_nr;
@@ -279,35 +296,7 @@ int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map,
chg_nr = chgidx;
/* sort change-point list by memory addresses (low -> high) */
- still_changing = 1;
- while (still_changing) {
- still_changing = 0;
- for (i = 1; i < chg_nr; i++) {
- unsigned long long curaddr, lastaddr;
- unsigned long long curpbaddr, lastpbaddr;
-
- curaddr = change_point[i]->addr;
- lastaddr = change_point[i - 1]->addr;
- curpbaddr = change_point[i]->pbios->addr;
- lastpbaddr = change_point[i - 1]->pbios->addr;
-
- /*
- * swap entries, when:
- *
- * curaddr > lastaddr or
- * curaddr == lastaddr and curaddr == curpbaddr and
- * lastaddr != lastpbaddr
- */
- if (curaddr < lastaddr ||
- (curaddr == lastaddr && curaddr == curpbaddr &&
- lastaddr != lastpbaddr)) {
- change_tmp = change_point[i];
- change_point[i] = change_point[i-1];
- change_point[i-1] = change_tmp;
- still_changing = 1;
- }
- }
- }
+ sort(change_point, chg_nr, sizeof *change_point, cpcompare, NULL);
/* create a new bios memory map, removing overlaps */
overlap_entries = 0; /* number of entries in the overlap table */
@@ -738,35 +727,17 @@ core_initcall(e820_mark_nvs_memory);
/*
* pre allocated 4k and reserved it in memblock and e820_saved
*/
-u64 __init early_reserve_e820(u64 startt, u64 sizet, u64 align)
+u64 __init early_reserve_e820(u64 size, u64 align)
{
- u64 size = 0;
u64 addr;
- u64 start;
- for (start = startt; ; start += size) {
- start = memblock_x86_find_in_range_size(start, &size, align);
- if (start == MEMBLOCK_ERROR)
- return 0;
- if (size >= sizet)
- break;
+ addr = __memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE);
+ if (addr) {
+ e820_update_range_saved(addr, size, E820_RAM, E820_RESERVED);
+ printk(KERN_INFO "update e820_saved for early_reserve_e820\n");
+ update_e820_saved();
}
-#ifdef CONFIG_X86_32
- if (start >= MAXMEM)
- return 0;
- if (start + size > MAXMEM)
- size = MAXMEM - start;
-#endif
-
- addr = round_down(start + size - sizet, align);
- if (addr < start)
- return 0;
- memblock_x86_reserve_range(addr, addr + sizet, "new next");
- e820_update_range_saved(addr, sizet, E820_RAM, E820_RESERVED);
- printk(KERN_INFO "update e820_saved for early_reserve_e820\n");
- update_e820_saved();
-
return addr;
}
@@ -1090,7 +1061,7 @@ void __init memblock_x86_fill(void)
* We are safe to enable resizing, beause memblock_x86_fill()
* is rather later for x86
*/
- memblock_can_resize = 1;
+ memblock_allow_resize();
for (i = 0; i < e820.nr_map; i++) {
struct e820entry *ei = &e820.map[i];
@@ -1105,22 +1076,36 @@ void __init memblock_x86_fill(void)
memblock_add(ei->addr, ei->size);
}
- memblock_analyze();
memblock_dump_all();
}
void __init memblock_find_dma_reserve(void)
{
#ifdef CONFIG_X86_64
- u64 free_size_pfn;
- u64 mem_size_pfn;
+ u64 nr_pages = 0, nr_free_pages = 0;
+ unsigned long start_pfn, end_pfn;
+ phys_addr_t start, end;
+ int i;
+ u64 u;
+
/*
* need to find out used area below MAX_DMA_PFN
* need to use memblock to get free size in [0, MAX_DMA_PFN]
* at first, and assume boot_mem will not take below MAX_DMA_PFN
*/
- mem_size_pfn = memblock_x86_memory_in_range(0, MAX_DMA_PFN << PAGE_SHIFT) >> PAGE_SHIFT;
- free_size_pfn = memblock_x86_free_memory_in_range(0, MAX_DMA_PFN << PAGE_SHIFT) >> PAGE_SHIFT;
- set_dma_reserve(mem_size_pfn - free_size_pfn);
+ for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) {
+ start_pfn = min_t(unsigned long, start_pfn, MAX_DMA_PFN);
+ end_pfn = min_t(unsigned long, end_pfn, MAX_DMA_PFN);
+ nr_pages += end_pfn - start_pfn;
+ }
+
+ for_each_free_mem_range(u, MAX_NUMNODES, &start, &end, NULL) {
+ start_pfn = min_t(unsigned long, PFN_UP(start), MAX_DMA_PFN);
+ end_pfn = min_t(unsigned long, PFN_DOWN(end), MAX_DMA_PFN);
+ if (start_pfn < end_pfn)
+ nr_free_pages += end_pfn - start_pfn;
+ }
+
+ set_dma_reserve(nr_pages - nr_free_pages);
#endif
}
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index cd28a350f7f9..9b9f18b49918 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -240,14 +240,14 @@ static int __init setup_early_printk(char *buf)
if (!strncmp(buf, "xen", 3))
early_console_register(&xenboot_console, keep);
#endif
-#ifdef CONFIG_EARLY_PRINTK_MRST
+#ifdef CONFIG_EARLY_PRINTK_INTEL_MID
if (!strncmp(buf, "mrst", 4)) {
mrst_early_console_init();
early_console_register(&early_mrst_console, keep);
}
if (!strncmp(buf, "hsu", 3)) {
- hsu_early_console_init();
+ hsu_early_console_init(buf + 3);
early_console_register(&early_hsu_console, keep);
}
#endif
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 1ffcda22c2f6..4af9fd2450a5 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -623,6 +623,8 @@ work_notifysig: # deal with pending signals and
movl %esp, %eax
jne work_notifysig_v86 # returning to kernel-space or
# vm86-space
+ TRACE_IRQS_ON
+ ENABLE_INTERRUPTS(CLBR_NONE)
xorl %edx, %edx
call do_notify_resume
jmp resume_userspace_sig
@@ -636,6 +638,8 @@ work_notifysig_v86:
#else
movl %esp, %eax
#endif
+ TRACE_IRQS_ON
+ ENABLE_INTERRUPTS(CLBR_NONE)
xorl %edx, %edx
call do_notify_resume
jmp resume_userspace_sig
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index faf8d5e74b0b..940ba711fc28 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -221,7 +221,7 @@ ENDPROC(native_usergs_sysret64)
/*CFI_REL_OFFSET ss,0*/
pushq_cfi %rax /* rsp */
CFI_REL_OFFSET rsp,0
- pushq_cfi $X86_EFLAGS_IF /* eflags - interrupts on */
+ pushq_cfi $(X86_EFLAGS_IF|X86_EFLAGS_BIT1) /* eflags - interrupts on */
/*CFI_REL_OFFSET rflags,0*/
pushq_cfi $__KERNEL_CS /* cs */
/*CFI_REL_OFFSET cs,0*/
@@ -411,7 +411,7 @@ ENTRY(ret_from_fork)
RESTORE_REST
testl $3, CS-ARGOFFSET(%rsp) # from kernel_thread?
- je int_ret_from_sys_call
+ jz retint_restore_args
testl $_TIF_IA32, TI_flags(%rcx) # 32-bit compat task needs IRET
jnz int_ret_from_sys_call
@@ -465,7 +465,7 @@ ENTRY(system_call)
* after the swapgs, so that it can do the swapgs
* for the guest and jump here on syscall.
*/
-ENTRY(system_call_after_swapgs)
+GLOBAL(system_call_after_swapgs)
movq %rsp,PER_CPU_VAR(old_rsp)
movq PER_CPU_VAR(kernel_stack),%rsp
@@ -478,8 +478,7 @@ ENTRY(system_call_after_swapgs)
movq %rax,ORIG_RAX-ARGOFFSET(%rsp)
movq %rcx,RIP-ARGOFFSET(%rsp)
CFI_REL_OFFSET rip,RIP-ARGOFFSET
- GET_THREAD_INFO(%rcx)
- testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%rcx)
+ testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
jnz tracesys
system_call_fastpath:
cmpq $__NR_syscall_max,%rax
@@ -496,10 +495,9 @@ ret_from_sys_call:
/* edi: flagmask */
sysret_check:
LOCKDEP_SYS_EXIT
- GET_THREAD_INFO(%rcx)
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
- movl TI_flags(%rcx),%edx
+ movl TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET),%edx
andl %edi,%edx
jnz sysret_careful
CFI_REMEMBER_STATE
@@ -583,7 +581,7 @@ sysret_audit:
/* Do syscall tracing */
tracesys:
#ifdef CONFIG_AUDITSYSCALL
- testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%rcx)
+ testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
jz auditsys
#endif
SAVE_REST
@@ -612,8 +610,6 @@ tracesys:
GLOBAL(int_ret_from_sys_call)
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
- testl $3,CS-ARGOFFSET(%rsp)
- je retint_restore_args
movl $_TIF_ALLWORK_MASK,%edi
/* edi: mask to check */
GLOBAL(int_with_check)
@@ -953,6 +949,7 @@ END(common_interrupt)
ENTRY(\sym)
INTR_FRAME
pushq_cfi $~(\num)
+.Lcommon_\sym:
interrupt \do_sym
jmp ret_from_intr
CFI_ENDPROC
@@ -976,13 +973,21 @@ apicinterrupt X86_PLATFORM_IPI_VECTOR \
x86_platform_ipi smp_x86_platform_ipi
#ifdef CONFIG_SMP
-.irp idx,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, \
+ ALIGN
+ INTR_FRAME
+.irp idx,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, \
16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
.if NUM_INVALIDATE_TLB_VECTORS > \idx
-apicinterrupt (INVALIDATE_TLB_VECTOR_START)+\idx \
- invalidate_interrupt\idx smp_invalidate_interrupt
+ENTRY(invalidate_interrupt\idx)
+ pushq_cfi $~(INVALIDATE_TLB_VECTOR_START+\idx)
+ jmp .Lcommon_invalidate_interrupt0
+ CFI_ADJUST_CFA_OFFSET -8
+END(invalidate_interrupt\idx)
.endif
.endr
+ CFI_ENDPROC
+apicinterrupt INVALIDATE_TLB_VECTOR_START, \
+ invalidate_interrupt0, smp_invalidate_interrupt
#endif
apicinterrupt THRESHOLD_APIC_VECTOR \
@@ -1475,62 +1480,214 @@ ENTRY(error_exit)
CFI_ENDPROC
END(error_exit)
+/*
+ * Test if a given stack is an NMI stack or not.
+ */
+ .macro test_in_nmi reg stack nmi_ret normal_ret
+ cmpq %\reg, \stack
+ ja \normal_ret
+ subq $EXCEPTION_STKSZ, %\reg
+ cmpq %\reg, \stack
+ jb \normal_ret
+ jmp \nmi_ret
+ .endm
/* runs on exception stack */
ENTRY(nmi)
INTR_FRAME
PARAVIRT_ADJUST_EXCEPTION_FRAME
- pushq_cfi $-1
+ /*
+ * We allow breakpoints in NMIs. If a breakpoint occurs, then
+ * the iretq it performs will take us out of NMI context.
+ * This means that we can have nested NMIs where the next
+ * NMI is using the top of the stack of the previous NMI. We
+ * can't let it execute because the nested NMI will corrupt the
+ * stack of the previous NMI. NMI handlers are not re-entrant
+ * anyway.
+ *
+ * To handle this case we do the following:
+ * Check the a special location on the stack that contains
+ * a variable that is set when NMIs are executing.
+ * The interrupted task's stack is also checked to see if it
+ * is an NMI stack.
+ * If the variable is not set and the stack is not the NMI
+ * stack then:
+ * o Set the special variable on the stack
+ * o Copy the interrupt frame into a "saved" location on the stack
+ * o Copy the interrupt frame into a "copy" location on the stack
+ * o Continue processing the NMI
+ * If the variable is set or the previous stack is the NMI stack:
+ * o Modify the "copy" location to jump to the repeate_nmi
+ * o return back to the first NMI
+ *
+ * Now on exit of the first NMI, we first clear the stack variable
+ * The NMI stack will tell any nested NMIs at that point that it is
+ * nested. Then we pop the stack normally with iret, and if there was
+ * a nested NMI that updated the copy interrupt stack frame, a
+ * jump will be made to the repeat_nmi code that will handle the second
+ * NMI.
+ */
+
+ /* Use %rdx as out temp variable throughout */
+ pushq_cfi %rdx
+
+ /*
+ * Check the special variable on the stack to see if NMIs are
+ * executing.
+ */
+ cmp $1, -8(%rsp)
+ je nested_nmi
+
+ /*
+ * Now test if the previous stack was an NMI stack.
+ * We need the double check. We check the NMI stack to satisfy the
+ * race when the first NMI clears the variable before returning.
+ * We check the variable because the first NMI could be in a
+ * breakpoint routine using a breakpoint stack.
+ */
+ lea 6*8(%rsp), %rdx
+ test_in_nmi rdx, 4*8(%rsp), nested_nmi, first_nmi
+
+nested_nmi:
+ /*
+ * Do nothing if we interrupted the fixup in repeat_nmi.
+ * It's about to repeat the NMI handler, so we are fine
+ * with ignoring this one.
+ */
+ movq $repeat_nmi, %rdx
+ cmpq 8(%rsp), %rdx
+ ja 1f
+ movq $end_repeat_nmi, %rdx
+ cmpq 8(%rsp), %rdx
+ ja nested_nmi_out
+
+1:
+ /* Set up the interrupted NMIs stack to jump to repeat_nmi */
+ leaq -6*8(%rsp), %rdx
+ movq %rdx, %rsp
+ CFI_ADJUST_CFA_OFFSET 6*8
+ pushq_cfi $__KERNEL_DS
+ pushq_cfi %rdx
+ pushfq_cfi
+ pushq_cfi $__KERNEL_CS
+ pushq_cfi $repeat_nmi
+
+ /* Put stack back */
+ addq $(11*8), %rsp
+ CFI_ADJUST_CFA_OFFSET -11*8
+
+nested_nmi_out:
+ popq_cfi %rdx
+
+ /* No need to check faults here */
+ INTERRUPT_RETURN
+
+first_nmi:
+ /*
+ * Because nested NMIs will use the pushed location that we
+ * stored in rdx, we must keep that space available.
+ * Here's what our stack frame will look like:
+ * +-------------------------+
+ * | original SS |
+ * | original Return RSP |
+ * | original RFLAGS |
+ * | original CS |
+ * | original RIP |
+ * +-------------------------+
+ * | temp storage for rdx |
+ * +-------------------------+
+ * | NMI executing variable |
+ * +-------------------------+
+ * | Saved SS |
+ * | Saved Return RSP |
+ * | Saved RFLAGS |
+ * | Saved CS |
+ * | Saved RIP |
+ * +-------------------------+
+ * | copied SS |
+ * | copied Return RSP |
+ * | copied RFLAGS |
+ * | copied CS |
+ * | copied RIP |
+ * +-------------------------+
+ * | pt_regs |
+ * +-------------------------+
+ *
+ * The saved RIP is used to fix up the copied RIP that a nested
+ * NMI may zero out. The original stack frame and the temp storage
+ * is also used by nested NMIs and can not be trusted on exit.
+ */
+ /* Set the NMI executing variable on the stack. */
+ pushq_cfi $1
+
+ /* Copy the stack frame to the Saved frame */
+ .rept 5
+ pushq_cfi 6*8(%rsp)
+ .endr
+
+ /* Make another copy, this one may be modified by nested NMIs */
+ .rept 5
+ pushq_cfi 4*8(%rsp)
+ .endr
+
+ /* Do not pop rdx, nested NMIs will corrupt it */
+ movq 11*8(%rsp), %rdx
+
+ /*
+ * Everything below this point can be preempted by a nested
+ * NMI if the first NMI took an exception. Repeated NMIs
+ * caused by an exception and nested NMI will start here, and
+ * can still be preempted by another NMI.
+ */
+restart_nmi:
+ pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
subq $ORIG_RAX-R15, %rsp
CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
+ /*
+ * Use save_paranoid to handle SWAPGS, but no need to use paranoid_exit
+ * as we should not be calling schedule in NMI context.
+ * Even with normal interrupts enabled. An NMI should not be
+ * setting NEED_RESCHED or anything that normal interrupts and
+ * exceptions might do.
+ */
call save_paranoid
DEFAULT_FRAME 0
/* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
movq %rsp,%rdi
movq $-1,%rsi
call do_nmi
-#ifdef CONFIG_TRACE_IRQFLAGS
- /* paranoidexit; without TRACE_IRQS_OFF */
- /* ebx: no swapgs flag */
- DISABLE_INTERRUPTS(CLBR_NONE)
testl %ebx,%ebx /* swapgs needed? */
jnz nmi_restore
- testl $3,CS(%rsp)
- jnz nmi_userspace
nmi_swapgs:
SWAPGS_UNSAFE_STACK
nmi_restore:
RESTORE_ALL 8
+ /* Clear the NMI executing stack variable */
+ movq $0, 10*8(%rsp)
jmp irq_return
-nmi_userspace:
- GET_THREAD_INFO(%rcx)
- movl TI_flags(%rcx),%ebx
- andl $_TIF_WORK_MASK,%ebx
- jz nmi_swapgs
- movq %rsp,%rdi /* &pt_regs */
- call sync_regs
- movq %rax,%rsp /* switch stack for scheduling */
- testl $_TIF_NEED_RESCHED,%ebx
- jnz nmi_schedule
- movl %ebx,%edx /* arg3: thread flags */
- ENABLE_INTERRUPTS(CLBR_NONE)
- xorl %esi,%esi /* arg2: oldset */
- movq %rsp,%rdi /* arg1: &pt_regs */
- call do_notify_resume
- DISABLE_INTERRUPTS(CLBR_NONE)
- jmp nmi_userspace
-nmi_schedule:
- ENABLE_INTERRUPTS(CLBR_ANY)
- call schedule
- DISABLE_INTERRUPTS(CLBR_ANY)
- jmp nmi_userspace
CFI_ENDPROC
-#else
- jmp paranoid_exit
- CFI_ENDPROC
-#endif
END(nmi)
+ /*
+ * If an NMI hit an iret because of an exception or breakpoint,
+ * it can lose its NMI context, and a nested NMI may come in.
+ * In that case, the nested NMI will change the preempted NMI's
+ * stack to jump to here when it does the final iret.
+ */
+repeat_nmi:
+ INTR_FRAME
+ /* Update the stack variable to say we are still in NMI */
+ movq $1, 5*8(%rsp)
+
+ /* copy the saved stack back to copy stack */
+ .rept 5
+ pushq_cfi 4*8(%rsp)
+ .endr
+
+ jmp restart_nmi
+ CFI_ENDPROC
+end_repeat_nmi:
+
ENTRY(ignore_sysret)
CFI_STARTPROC
mov $-ENOSYS,%eax
diff --git a/arch/x86/kernel/head.c b/arch/x86/kernel/head.c
index af0699ba48cf..48d9d4ea1020 100644
--- a/arch/x86/kernel/head.c
+++ b/arch/x86/kernel/head.c
@@ -52,5 +52,5 @@ void __init reserve_ebda_region(void)
lowmem = 0x9f000;
/* reserve all memory between lowmem and the 1MB mark */
- memblock_x86_reserve_range(lowmem, 0x100000, "* BIOS reserved");
+ memblock_reserve(lowmem, 0x100000 - lowmem);
}
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index 3bb08509a7a1..51ff18616d50 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -31,9 +31,8 @@ static void __init i386_default_early_setup(void)
void __init i386_start_kernel(void)
{
- memblock_init();
-
- memblock_x86_reserve_range(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS");
+ memblock_reserve(__pa_symbol(&_text),
+ __pa_symbol(&__bss_stop) - __pa_symbol(&_text));
#ifdef CONFIG_BLK_DEV_INITRD
/* Reserve INITRD */
@@ -42,7 +41,7 @@ void __init i386_start_kernel(void)
u64 ramdisk_image = boot_params.hdr.ramdisk_image;
u64 ramdisk_size = boot_params.hdr.ramdisk_size;
u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size);
- memblock_x86_reserve_range(ramdisk_image, ramdisk_end, "RAMDISK");
+ memblock_reserve(ramdisk_image, ramdisk_end - ramdisk_image);
}
#endif
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 5655c2272adb..3a3b779f41d3 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -98,9 +98,8 @@ void __init x86_64_start_reservations(char *real_mode_data)
{
copy_bootdata(__va(real_mode_data));
- memblock_init();
-
- memblock_x86_reserve_range(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS");
+ memblock_reserve(__pa_symbol(&_text),
+ __pa_symbol(&__bss_stop) - __pa_symbol(&_text));
#ifdef CONFIG_BLK_DEV_INITRD
/* Reserve INITRD */
@@ -109,7 +108,7 @@ void __init x86_64_start_reservations(char *real_mode_data)
unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
unsigned long ramdisk_size = boot_params.hdr.ramdisk_size;
unsigned long ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size);
- memblock_x86_reserve_range(ramdisk_image, ramdisk_end, "RAMDISK");
+ memblock_reserve(ramdisk_image, ramdisk_end - ramdisk_image);
}
#endif
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index e11e39478a49..40f4eb3766d1 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -417,6 +417,10 @@ ENTRY(phys_base)
ENTRY(idt_table)
.skip IDT_ENTRIES * 16
+ .align L1_CACHE_BYTES
+ENTRY(nmi_idt_table)
+ .skip IDT_ENTRIES * 16
+
__PAGE_ALIGNED_BSS
.align PAGE_SIZE
ENTRY(empty_zero_page)
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index b946a9eac7d9..ad0de0c2714e 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -2,7 +2,6 @@
#include <linux/clockchips.h>
#include <linux/interrupt.h>
#include <linux/export.h>
-#include <linux/sysdev.h>
#include <linux/delay.h>
#include <linux/errno.h>
#include <linux/i8253.h>
@@ -32,8 +31,6 @@
#define HPET_MIN_CYCLES 128
#define HPET_MIN_PROG_DELTA (HPET_MIN_CYCLES + (HPET_MIN_CYCLES >> 1))
-#define EVT_TO_HPET_DEV(evt) container_of(evt, struct hpet_dev, evt)
-
/*
* HPET address is set in acpi/boot.c, when an ACPI entry exists
*/
@@ -55,6 +52,11 @@ struct hpet_dev {
char name[10];
};
+inline struct hpet_dev *EVT_TO_HPET_DEV(struct clock_event_device *evtdev)
+{
+ return container_of(evtdev, struct hpet_dev, evt);
+}
+
inline unsigned int hpet_readl(unsigned int a)
{
return readl(hpet_virt_address + a);
@@ -1049,6 +1051,14 @@ int hpet_rtc_timer_init(void)
}
EXPORT_SYMBOL_GPL(hpet_rtc_timer_init);
+static void hpet_disable_rtc_channel(void)
+{
+ unsigned long cfg;
+ cfg = hpet_readl(HPET_T1_CFG);
+ cfg &= ~HPET_TN_ENABLE;
+ hpet_writel(cfg, HPET_T1_CFG);
+}
+
/*
* The functions below are called from rtc driver.
* Return 0 if HPET is not being used.
@@ -1060,6 +1070,9 @@ int hpet_mask_rtc_irq_bit(unsigned long bit_mask)
return 0;
hpet_rtc_flags &= ~bit_mask;
+ if (unlikely(!hpet_rtc_flags))
+ hpet_disable_rtc_channel();
+
return 1;
}
EXPORT_SYMBOL_GPL(hpet_mask_rtc_irq_bit);
@@ -1125,15 +1138,11 @@ EXPORT_SYMBOL_GPL(hpet_rtc_dropped_irq);
static void hpet_rtc_timer_reinit(void)
{
- unsigned int cfg, delta;
+ unsigned int delta;
int lost_ints = -1;
- if (unlikely(!hpet_rtc_flags)) {
- cfg = hpet_readl(HPET_T1_CFG);
- cfg &= ~HPET_TN_ENABLE;
- hpet_writel(cfg, HPET_T1_CFG);
- return;
- }
+ if (unlikely(!hpet_rtc_flags))
+ hpet_disable_rtc_channel();
if (!(hpet_rtc_flags & RTC_PIE) || hpet_pie_limit)
delta = hpet_default_delta;
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 429e0c92924e..7943e0c21bde 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -74,6 +74,10 @@ int arch_show_interrupts(struct seq_file *p, int prec)
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->apic_irq_work_irqs);
seq_printf(p, " IRQ work interrupts\n");
+ seq_printf(p, "%*s: ", prec, "RTR");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", irq_stats(j)->icr_read_retry_count);
+ seq_printf(p, " APIC ICR read retries\n");
#endif
if (x86_platform_ipi_callback) {
seq_printf(p, "%*s: ", prec, "PLT");
@@ -136,6 +140,7 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
sum += irq_stats(cpu)->irq_spurious_count;
sum += irq_stats(cpu)->apic_perf_irqs;
sum += irq_stats(cpu)->apic_irq_work_irqs;
+ sum += irq_stats(cpu)->icr_read_retry_count;
#endif
if (x86_platform_ipi_callback)
sum += irq_stats(cpu)->x86_platform_ipis;
@@ -181,8 +186,8 @@ unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
unsigned vector = ~regs->orig_ax;
unsigned irq;
- exit_idle();
irq_enter();
+ exit_idle();
irq = __this_cpu_read(vector_irq[vector]);
@@ -209,10 +214,10 @@ void smp_x86_platform_ipi(struct pt_regs *regs)
ack_APIC_irq();
- exit_idle();
-
irq_enter();
+ exit_idle();
+
inc_irq_stat(x86_platform_ipis);
if (x86_platform_ipi_callback)
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 72090705a656..40fc86161d92 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -28,6 +28,9 @@ DEFINE_PER_CPU(struct pt_regs *, irq_regs);
EXPORT_PER_CPU_SYMBOL(irq_regs);
#ifdef CONFIG_DEBUG_STACKOVERFLOW
+
+int sysctl_panic_on_stackoverflow __read_mostly;
+
/* Debugging check for stack overflow: is there less than 1KB free? */
static int check_stack_overflow(void)
{
@@ -43,6 +46,8 @@ static void print_stack_overflow(void)
{
printk(KERN_WARNING "low stack detected by irq handler\n");
dump_stack();
+ if (sysctl_panic_on_stackoverflow)
+ panic("low stack detected by irq handler - check messages\n");
}
#else
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index acf8fbf8fbda..d04d3ecded62 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -26,6 +26,8 @@ EXPORT_PER_CPU_SYMBOL(irq_stat);
DEFINE_PER_CPU(struct pt_regs *, irq_regs);
EXPORT_PER_CPU_SYMBOL(irq_regs);
+int sysctl_panic_on_stackoverflow;
+
/*
* Probabilistic stack overflow check:
*
@@ -36,15 +38,39 @@ EXPORT_PER_CPU_SYMBOL(irq_regs);
static inline void stack_overflow_check(struct pt_regs *regs)
{
#ifdef CONFIG_DEBUG_STACKOVERFLOW
+#define STACK_TOP_MARGIN 128
+ struct orig_ist *oist;
+ u64 irq_stack_top, irq_stack_bottom;
+ u64 estack_top, estack_bottom;
u64 curbase = (u64)task_stack_page(current);
- WARN_ONCE(regs->sp >= curbase &&
- regs->sp <= curbase + THREAD_SIZE &&
- regs->sp < curbase + sizeof(struct thread_info) +
- sizeof(struct pt_regs) + 128,
+ if (user_mode_vm(regs))
+ return;
+
+ if (regs->sp >= curbase + sizeof(struct thread_info) +
+ sizeof(struct pt_regs) + STACK_TOP_MARGIN &&
+ regs->sp <= curbase + THREAD_SIZE)
+ return;
+
+ irq_stack_top = (u64)__get_cpu_var(irq_stack_union.irq_stack) +
+ STACK_TOP_MARGIN;
+ irq_stack_bottom = (u64)__get_cpu_var(irq_stack_ptr);
+ if (regs->sp >= irq_stack_top && regs->sp <= irq_stack_bottom)
+ return;
+
+ oist = &__get_cpu_var(orig_ist);
+ estack_top = (u64)oist->ist[0] - EXCEPTION_STKSZ + STACK_TOP_MARGIN;
+ estack_bottom = (u64)oist->ist[N_EXCEPTION_STACKS - 1];
+ if (regs->sp >= estack_top && regs->sp <= estack_bottom)
+ return;
+
+ WARN_ONCE(1, "do_IRQ(): %s has overflown the kernel stack (cur:%Lx,sp:%lx,irq stk top-bottom:%Lx-%Lx,exception stk top-bottom:%Lx-%Lx)\n",
+ current->comm, curbase, regs->sp,
+ irq_stack_top, irq_stack_bottom,
+ estack_top, estack_bottom);
- "do_IRQ: %s near stack overflow (cur:%Lx,sp:%lx)\n",
- current->comm, curbase, regs->sp);
+ if (sysctl_panic_on_stackoverflow)
+ panic("low stack detected by irq handler - check messages\n");
#endif
}
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index b3300e6bacef..313fb5cddbce 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -9,7 +9,7 @@
#include <linux/kprobes.h>
#include <linux/init.h>
#include <linux/kernel_stat.h>
-#include <linux/sysdev.h>
+#include <linux/device.h>
#include <linux/bitops.h>
#include <linux/acpi.h>
#include <linux/io.h>
diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c
index ea9d5f2f13ef..2889b3d43882 100644
--- a/arch/x86/kernel/jump_label.c
+++ b/arch/x86/kernel/jump_label.c
@@ -50,7 +50,7 @@ void arch_jump_label_transform(struct jump_entry *entry,
put_online_cpus();
}
-void arch_jump_label_transform_static(struct jump_entry *entry,
+__init_or_module void arch_jump_label_transform_static(struct jump_entry *entry,
enum jump_label_type type)
{
__jump_label_transform(entry, type, text_poke_early);
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index a9c2116001d6..f0c6fd6f176b 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -39,8 +39,6 @@
#include <asm/desc.h>
#include <asm/tlbflush.h>
-#define MMU_QUEUE_SIZE 1024
-
static int kvmapf = 1;
static int parse_no_kvmapf(char *arg)
@@ -60,21 +58,10 @@ static int parse_no_stealacc(char *arg)
early_param("no-steal-acc", parse_no_stealacc);
-struct kvm_para_state {
- u8 mmu_queue[MMU_QUEUE_SIZE];
- int mmu_queue_len;
-};
-
-static DEFINE_PER_CPU(struct kvm_para_state, para_state);
static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64);
static DEFINE_PER_CPU(struct kvm_steal_time, steal_time) __aligned(64);
static int has_steal_clock = 0;
-static struct kvm_para_state *kvm_para_state(void)
-{
- return &per_cpu(para_state, raw_smp_processor_id());
-}
-
/*
* No need for any "IO delay" on KVM
*/
@@ -271,151 +258,6 @@ do_async_page_fault(struct pt_regs *regs, unsigned long error_code)
}
}
-static void kvm_mmu_op(void *buffer, unsigned len)
-{
- int r;
- unsigned long a1, a2;
-
- do {
- a1 = __pa(buffer);
- a2 = 0; /* on i386 __pa() always returns <4G */
- r = kvm_hypercall3(KVM_HC_MMU_OP, len, a1, a2);
- buffer += r;
- len -= r;
- } while (len);
-}
-
-static void mmu_queue_flush(struct kvm_para_state *state)
-{
- if (state->mmu_queue_len) {
- kvm_mmu_op(state->mmu_queue, state->mmu_queue_len);
- state->mmu_queue_len = 0;
- }
-}
-
-static void kvm_deferred_mmu_op(void *buffer, int len)
-{
- struct kvm_para_state *state = kvm_para_state();
-
- if (paravirt_get_lazy_mode() != PARAVIRT_LAZY_MMU) {
- kvm_mmu_op(buffer, len);
- return;
- }
- if (state->mmu_queue_len + len > sizeof state->mmu_queue)
- mmu_queue_flush(state);
- memcpy(state->mmu_queue + state->mmu_queue_len, buffer, len);
- state->mmu_queue_len += len;
-}
-
-static void kvm_mmu_write(void *dest, u64 val)
-{
- __u64 pte_phys;
- struct kvm_mmu_op_write_pte wpte;
-
-#ifdef CONFIG_HIGHPTE
- struct page *page;
- unsigned long dst = (unsigned long) dest;
-
- page = kmap_atomic_to_page(dest);
- pte_phys = page_to_pfn(page);
- pte_phys <<= PAGE_SHIFT;
- pte_phys += (dst & ~(PAGE_MASK));
-#else
- pte_phys = (unsigned long)__pa(dest);
-#endif
- wpte.header.op = KVM_MMU_OP_WRITE_PTE;
- wpte.pte_val = val;
- wpte.pte_phys = pte_phys;
-
- kvm_deferred_mmu_op(&wpte, sizeof wpte);
-}
-
-/*
- * We only need to hook operations that are MMU writes. We hook these so that
- * we can use lazy MMU mode to batch these operations. We could probably
- * improve the performance of the host code if we used some of the information
- * here to simplify processing of batched writes.
- */
-static void kvm_set_pte(pte_t *ptep, pte_t pte)
-{
- kvm_mmu_write(ptep, pte_val(pte));
-}
-
-static void kvm_set_pte_at(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep, pte_t pte)
-{
- kvm_mmu_write(ptep, pte_val(pte));
-}
-
-static void kvm_set_pmd(pmd_t *pmdp, pmd_t pmd)
-{
- kvm_mmu_write(pmdp, pmd_val(pmd));
-}
-
-#if PAGETABLE_LEVELS >= 3
-#ifdef CONFIG_X86_PAE
-static void kvm_set_pte_atomic(pte_t *ptep, pte_t pte)
-{
- kvm_mmu_write(ptep, pte_val(pte));
-}
-
-static void kvm_pte_clear(struct mm_struct *mm,
- unsigned long addr, pte_t *ptep)
-{
- kvm_mmu_write(ptep, 0);
-}
-
-static void kvm_pmd_clear(pmd_t *pmdp)
-{
- kvm_mmu_write(pmdp, 0);
-}
-#endif
-
-static void kvm_set_pud(pud_t *pudp, pud_t pud)
-{
- kvm_mmu_write(pudp, pud_val(pud));
-}
-
-#if PAGETABLE_LEVELS == 4
-static void kvm_set_pgd(pgd_t *pgdp, pgd_t pgd)
-{
- kvm_mmu_write(pgdp, pgd_val(pgd));
-}
-#endif
-#endif /* PAGETABLE_LEVELS >= 3 */
-
-static void kvm_flush_tlb(void)
-{
- struct kvm_mmu_op_flush_tlb ftlb = {
- .header.op = KVM_MMU_OP_FLUSH_TLB,
- };
-
- kvm_deferred_mmu_op(&ftlb, sizeof ftlb);
-}
-
-static void kvm_release_pt(unsigned long pfn)
-{
- struct kvm_mmu_op_release_pt rpt = {
- .header.op = KVM_MMU_OP_RELEASE_PT,
- .pt_phys = (u64)pfn << PAGE_SHIFT,
- };
-
- kvm_mmu_op(&rpt, sizeof rpt);
-}
-
-static void kvm_enter_lazy_mmu(void)
-{
- paravirt_enter_lazy_mmu();
-}
-
-static void kvm_leave_lazy_mmu(void)
-{
- struct kvm_para_state *state = kvm_para_state();
-
- mmu_queue_flush(state);
- paravirt_leave_lazy_mmu();
-}
-
static void __init paravirt_ops_setup(void)
{
pv_info.name = "KVM";
@@ -424,29 +266,6 @@ static void __init paravirt_ops_setup(void)
if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY))
pv_cpu_ops.io_delay = kvm_io_delay;
- if (kvm_para_has_feature(KVM_FEATURE_MMU_OP)) {
- pv_mmu_ops.set_pte = kvm_set_pte;
- pv_mmu_ops.set_pte_at = kvm_set_pte_at;
- pv_mmu_ops.set_pmd = kvm_set_pmd;
-#if PAGETABLE_LEVELS >= 3
-#ifdef CONFIG_X86_PAE
- pv_mmu_ops.set_pte_atomic = kvm_set_pte_atomic;
- pv_mmu_ops.pte_clear = kvm_pte_clear;
- pv_mmu_ops.pmd_clear = kvm_pmd_clear;
-#endif
- pv_mmu_ops.set_pud = kvm_set_pud;
-#if PAGETABLE_LEVELS == 4
- pv_mmu_ops.set_pgd = kvm_set_pgd;
-#endif
-#endif
- pv_mmu_ops.flush_tlb_user = kvm_flush_tlb;
- pv_mmu_ops.release_pte = kvm_release_pt;
- pv_mmu_ops.release_pmd = kvm_release_pt;
- pv_mmu_ops.release_pud = kvm_release_pt;
-
- pv_mmu_ops.lazy_mode.enter = kvm_enter_lazy_mmu;
- pv_mmu_ops.lazy_mode.leave = kvm_leave_lazy_mmu;
- }
#ifdef CONFIG_X86_IO_APIC
no_timer_check = 1;
#endif
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index c1a0188e29ae..44842d756b29 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -74,9 +74,10 @@ static cycle_t kvm_clock_read(void)
struct pvclock_vcpu_time_info *src;
cycle_t ret;
- src = &get_cpu_var(hv_clock);
+ preempt_disable_notrace();
+ src = &__get_cpu_var(hv_clock);
ret = pvclock_clocksource_read(src);
- put_cpu_var(hv_clock);
+ preempt_enable_notrace();
return ret;
}
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
index d494799aafcd..fe86493f3ed1 100644
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/microcode_amd.c
@@ -1,14 +1,18 @@
/*
* AMD CPU Microcode Update Driver for Linux
- * Copyright (C) 2008 Advanced Micro Devices Inc.
+ * Copyright (C) 2008-2011 Advanced Micro Devices Inc.
*
* Author: Peter Oruba <peter.oruba@amd.com>
*
* Based on work by:
* Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
*
- * This driver allows to upgrade microcode on AMD
- * family 0x10 and 0x11 processors.
+ * Maintainers:
+ * Andreas Herrmann <andreas.herrmann3@amd.com>
+ * Borislav Petkov <borislav.petkov@amd.com>
+ *
+ * This driver allows to upgrade microcode on F10h AMD
+ * CPUs and later.
*
* Licensed under the terms of the GNU General Public
* License version 2. See file COPYING for details.
@@ -71,6 +75,9 @@ struct microcode_amd {
static struct equiv_cpu_entry *equiv_cpu_table;
+/* page-sized ucode patch buffer */
+void *patch;
+
static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig)
{
struct cpuinfo_x86 *c = &cpu_data(cpu);
@@ -86,27 +93,76 @@ static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig)
return 0;
}
-static int get_matching_microcode(int cpu, struct microcode_header_amd *mc_hdr,
- int rev)
+static unsigned int verify_ucode_size(int cpu, u32 patch_size,
+ unsigned int size)
{
- unsigned int current_cpu_id;
- u16 equiv_cpu_id = 0;
- unsigned int i = 0;
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+ u32 max_size;
+
+#define F1XH_MPB_MAX_SIZE 2048
+#define F14H_MPB_MAX_SIZE 1824
+#define F15H_MPB_MAX_SIZE 4096
+
+ switch (c->x86) {
+ case 0x14:
+ max_size = F14H_MPB_MAX_SIZE;
+ break;
+ case 0x15:
+ max_size = F15H_MPB_MAX_SIZE;
+ break;
+ default:
+ max_size = F1XH_MPB_MAX_SIZE;
+ break;
+ }
+
+ if (patch_size > min_t(u32, size, max_size)) {
+ pr_err("patch size mismatch\n");
+ return 0;
+ }
+
+ return patch_size;
+}
+
+static u16 find_equiv_id(void)
+{
+ unsigned int current_cpu_id, i = 0;
BUG_ON(equiv_cpu_table == NULL);
+
current_cpu_id = cpuid_eax(0x00000001);
while (equiv_cpu_table[i].installed_cpu != 0) {
- if (current_cpu_id == equiv_cpu_table[i].installed_cpu) {
- equiv_cpu_id = equiv_cpu_table[i].equiv_cpu;
- break;
- }
+ if (current_cpu_id == equiv_cpu_table[i].installed_cpu)
+ return equiv_cpu_table[i].equiv_cpu;
+
i++;
}
+ return 0;
+}
+/*
+ * we signal a good patch is found by returning its size > 0
+ */
+static int get_matching_microcode(int cpu, const u8 *ucode_ptr,
+ unsigned int leftover_size, int rev,
+ unsigned int *current_size)
+{
+ struct microcode_header_amd *mc_hdr;
+ unsigned int actual_size;
+ u16 equiv_cpu_id;
+
+ /* size of the current patch we're staring at */
+ *current_size = *(u32 *)(ucode_ptr + 4) + SECTION_HDR_SIZE;
+
+ equiv_cpu_id = find_equiv_id();
if (!equiv_cpu_id)
return 0;
+ /*
+ * let's look at the patch header itself now
+ */
+ mc_hdr = (struct microcode_header_amd *)(ucode_ptr + SECTION_HDR_SIZE);
+
if (mc_hdr->processor_rev_id != equiv_cpu_id)
return 0;
@@ -120,7 +176,20 @@ static int get_matching_microcode(int cpu, struct microcode_header_amd *mc_hdr,
if (mc_hdr->patch_id <= rev)
return 0;
- return 1;
+ /*
+ * now that the header looks sane, verify its size
+ */
+ actual_size = verify_ucode_size(cpu, *current_size, leftover_size);
+ if (!actual_size)
+ return 0;
+
+ /* clear the patch buffer */
+ memset(patch, 0, PAGE_SIZE);
+
+ /* all looks ok, get the binary patch */
+ get_ucode_data(patch, ucode_ptr + SECTION_HDR_SIZE, actual_size);
+
+ return actual_size;
}
static int apply_microcode_amd(int cpu)
@@ -155,63 +224,6 @@ static int apply_microcode_amd(int cpu)
return 0;
}
-static unsigned int verify_ucode_size(int cpu, const u8 *buf, unsigned int size)
-{
- struct cpuinfo_x86 *c = &cpu_data(cpu);
- u32 max_size, actual_size;
-
-#define F1XH_MPB_MAX_SIZE 2048
-#define F14H_MPB_MAX_SIZE 1824
-#define F15H_MPB_MAX_SIZE 4096
-
- switch (c->x86) {
- case 0x14:
- max_size = F14H_MPB_MAX_SIZE;
- break;
- case 0x15:
- max_size = F15H_MPB_MAX_SIZE;
- break;
- default:
- max_size = F1XH_MPB_MAX_SIZE;
- break;
- }
-
- actual_size = *(u32 *)(buf + 4);
-
- if (actual_size + SECTION_HDR_SIZE > size || actual_size > max_size) {
- pr_err("section size mismatch\n");
- return 0;
- }
-
- return actual_size;
-}
-
-static struct microcode_header_amd *
-get_next_ucode(int cpu, const u8 *buf, unsigned int size, unsigned int *mc_size)
-{
- struct microcode_header_amd *mc = NULL;
- unsigned int actual_size = 0;
-
- if (*(u32 *)buf != UCODE_UCODE_TYPE) {
- pr_err("invalid type field in container file section header\n");
- goto out;
- }
-
- actual_size = verify_ucode_size(cpu, buf, size);
- if (!actual_size)
- goto out;
-
- mc = vzalloc(actual_size);
- if (!mc)
- goto out;
-
- get_ucode_data(mc, buf + SECTION_HDR_SIZE, actual_size);
- *mc_size = actual_size + SECTION_HDR_SIZE;
-
-out:
- return mc;
-}
-
static int install_equiv_cpu_table(const u8 *buf)
{
unsigned int *ibuf = (unsigned int *)buf;
@@ -247,36 +259,38 @@ generic_load_microcode(int cpu, const u8 *data, size_t size)
{
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
struct microcode_header_amd *mc_hdr = NULL;
- unsigned int mc_size, leftover;
+ unsigned int mc_size, leftover, current_size = 0;
int offset;
const u8 *ucode_ptr = data;
void *new_mc = NULL;
unsigned int new_rev = uci->cpu_sig.rev;
- enum ucode_state state = UCODE_OK;
+ enum ucode_state state = UCODE_ERROR;
offset = install_equiv_cpu_table(ucode_ptr);
if (offset < 0) {
pr_err("failed to create equivalent cpu table\n");
- return UCODE_ERROR;
+ goto out;
}
-
ucode_ptr += offset;
leftover = size - offset;
- while (leftover) {
- mc_hdr = get_next_ucode(cpu, ucode_ptr, leftover, &mc_size);
- if (!mc_hdr)
- break;
+ if (*(u32 *)ucode_ptr != UCODE_UCODE_TYPE) {
+ pr_err("invalid type field in container file section header\n");
+ goto free_table;
+ }
- if (get_matching_microcode(cpu, mc_hdr, new_rev)) {
- vfree(new_mc);
+ while (leftover) {
+ mc_size = get_matching_microcode(cpu, ucode_ptr, leftover,
+ new_rev, &current_size);
+ if (mc_size) {
+ mc_hdr = patch;
+ new_mc = patch;
new_rev = mc_hdr->patch_id;
- new_mc = mc_hdr;
- } else
- vfree(mc_hdr);
+ goto out_ok;
+ }
- ucode_ptr += mc_size;
- leftover -= mc_size;
+ ucode_ptr += current_size;
+ leftover -= current_size;
}
if (!new_mc) {
@@ -284,19 +298,16 @@ generic_load_microcode(int cpu, const u8 *data, size_t size)
goto free_table;
}
- if (!leftover) {
- vfree(uci->mc);
- uci->mc = new_mc;
- pr_debug("CPU%d update ucode (0x%08x -> 0x%08x)\n",
- cpu, uci->cpu_sig.rev, new_rev);
- } else {
- vfree(new_mc);
- state = UCODE_ERROR;
- }
+out_ok:
+ uci->mc = new_mc;
+ state = UCODE_OK;
+ pr_debug("CPU%d update ucode (0x%08x -> 0x%08x)\n",
+ cpu, uci->cpu_sig.rev, new_rev);
free_table:
free_equiv_cpu_table();
+out:
return state;
}
@@ -337,7 +348,6 @@ static void microcode_fini_cpu_amd(int cpu)
{
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
- vfree(uci->mc);
uci->mc = NULL;
}
@@ -351,5 +361,14 @@ static struct microcode_ops microcode_amd_ops = {
struct microcode_ops * __init init_amd_microcode(void)
{
+ patch = (void *)get_zeroed_page(GFP_KERNEL);
+ if (!patch)
+ return NULL;
+
return &microcode_amd_ops;
}
+
+void __exit exit_amd_microcode(void)
+{
+ free_page((unsigned long)patch);
+}
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index f2d2a664e797..fda91c307104 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -256,7 +256,7 @@ static int __init microcode_dev_init(void)
return 0;
}
-static void microcode_dev_exit(void)
+static void __exit microcode_dev_exit(void)
{
misc_deregister(&microcode_dev);
}
@@ -292,8 +292,8 @@ static int reload_for_cpu(int cpu)
return err;
}
-static ssize_t reload_store(struct sys_device *dev,
- struct sysdev_attribute *attr,
+static ssize_t reload_store(struct device *dev,
+ struct device_attribute *attr,
const char *buf, size_t size)
{
unsigned long val;
@@ -318,30 +318,30 @@ static ssize_t reload_store(struct sys_device *dev,
return ret;
}
-static ssize_t version_show(struct sys_device *dev,
- struct sysdev_attribute *attr, char *buf)
+static ssize_t version_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
{
struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
return sprintf(buf, "0x%x\n", uci->cpu_sig.rev);
}
-static ssize_t pf_show(struct sys_device *dev,
- struct sysdev_attribute *attr, char *buf)
+static ssize_t pf_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
{
struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
return sprintf(buf, "0x%x\n", uci->cpu_sig.pf);
}
-static SYSDEV_ATTR(reload, 0200, NULL, reload_store);
-static SYSDEV_ATTR(version, 0400, version_show, NULL);
-static SYSDEV_ATTR(processor_flags, 0400, pf_show, NULL);
+static DEVICE_ATTR(reload, 0200, NULL, reload_store);
+static DEVICE_ATTR(version, 0400, version_show, NULL);
+static DEVICE_ATTR(processor_flags, 0400, pf_show, NULL);
static struct attribute *mc_default_attrs[] = {
- &attr_reload.attr,
- &attr_version.attr,
- &attr_processor_flags.attr,
+ &dev_attr_reload.attr,
+ &dev_attr_version.attr,
+ &dev_attr_processor_flags.attr,
NULL
};
@@ -405,43 +405,45 @@ static enum ucode_state microcode_update_cpu(int cpu)
return ustate;
}
-static int mc_sysdev_add(struct sys_device *sys_dev)
+static int mc_device_add(struct device *dev, struct subsys_interface *sif)
{
- int err, cpu = sys_dev->id;
+ int err, cpu = dev->id;
if (!cpu_online(cpu))
return 0;
pr_debug("CPU%d added\n", cpu);
- err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group);
+ err = sysfs_create_group(&dev->kobj, &mc_attr_group);
if (err)
return err;
if (microcode_init_cpu(cpu) == UCODE_ERROR) {
- sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
+ sysfs_remove_group(&dev->kobj, &mc_attr_group);
return -EINVAL;
}
return err;
}
-static int mc_sysdev_remove(struct sys_device *sys_dev)
+static int mc_device_remove(struct device *dev, struct subsys_interface *sif)
{
- int cpu = sys_dev->id;
+ int cpu = dev->id;
if (!cpu_online(cpu))
return 0;
pr_debug("CPU%d removed\n", cpu);
microcode_fini_cpu(cpu);
- sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
+ sysfs_remove_group(&dev->kobj, &mc_attr_group);
return 0;
}
-static struct sysdev_driver mc_sysdev_driver = {
- .add = mc_sysdev_add,
- .remove = mc_sysdev_remove,
+static struct subsys_interface mc_cpu_interface = {
+ .name = "microcode",
+ .subsys = &cpu_subsys,
+ .add_dev = mc_device_add,
+ .remove_dev = mc_device_remove,
};
/**
@@ -464,9 +466,9 @@ static __cpuinit int
mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
{
unsigned int cpu = (unsigned long)hcpu;
- struct sys_device *sys_dev;
+ struct device *dev;
- sys_dev = get_cpu_sysdev(cpu);
+ dev = get_cpu_device(cpu);
switch (action) {
case CPU_ONLINE:
case CPU_ONLINE_FROZEN:
@@ -474,13 +476,13 @@ mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
case CPU_DOWN_FAILED:
case CPU_DOWN_FAILED_FROZEN:
pr_debug("CPU%d added\n", cpu);
- if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group))
+ if (sysfs_create_group(&dev->kobj, &mc_attr_group))
pr_err("Failed to create group for CPU%d\n", cpu);
break;
case CPU_DOWN_PREPARE:
case CPU_DOWN_PREPARE_FROZEN:
/* Suspend is in progress, only remove the interface */
- sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
+ sysfs_remove_group(&dev->kobj, &mc_attr_group);
pr_debug("CPU%d removed\n", cpu);
break;
@@ -519,27 +521,23 @@ static int __init microcode_init(void)
microcode_pdev = platform_device_register_simple("microcode", -1,
NULL, 0);
- if (IS_ERR(microcode_pdev)) {
- microcode_dev_exit();
+ if (IS_ERR(microcode_pdev))
return PTR_ERR(microcode_pdev);
- }
get_online_cpus();
mutex_lock(&microcode_mutex);
- error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver);
+ error = subsys_interface_register(&mc_cpu_interface);
mutex_unlock(&microcode_mutex);
put_online_cpus();
- if (error) {
- platform_device_unregister(microcode_pdev);
- return error;
- }
+ if (error)
+ goto out_pdev;
error = microcode_dev_init();
if (error)
- return error;
+ goto out_driver;
register_syscore_ops(&mc_syscore_ops);
register_hotcpu_notifier(&mc_cpu_notifier);
@@ -548,11 +546,27 @@ static int __init microcode_init(void)
" <tigran@aivazian.fsnet.co.uk>, Peter Oruba\n");
return 0;
+
+out_driver:
+ get_online_cpus();
+ mutex_lock(&microcode_mutex);
+
+ subsys_interface_unregister(&mc_cpu_interface);
+
+ mutex_unlock(&microcode_mutex);
+ put_online_cpus();
+
+out_pdev:
+ platform_device_unregister(microcode_pdev);
+ return error;
+
}
module_init(microcode_init);
static void __exit microcode_exit(void)
{
+ struct cpuinfo_x86 *c = &cpu_data(0);
+
microcode_dev_exit();
unregister_hotcpu_notifier(&mc_cpu_notifier);
@@ -561,7 +575,7 @@ static void __exit microcode_exit(void)
get_online_cpus();
mutex_lock(&microcode_mutex);
- sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver);
+ subsys_interface_unregister(&mc_cpu_interface);
mutex_unlock(&microcode_mutex);
put_online_cpus();
@@ -570,6 +584,9 @@ static void __exit microcode_exit(void)
microcode_ops = NULL;
+ if (c->x86_vendor == X86_VENDOR_AMD)
+ exit_amd_microcode();
+
pr_info("Microcode Update Driver: v" MICROCODE_VERSION " removed.\n");
}
module_exit(microcode_exit);
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 9103b89c145a..ca470e4c92dc 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -95,8 +95,8 @@ static void __init MP_bus_info(struct mpc_bus *m)
}
#endif
+ set_bit(m->busid, mp_bus_not_pci);
if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA) - 1) == 0) {
- set_bit(m->busid, mp_bus_not_pci);
#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
mp_bus_id_to_type[m->busid] = MP_BUS_ISA;
#endif
@@ -564,9 +564,7 @@ void __init default_get_smp_config(unsigned int early)
static void __init smp_reserve_memory(struct mpf_intel *mpf)
{
- unsigned long size = get_mpc_size(mpf->physptr);
-
- memblock_x86_reserve_range(mpf->physptr, mpf->physptr+size, "* MP-table mpc");
+ memblock_reserve(mpf->physptr, get_mpc_size(mpf->physptr));
}
static int __init smp_scan_config(unsigned long base, unsigned long length)
@@ -595,7 +593,7 @@ static int __init smp_scan_config(unsigned long base, unsigned long length)
mpf, (u64)virt_to_phys(mpf));
mem = virt_to_phys(mpf);
- memblock_x86_reserve_range(mem, mem + sizeof(*mpf), "* MP-table mpf");
+ memblock_reserve(mem, sizeof(*mpf));
if (mpf->physptr)
smp_reserve_memory(mpf);
@@ -836,10 +834,8 @@ early_param("alloc_mptable", parse_alloc_mptable_opt);
void __init early_reserve_e820_mpc_new(void)
{
- if (enable_update_mptable && alloc_mptable) {
- u64 startt = 0;
- mpc_new_phys = early_reserve_e820(startt, mpc_new_length, 4);
- }
+ if (enable_update_mptable && alloc_mptable)
+ mpc_new_phys = early_reserve_e820(mpc_new_length, 4);
}
static int __init update_mp_table(void)
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 12fcbe2c143e..96356762a51d 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -236,7 +236,7 @@ static struct notifier_block __refdata msr_class_cpu_notifier = {
.notifier_call = msr_class_cpu_callback,
};
-static char *msr_devnode(struct device *dev, mode_t *mode)
+static char *msr_devnode(struct device *dev, umode_t *mode)
{
return kasprintf(GFP_KERNEL, "cpu/%u/msr", MINOR(dev->devt));
}
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index b9c8628974af..47acaf319165 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -29,6 +29,7 @@
#include <asm/traps.h>
#include <asm/mach_traps.h>
#include <asm/nmi.h>
+#include <asm/x86_init.h>
#define NMI_MAX_NAMELEN 16
struct nmiaction {
@@ -348,7 +349,7 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
/* Non-CPU-specific NMI: NMI sources can be processed on any CPU */
raw_spin_lock(&nmi_reason_lock);
- reason = get_nmi_reason();
+ reason = x86_platform.get_nmi_reason();
if (reason & NMI_REASON_MASK) {
if (reason & NMI_REASON_SERR)
@@ -404,9 +405,108 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
unknown_nmi_error(reason, regs);
}
+/*
+ * NMIs can hit breakpoints which will cause it to lose its
+ * NMI context with the CPU when the breakpoint does an iret.
+ */
+#ifdef CONFIG_X86_32
+/*
+ * For i386, NMIs use the same stack as the kernel, and we can
+ * add a workaround to the iret problem in C. Simply have 3 states
+ * the NMI can be in.
+ *
+ * 1) not running
+ * 2) executing
+ * 3) latched
+ *
+ * When no NMI is in progress, it is in the "not running" state.
+ * When an NMI comes in, it goes into the "executing" state.
+ * Normally, if another NMI is triggered, it does not interrupt
+ * the running NMI and the HW will simply latch it so that when
+ * the first NMI finishes, it will restart the second NMI.
+ * (Note, the latch is binary, thus multiple NMIs triggering,
+ * when one is running, are ignored. Only one NMI is restarted.)
+ *
+ * If an NMI hits a breakpoint that executes an iret, another
+ * NMI can preempt it. We do not want to allow this new NMI
+ * to run, but we want to execute it when the first one finishes.
+ * We set the state to "latched", and the first NMI will perform
+ * an cmpxchg on the state, and if it doesn't successfully
+ * reset the state to "not running" it will restart the next
+ * NMI.
+ */
+enum nmi_states {
+ NMI_NOT_RUNNING,
+ NMI_EXECUTING,
+ NMI_LATCHED,
+};
+static DEFINE_PER_CPU(enum nmi_states, nmi_state);
+
+#define nmi_nesting_preprocess(regs) \
+ do { \
+ if (__get_cpu_var(nmi_state) != NMI_NOT_RUNNING) { \
+ __get_cpu_var(nmi_state) = NMI_LATCHED; \
+ return; \
+ } \
+ nmi_restart: \
+ __get_cpu_var(nmi_state) = NMI_EXECUTING; \
+ } while (0)
+
+#define nmi_nesting_postprocess() \
+ do { \
+ if (cmpxchg(&__get_cpu_var(nmi_state), \
+ NMI_EXECUTING, NMI_NOT_RUNNING) != NMI_EXECUTING) \
+ goto nmi_restart; \
+ } while (0)
+#else /* x86_64 */
+/*
+ * In x86_64 things are a bit more difficult. This has the same problem
+ * where an NMI hitting a breakpoint that calls iret will remove the
+ * NMI context, allowing a nested NMI to enter. What makes this more
+ * difficult is that both NMIs and breakpoints have their own stack.
+ * When a new NMI or breakpoint is executed, the stack is set to a fixed
+ * point. If an NMI is nested, it will have its stack set at that same
+ * fixed address that the first NMI had, and will start corrupting the
+ * stack. This is handled in entry_64.S, but the same problem exists with
+ * the breakpoint stack.
+ *
+ * If a breakpoint is being processed, and the debug stack is being used,
+ * if an NMI comes in and also hits a breakpoint, the stack pointer
+ * will be set to the same fixed address as the breakpoint that was
+ * interrupted, causing that stack to be corrupted. To handle this case,
+ * check if the stack that was interrupted is the debug stack, and if
+ * so, change the IDT so that new breakpoints will use the current stack
+ * and not switch to the fixed address. On return of the NMI, switch back
+ * to the original IDT.
+ */
+static DEFINE_PER_CPU(int, update_debug_stack);
+
+static inline void nmi_nesting_preprocess(struct pt_regs *regs)
+{
+ /*
+ * If we interrupted a breakpoint, it is possible that
+ * the nmi handler will have breakpoints too. We need to
+ * change the IDT such that breakpoints that happen here
+ * continue to use the NMI stack.
+ */
+ if (unlikely(is_debug_stack(regs->sp))) {
+ debug_stack_set_zero();
+ __get_cpu_var(update_debug_stack) = 1;
+ }
+}
+
+static inline void nmi_nesting_postprocess(void)
+{
+ if (unlikely(__get_cpu_var(update_debug_stack)))
+ debug_stack_reset();
+}
+#endif
+
dotraplinkage notrace __kprobes void
do_nmi(struct pt_regs *regs, long error_code)
{
+ nmi_nesting_preprocess(regs);
+
nmi_enter();
inc_irq_stat(__nmi_count);
@@ -415,6 +515,9 @@ do_nmi(struct pt_regs *regs, long error_code)
default_do_nmi(regs);
nmi_exit();
+
+ /* On i386, may loop back to preprocess */
+ nmi_nesting_postprocess();
}
void stop_nmi(void)
diff --git a/arch/x86/kernel/nmi_selftest.c b/arch/x86/kernel/nmi_selftest.c
new file mode 100644
index 000000000000..0d01a8ea4e11
--- /dev/null
+++ b/arch/x86/kernel/nmi_selftest.c
@@ -0,0 +1,180 @@
+/*
+ * arch/x86/kernel/nmi-selftest.c
+ *
+ * Testsuite for NMI: IPIs
+ *
+ * Started by Don Zickus:
+ * (using lib/locking-selftest.c as a guide)
+ *
+ * Copyright (C) 2011 Red Hat, Inc., Don Zickus <dzickus@redhat.com>
+ */
+
+#include <linux/smp.h>
+#include <linux/cpumask.h>
+#include <linux/delay.h>
+
+#include <asm/apic.h>
+#include <asm/nmi.h>
+
+#define SUCCESS 0
+#define FAILURE 1
+#define TIMEOUT 2
+
+static int nmi_fail;
+
+/* check to see if NMI IPIs work on this machine */
+static DECLARE_BITMAP(nmi_ipi_mask, NR_CPUS) __read_mostly;
+
+static int testcase_total;
+static int testcase_successes;
+static int expected_testcase_failures;
+static int unexpected_testcase_failures;
+static int unexpected_testcase_unknowns;
+
+static int nmi_unk_cb(unsigned int val, struct pt_regs *regs)
+{
+ unexpected_testcase_unknowns++;
+ return NMI_HANDLED;
+}
+
+static void init_nmi_testsuite(void)
+{
+ /* trap all the unknown NMIs we may generate */
+ register_nmi_handler(NMI_UNKNOWN, nmi_unk_cb, 0, "nmi_selftest_unk");
+}
+
+static void cleanup_nmi_testsuite(void)
+{
+ unregister_nmi_handler(NMI_UNKNOWN, "nmi_selftest_unk");
+}
+
+static int test_nmi_ipi_callback(unsigned int val, struct pt_regs *regs)
+{
+ int cpu = raw_smp_processor_id();
+
+ if (cpumask_test_and_clear_cpu(cpu, to_cpumask(nmi_ipi_mask)))
+ return NMI_HANDLED;
+
+ return NMI_DONE;
+}
+
+static void test_nmi_ipi(struct cpumask *mask)
+{
+ unsigned long timeout;
+
+ if (register_nmi_handler(NMI_LOCAL, test_nmi_ipi_callback,
+ NMI_FLAG_FIRST, "nmi_selftest")) {
+ nmi_fail = FAILURE;
+ return;
+ }
+
+ /* sync above data before sending NMI */
+ wmb();
+
+ apic->send_IPI_mask(mask, NMI_VECTOR);
+
+ /* Don't wait longer than a second */
+ timeout = USEC_PER_SEC;
+ while (!cpumask_empty(mask) && timeout--)
+ udelay(1);
+
+ /* What happens if we timeout, do we still unregister?? */
+ unregister_nmi_handler(NMI_LOCAL, "nmi_selftest");
+
+ if (!timeout)
+ nmi_fail = TIMEOUT;
+ return;
+}
+
+static void remote_ipi(void)
+{
+ cpumask_copy(to_cpumask(nmi_ipi_mask), cpu_online_mask);
+ cpumask_clear_cpu(smp_processor_id(), to_cpumask(nmi_ipi_mask));
+ if (!cpumask_empty(to_cpumask(nmi_ipi_mask)))
+ test_nmi_ipi(to_cpumask(nmi_ipi_mask));
+}
+
+static void local_ipi(void)
+{
+ cpumask_clear(to_cpumask(nmi_ipi_mask));
+ cpumask_set_cpu(smp_processor_id(), to_cpumask(nmi_ipi_mask));
+ test_nmi_ipi(to_cpumask(nmi_ipi_mask));
+}
+
+static void reset_nmi(void)
+{
+ nmi_fail = 0;
+}
+
+static void dotest(void (*testcase_fn)(void), int expected)
+{
+ testcase_fn();
+ /*
+ * Filter out expected failures:
+ */
+ if (nmi_fail != expected) {
+ unexpected_testcase_failures++;
+
+ if (nmi_fail == FAILURE)
+ printk("FAILED |");
+ else if (nmi_fail == TIMEOUT)
+ printk("TIMEOUT|");
+ else
+ printk("ERROR |");
+ dump_stack();
+ } else {
+ testcase_successes++;
+ printk(" ok |");
+ }
+ testcase_total++;
+
+ reset_nmi();
+}
+
+static inline void print_testname(const char *testname)
+{
+ printk("%12s:", testname);
+}
+
+void nmi_selftest(void)
+{
+ init_nmi_testsuite();
+
+ /*
+ * Run the testsuite:
+ */
+ printk("----------------\n");
+ printk("| NMI testsuite:\n");
+ printk("--------------------\n");
+
+ print_testname("remote IPI");
+ dotest(remote_ipi, SUCCESS);
+ printk("\n");
+ print_testname("local IPI");
+ dotest(local_ipi, SUCCESS);
+ printk("\n");
+
+ cleanup_nmi_testsuite();
+
+ if (unexpected_testcase_failures) {
+ printk("--------------------\n");
+ printk("BUG: %3d unexpected failures (out of %3d) - debugging disabled! |\n",
+ unexpected_testcase_failures, testcase_total);
+ printk("-----------------------------------------------------------------\n");
+ } else if (expected_testcase_failures && testcase_successes) {
+ printk("--------------------\n");
+ printk("%3d out of %3d testcases failed, as expected. |\n",
+ expected_testcase_failures, testcase_total);
+ printk("----------------------------------------------------\n");
+ } else if (expected_testcase_failures && !testcase_successes) {
+ printk("--------------------\n");
+ printk("All %3d testcases failed, as expected. |\n",
+ expected_testcase_failures);
+ printk("----------------------------------------\n");
+ } else {
+ printk("--------------------\n");
+ printk("Good, all %3d testcases passed! |\n",
+ testcase_successes);
+ printk("---------------------------------\n");
+ }
+}
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 80dc793b3f63..1c4d769e21ea 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -45,6 +45,15 @@ int iommu_detected __read_mostly = 0;
*/
int iommu_pass_through __read_mostly;
+/*
+ * Group multi-function PCI devices into a single device-group for the
+ * iommu_device_group interface. This tells the iommu driver to pretend
+ * it cannot distinguish between functions of a device, exposing only one
+ * group for the device. Useful for disallowing use of individual PCI
+ * functions from userspace drivers.
+ */
+int iommu_group_mf __read_mostly;
+
extern struct iommu_table_entry __iommu_table[], __iommu_table_end[];
/* Dummy device used for NULL arguments (normally ISA). */
@@ -169,6 +178,8 @@ static __init int iommu_setup(char *p)
#endif
if (!strncmp(p, "pt", 2))
iommu_pass_through = 1;
+ if (!strncmp(p, "group_mf", 8))
+ iommu_group_mf = 1;
gart_parse_options(p);
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index b9b3b1a51643..15763af7bfe3 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -293,7 +293,7 @@ int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
regs.orig_ax = -1;
regs.ip = (unsigned long) kernel_thread_helper;
regs.cs = __KERNEL_CS | get_kernel_rpl();
- regs.flags = X86_EFLAGS_IF | 0x2;
+ regs.flags = X86_EFLAGS_IF | X86_EFLAGS_BIT1;
/* Ok, create the new process.. */
return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, &regs, 0, NULL, NULL);
@@ -403,6 +403,14 @@ void default_idle(void)
EXPORT_SYMBOL(default_idle);
#endif
+bool set_pm_idle_to_default(void)
+{
+ bool ret = !!pm_idle;
+
+ pm_idle = default_idle;
+
+ return ret;
+}
void stop_this_cpu(void *dummy)
{
local_irq_disable();
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 795b79f984c2..485204f58cda 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -99,7 +99,8 @@ void cpu_idle(void)
/* endless idle loop with no priority at all */
while (1) {
- tick_nohz_stop_sched_tick(1);
+ tick_nohz_idle_enter();
+ rcu_idle_enter();
while (!need_resched()) {
check_pgt_cache();
@@ -116,7 +117,8 @@ void cpu_idle(void)
pm_idle();
start_critical_timings();
}
- tick_nohz_restart_sched_tick();
+ rcu_idle_exit();
+ tick_nohz_idle_exit();
preempt_enable_no_resched();
schedule();
preempt_disable();
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 3bd7e6eebf31..9b9fe4a85c87 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -122,7 +122,7 @@ void cpu_idle(void)
/* endless idle loop with no priority at all */
while (1) {
- tick_nohz_stop_sched_tick(1);
+ tick_nohz_idle_enter();
while (!need_resched()) {
rmb();
@@ -139,8 +139,14 @@ void cpu_idle(void)
enter_idle();
/* Don't trace irqs off for idle */
stop_critical_timings();
+
+ /* enter_idle() needs rcu for notifiers */
+ rcu_idle_enter();
+
if (cpuidle_idle_call())
pm_idle();
+
+ rcu_idle_exit();
start_critical_timings();
/* In many cases the interrupt that ended idle
@@ -149,7 +155,7 @@ void cpu_idle(void)
__exit_idle();
}
- tick_nohz_restart_sched_tick();
+ tick_nohz_idle_exit();
preempt_enable_no_resched();
schedule();
preempt_disable();
@@ -293,13 +299,12 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
- p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
+ p->thread.io_bitmap_ptr = kmemdup(me->thread.io_bitmap_ptr,
+ IO_BITMAP_BYTES, GFP_KERNEL);
if (!p->thread.io_bitmap_ptr) {
p->thread.io_bitmap_max = 0;
return -ENOMEM;
}
- memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
- IO_BITMAP_BYTES);
set_tsk_thread_flag(p, TIF_IO_BITMAP);
}
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 82528799c5de..89a04c7b5bb6 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -749,7 +749,8 @@ put:
/*
* Handle PTRACE_POKEUSR calls for the debug register area.
*/
-int ptrace_set_debugreg(struct task_struct *tsk, int n, unsigned long val)
+static int ptrace_set_debugreg(struct task_struct *tsk, int n,
+ unsigned long val)
{
struct thread_struct *thread = &(tsk->thread);
int rc = 0;
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index b78643d0f9a5..03920a15a632 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -553,4 +553,17 @@ DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC,
quirk_amd_nb_node);
DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_LINK,
quirk_amd_nb_node);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F0,
+ quirk_amd_nb_node);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F1,
+ quirk_amd_nb_node);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F2,
+ quirk_amd_nb_node);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F3,
+ quirk_amd_nb_node);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F4,
+ quirk_amd_nb_node);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F5,
+ quirk_amd_nb_node);
+
#endif
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index e334be1182b9..37a458b521a6 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -124,7 +124,7 @@ __setup("reboot=", reboot_setup);
*/
/*
- * Some machines require the "reboot=b" commandline option,
+ * Some machines require the "reboot=b" or "reboot=k" commandline options,
* this quirk makes that automatic.
*/
static int __init set_bios_reboot(const struct dmi_system_id *d)
@@ -136,6 +136,15 @@ static int __init set_bios_reboot(const struct dmi_system_id *d)
return 0;
}
+static int __init set_kbd_reboot(const struct dmi_system_id *d)
+{
+ if (reboot_type != BOOT_KBD) {
+ reboot_type = BOOT_KBD;
+ printk(KERN_INFO "%s series board detected. Selecting KBD-method for reboot.\n", d->ident);
+ }
+ return 0;
+}
+
static struct dmi_system_id __initdata reboot_dmi_table[] = {
{ /* Handle problems with rebooting on Dell E520's */
.callback = set_bios_reboot,
@@ -295,7 +304,7 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
},
},
{ /* Handle reboot issue on Acer Aspire one */
- .callback = set_bios_reboot,
+ .callback = set_kbd_reboot,
.ident = "Acer Aspire One A110",
.matches = {
DMI_MATCH(DMI_SYS_VENDOR, "Acer"),
@@ -443,6 +452,14 @@ static struct dmi_system_id __initdata pci_reboot_dmi_table[] = {
DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E6420"),
},
},
+ { /* Handle problems with rebooting on the OptiPlex 990. */
+ .callback = set_pci_reboot,
+ .ident = "Dell OptiPlex 990",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 990"),
+ },
+ },
{ }
};
diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c
index 348ce016a835..af6db6ec5b2a 100644
--- a/arch/x86/kernel/rtc.c
+++ b/arch/x86/kernel/rtc.c
@@ -12,6 +12,7 @@
#include <asm/vsyscall.h>
#include <asm/x86_init.h>
#include <asm/time.h>
+#include <asm/mrst.h>
#ifdef CONFIG_X86_32
/*
@@ -242,6 +243,10 @@ static __init int add_rtc_cmos(void)
if (of_have_populated_dt())
return 0;
+ /* Intel MID platforms don't have ioport rtc */
+ if (mrst_identify_cpu())
+ return -ENODEV;
+
platform_device_register(&rtc_device);
dev_info(&rtc_device.dev,
"registered platform RTC device (no PNP device found)\n");
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index afaf38447ef5..d7d5099fe874 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -306,7 +306,8 @@ static void __init cleanup_highmap(void)
static void __init reserve_brk(void)
{
if (_brk_end > _brk_start)
- memblock_x86_reserve_range(__pa(_brk_start), __pa(_brk_end), "BRK");
+ memblock_reserve(__pa(_brk_start),
+ __pa(_brk_end) - __pa(_brk_start));
/* Mark brk area as locked down and no longer taking any
new allocations */
@@ -331,13 +332,13 @@ static void __init relocate_initrd(void)
ramdisk_here = memblock_find_in_range(0, end_of_lowmem, area_size,
PAGE_SIZE);
- if (ramdisk_here == MEMBLOCK_ERROR)
+ if (!ramdisk_here)
panic("Cannot find place for new RAMDISK of size %lld\n",
ramdisk_size);
/* Note: this includes all the lowmem currently occupied by
the initrd, we rely on that fact to keep the data intact. */
- memblock_x86_reserve_range(ramdisk_here, ramdisk_here + area_size, "NEW RAMDISK");
+ memblock_reserve(ramdisk_here, area_size);
initrd_start = ramdisk_here + PAGE_OFFSET;
initrd_end = initrd_start + ramdisk_size;
printk(KERN_INFO "Allocated new RAMDISK: %08llx - %08llx\n",
@@ -393,7 +394,7 @@ static void __init reserve_initrd(void)
initrd_start = 0;
if (ramdisk_size >= (end_of_lowmem>>1)) {
- memblock_x86_free_range(ramdisk_image, ramdisk_end);
+ memblock_free(ramdisk_image, ramdisk_end - ramdisk_image);
printk(KERN_ERR "initrd too large to handle, "
"disabling initrd\n");
return;
@@ -416,7 +417,7 @@ static void __init reserve_initrd(void)
relocate_initrd();
- memblock_x86_free_range(ramdisk_image, ramdisk_end);
+ memblock_free(ramdisk_image, ramdisk_end - ramdisk_image);
}
#else
static void __init reserve_initrd(void)
@@ -490,15 +491,13 @@ static void __init memblock_x86_reserve_range_setup_data(void)
{
struct setup_data *data;
u64 pa_data;
- char buf[32];
if (boot_params.hdr.version < 0x0209)
return;
pa_data = boot_params.hdr.setup_data;
while (pa_data) {
data = early_memremap(pa_data, sizeof(*data));
- sprintf(buf, "setup data %x", data->type);
- memblock_x86_reserve_range(pa_data, pa_data+sizeof(*data)+data->len, buf);
+ memblock_reserve(pa_data, sizeof(*data) + data->len);
pa_data = data->next;
early_iounmap(data, sizeof(*data));
}
@@ -554,7 +553,7 @@ static void __init reserve_crashkernel(void)
crash_base = memblock_find_in_range(alignment,
CRASH_KERNEL_ADDR_MAX, crash_size, alignment);
- if (crash_base == MEMBLOCK_ERROR) {
+ if (!crash_base) {
pr_info("crashkernel reservation failed - No suitable area found.\n");
return;
}
@@ -568,7 +567,7 @@ static void __init reserve_crashkernel(void)
return;
}
}
- memblock_x86_reserve_range(crash_base, crash_base + crash_size, "CRASH KERNEL");
+ memblock_reserve(crash_base, crash_size);
printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
"for crashkernel (System RAM: %ldMB)\n",
@@ -626,7 +625,7 @@ static __init void reserve_ibft_region(void)
addr = find_ibft_region(&size);
if (size)
- memblock_x86_reserve_range(addr, addr + size, "* ibft");
+ memblock_reserve(addr, size);
}
static unsigned reserve_low = CONFIG_X86_RESERVE_LOW << 10;
@@ -750,12 +749,7 @@ void __init setup_arch(char **cmdline_p)
#endif
#ifdef CONFIG_EFI
if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
-#ifdef CONFIG_X86_32
- "EL32",
-#else
- "EL64",
-#endif
- 4)) {
+ EFI_LOADER_SIGNATURE, 4)) {
efi_enabled = 1;
efi_memblock_x86_reserve_range();
}
@@ -1045,6 +1039,8 @@ void __init setup_arch(char **cmdline_p)
x86_init.timers.wallclock_init();
+ x86_platform.wallclock_init();
+
mcheck_init();
arch_init_ideal_nops();
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 54ddaeb221c1..46a01bdc27e2 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -682,7 +682,6 @@ static int
handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
struct pt_regs *regs)
{
- sigset_t blocked;
int ret;
/* Are we from a system call? */
@@ -733,10 +732,7 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
*/
regs->flags &= ~X86_EFLAGS_TF;
- sigorsets(&blocked, &current->blocked, &ka->sa.sa_mask);
- if (!(ka->sa.sa_flags & SA_NODEFER))
- sigaddset(&blocked, sig);
- set_current_blocked(&blocked);
+ block_sigmask(ka, sig);
tracehook_signal_handler(sig, info, ka, regs,
test_thread_flag(TIF_SINGLESTEP));
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 16204dc15484..66c74f481cab 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -29,6 +29,7 @@
#include <asm/mmu_context.h>
#include <asm/proto.h>
#include <asm/apic.h>
+#include <asm/nmi.h>
/*
* Some notes on x86 processor bugs affecting SMP operation:
*
@@ -148,6 +149,60 @@ void native_send_call_func_ipi(const struct cpumask *mask)
free_cpumask_var(allbutself);
}
+static atomic_t stopping_cpu = ATOMIC_INIT(-1);
+
+static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs)
+{
+ /* We are registered on stopping cpu too, avoid spurious NMI */
+ if (raw_smp_processor_id() == atomic_read(&stopping_cpu))
+ return NMI_HANDLED;
+
+ stop_this_cpu(NULL);
+
+ return NMI_HANDLED;
+}
+
+static void native_nmi_stop_other_cpus(int wait)
+{
+ unsigned long flags;
+ unsigned long timeout;
+
+ if (reboot_force)
+ return;
+
+ /*
+ * Use an own vector here because smp_call_function
+ * does lots of things not suitable in a panic situation.
+ */
+ if (num_online_cpus() > 1) {
+ /* did someone beat us here? */
+ if (atomic_cmpxchg(&stopping_cpu, -1, safe_smp_processor_id()) != -1)
+ return;
+
+ if (register_nmi_handler(NMI_LOCAL, smp_stop_nmi_callback,
+ NMI_FLAG_FIRST, "smp_stop"))
+ /* Note: we ignore failures here */
+ return;
+
+ /* sync above data before sending NMI */
+ wmb();
+
+ apic->send_IPI_allbutself(NMI_VECTOR);
+
+ /*
+ * Don't wait longer than a second if the caller
+ * didn't ask us to wait.
+ */
+ timeout = USEC_PER_SEC;
+ while (num_online_cpus() > 1 && (wait || timeout--))
+ udelay(1);
+ }
+
+ local_irq_save(flags);
+ disable_local_APIC();
+ local_irq_restore(flags);
+}
+
/*
* this function calls the 'stop' function on all other CPUs in the system.
*/
@@ -160,7 +215,7 @@ asmlinkage void smp_reboot_interrupt(void)
irq_exit();
}
-static void native_stop_other_cpus(int wait)
+static void native_irq_stop_other_cpus(int wait)
{
unsigned long flags;
unsigned long timeout;
@@ -194,6 +249,11 @@ static void native_stop_other_cpus(int wait)
local_irq_restore(flags);
}
+static void native_smp_disable_nmi_ipi(void)
+{
+ smp_ops.stop_other_cpus = native_irq_stop_other_cpus;
+}
+
/*
* Reschedule call back.
*/
@@ -225,12 +285,20 @@ void smp_call_function_single_interrupt(struct pt_regs *regs)
irq_exit();
}
+static int __init nonmi_ipi_setup(char *str)
+{
+ native_smp_disable_nmi_ipi();
+ return 1;
+}
+
+__setup("nonmi_ipi", nonmi_ipi_setup);
+
struct smp_ops smp_ops = {
.smp_prepare_boot_cpu = native_smp_prepare_boot_cpu,
.smp_prepare_cpus = native_smp_prepare_cpus,
.smp_cpus_done = native_smp_cpus_done,
- .stop_other_cpus = native_stop_other_cpus,
+ .stop_other_cpus = native_nmi_stop_other_cpus,
.smp_send_reschedule = native_smp_send_reschedule,
.cpu_up = native_cpu_up,
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 9f548cb4a958..66d250c00d11 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -207,23 +207,29 @@ static void __cpuinit smp_callin(void)
* Need to setup vector mappings before we enable interrupts.
*/
setup_vector_irq(smp_processor_id());
+
+ /*
+ * Save our processor parameters. Note: this information
+ * is needed for clock calibration.
+ */
+ smp_store_cpu_info(cpuid);
+
/*
* Get our bogomips.
+ * Update loops_per_jiffy in cpu_data. Previous call to
+ * smp_store_cpu_info() stored a value that is close but not as
+ * accurate as the value just calculated.
*
* Need to enable IRQs because it can take longer and then
* the NMI watchdog might kill us.
*/
local_irq_enable();
calibrate_delay();
+ cpu_data(cpuid).loops_per_jiffy = loops_per_jiffy;
local_irq_disable();
pr_debug("Stack at about %p\n", &cpuid);
/*
- * Save our processor parameters
- */
- smp_store_cpu_info(cpuid);
-
- /*
* This must be done before setting cpu_online_mask
* or calling notify_cpu_starting.
*/
@@ -840,7 +846,8 @@ int __cpuinit native_cpu_up(unsigned int cpu)
pr_debug("++++++++++++++++++++=_---CPU UP %u\n", cpu);
if (apicid == BAD_APICID || apicid == boot_cpu_physical_apicid ||
- !physid_isset(apicid, phys_cpu_present_map)) {
+ !physid_isset(apicid, phys_cpu_present_map) ||
+ (!x2apic_mode && apicid >= 255)) {
printk(KERN_ERR "%s: bad cpu %d\n", __func__, cpu);
return -EINVAL;
}
@@ -1142,6 +1149,7 @@ void __init native_smp_cpus_done(unsigned int max_cpus)
{
pr_debug("Boot done.\n");
+ nmi_selftest();
impress_friends();
#ifdef CONFIG_X86_IO_APIC
setup_ioapic_dest();
diff --git a/arch/x86/kernel/trampoline.c b/arch/x86/kernel/trampoline.c
index a91ae7709b49..a73b61055ad6 100644
--- a/arch/x86/kernel/trampoline.c
+++ b/arch/x86/kernel/trampoline.c
@@ -14,11 +14,11 @@ void __init setup_trampolines(void)
/* Has to be in very low memory so we can execute real-mode AP code. */
mem = memblock_find_in_range(0, 1<<20, size, PAGE_SIZE);
- if (mem == MEMBLOCK_ERROR)
+ if (!mem)
panic("Cannot allocate trampoline\n");
x86_trampoline_base = __va(mem);
- memblock_x86_reserve_range(mem, mem + size, "TRAMPOLINE");
+ memblock_reserve(mem, size);
printk(KERN_DEBUG "Base memory trampoline at [%p] %llx size %zu\n",
x86_trampoline_base, (unsigned long long)mem, size);
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index a8e3eb83466c..482ec3af2067 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -306,19 +306,20 @@ dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code)
== NOTIFY_STOP)
return;
#endif /* CONFIG_KGDB_LOW_LEVEL_TRAP */
-#ifdef CONFIG_KPROBES
+
if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP)
== NOTIFY_STOP)
return;
-#else
- if (notify_die(DIE_TRAP, "int3", regs, error_code, 3, SIGTRAP)
- == NOTIFY_STOP)
- return;
-#endif
+ /*
+ * Let others (NMI) know that the debug stack is in use
+ * as we may switch to the interrupt stack.
+ */
+ debug_stack_usage_inc();
preempt_conditional_sti(regs);
do_trap(3, SIGTRAP, "int3", regs, error_code, NULL);
preempt_conditional_cli(regs);
+ debug_stack_usage_dec();
}
#ifdef CONFIG_X86_64
@@ -411,6 +412,12 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
SIGTRAP) == NOTIFY_STOP)
return;
+ /*
+ * Let others (NMI) know that the debug stack is in use
+ * as we may switch to the interrupt stack.
+ */
+ debug_stack_usage_inc();
+
/* It's safe to allow irq's after DR6 has been saved */
preempt_conditional_sti(regs);
@@ -418,6 +425,7 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
handle_vm86_trap((struct kernel_vm86_regs *) regs,
error_code, 1);
preempt_conditional_cli(regs);
+ debug_stack_usage_dec();
return;
}
@@ -437,6 +445,7 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS) || user_icebp)
send_sigtrap(tsk, regs, error_code, si_code);
preempt_conditional_cli(regs);
+ debug_stack_usage_dec();
return;
}
@@ -723,4 +732,10 @@ void __init trap_init(void)
cpu_init();
x86_init.irqs.trap_init();
+
+#ifdef CONFIG_X86_64
+ memcpy(&nmi_idt_table, &idt_table, IDT_ENTRIES * 16);
+ set_nmi_gate(1, &debug);
+ set_nmi_gate(3, &int3);
+#endif
}
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index db483369f10b..c0dd5b603749 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -35,7 +35,7 @@ static int __read_mostly tsc_unstable;
erroneous rdtsc usage on !cpu_has_tsc processors */
static int __read_mostly tsc_disabled = -1;
-static int tsc_clocksource_reliable;
+int tsc_clocksource_reliable;
/*
* Scheduler clock - returns current time in nanosec units.
*/
@@ -178,11 +178,11 @@ static unsigned long calc_pmtimer_ref(u64 deltatsc, u64 pm1, u64 pm2)
}
#define CAL_MS 10
-#define CAL_LATCH (CLOCK_TICK_RATE / (1000 / CAL_MS))
+#define CAL_LATCH (PIT_TICK_RATE / (1000 / CAL_MS))
#define CAL_PIT_LOOPS 1000
#define CAL2_MS 50
-#define CAL2_LATCH (CLOCK_TICK_RATE / (1000 / CAL2_MS))
+#define CAL2_LATCH (PIT_TICK_RATE / (1000 / CAL2_MS))
#define CAL2_PIT_LOOPS 5000
@@ -995,3 +995,23 @@ void __init tsc_init(void)
check_system_tsc_reliable();
}
+#ifdef CONFIG_SMP
+/*
+ * If we have a constant TSC and are using the TSC for the delay loop,
+ * we can skip clock calibration if another cpu in the same socket has already
+ * been calibrated. This assumes that CONSTANT_TSC applies to all
+ * cpus in the socket - this should be a safe assumption.
+ */
+unsigned long __cpuinit calibrate_delay_is_known(void)
+{
+ int i, cpu = smp_processor_id();
+
+ if (!tsc_disabled && !cpu_has(&cpu_data(cpu), X86_FEATURE_CONSTANT_TSC))
+ return 0;
+
+ for_each_online_cpu(i)
+ if (cpu_data(i).phys_proc_id == cpu_data(cpu).phys_proc_id)
+ return cpu_data(i).loops_per_jiffy;
+ return 0;
+}
+#endif
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index 0aa5fed8b9e6..9eba29b46cb7 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -113,7 +113,7 @@ void __cpuinit check_tsc_sync_source(int cpu)
if (unsynchronized_tsc())
return;
- if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) {
+ if (tsc_clocksource_reliable) {
if (cpu == (nr_cpu_ids-1) || system_state != SYSTEM_BOOTING)
pr_info(
"Skipped synchronization checks as TSC is reliable.\n");
@@ -172,7 +172,7 @@ void __cpuinit check_tsc_sync_target(void)
{
int cpus = 2;
- if (unsynchronized_tsc() || boot_cpu_has(X86_FEATURE_TSC_RELIABLE))
+ if (unsynchronized_tsc() || tsc_clocksource_reliable)
return;
/*
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index e4d4a22e8b94..b07ba9393564 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -57,7 +57,7 @@ DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data) =
.lock = __SEQLOCK_UNLOCKED(__vsyscall_gtod_data.lock),
};
-static enum { EMULATE, NATIVE, NONE } vsyscall_mode = NATIVE;
+static enum { EMULATE, NATIVE, NONE } vsyscall_mode = EMULATE;
static int __init vsyscall_setup(char *str)
{
@@ -140,11 +140,40 @@ static int addr_to_vsyscall_nr(unsigned long addr)
return nr;
}
+static bool write_ok_or_segv(unsigned long ptr, size_t size)
+{
+ /*
+ * XXX: if access_ok, get_user, and put_user handled
+ * sig_on_uaccess_error, this could go away.
+ */
+
+ if (!access_ok(VERIFY_WRITE, (void __user *)ptr, size)) {
+ siginfo_t info;
+ struct thread_struct *thread = &current->thread;
+
+ thread->error_code = 6; /* user fault, no page, write */
+ thread->cr2 = ptr;
+ thread->trap_no = 14;
+
+ memset(&info, 0, sizeof(info));
+ info.si_signo = SIGSEGV;
+ info.si_errno = 0;
+ info.si_code = SEGV_MAPERR;
+ info.si_addr = (void __user *)ptr;
+
+ force_sig_info(SIGSEGV, &info, current);
+ return false;
+ } else {
+ return true;
+ }
+}
+
bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
{
struct task_struct *tsk;
unsigned long caller;
int vsyscall_nr;
+ int prev_sig_on_uaccess_error;
long ret;
/*
@@ -180,35 +209,65 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
if (seccomp_mode(&tsk->seccomp))
do_exit(SIGKILL);
+ /*
+ * With a real vsyscall, page faults cause SIGSEGV. We want to
+ * preserve that behavior to make writing exploits harder.
+ */
+ prev_sig_on_uaccess_error = current_thread_info()->sig_on_uaccess_error;
+ current_thread_info()->sig_on_uaccess_error = 1;
+
+ /*
+ * 0 is a valid user pointer (in the access_ok sense) on 32-bit and
+ * 64-bit, so we don't need to special-case it here. For all the
+ * vsyscalls, 0 means "don't write anything" not "write it at
+ * address 0".
+ */
+ ret = -EFAULT;
switch (vsyscall_nr) {
case 0:
+ if (!write_ok_or_segv(regs->di, sizeof(struct timeval)) ||
+ !write_ok_or_segv(regs->si, sizeof(struct timezone)))
+ break;
+
ret = sys_gettimeofday(
(struct timeval __user *)regs->di,
(struct timezone __user *)regs->si);
break;
case 1:
+ if (!write_ok_or_segv(regs->di, sizeof(time_t)))
+ break;
+
ret = sys_time((time_t __user *)regs->di);
break;
case 2:
+ if (!write_ok_or_segv(regs->di, sizeof(unsigned)) ||
+ !write_ok_or_segv(regs->si, sizeof(unsigned)))
+ break;
+
ret = sys_getcpu((unsigned __user *)regs->di,
(unsigned __user *)regs->si,
0);
break;
}
+ current_thread_info()->sig_on_uaccess_error = prev_sig_on_uaccess_error;
+
if (ret == -EFAULT) {
- /*
- * Bad news -- userspace fed a bad pointer to a vsyscall.
- *
- * With a real vsyscall, that would have caused SIGSEGV.
- * To make writing reliable exploits using the emulated
- * vsyscalls harder, generate SIGSEGV here as well.
- */
+ /* Bad news -- userspace fed a bad pointer to a vsyscall. */
warn_bad_vsyscall(KERN_INFO, regs,
"vsyscall fault (exploit attempt?)");
- goto sigsegv;
+
+ /*
+ * If we failed to generate a signal for any reason,
+ * generate one here. (This should be impossible.)
+ */
+ if (WARN_ON_ONCE(!sigismember(&tsk->pending.signal, SIGBUS) &&
+ !sigismember(&tsk->pending.signal, SIGSEGV)))
+ goto sigsegv;
+
+ return true; /* Don't emulate the ret. */
}
regs->ax = ret;
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index 6f164bd5e14d..947a06ccc673 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -21,12 +21,14 @@
#include <asm/pat.h>
#include <asm/tsc.h>
#include <asm/iommu.h>
+#include <asm/mach_traps.h>
void __cpuinit x86_init_noop(void) { }
void __init x86_init_uint_noop(unsigned int unused) { }
void __init x86_init_pgd_noop(pgd_t *unused) { }
int __init iommu_init_noop(void) { return 0; }
void iommu_shutdown_noop(void) { }
+void wallclock_init_noop(void) { }
/*
* The platform setup functions are preset with the default functions
@@ -90,6 +92,7 @@ struct x86_init_ops x86_init __initdata = {
struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = {
.setup_percpu_clockev = setup_secondary_APIC_clock,
+ .fixup_cpu_id = x86_default_fixup_cpu_id,
};
static void default_nmi_init(void) { };
@@ -97,11 +100,13 @@ static int default_i8042_detect(void) { return 1; };
struct x86_platform_ops x86_platform = {
.calibrate_tsc = native_calibrate_tsc,
+ .wallclock_init = wallclock_init_noop,
.get_wallclock = mach_get_cmos_time,
.set_wallclock = mach_set_rtc_mmss,
.iommu_shutdown = iommu_shutdown_noop,
.is_untracked_pat_range = is_ISA_range,
.nmi_init = default_nmi_init,
+ .get_nmi_reason = default_get_nmi_reason,
.i8042_detect = default_i8042_detect
};
@@ -110,4 +115,5 @@ struct x86_msi_ops x86_msi = {
.setup_msi_irqs = native_setup_msi_irqs,
.teardown_msi_irq = native_teardown_msi_irq,
.teardown_msi_irqs = default_teardown_msi_irqs,
+ .restore_msi_irqs = default_restore_msi_irqs,
};
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index ff5790d8e990..1a7fe868f375 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -35,6 +35,7 @@ config KVM
select KVM_MMIO
select TASKSTATS
select TASK_DELAY_ACCT
+ select PERF_EVENTS
---help---
Support hosting fully virtualized guest machines using hardware
virtualization extensions. You will need a fairly recent
@@ -52,6 +53,8 @@ config KVM
config KVM_INTEL
tristate "KVM for Intel processors support"
depends on KVM
+ # for perf_guest_get_msrs():
+ depends on CPU_SUP_INTEL
---help---
Provides support for KVM on Intel processors equipped with the VT
extensions.
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index f15501f431c8..4f579e8dcacf 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -12,7 +12,7 @@ kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o)
kvm-$(CONFIG_KVM_ASYNC_PF) += $(addprefix ../../../virt/kvm/, async_pf.o)
kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
- i8254.o timer.o
+ i8254.o timer.o cpuid.o pmu.o
kvm-intel-y += vmx.o
kvm-amd-y += svm.o
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
new file mode 100644
index 000000000000..89b02bfaaca5
--- /dev/null
+++ b/arch/x86/kvm/cpuid.c
@@ -0,0 +1,670 @@
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ * cpuid support routines
+ *
+ * derived from arch/x86/kvm/x86.c
+ *
+ * Copyright 2011 Red Hat, Inc. and/or its affiliates.
+ * Copyright IBM Corporation, 2008
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/module.h>
+#include <linux/vmalloc.h>
+#include <linux/uaccess.h>
+#include <asm/user.h>
+#include <asm/xsave.h>
+#include "cpuid.h"
+#include "lapic.h"
+#include "mmu.h"
+#include "trace.h"
+
+void kvm_update_cpuid(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best;
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ best = kvm_find_cpuid_entry(vcpu, 1, 0);
+ if (!best)
+ return;
+
+ /* Update OSXSAVE bit */
+ if (cpu_has_xsave && best->function == 0x1) {
+ best->ecx &= ~(bit(X86_FEATURE_OSXSAVE));
+ if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE))
+ best->ecx |= bit(X86_FEATURE_OSXSAVE);
+ }
+
+ if (apic) {
+ if (best->ecx & bit(X86_FEATURE_TSC_DEADLINE_TIMER))
+ apic->lapic_timer.timer_mode_mask = 3 << 17;
+ else
+ apic->lapic_timer.timer_mode_mask = 1 << 17;
+ }
+
+ kvm_pmu_cpuid_update(vcpu);
+}
+
+static int is_efer_nx(void)
+{
+ unsigned long long efer = 0;
+
+ rdmsrl_safe(MSR_EFER, &efer);
+ return efer & EFER_NX;
+}
+
+static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu)
+{
+ int i;
+ struct kvm_cpuid_entry2 *e, *entry;
+
+ entry = NULL;
+ for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
+ e = &vcpu->arch.cpuid_entries[i];
+ if (e->function == 0x80000001) {
+ entry = e;
+ break;
+ }
+ }
+ if (entry && (entry->edx & (1 << 20)) && !is_efer_nx()) {
+ entry->edx &= ~(1 << 20);
+ printk(KERN_INFO "kvm: guest NX capability removed\n");
+ }
+}
+
+/* when an old userspace process fills a new kernel module */
+int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
+ struct kvm_cpuid *cpuid,
+ struct kvm_cpuid_entry __user *entries)
+{
+ int r, i;
+ struct kvm_cpuid_entry *cpuid_entries;
+
+ r = -E2BIG;
+ if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
+ goto out;
+ r = -ENOMEM;
+ cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry) * cpuid->nent);
+ if (!cpuid_entries)
+ goto out;
+ r = -EFAULT;
+ if (copy_from_user(cpuid_entries, entries,
+ cpuid->nent * sizeof(struct kvm_cpuid_entry)))
+ goto out_free;
+ for (i = 0; i < cpuid->nent; i++) {
+ vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function;
+ vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax;
+ vcpu->arch.cpuid_entries[i].ebx = cpuid_entries[i].ebx;
+ vcpu->arch.cpuid_entries[i].ecx = cpuid_entries[i].ecx;
+ vcpu->arch.cpuid_entries[i].edx = cpuid_entries[i].edx;
+ vcpu->arch.cpuid_entries[i].index = 0;
+ vcpu->arch.cpuid_entries[i].flags = 0;
+ vcpu->arch.cpuid_entries[i].padding[0] = 0;
+ vcpu->arch.cpuid_entries[i].padding[1] = 0;
+ vcpu->arch.cpuid_entries[i].padding[2] = 0;
+ }
+ vcpu->arch.cpuid_nent = cpuid->nent;
+ cpuid_fix_nx_cap(vcpu);
+ r = 0;
+ kvm_apic_set_version(vcpu);
+ kvm_x86_ops->cpuid_update(vcpu);
+ kvm_update_cpuid(vcpu);
+
+out_free:
+ vfree(cpuid_entries);
+out:
+ return r;
+}
+
+int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
+ struct kvm_cpuid2 *cpuid,
+ struct kvm_cpuid_entry2 __user *entries)
+{
+ int r;
+
+ r = -E2BIG;
+ if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
+ goto out;
+ r = -EFAULT;
+ if (copy_from_user(&vcpu->arch.cpuid_entries, entries,
+ cpuid->nent * sizeof(struct kvm_cpuid_entry2)))
+ goto out;
+ vcpu->arch.cpuid_nent = cpuid->nent;
+ kvm_apic_set_version(vcpu);
+ kvm_x86_ops->cpuid_update(vcpu);
+ kvm_update_cpuid(vcpu);
+ return 0;
+
+out:
+ return r;
+}
+
+int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
+ struct kvm_cpuid2 *cpuid,
+ struct kvm_cpuid_entry2 __user *entries)
+{
+ int r;
+
+ r = -E2BIG;
+ if (cpuid->nent < vcpu->arch.cpuid_nent)
+ goto out;
+ r = -EFAULT;
+ if (copy_to_user(entries, &vcpu->arch.cpuid_entries,
+ vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2)))
+ goto out;
+ return 0;
+
+out:
+ cpuid->nent = vcpu->arch.cpuid_nent;
+ return r;
+}
+
+static void cpuid_mask(u32 *word, int wordnum)
+{
+ *word &= boot_cpu_data.x86_capability[wordnum];
+}
+
+static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function,
+ u32 index)
+{
+ entry->function = function;
+ entry->index = index;
+ cpuid_count(entry->function, entry->index,
+ &entry->eax, &entry->ebx, &entry->ecx, &entry->edx);
+ entry->flags = 0;
+}
+
+static bool supported_xcr0_bit(unsigned bit)
+{
+ u64 mask = ((u64)1 << bit);
+
+ return mask & (XSTATE_FP | XSTATE_SSE | XSTATE_YMM) & host_xcr0;
+}
+
+#define F(x) bit(X86_FEATURE_##x)
+
+static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
+ u32 index, int *nent, int maxnent)
+{
+ int r;
+ unsigned f_nx = is_efer_nx() ? F(NX) : 0;
+#ifdef CONFIG_X86_64
+ unsigned f_gbpages = (kvm_x86_ops->get_lpage_level() == PT_PDPE_LEVEL)
+ ? F(GBPAGES) : 0;
+ unsigned f_lm = F(LM);
+#else
+ unsigned f_gbpages = 0;
+ unsigned f_lm = 0;
+#endif
+ unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0;
+
+ /* cpuid 1.edx */
+ const u32 kvm_supported_word0_x86_features =
+ F(FPU) | F(VME) | F(DE) | F(PSE) |
+ F(TSC) | F(MSR) | F(PAE) | F(MCE) |
+ F(CX8) | F(APIC) | 0 /* Reserved */ | F(SEP) |
+ F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
+ F(PAT) | F(PSE36) | 0 /* PSN */ | F(CLFLSH) |
+ 0 /* Reserved, DS, ACPI */ | F(MMX) |
+ F(FXSR) | F(XMM) | F(XMM2) | F(SELFSNOOP) |
+ 0 /* HTT, TM, Reserved, PBE */;
+ /* cpuid 0x80000001.edx */
+ const u32 kvm_supported_word1_x86_features =
+ F(FPU) | F(VME) | F(DE) | F(PSE) |
+ F(TSC) | F(MSR) | F(PAE) | F(MCE) |
+ F(CX8) | F(APIC) | 0 /* Reserved */ | F(SYSCALL) |
+ F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
+ F(PAT) | F(PSE36) | 0 /* Reserved */ |
+ f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) |
+ F(FXSR) | F(FXSR_OPT) | f_gbpages | f_rdtscp |
+ 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW);
+ /* cpuid 1.ecx */
+ const u32 kvm_supported_word4_x86_features =
+ F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ |
+ 0 /* DS-CPL, VMX, SMX, EST */ |
+ 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |
+ F(FMA) | F(CX16) | 0 /* xTPR Update, PDCM */ |
+ 0 /* Reserved, DCA */ | F(XMM4_1) |
+ F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) |
+ 0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) |
+ F(F16C) | F(RDRAND);
+ /* cpuid 0x80000001.ecx */
+ const u32 kvm_supported_word6_x86_features =
+ F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ |
+ F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
+ F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(XOP) |
+ 0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM);
+
+ /* cpuid 0xC0000001.edx */
+ const u32 kvm_supported_word5_x86_features =
+ F(XSTORE) | F(XSTORE_EN) | F(XCRYPT) | F(XCRYPT_EN) |
+ F(ACE2) | F(ACE2_EN) | F(PHE) | F(PHE_EN) |
+ F(PMM) | F(PMM_EN);
+
+ /* cpuid 7.0.ebx */
+ const u32 kvm_supported_word9_x86_features =
+ F(FSGSBASE) | F(BMI1) | F(AVX2) | F(SMEP) | F(BMI2) | F(ERMS);
+
+ /* all calls to cpuid_count() should be made on the same cpu */
+ get_cpu();
+
+ r = -E2BIG;
+
+ if (*nent >= maxnent)
+ goto out;
+
+ do_cpuid_1_ent(entry, function, index);
+ ++*nent;
+
+ switch (function) {
+ case 0:
+ entry->eax = min(entry->eax, (u32)0xd);
+ break;
+ case 1:
+ entry->edx &= kvm_supported_word0_x86_features;
+ cpuid_mask(&entry->edx, 0);
+ entry->ecx &= kvm_supported_word4_x86_features;
+ cpuid_mask(&entry->ecx, 4);
+ /* we support x2apic emulation even if host does not support
+ * it since we emulate x2apic in software */
+ entry->ecx |= F(X2APIC);
+ break;
+ /* function 2 entries are STATEFUL. That is, repeated cpuid commands
+ * may return different values. This forces us to get_cpu() before
+ * issuing the first command, and also to emulate this annoying behavior
+ * in kvm_emulate_cpuid() using KVM_CPUID_FLAG_STATE_READ_NEXT */
+ case 2: {
+ int t, times = entry->eax & 0xff;
+
+ entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
+ entry->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
+ for (t = 1; t < times; ++t) {
+ if (*nent >= maxnent)
+ goto out;
+
+ do_cpuid_1_ent(&entry[t], function, 0);
+ entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
+ ++*nent;
+ }
+ break;
+ }
+ /* function 4 has additional index. */
+ case 4: {
+ int i, cache_type;
+
+ entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+ /* read more entries until cache_type is zero */
+ for (i = 1; ; ++i) {
+ if (*nent >= maxnent)
+ goto out;
+
+ cache_type = entry[i - 1].eax & 0x1f;
+ if (!cache_type)
+ break;
+ do_cpuid_1_ent(&entry[i], function, i);
+ entry[i].flags |=
+ KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+ ++*nent;
+ }
+ break;
+ }
+ case 7: {
+ entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+ /* Mask ebx against host capbability word 9 */
+ if (index == 0) {
+ entry->ebx &= kvm_supported_word9_x86_features;
+ cpuid_mask(&entry->ebx, 9);
+ } else
+ entry->ebx = 0;
+ entry->eax = 0;
+ entry->ecx = 0;
+ entry->edx = 0;
+ break;
+ }
+ case 9:
+ break;
+ case 0xa: { /* Architectural Performance Monitoring */
+ struct x86_pmu_capability cap;
+ union cpuid10_eax eax;
+ union cpuid10_edx edx;
+
+ perf_get_x86_pmu_capability(&cap);
+
+ /*
+ * Only support guest architectural pmu on a host
+ * with architectural pmu.
+ */
+ if (!cap.version)
+ memset(&cap, 0, sizeof(cap));
+
+ eax.split.version_id = min(cap.version, 2);
+ eax.split.num_counters = cap.num_counters_gp;
+ eax.split.bit_width = cap.bit_width_gp;
+ eax.split.mask_length = cap.events_mask_len;
+
+ edx.split.num_counters_fixed = cap.num_counters_fixed;
+ edx.split.bit_width_fixed = cap.bit_width_fixed;
+ edx.split.reserved = 0;
+
+ entry->eax = eax.full;
+ entry->ebx = cap.events_mask;
+ entry->ecx = 0;
+ entry->edx = edx.full;
+ break;
+ }
+ /* function 0xb has additional index. */
+ case 0xb: {
+ int i, level_type;
+
+ entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+ /* read more entries until level_type is zero */
+ for (i = 1; ; ++i) {
+ if (*nent >= maxnent)
+ goto out;
+
+ level_type = entry[i - 1].ecx & 0xff00;
+ if (!level_type)
+ break;
+ do_cpuid_1_ent(&entry[i], function, i);
+ entry[i].flags |=
+ KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+ ++*nent;
+ }
+ break;
+ }
+ case 0xd: {
+ int idx, i;
+
+ entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+ for (idx = 1, i = 1; idx < 64; ++idx) {
+ if (*nent >= maxnent)
+ goto out;
+
+ do_cpuid_1_ent(&entry[i], function, idx);
+ if (entry[i].eax == 0 || !supported_xcr0_bit(idx))
+ continue;
+ entry[i].flags |=
+ KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+ ++*nent;
+ ++i;
+ }
+ break;
+ }
+ case KVM_CPUID_SIGNATURE: {
+ char signature[12] = "KVMKVMKVM\0\0";
+ u32 *sigptr = (u32 *)signature;
+ entry->eax = 0;
+ entry->ebx = sigptr[0];
+ entry->ecx = sigptr[1];
+ entry->edx = sigptr[2];
+ break;
+ }
+ case KVM_CPUID_FEATURES:
+ entry->eax = (1 << KVM_FEATURE_CLOCKSOURCE) |
+ (1 << KVM_FEATURE_NOP_IO_DELAY) |
+ (1 << KVM_FEATURE_CLOCKSOURCE2) |
+ (1 << KVM_FEATURE_ASYNC_PF) |
+ (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT);
+
+ if (sched_info_on())
+ entry->eax |= (1 << KVM_FEATURE_STEAL_TIME);
+
+ entry->ebx = 0;
+ entry->ecx = 0;
+ entry->edx = 0;
+ break;
+ case 0x80000000:
+ entry->eax = min(entry->eax, 0x8000001a);
+ break;
+ case 0x80000001:
+ entry->edx &= kvm_supported_word1_x86_features;
+ cpuid_mask(&entry->edx, 1);
+ entry->ecx &= kvm_supported_word6_x86_features;
+ cpuid_mask(&entry->ecx, 6);
+ break;
+ case 0x80000008: {
+ unsigned g_phys_as = (entry->eax >> 16) & 0xff;
+ unsigned virt_as = max((entry->eax >> 8) & 0xff, 48U);
+ unsigned phys_as = entry->eax & 0xff;
+
+ if (!g_phys_as)
+ g_phys_as = phys_as;
+ entry->eax = g_phys_as | (virt_as << 8);
+ entry->ebx = entry->edx = 0;
+ break;
+ }
+ case 0x80000019:
+ entry->ecx = entry->edx = 0;
+ break;
+ case 0x8000001a:
+ break;
+ case 0x8000001d:
+ break;
+ /*Add support for Centaur's CPUID instruction*/
+ case 0xC0000000:
+ /*Just support up to 0xC0000004 now*/
+ entry->eax = min(entry->eax, 0xC0000004);
+ break;
+ case 0xC0000001:
+ entry->edx &= kvm_supported_word5_x86_features;
+ cpuid_mask(&entry->edx, 5);
+ break;
+ case 3: /* Processor serial number */
+ case 5: /* MONITOR/MWAIT */
+ case 6: /* Thermal management */
+ case 0x80000007: /* Advanced power management */
+ case 0xC0000002:
+ case 0xC0000003:
+ case 0xC0000004:
+ default:
+ entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
+ break;
+ }
+
+ kvm_x86_ops->set_supported_cpuid(function, entry);
+
+ r = 0;
+
+out:
+ put_cpu();
+
+ return r;
+}
+
+#undef F
+
+struct kvm_cpuid_param {
+ u32 func;
+ u32 idx;
+ bool has_leaf_count;
+ bool (*qualifier)(struct kvm_cpuid_param *param);
+};
+
+static bool is_centaur_cpu(struct kvm_cpuid_param *param)
+{
+ return boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR;
+}
+
+int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
+ struct kvm_cpuid_entry2 __user *entries)
+{
+ struct kvm_cpuid_entry2 *cpuid_entries;
+ int limit, nent = 0, r = -E2BIG, i;
+ u32 func;
+ static struct kvm_cpuid_param param[] = {
+ { .func = 0, .has_leaf_count = true },
+ { .func = 0x80000000, .has_leaf_count = true },
+ { .func = 0xC0000000, .qualifier = is_centaur_cpu, .has_leaf_count = true },
+ { .func = KVM_CPUID_SIGNATURE },
+ { .func = KVM_CPUID_FEATURES },
+ };
+
+ if (cpuid->nent < 1)
+ goto out;
+ if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
+ cpuid->nent = KVM_MAX_CPUID_ENTRIES;
+ r = -ENOMEM;
+ cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent);
+ if (!cpuid_entries)
+ goto out;
+
+ r = 0;
+ for (i = 0; i < ARRAY_SIZE(param); i++) {
+ struct kvm_cpuid_param *ent = &param[i];
+
+ if (ent->qualifier && !ent->qualifier(ent))
+ continue;
+
+ r = do_cpuid_ent(&cpuid_entries[nent], ent->func, ent->idx,
+ &nent, cpuid->nent);
+
+ if (r)
+ goto out_free;
+
+ if (!ent->has_leaf_count)
+ continue;
+
+ limit = cpuid_entries[nent - 1].eax;
+ for (func = ent->func + 1; func <= limit && nent < cpuid->nent && r == 0; ++func)
+ r = do_cpuid_ent(&cpuid_entries[nent], func, ent->idx,
+ &nent, cpuid->nent);
+
+ if (r)
+ goto out_free;
+ }
+
+ r = -EFAULT;
+ if (copy_to_user(entries, cpuid_entries,
+ nent * sizeof(struct kvm_cpuid_entry2)))
+ goto out_free;
+ cpuid->nent = nent;
+ r = 0;
+
+out_free:
+ vfree(cpuid_entries);
+out:
+ return r;
+}
+
+static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i)
+{
+ struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i];
+ int j, nent = vcpu->arch.cpuid_nent;
+
+ e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT;
+ /* when no next entry is found, the current entry[i] is reselected */
+ for (j = i + 1; ; j = (j + 1) % nent) {
+ struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j];
+ if (ej->function == e->function) {
+ ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
+ return j;
+ }
+ }
+ return 0; /* silence gcc, even though control never reaches here */
+}
+
+/* find an entry with matching function, matching index (if needed), and that
+ * should be read next (if it's stateful) */
+static int is_matching_cpuid_entry(struct kvm_cpuid_entry2 *e,
+ u32 function, u32 index)
+{
+ if (e->function != function)
+ return 0;
+ if ((e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index)
+ return 0;
+ if ((e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) &&
+ !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT))
+ return 0;
+ return 1;
+}
+
+struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
+ u32 function, u32 index)
+{
+ int i;
+ struct kvm_cpuid_entry2 *best = NULL;
+
+ for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
+ struct kvm_cpuid_entry2 *e;
+
+ e = &vcpu->arch.cpuid_entries[i];
+ if (is_matching_cpuid_entry(e, function, index)) {
+ if (e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC)
+ move_to_next_stateful_cpuid_entry(vcpu, i);
+ best = e;
+ break;
+ }
+ }
+ return best;
+}
+EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry);
+
+int cpuid_maxphyaddr(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best;
+
+ best = kvm_find_cpuid_entry(vcpu, 0x80000000, 0);
+ if (!best || best->eax < 0x80000008)
+ goto not_found;
+ best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0);
+ if (best)
+ return best->eax & 0xff;
+not_found:
+ return 36;
+}
+
+/*
+ * If no match is found, check whether we exceed the vCPU's limit
+ * and return the content of the highest valid _standard_ leaf instead.
+ * This is to satisfy the CPUID specification.
+ */
+static struct kvm_cpuid_entry2* check_cpuid_limit(struct kvm_vcpu *vcpu,
+ u32 function, u32 index)
+{
+ struct kvm_cpuid_entry2 *maxlevel;
+
+ maxlevel = kvm_find_cpuid_entry(vcpu, function & 0x80000000, 0);
+ if (!maxlevel || maxlevel->eax >= function)
+ return NULL;
+ if (function & 0x80000000) {
+ maxlevel = kvm_find_cpuid_entry(vcpu, 0, 0);
+ if (!maxlevel)
+ return NULL;
+ }
+ return kvm_find_cpuid_entry(vcpu, maxlevel->eax, index);
+}
+
+void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
+{
+ u32 function, index;
+ struct kvm_cpuid_entry2 *best;
+
+ function = kvm_register_read(vcpu, VCPU_REGS_RAX);
+ index = kvm_register_read(vcpu, VCPU_REGS_RCX);
+ kvm_register_write(vcpu, VCPU_REGS_RAX, 0);
+ kvm_register_write(vcpu, VCPU_REGS_RBX, 0);
+ kvm_register_write(vcpu, VCPU_REGS_RCX, 0);
+ kvm_register_write(vcpu, VCPU_REGS_RDX, 0);
+ best = kvm_find_cpuid_entry(vcpu, function, index);
+
+ if (!best)
+ best = check_cpuid_limit(vcpu, function, index);
+
+ if (best) {
+ kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax);
+ kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx);
+ kvm_register_write(vcpu, VCPU_REGS_RCX, best->ecx);
+ kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx);
+ }
+ kvm_x86_ops->skip_emulated_instruction(vcpu);
+ trace_kvm_cpuid(function,
+ kvm_register_read(vcpu, VCPU_REGS_RAX),
+ kvm_register_read(vcpu, VCPU_REGS_RBX),
+ kvm_register_read(vcpu, VCPU_REGS_RCX),
+ kvm_register_read(vcpu, VCPU_REGS_RDX));
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
new file mode 100644
index 000000000000..5b97e1797a6d
--- /dev/null
+++ b/arch/x86/kvm/cpuid.h
@@ -0,0 +1,46 @@
+#ifndef ARCH_X86_KVM_CPUID_H
+#define ARCH_X86_KVM_CPUID_H
+
+#include "x86.h"
+
+void kvm_update_cpuid(struct kvm_vcpu *vcpu);
+struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
+ u32 function, u32 index);
+int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
+ struct kvm_cpuid_entry2 __user *entries);
+int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
+ struct kvm_cpuid *cpuid,
+ struct kvm_cpuid_entry __user *entries);
+int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
+ struct kvm_cpuid2 *cpuid,
+ struct kvm_cpuid_entry2 __user *entries);
+int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
+ struct kvm_cpuid2 *cpuid,
+ struct kvm_cpuid_entry2 __user *entries);
+
+
+static inline bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best;
+
+ best = kvm_find_cpuid_entry(vcpu, 1, 0);
+ return best && (best->ecx & bit(X86_FEATURE_XSAVE));
+}
+
+static inline bool guest_cpuid_has_smep(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best;
+
+ best = kvm_find_cpuid_entry(vcpu, 7, 0);
+ return best && (best->ebx & bit(X86_FEATURE_SMEP));
+}
+
+static inline bool guest_cpuid_has_fsgsbase(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best;
+
+ best = kvm_find_cpuid_entry(vcpu, 7, 0);
+ return best && (best->ebx & bit(X86_FEATURE_FSGSBASE));
+}
+
+#endif
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index f1e3be18a08f..05a562b85025 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -125,8 +125,9 @@
#define Lock (1<<26) /* lock prefix is allowed for the instruction */
#define Priv (1<<27) /* instruction generates #GP if current CPL != 0 */
#define No64 (1<<28)
+#define PageTable (1 << 29) /* instruction used to write page table */
/* Source 2 operand type */
-#define Src2Shift (29)
+#define Src2Shift (30)
#define Src2None (OpNone << Src2Shift)
#define Src2CL (OpCL << Src2Shift)
#define Src2ImmByte (OpImmByte << Src2Shift)
@@ -1674,11 +1675,6 @@ static int em_jmp_far(struct x86_emulate_ctxt *ctxt)
return X86EMUL_CONTINUE;
}
-static int em_grp1a(struct x86_emulate_ctxt *ctxt)
-{
- return emulate_pop(ctxt, &ctxt->dst.val, ctxt->dst.bytes);
-}
-
static int em_grp2(struct x86_emulate_ctxt *ctxt)
{
switch (ctxt->modrm_reg) {
@@ -1788,7 +1784,7 @@ static int em_grp45(struct x86_emulate_ctxt *ctxt)
return rc;
}
-static int em_grp9(struct x86_emulate_ctxt *ctxt)
+static int em_cmpxchg8b(struct x86_emulate_ctxt *ctxt)
{
u64 old = ctxt->dst.orig_val64;
@@ -1831,6 +1827,24 @@ static int em_ret_far(struct x86_emulate_ctxt *ctxt)
return rc;
}
+static int em_cmpxchg(struct x86_emulate_ctxt *ctxt)
+{
+ /* Save real source value, then compare EAX against destination. */
+ ctxt->src.orig_val = ctxt->src.val;
+ ctxt->src.val = ctxt->regs[VCPU_REGS_RAX];
+ emulate_2op_SrcV(ctxt, "cmp");
+
+ if (ctxt->eflags & EFLG_ZF) {
+ /* Success: write back to memory. */
+ ctxt->dst.val = ctxt->src.orig_val;
+ } else {
+ /* Failure: write the value we saw to EAX. */
+ ctxt->dst.type = OP_REG;
+ ctxt->dst.addr.reg = (unsigned long *)&ctxt->regs[VCPU_REGS_RAX];
+ }
+ return X86EMUL_CONTINUE;
+}
+
static int em_lseg(struct x86_emulate_ctxt *ctxt)
{
int seg = ctxt->src2.val;
@@ -2481,6 +2495,15 @@ static int em_das(struct x86_emulate_ctxt *ctxt)
return X86EMUL_CONTINUE;
}
+static int em_call(struct x86_emulate_ctxt *ctxt)
+{
+ long rel = ctxt->src.val;
+
+ ctxt->src.val = (unsigned long)ctxt->_eip;
+ jmp_rel(ctxt, rel);
+ return em_push(ctxt);
+}
+
static int em_call_far(struct x86_emulate_ctxt *ctxt)
{
u16 sel, old_cs;
@@ -2622,12 +2645,75 @@ static int em_rdtsc(struct x86_emulate_ctxt *ctxt)
return X86EMUL_CONTINUE;
}
+static int em_rdpmc(struct x86_emulate_ctxt *ctxt)
+{
+ u64 pmc;
+
+ if (ctxt->ops->read_pmc(ctxt, ctxt->regs[VCPU_REGS_RCX], &pmc))
+ return emulate_gp(ctxt, 0);
+ ctxt->regs[VCPU_REGS_RAX] = (u32)pmc;
+ ctxt->regs[VCPU_REGS_RDX] = pmc >> 32;
+ return X86EMUL_CONTINUE;
+}
+
static int em_mov(struct x86_emulate_ctxt *ctxt)
{
ctxt->dst.val = ctxt->src.val;
return X86EMUL_CONTINUE;
}
+static int em_cr_write(struct x86_emulate_ctxt *ctxt)
+{
+ if (ctxt->ops->set_cr(ctxt, ctxt->modrm_reg, ctxt->src.val))
+ return emulate_gp(ctxt, 0);
+
+ /* Disable writeback. */
+ ctxt->dst.type = OP_NONE;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_dr_write(struct x86_emulate_ctxt *ctxt)
+{
+ unsigned long val;
+
+ if (ctxt->mode == X86EMUL_MODE_PROT64)
+ val = ctxt->src.val & ~0ULL;
+ else
+ val = ctxt->src.val & ~0U;
+
+ /* #UD condition is already handled. */
+ if (ctxt->ops->set_dr(ctxt, ctxt->modrm_reg, val) < 0)
+ return emulate_gp(ctxt, 0);
+
+ /* Disable writeback. */
+ ctxt->dst.type = OP_NONE;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_wrmsr(struct x86_emulate_ctxt *ctxt)
+{
+ u64 msr_data;
+
+ msr_data = (u32)ctxt->regs[VCPU_REGS_RAX]
+ | ((u64)ctxt->regs[VCPU_REGS_RDX] << 32);
+ if (ctxt->ops->set_msr(ctxt, ctxt->regs[VCPU_REGS_RCX], msr_data))
+ return emulate_gp(ctxt, 0);
+
+ return X86EMUL_CONTINUE;
+}
+
+static int em_rdmsr(struct x86_emulate_ctxt *ctxt)
+{
+ u64 msr_data;
+
+ if (ctxt->ops->get_msr(ctxt, ctxt->regs[VCPU_REGS_RCX], &msr_data))
+ return emulate_gp(ctxt, 0);
+
+ ctxt->regs[VCPU_REGS_RAX] = (u32)msr_data;
+ ctxt->regs[VCPU_REGS_RDX] = msr_data >> 32;
+ return X86EMUL_CONTINUE;
+}
+
static int em_mov_rm_sreg(struct x86_emulate_ctxt *ctxt)
{
if (ctxt->modrm_reg > VCPU_SREG_GS)
@@ -2775,6 +2861,24 @@ static int em_jcxz(struct x86_emulate_ctxt *ctxt)
return X86EMUL_CONTINUE;
}
+static int em_in(struct x86_emulate_ctxt *ctxt)
+{
+ if (!pio_in_emulated(ctxt, ctxt->dst.bytes, ctxt->src.val,
+ &ctxt->dst.val))
+ return X86EMUL_IO_NEEDED;
+
+ return X86EMUL_CONTINUE;
+}
+
+static int em_out(struct x86_emulate_ctxt *ctxt)
+{
+ ctxt->ops->pio_out_emulated(ctxt, ctxt->src.bytes, ctxt->dst.val,
+ &ctxt->src.val, 1);
+ /* Disable writeback. */
+ ctxt->dst.type = OP_NONE;
+ return X86EMUL_CONTINUE;
+}
+
static int em_cli(struct x86_emulate_ctxt *ctxt)
{
if (emulator_bad_iopl(ctxt))
@@ -2794,6 +2898,69 @@ static int em_sti(struct x86_emulate_ctxt *ctxt)
return X86EMUL_CONTINUE;
}
+static int em_bt(struct x86_emulate_ctxt *ctxt)
+{
+ /* Disable writeback. */
+ ctxt->dst.type = OP_NONE;
+ /* only subword offset */
+ ctxt->src.val &= (ctxt->dst.bytes << 3) - 1;
+
+ emulate_2op_SrcV_nobyte(ctxt, "bt");
+ return X86EMUL_CONTINUE;
+}
+
+static int em_bts(struct x86_emulate_ctxt *ctxt)
+{
+ emulate_2op_SrcV_nobyte(ctxt, "bts");
+ return X86EMUL_CONTINUE;
+}
+
+static int em_btr(struct x86_emulate_ctxt *ctxt)
+{
+ emulate_2op_SrcV_nobyte(ctxt, "btr");
+ return X86EMUL_CONTINUE;
+}
+
+static int em_btc(struct x86_emulate_ctxt *ctxt)
+{
+ emulate_2op_SrcV_nobyte(ctxt, "btc");
+ return X86EMUL_CONTINUE;
+}
+
+static int em_bsf(struct x86_emulate_ctxt *ctxt)
+{
+ u8 zf;
+
+ __asm__ ("bsf %2, %0; setz %1"
+ : "=r"(ctxt->dst.val), "=q"(zf)
+ : "r"(ctxt->src.val));
+
+ ctxt->eflags &= ~X86_EFLAGS_ZF;
+ if (zf) {
+ ctxt->eflags |= X86_EFLAGS_ZF;
+ /* Disable writeback. */
+ ctxt->dst.type = OP_NONE;
+ }
+ return X86EMUL_CONTINUE;
+}
+
+static int em_bsr(struct x86_emulate_ctxt *ctxt)
+{
+ u8 zf;
+
+ __asm__ ("bsr %2, %0; setz %1"
+ : "=r"(ctxt->dst.val), "=q"(zf)
+ : "r"(ctxt->src.val));
+
+ ctxt->eflags &= ~X86_EFLAGS_ZF;
+ if (zf) {
+ ctxt->eflags |= X86_EFLAGS_ZF;
+ /* Disable writeback. */
+ ctxt->dst.type = OP_NONE;
+ }
+ return X86EMUL_CONTINUE;
+}
+
static bool valid_cr(int nr)
{
switch (nr) {
@@ -2867,9 +3034,6 @@ static int check_cr_write(struct x86_emulate_ctxt *ctxt)
break;
}
case 4: {
- u64 cr4;
-
- cr4 = ctxt->ops->get_cr(ctxt, 4);
ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
if ((efer & EFER_LMA) && !(new_val & X86_CR4_PAE))
@@ -3003,6 +3167,8 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt)
#define D2bv(_f) D((_f) | ByteOp), D(_f)
#define D2bvIP(_f, _i, _p) DIP((_f) | ByteOp, _i, _p), DIP(_f, _i, _p)
#define I2bv(_f, _e) I((_f) | ByteOp, _e), I(_f, _e)
+#define I2bvIP(_f, _e, _i, _p) \
+ IIP((_f) | ByteOp, _e, _i, _p), IIP(_f, _e, _i, _p)
#define I6ALU(_f, _e) I2bv((_f) | DstMem | SrcReg | ModRM, _e), \
I2bv(((_f) | DstReg | SrcMem | ModRM) & ~Lock, _e), \
@@ -3033,17 +3199,17 @@ static struct opcode group7_rm7[] = {
static struct opcode group1[] = {
I(Lock, em_add),
- I(Lock, em_or),
+ I(Lock | PageTable, em_or),
I(Lock, em_adc),
I(Lock, em_sbb),
- I(Lock, em_and),
+ I(Lock | PageTable, em_and),
I(Lock, em_sub),
I(Lock, em_xor),
I(0, em_cmp),
};
static struct opcode group1A[] = {
- D(DstMem | SrcNone | ModRM | Mov | Stack), N, N, N, N, N, N, N,
+ I(DstMem | SrcNone | ModRM | Mov | Stack, em_pop), N, N, N, N, N, N, N,
};
static struct opcode group3[] = {
@@ -3058,16 +3224,19 @@ static struct opcode group3[] = {
};
static struct opcode group4[] = {
- D(ByteOp | DstMem | SrcNone | ModRM | Lock), D(ByteOp | DstMem | SrcNone | ModRM | Lock),
+ I(ByteOp | DstMem | SrcNone | ModRM | Lock, em_grp45),
+ I(ByteOp | DstMem | SrcNone | ModRM | Lock, em_grp45),
N, N, N, N, N, N,
};
static struct opcode group5[] = {
- D(DstMem | SrcNone | ModRM | Lock), D(DstMem | SrcNone | ModRM | Lock),
- D(SrcMem | ModRM | Stack),
+ I(DstMem | SrcNone | ModRM | Lock, em_grp45),
+ I(DstMem | SrcNone | ModRM | Lock, em_grp45),
+ I(SrcMem | ModRM | Stack, em_grp45),
I(SrcMemFAddr | ModRM | ImplicitOps | Stack, em_call_far),
- D(SrcMem | ModRM | Stack), D(SrcMemFAddr | ModRM | ImplicitOps),
- D(SrcMem | ModRM | Stack), N,
+ I(SrcMem | ModRM | Stack, em_grp45),
+ I(SrcMemFAddr | ModRM | ImplicitOps, em_grp45),
+ I(SrcMem | ModRM | Stack, em_grp45), N,
};
static struct opcode group6[] = {
@@ -3096,18 +3265,21 @@ static struct group_dual group7 = { {
static struct opcode group8[] = {
N, N, N, N,
- D(DstMem | SrcImmByte | ModRM), D(DstMem | SrcImmByte | ModRM | Lock),
- D(DstMem | SrcImmByte | ModRM | Lock), D(DstMem | SrcImmByte | ModRM | Lock),
+ I(DstMem | SrcImmByte | ModRM, em_bt),
+ I(DstMem | SrcImmByte | ModRM | Lock | PageTable, em_bts),
+ I(DstMem | SrcImmByte | ModRM | Lock, em_btr),
+ I(DstMem | SrcImmByte | ModRM | Lock | PageTable, em_btc),
};
static struct group_dual group9 = { {
- N, D(DstMem64 | ModRM | Lock), N, N, N, N, N, N,
+ N, I(DstMem64 | ModRM | Lock | PageTable, em_cmpxchg8b), N, N, N, N, N, N,
}, {
N, N, N, N, N, N, N, N,
} };
static struct opcode group11[] = {
- I(DstMem | SrcImm | ModRM | Mov, em_mov), X7(D(Undefined)),
+ I(DstMem | SrcImm | ModRM | Mov | PageTable, em_mov),
+ X7(D(Undefined)),
};
static struct gprefix pfx_0f_6f_0f_7f = {
@@ -3120,7 +3292,7 @@ static struct opcode opcode_table[256] = {
I(ImplicitOps | Stack | No64 | Src2ES, em_push_sreg),
I(ImplicitOps | Stack | No64 | Src2ES, em_pop_sreg),
/* 0x08 - 0x0F */
- I6ALU(Lock, em_or),
+ I6ALU(Lock | PageTable, em_or),
I(ImplicitOps | Stack | No64 | Src2CS, em_push_sreg),
N,
/* 0x10 - 0x17 */
@@ -3132,7 +3304,7 @@ static struct opcode opcode_table[256] = {
I(ImplicitOps | Stack | No64 | Src2DS, em_push_sreg),
I(ImplicitOps | Stack | No64 | Src2DS, em_pop_sreg),
/* 0x20 - 0x27 */
- I6ALU(Lock, em_and), N, N,
+ I6ALU(Lock | PageTable, em_and), N, N,
/* 0x28 - 0x2F */
I6ALU(Lock, em_sub), N, I(ByteOp | DstAcc | No64, em_das),
/* 0x30 - 0x37 */
@@ -3155,8 +3327,8 @@ static struct opcode opcode_table[256] = {
I(DstReg | SrcMem | ModRM | Src2Imm, em_imul_3op),
I(SrcImmByte | Mov | Stack, em_push),
I(DstReg | SrcMem | ModRM | Src2ImmByte, em_imul_3op),
- D2bvIP(DstDI | SrcDX | Mov | String, ins, check_perm_in), /* insb, insw/insd */
- D2bvIP(SrcSI | DstDX | String, outs, check_perm_out), /* outsb, outsw/outsd */
+ I2bvIP(DstDI | SrcDX | Mov | String, em_in, ins, check_perm_in), /* insb, insw/insd */
+ I2bvIP(SrcSI | DstDX | String, em_out, outs, check_perm_out), /* outsb, outsw/outsd */
/* 0x70 - 0x7F */
X16(D(SrcImmByte)),
/* 0x80 - 0x87 */
@@ -3165,11 +3337,11 @@ static struct opcode opcode_table[256] = {
G(ByteOp | DstMem | SrcImm | ModRM | No64 | Group, group1),
G(DstMem | SrcImmByte | ModRM | Group, group1),
I2bv(DstMem | SrcReg | ModRM, em_test),
- I2bv(DstMem | SrcReg | ModRM | Lock, em_xchg),
+ I2bv(DstMem | SrcReg | ModRM | Lock | PageTable, em_xchg),
/* 0x88 - 0x8F */
- I2bv(DstMem | SrcReg | ModRM | Mov, em_mov),
+ I2bv(DstMem | SrcReg | ModRM | Mov | PageTable, em_mov),
I2bv(DstReg | SrcMem | ModRM | Mov, em_mov),
- I(DstMem | SrcNone | ModRM | Mov, em_mov_rm_sreg),
+ I(DstMem | SrcNone | ModRM | Mov | PageTable, em_mov_rm_sreg),
D(ModRM | SrcMem | NoAccess | DstReg),
I(ImplicitOps | SrcMem16 | ModRM, em_mov_sreg_rm),
G(0, group1A),
@@ -3182,7 +3354,7 @@ static struct opcode opcode_table[256] = {
II(ImplicitOps | Stack, em_popf, popf), N, N,
/* 0xA0 - 0xA7 */
I2bv(DstAcc | SrcMem | Mov | MemAbs, em_mov),
- I2bv(DstMem | SrcAcc | Mov | MemAbs, em_mov),
+ I2bv(DstMem | SrcAcc | Mov | MemAbs | PageTable, em_mov),
I2bv(SrcSI | DstDI | Mov | String, em_mov),
I2bv(SrcSI | DstDI | String, em_cmp),
/* 0xA8 - 0xAF */
@@ -3213,13 +3385,13 @@ static struct opcode opcode_table[256] = {
/* 0xE0 - 0xE7 */
X3(I(SrcImmByte, em_loop)),
I(SrcImmByte, em_jcxz),
- D2bvIP(SrcImmUByte | DstAcc, in, check_perm_in),
- D2bvIP(SrcAcc | DstImmUByte, out, check_perm_out),
+ I2bvIP(SrcImmUByte | DstAcc, em_in, in, check_perm_in),
+ I2bvIP(SrcAcc | DstImmUByte, em_out, out, check_perm_out),
/* 0xE8 - 0xEF */
- D(SrcImm | Stack), D(SrcImm | ImplicitOps),
+ I(SrcImm | Stack, em_call), D(SrcImm | ImplicitOps),
I(SrcImmFAddr | No64, em_jmp_far), D(SrcImmByte | ImplicitOps),
- D2bvIP(SrcDX | DstAcc, in, check_perm_in),
- D2bvIP(SrcAcc | DstDX, out, check_perm_out),
+ I2bvIP(SrcDX | DstAcc, em_in, in, check_perm_in),
+ I2bvIP(SrcAcc | DstDX, em_out, out, check_perm_out),
/* 0xF0 - 0xF7 */
N, DI(ImplicitOps, icebp), N, N,
DI(ImplicitOps | Priv, hlt), D(ImplicitOps),
@@ -3242,15 +3414,15 @@ static struct opcode twobyte_table[256] = {
/* 0x20 - 0x2F */
DIP(ModRM | DstMem | Priv | Op3264, cr_read, check_cr_read),
DIP(ModRM | DstMem | Priv | Op3264, dr_read, check_dr_read),
- DIP(ModRM | SrcMem | Priv | Op3264, cr_write, check_cr_write),
- DIP(ModRM | SrcMem | Priv | Op3264, dr_write, check_dr_write),
+ IIP(ModRM | SrcMem | Priv | Op3264, em_cr_write, cr_write, check_cr_write),
+ IIP(ModRM | SrcMem | Priv | Op3264, em_dr_write, dr_write, check_dr_write),
N, N, N, N,
N, N, N, N, N, N, N, N,
/* 0x30 - 0x3F */
- DI(ImplicitOps | Priv, wrmsr),
+ II(ImplicitOps | Priv, em_wrmsr, wrmsr),
IIP(ImplicitOps, em_rdtsc, rdtsc, check_rdtsc),
- DI(ImplicitOps | Priv, rdmsr),
- DIP(ImplicitOps | Priv, rdpmc, check_rdpmc),
+ II(ImplicitOps | Priv, em_rdmsr, rdmsr),
+ IIP(ImplicitOps, em_rdpmc, rdpmc, check_rdpmc),
I(ImplicitOps | VendorSpecific, em_sysenter),
I(ImplicitOps | Priv | VendorSpecific, em_sysexit),
N, N,
@@ -3275,26 +3447,28 @@ static struct opcode twobyte_table[256] = {
X16(D(ByteOp | DstMem | SrcNone | ModRM| Mov)),
/* 0xA0 - 0xA7 */
I(Stack | Src2FS, em_push_sreg), I(Stack | Src2FS, em_pop_sreg),
- DI(ImplicitOps, cpuid), D(DstMem | SrcReg | ModRM | BitOp),
+ DI(ImplicitOps, cpuid), I(DstMem | SrcReg | ModRM | BitOp, em_bt),
D(DstMem | SrcReg | Src2ImmByte | ModRM),
D(DstMem | SrcReg | Src2CL | ModRM), N, N,
/* 0xA8 - 0xAF */
I(Stack | Src2GS, em_push_sreg), I(Stack | Src2GS, em_pop_sreg),
- DI(ImplicitOps, rsm), D(DstMem | SrcReg | ModRM | BitOp | Lock),
+ DI(ImplicitOps, rsm),
+ I(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_bts),
D(DstMem | SrcReg | Src2ImmByte | ModRM),
D(DstMem | SrcReg | Src2CL | ModRM),
D(ModRM), I(DstReg | SrcMem | ModRM, em_imul),
/* 0xB0 - 0xB7 */
- D2bv(DstMem | SrcReg | ModRM | Lock),
+ I2bv(DstMem | SrcReg | ModRM | Lock | PageTable, em_cmpxchg),
I(DstReg | SrcMemFAddr | ModRM | Src2SS, em_lseg),
- D(DstMem | SrcReg | ModRM | BitOp | Lock),
+ I(DstMem | SrcReg | ModRM | BitOp | Lock, em_btr),
I(DstReg | SrcMemFAddr | ModRM | Src2FS, em_lseg),
I(DstReg | SrcMemFAddr | ModRM | Src2GS, em_lseg),
D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
/* 0xB8 - 0xBF */
N, N,
- G(BitOp, group8), D(DstMem | SrcReg | ModRM | BitOp | Lock),
- D(DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM),
+ G(BitOp, group8),
+ I(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_btc),
+ I(DstReg | SrcMem | ModRM, em_bsf), I(DstReg | SrcMem | ModRM, em_bsr),
D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
/* 0xC0 - 0xCF */
D2bv(DstMem | SrcReg | ModRM | Lock),
@@ -3320,6 +3494,7 @@ static struct opcode twobyte_table[256] = {
#undef D2bv
#undef D2bvIP
#undef I2bv
+#undef I2bvIP
#undef I6ALU
static unsigned imm_size(struct x86_emulate_ctxt *ctxt)
@@ -3697,6 +3872,11 @@ done:
return (rc != X86EMUL_CONTINUE) ? EMULATION_FAILED : EMULATION_OK;
}
+bool x86_page_table_writing_insn(struct x86_emulate_ctxt *ctxt)
+{
+ return ctxt->d & PageTable;
+}
+
static bool string_insn_completed(struct x86_emulate_ctxt *ctxt)
{
/* The second termination condition only applies for REPE
@@ -3720,7 +3900,6 @@ static bool string_insn_completed(struct x86_emulate_ctxt *ctxt)
int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
{
struct x86_emulate_ops *ops = ctxt->ops;
- u64 msr_data;
int rc = X86EMUL_CONTINUE;
int saved_dst_type = ctxt->dst.type;
@@ -3854,15 +4033,6 @@ special_insn:
goto cannot_emulate;
ctxt->dst.val = (s32) ctxt->src.val;
break;
- case 0x6c: /* insb */
- case 0x6d: /* insw/insd */
- ctxt->src.val = ctxt->regs[VCPU_REGS_RDX];
- goto do_io_in;
- case 0x6e: /* outsb */
- case 0x6f: /* outsw/outsd */
- ctxt->dst.val = ctxt->regs[VCPU_REGS_RDX];
- goto do_io_out;
- break;
case 0x70 ... 0x7f: /* jcc (short) */
if (test_cc(ctxt->b, ctxt->eflags))
jmp_rel(ctxt, ctxt->src.val);
@@ -3870,9 +4040,6 @@ special_insn:
case 0x8d: /* lea r16/r32, m */
ctxt->dst.val = ctxt->src.addr.mem.ea;
break;
- case 0x8f: /* pop (sole member of Grp1a) */
- rc = em_grp1a(ctxt);
- break;
case 0x90 ... 0x97: /* nop / xchg reg, rax */
if (ctxt->dst.addr.reg == &ctxt->regs[VCPU_REGS_RAX])
break;
@@ -3905,38 +4072,11 @@ special_insn:
ctxt->src.val = ctxt->regs[VCPU_REGS_RCX];
rc = em_grp2(ctxt);
break;
- case 0xe4: /* inb */
- case 0xe5: /* in */
- goto do_io_in;
- case 0xe6: /* outb */
- case 0xe7: /* out */
- goto do_io_out;
- case 0xe8: /* call (near) */ {
- long int rel = ctxt->src.val;
- ctxt->src.val = (unsigned long) ctxt->_eip;
- jmp_rel(ctxt, rel);
- rc = em_push(ctxt);
- break;
- }
case 0xe9: /* jmp rel */
case 0xeb: /* jmp rel short */
jmp_rel(ctxt, ctxt->src.val);
ctxt->dst.type = OP_NONE; /* Disable writeback. */
break;
- case 0xec: /* in al,dx */
- case 0xed: /* in (e/r)ax,dx */
- do_io_in:
- if (!pio_in_emulated(ctxt, ctxt->dst.bytes, ctxt->src.val,
- &ctxt->dst.val))
- goto done; /* IO is needed */
- break;
- case 0xee: /* out dx,al */
- case 0xef: /* out dx,(e/r)ax */
- do_io_out:
- ops->pio_out_emulated(ctxt, ctxt->src.bytes, ctxt->dst.val,
- &ctxt->src.val, 1);
- ctxt->dst.type = OP_NONE; /* Disable writeback. */
- break;
case 0xf4: /* hlt */
ctxt->ops->halt(ctxt);
break;
@@ -3956,12 +4096,6 @@ special_insn:
case 0xfd: /* std */
ctxt->eflags |= EFLG_DF;
break;
- case 0xfe: /* Grp4 */
- rc = em_grp45(ctxt);
- break;
- case 0xff: /* Grp5 */
- rc = em_grp45(ctxt);
- break;
default:
goto cannot_emulate;
}
@@ -4036,49 +4170,6 @@ twobyte_insn:
case 0x21: /* mov from dr to reg */
ops->get_dr(ctxt, ctxt->modrm_reg, &ctxt->dst.val);
break;
- case 0x22: /* mov reg, cr */
- if (ops->set_cr(ctxt, ctxt->modrm_reg, ctxt->src.val)) {
- emulate_gp(ctxt, 0);
- rc = X86EMUL_PROPAGATE_FAULT;
- goto done;
- }
- ctxt->dst.type = OP_NONE;
- break;
- case 0x23: /* mov from reg to dr */
- if (ops->set_dr(ctxt, ctxt->modrm_reg, ctxt->src.val &
- ((ctxt->mode == X86EMUL_MODE_PROT64) ?
- ~0ULL : ~0U)) < 0) {
- /* #UD condition is already handled by the code above */
- emulate_gp(ctxt, 0);
- rc = X86EMUL_PROPAGATE_FAULT;
- goto done;
- }
-
- ctxt->dst.type = OP_NONE; /* no writeback */
- break;
- case 0x30:
- /* wrmsr */
- msr_data = (u32)ctxt->regs[VCPU_REGS_RAX]
- | ((u64)ctxt->regs[VCPU_REGS_RDX] << 32);
- if (ops->set_msr(ctxt, ctxt->regs[VCPU_REGS_RCX], msr_data)) {
- emulate_gp(ctxt, 0);
- rc = X86EMUL_PROPAGATE_FAULT;
- goto done;
- }
- rc = X86EMUL_CONTINUE;
- break;
- case 0x32:
- /* rdmsr */
- if (ops->get_msr(ctxt, ctxt->regs[VCPU_REGS_RCX], &msr_data)) {
- emulate_gp(ctxt, 0);
- rc = X86EMUL_PROPAGATE_FAULT;
- goto done;
- } else {
- ctxt->regs[VCPU_REGS_RAX] = (u32)msr_data;
- ctxt->regs[VCPU_REGS_RDX] = msr_data >> 32;
- }
- rc = X86EMUL_CONTINUE;
- break;
case 0x40 ... 0x4f: /* cmov */
ctxt->dst.val = ctxt->dst.orig_val = ctxt->src.val;
if (!test_cc(ctxt->b, ctxt->eflags))
@@ -4091,93 +4182,21 @@ twobyte_insn:
case 0x90 ... 0x9f: /* setcc r/m8 */
ctxt->dst.val = test_cc(ctxt->b, ctxt->eflags);
break;
- case 0xa3:
- bt: /* bt */
- ctxt->dst.type = OP_NONE;
- /* only subword offset */
- ctxt->src.val &= (ctxt->dst.bytes << 3) - 1;
- emulate_2op_SrcV_nobyte(ctxt, "bt");
- break;
case 0xa4: /* shld imm8, r, r/m */
case 0xa5: /* shld cl, r, r/m */
emulate_2op_cl(ctxt, "shld");
break;
- case 0xab:
- bts: /* bts */
- emulate_2op_SrcV_nobyte(ctxt, "bts");
- break;
case 0xac: /* shrd imm8, r, r/m */
case 0xad: /* shrd cl, r, r/m */
emulate_2op_cl(ctxt, "shrd");
break;
case 0xae: /* clflush */
break;
- case 0xb0 ... 0xb1: /* cmpxchg */
- /*
- * Save real source value, then compare EAX against
- * destination.
- */
- ctxt->src.orig_val = ctxt->src.val;
- ctxt->src.val = ctxt->regs[VCPU_REGS_RAX];
- emulate_2op_SrcV(ctxt, "cmp");
- if (ctxt->eflags & EFLG_ZF) {
- /* Success: write back to memory. */
- ctxt->dst.val = ctxt->src.orig_val;
- } else {
- /* Failure: write the value we saw to EAX. */
- ctxt->dst.type = OP_REG;
- ctxt->dst.addr.reg = (unsigned long *)&ctxt->regs[VCPU_REGS_RAX];
- }
- break;
- case 0xb3:
- btr: /* btr */
- emulate_2op_SrcV_nobyte(ctxt, "btr");
- break;
case 0xb6 ... 0xb7: /* movzx */
ctxt->dst.bytes = ctxt->op_bytes;
ctxt->dst.val = (ctxt->d & ByteOp) ? (u8) ctxt->src.val
: (u16) ctxt->src.val;
break;
- case 0xba: /* Grp8 */
- switch (ctxt->modrm_reg & 3) {
- case 0:
- goto bt;
- case 1:
- goto bts;
- case 2:
- goto btr;
- case 3:
- goto btc;
- }
- break;
- case 0xbb:
- btc: /* btc */
- emulate_2op_SrcV_nobyte(ctxt, "btc");
- break;
- case 0xbc: { /* bsf */
- u8 zf;
- __asm__ ("bsf %2, %0; setz %1"
- : "=r"(ctxt->dst.val), "=q"(zf)
- : "r"(ctxt->src.val));
- ctxt->eflags &= ~X86_EFLAGS_ZF;
- if (zf) {
- ctxt->eflags |= X86_EFLAGS_ZF;
- ctxt->dst.type = OP_NONE; /* Disable writeback. */
- }
- break;
- }
- case 0xbd: { /* bsr */
- u8 zf;
- __asm__ ("bsr %2, %0; setz %1"
- : "=r"(ctxt->dst.val), "=q"(zf)
- : "r"(ctxt->src.val));
- ctxt->eflags &= ~X86_EFLAGS_ZF;
- if (zf) {
- ctxt->eflags |= X86_EFLAGS_ZF;
- ctxt->dst.type = OP_NONE; /* Disable writeback. */
- }
- break;
- }
case 0xbe ... 0xbf: /* movsx */
ctxt->dst.bytes = ctxt->op_bytes;
ctxt->dst.val = (ctxt->d & ByteOp) ? (s8) ctxt->src.val :
@@ -4194,9 +4213,6 @@ twobyte_insn:
ctxt->dst.val = (ctxt->op_bytes == 4) ? (u32) ctxt->src.val :
(u64) ctxt->src.val;
break;
- case 0xc7: /* Grp9 (cmpxchg8b) */
- rc = em_grp9(ctxt);
- break;
default:
goto cannot_emulate;
}
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 76e3f1cd0369..d68f99df690c 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -338,11 +338,15 @@ static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
return HRTIMER_NORESTART;
}
-static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period)
+static void create_pit_timer(struct kvm *kvm, u32 val, int is_period)
{
+ struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state;
struct kvm_timer *pt = &ps->pit_timer;
s64 interval;
+ if (!irqchip_in_kernel(kvm) || ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)
+ return;
+
interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ);
pr_debug("create pit timer, interval is %llu nsec\n", interval);
@@ -393,15 +397,11 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val)
case 1:
/* FIXME: enhance mode 4 precision */
case 4:
- if (!(ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)) {
- create_pit_timer(ps, val, 0);
- }
+ create_pit_timer(kvm, val, 0);
break;
case 2:
case 3:
- if (!(ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)){
- create_pit_timer(ps, val, 1);
- }
+ create_pit_timer(kvm, val, 1);
break;
default:
destroy_pit_timer(kvm->arch.vpit);
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index cac4746d7ffb..b6a73537e1ef 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -262,9 +262,10 @@ int kvm_pic_read_irq(struct kvm *kvm)
void kvm_pic_reset(struct kvm_kpic_state *s)
{
- int irq;
- struct kvm_vcpu *vcpu0 = s->pics_state->kvm->bsp_vcpu;
+ int irq, i;
+ struct kvm_vcpu *vcpu;
u8 irr = s->irr, isr = s->imr;
+ bool found = false;
s->last_irr = 0;
s->irr = 0;
@@ -281,12 +282,19 @@ void kvm_pic_reset(struct kvm_kpic_state *s)
s->special_fully_nested_mode = 0;
s->init4 = 0;
- for (irq = 0; irq < PIC_NUM_PINS/2; irq++) {
- if (vcpu0 && kvm_apic_accept_pic_intr(vcpu0))
- if (irr & (1 << irq) || isr & (1 << irq)) {
- pic_clear_isr(s, irq);
- }
- }
+ kvm_for_each_vcpu(i, vcpu, s->pics_state->kvm)
+ if (kvm_apic_accept_pic_intr(vcpu)) {
+ found = true;
+ break;
+ }
+
+
+ if (!found)
+ return;
+
+ for (irq = 0; irq < PIC_NUM_PINS/2; irq++)
+ if (irr & (1 << irq) || isr & (1 << irq))
+ pic_clear_isr(s, irq);
}
static void pic_ioport_write(void *opaque, u32 addr, u32 val)
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 54abb40199d6..cfdc6e0ef002 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -38,6 +38,7 @@
#include "irq.h"
#include "trace.h"
#include "x86.h"
+#include "cpuid.h"
#ifndef CONFIG_X86_64
#define mod_64(x, y) ((x) - (y) * div64_u64(x, y))
@@ -1120,7 +1121,7 @@ int apic_has_pending_timer(struct kvm_vcpu *vcpu)
return 0;
}
-static int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type)
+int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type)
{
u32 reg = apic_get_reg(apic, lvt_type);
int vector, mode, trig_mode;
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 138e8cc6fea6..6f4ce2575d09 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -34,6 +34,7 @@ void kvm_apic_set_version(struct kvm_vcpu *vcpu);
int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest);
int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda);
int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq);
+int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type);
u64 kvm_get_apic_base(struct kvm_vcpu *vcpu);
void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index f1b36cf3e3d0..224b02c3cda9 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -59,15 +59,6 @@ enum {
AUDIT_POST_SYNC
};
-char *audit_point_name[] = {
- "pre page fault",
- "post page fault",
- "pre pte write",
- "post pte write",
- "pre sync",
- "post sync"
-};
-
#undef MMU_DEBUG
#ifdef MMU_DEBUG
@@ -83,13 +74,10 @@ char *audit_point_name[] = {
#endif
#ifdef MMU_DEBUG
-static int dbg = 0;
+static bool dbg = 0;
module_param(dbg, bool, 0644);
#endif
-static int oos_shadow = 1;
-module_param(oos_shadow, bool, 0644);
-
#ifndef MMU_DEBUG
#define ASSERT(x) do { } while (0)
#else
@@ -593,6 +581,11 @@ static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
return 0;
}
+static int mmu_memory_cache_free_objects(struct kvm_mmu_memory_cache *cache)
+{
+ return cache->nobjs;
+}
+
static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc,
struct kmem_cache *cache)
{
@@ -953,21 +946,35 @@ static void pte_list_walk(unsigned long *pte_list, pte_list_walk_fn fn)
}
}
+static unsigned long *__gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level,
+ struct kvm_memory_slot *slot)
+{
+ struct kvm_lpage_info *linfo;
+
+ if (likely(level == PT_PAGE_TABLE_LEVEL))
+ return &slot->rmap[gfn - slot->base_gfn];
+
+ linfo = lpage_info_slot(gfn, slot, level);
+ return &linfo->rmap_pde;
+}
+
/*
* Take gfn and return the reverse mapping to it.
*/
static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
{
struct kvm_memory_slot *slot;
- struct kvm_lpage_info *linfo;
slot = gfn_to_memslot(kvm, gfn);
- if (likely(level == PT_PAGE_TABLE_LEVEL))
- return &slot->rmap[gfn - slot->base_gfn];
+ return __gfn_to_rmap(kvm, gfn, level, slot);
+}
- linfo = lpage_info_slot(gfn, slot, level);
+static bool rmap_can_add(struct kvm_vcpu *vcpu)
+{
+ struct kvm_mmu_memory_cache *cache;
- return &linfo->rmap_pde;
+ cache = &vcpu->arch.mmu_pte_list_desc_cache;
+ return mmu_memory_cache_free_objects(cache);
}
static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
@@ -1004,17 +1011,16 @@ static void drop_spte(struct kvm *kvm, u64 *sptep)
rmap_remove(kvm, sptep);
}
-static int rmap_write_protect(struct kvm *kvm, u64 gfn)
+int kvm_mmu_rmap_write_protect(struct kvm *kvm, u64 gfn,
+ struct kvm_memory_slot *slot)
{
unsigned long *rmapp;
u64 *spte;
int i, write_protected = 0;
- rmapp = gfn_to_rmap(kvm, gfn, PT_PAGE_TABLE_LEVEL);
-
+ rmapp = __gfn_to_rmap(kvm, gfn, PT_PAGE_TABLE_LEVEL, slot);
spte = rmap_next(kvm, rmapp, NULL);
while (spte) {
- BUG_ON(!spte);
BUG_ON(!(*spte & PT_PRESENT_MASK));
rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
if (is_writable_pte(*spte)) {
@@ -1027,12 +1033,11 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
/* check for huge page mappings */
for (i = PT_DIRECTORY_LEVEL;
i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
- rmapp = gfn_to_rmap(kvm, gfn, i);
+ rmapp = __gfn_to_rmap(kvm, gfn, i, slot);
spte = rmap_next(kvm, rmapp, NULL);
while (spte) {
- BUG_ON(!spte);
BUG_ON(!(*spte & PT_PRESENT_MASK));
- BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK));
+ BUG_ON(!is_large_pte(*spte));
pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn);
if (is_writable_pte(*spte)) {
drop_spte(kvm, spte);
@@ -1047,6 +1052,14 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
return write_protected;
}
+static int rmap_write_protect(struct kvm *kvm, u64 gfn)
+{
+ struct kvm_memory_slot *slot;
+
+ slot = gfn_to_memslot(kvm, gfn);
+ return kvm_mmu_rmap_write_protect(kvm, gfn, slot);
+}
+
static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
unsigned long data)
{
@@ -1103,15 +1116,15 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
int (*handler)(struct kvm *kvm, unsigned long *rmapp,
unsigned long data))
{
- int i, j;
+ int j;
int ret;
int retval = 0;
struct kvm_memslots *slots;
+ struct kvm_memory_slot *memslot;
slots = kvm_memslots(kvm);
- for (i = 0; i < slots->nmemslots; i++) {
- struct kvm_memory_slot *memslot = &slots->memslots[i];
+ kvm_for_each_memslot(memslot, slots) {
unsigned long start = memslot->userspace_addr;
unsigned long end;
@@ -1324,7 +1337,7 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
PAGE_SIZE);
set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
- bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
+ bitmap_zero(sp->slot_bitmap, KVM_MEM_SLOTS_NUM);
sp->parent_ptes = 0;
mmu_page_add_parent_pte(vcpu, sp, parent_pte);
kvm_mod_used_mmu_pages(vcpu->kvm, +1);
@@ -1511,6 +1524,13 @@ static int kvm_sync_page_transient(struct kvm_vcpu *vcpu,
return ret;
}
+#ifdef CONFIG_KVM_MMU_AUDIT
+#include "mmu_audit.c"
+#else
+static void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { }
+static void mmu_audit_disable(void) { }
+#endif
+
static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
struct list_head *invalid_list)
{
@@ -1640,6 +1660,18 @@ static void init_shadow_page_table(struct kvm_mmu_page *sp)
sp->spt[i] = 0ull;
}
+static void __clear_sp_write_flooding_count(struct kvm_mmu_page *sp)
+{
+ sp->write_flooding_count = 0;
+}
+
+static void clear_sp_write_flooding_count(u64 *spte)
+{
+ struct kvm_mmu_page *sp = page_header(__pa(spte));
+
+ __clear_sp_write_flooding_count(sp);
+}
+
static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
gfn_t gfn,
gva_t gaddr,
@@ -1683,6 +1715,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
} else if (sp->unsync)
kvm_mmu_mark_parents_unsync(sp);
+ __clear_sp_write_flooding_count(sp);
trace_kvm_mmu_get_page(sp, false);
return sp;
}
@@ -1796,7 +1829,7 @@ static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
}
}
-static void mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
+static bool mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
u64 *spte)
{
u64 pte;
@@ -1804,17 +1837,21 @@ static void mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
pte = *spte;
if (is_shadow_present_pte(pte)) {
- if (is_last_spte(pte, sp->role.level))
+ if (is_last_spte(pte, sp->role.level)) {
drop_spte(kvm, spte);
- else {
+ if (is_large_pte(pte))
+ --kvm->stat.lpages;
+ } else {
child = page_header(pte & PT64_BASE_ADDR_MASK);
drop_parent_pte(child, spte);
}
- } else if (is_mmio_spte(pte))
+ return true;
+ }
+
+ if (is_mmio_spte(pte))
mmu_spte_clear_no_track(spte);
- if (is_large_pte(pte))
- --kvm->stat.lpages;
+ return false;
}
static void kvm_mmu_page_unlink_children(struct kvm *kvm,
@@ -1831,15 +1868,6 @@ static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte)
mmu_page_remove_parent_pte(sp, parent_pte);
}
-static void kvm_mmu_reset_last_pte_updated(struct kvm *kvm)
-{
- int i;
- struct kvm_vcpu *vcpu;
-
- kvm_for_each_vcpu(i, vcpu, kvm)
- vcpu->arch.last_pte_updated = NULL;
-}
-
static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
{
u64 *parent_pte;
@@ -1899,7 +1927,6 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
}
sp->role.invalid = 1;
- kvm_mmu_reset_last_pte_updated(kvm);
return ret;
}
@@ -1985,7 +2012,7 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages)
kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages;
}
-static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
+int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
{
struct kvm_mmu_page *sp;
struct hlist_node *node;
@@ -1994,7 +2021,7 @@ static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
pgprintk("%s: looking for gfn %llx\n", __func__, gfn);
r = 0;
-
+ spin_lock(&kvm->mmu_lock);
for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) {
pgprintk("%s: gfn %llx role %x\n", __func__, gfn,
sp->role.word);
@@ -2002,22 +2029,11 @@ static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
}
kvm_mmu_commit_zap_page(kvm, &invalid_list);
- return r;
-}
-
-static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
-{
- struct kvm_mmu_page *sp;
- struct hlist_node *node;
- LIST_HEAD(invalid_list);
+ spin_unlock(&kvm->mmu_lock);
- for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) {
- pgprintk("%s: zap %llx %x\n",
- __func__, gfn, sp->role.word);
- kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
- }
- kvm_mmu_commit_zap_page(kvm, &invalid_list);
+ return r;
}
+EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page);
static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
{
@@ -2169,8 +2185,6 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
return 1;
if (!need_unsync && !s->unsync) {
- if (!oos_shadow)
- return 1;
need_unsync = true;
}
}
@@ -2191,11 +2205,6 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
if (set_mmio_spte(sptep, gfn, pfn, pte_access))
return 0;
- /*
- * We don't set the accessed bit, since we sometimes want to see
- * whether the guest actually used the pte (in order to detect
- * demand paging).
- */
spte = PT_PRESENT_MASK;
if (!speculative)
spte |= shadow_accessed_mask;
@@ -2346,10 +2355,6 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
}
}
kvm_release_pfn_clean(pfn);
- if (speculative) {
- vcpu->arch.last_pte_updated = sptep;
- vcpu->arch.last_pte_gfn = gfn;
- }
}
static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
@@ -2840,12 +2845,12 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
return;
vcpu_clear_mmio_info(vcpu, ~0ul);
- trace_kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
+ kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
hpa_t root = vcpu->arch.mmu.root_hpa;
sp = page_header(root);
mmu_sync_children(vcpu, sp);
- trace_kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
+ kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
return;
}
for (i = 0; i < 4; ++i) {
@@ -2857,7 +2862,7 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
mmu_sync_children(vcpu, sp);
}
}
- trace_kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
+ kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
}
void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
@@ -3510,28 +3515,119 @@ static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, bool zap_page,
kvm_mmu_flush_tlb(vcpu);
}
-static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu)
+static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa,
+ const u8 *new, int *bytes)
{
- u64 *spte = vcpu->arch.last_pte_updated;
+ u64 gentry;
+ int r;
+
+ /*
+ * Assume that the pte write on a page table of the same type
+ * as the current vcpu paging mode since we update the sptes only
+ * when they have the same mode.
+ */
+ if (is_pae(vcpu) && *bytes == 4) {
+ /* Handle a 32-bit guest writing two halves of a 64-bit gpte */
+ *gpa &= ~(gpa_t)7;
+ *bytes = 8;
+ r = kvm_read_guest(vcpu->kvm, *gpa, &gentry, min(*bytes, 8));
+ if (r)
+ gentry = 0;
+ new = (const u8 *)&gentry;
+ }
- return !!(spte && (*spte & shadow_accessed_mask));
+ switch (*bytes) {
+ case 4:
+ gentry = *(const u32 *)new;
+ break;
+ case 8:
+ gentry = *(const u64 *)new;
+ break;
+ default:
+ gentry = 0;
+ break;
+ }
+
+ return gentry;
}
-static void kvm_mmu_access_page(struct kvm_vcpu *vcpu, gfn_t gfn)
+/*
+ * If we're seeing too many writes to a page, it may no longer be a page table,
+ * or we may be forking, in which case it is better to unmap the page.
+ */
+static bool detect_write_flooding(struct kvm_mmu_page *sp, u64 *spte)
{
- u64 *spte = vcpu->arch.last_pte_updated;
+ /*
+ * Skip write-flooding detected for the sp whose level is 1, because
+ * it can become unsync, then the guest page is not write-protected.
+ */
+ if (sp->role.level == 1)
+ return false;
- if (spte
- && vcpu->arch.last_pte_gfn == gfn
- && shadow_accessed_mask
- && !(*spte & shadow_accessed_mask)
- && is_shadow_present_pte(*spte))
- set_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte);
+ return ++sp->write_flooding_count >= 3;
+}
+
+/*
+ * Misaligned accesses are too much trouble to fix up; also, they usually
+ * indicate a page is not used as a page table.
+ */
+static bool detect_write_misaligned(struct kvm_mmu_page *sp, gpa_t gpa,
+ int bytes)
+{
+ unsigned offset, pte_size, misaligned;
+
+ pgprintk("misaligned: gpa %llx bytes %d role %x\n",
+ gpa, bytes, sp->role.word);
+
+ offset = offset_in_page(gpa);
+ pte_size = sp->role.cr4_pae ? 8 : 4;
+
+ /*
+ * Sometimes, the OS only writes the last one bytes to update status
+ * bits, for example, in linux, andb instruction is used in clear_bit().
+ */
+ if (!(offset & (pte_size - 1)) && bytes == 1)
+ return false;
+
+ misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
+ misaligned |= bytes < 4;
+
+ return misaligned;
+}
+
+static u64 *get_written_sptes(struct kvm_mmu_page *sp, gpa_t gpa, int *nspte)
+{
+ unsigned page_offset, quadrant;
+ u64 *spte;
+ int level;
+
+ page_offset = offset_in_page(gpa);
+ level = sp->role.level;
+ *nspte = 1;
+ if (!sp->role.cr4_pae) {
+ page_offset <<= 1; /* 32->64 */
+ /*
+ * A 32-bit pde maps 4MB while the shadow pdes map
+ * only 2MB. So we need to double the offset again
+ * and zap two pdes instead of one.
+ */
+ if (level == PT32_ROOT_LEVEL) {
+ page_offset &= ~7; /* kill rounding error */
+ page_offset <<= 1;
+ *nspte = 2;
+ }
+ quadrant = page_offset >> PAGE_SHIFT;
+ page_offset &= ~PAGE_MASK;
+ if (quadrant != sp->role.quadrant)
+ return NULL;
+ }
+
+ spte = &sp->spt[page_offset / sizeof(*spte)];
+ return spte;
}
void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
- const u8 *new, int bytes,
- bool guest_initiated)
+ const u8 *new, int bytes)
{
gfn_t gfn = gpa >> PAGE_SHIFT;
union kvm_mmu_page_role mask = { .word = 0 };
@@ -3539,8 +3635,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
struct hlist_node *node;
LIST_HEAD(invalid_list);
u64 entry, gentry, *spte;
- unsigned pte_size, page_offset, misaligned, quadrant, offset;
- int level, npte, invlpg_counter, r, flooded = 0;
+ int npte;
bool remote_flush, local_flush, zap_page;
/*
@@ -3551,112 +3646,45 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
return;
zap_page = remote_flush = local_flush = false;
- offset = offset_in_page(gpa);
pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
- invlpg_counter = atomic_read(&vcpu->kvm->arch.invlpg_counter);
+ gentry = mmu_pte_write_fetch_gpte(vcpu, &gpa, new, &bytes);
/*
- * Assume that the pte write on a page table of the same type
- * as the current vcpu paging mode since we update the sptes only
- * when they have the same mode.
+ * No need to care whether allocation memory is successful
+ * or not since pte prefetch is skiped if it does not have
+ * enough objects in the cache.
*/
- if ((is_pae(vcpu) && bytes == 4) || !new) {
- /* Handle a 32-bit guest writing two halves of a 64-bit gpte */
- if (is_pae(vcpu)) {
- gpa &= ~(gpa_t)7;
- bytes = 8;
- }
- r = kvm_read_guest(vcpu->kvm, gpa, &gentry, min(bytes, 8));
- if (r)
- gentry = 0;
- new = (const u8 *)&gentry;
- }
-
- switch (bytes) {
- case 4:
- gentry = *(const u32 *)new;
- break;
- case 8:
- gentry = *(const u64 *)new;
- break;
- default:
- gentry = 0;
- break;
- }
+ mmu_topup_memory_caches(vcpu);
spin_lock(&vcpu->kvm->mmu_lock);
- if (atomic_read(&vcpu->kvm->arch.invlpg_counter) != invlpg_counter)
- gentry = 0;
- kvm_mmu_free_some_pages(vcpu);
++vcpu->kvm->stat.mmu_pte_write;
- trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE);
- if (guest_initiated) {
- kvm_mmu_access_page(vcpu, gfn);
- if (gfn == vcpu->arch.last_pt_write_gfn
- && !last_updated_pte_accessed(vcpu)) {
- ++vcpu->arch.last_pt_write_count;
- if (vcpu->arch.last_pt_write_count >= 3)
- flooded = 1;
- } else {
- vcpu->arch.last_pt_write_gfn = gfn;
- vcpu->arch.last_pt_write_count = 1;
- vcpu->arch.last_pte_updated = NULL;
- }
- }
+ kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE);
mask.cr0_wp = mask.cr4_pae = mask.nxe = 1;
for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn, node) {
- pte_size = sp->role.cr4_pae ? 8 : 4;
- misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
- misaligned |= bytes < 4;
- if (misaligned || flooded) {
- /*
- * Misaligned accesses are too much trouble to fix
- * up; also, they usually indicate a page is not used
- * as a page table.
- *
- * If we're seeing too many writes to a page,
- * it may no longer be a page table, or we may be
- * forking, in which case it is better to unmap the
- * page.
- */
- pgprintk("misaligned: gpa %llx bytes %d role %x\n",
- gpa, bytes, sp->role.word);
+ spte = get_written_sptes(sp, gpa, &npte);
+
+ if (detect_write_misaligned(sp, gpa, bytes) ||
+ detect_write_flooding(sp, spte)) {
zap_page |= !!kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
&invalid_list);
++vcpu->kvm->stat.mmu_flooded;
continue;
}
- page_offset = offset;
- level = sp->role.level;
- npte = 1;
- if (!sp->role.cr4_pae) {
- page_offset <<= 1; /* 32->64 */
- /*
- * A 32-bit pde maps 4MB while the shadow pdes map
- * only 2MB. So we need to double the offset again
- * and zap two pdes instead of one.
- */
- if (level == PT32_ROOT_LEVEL) {
- page_offset &= ~7; /* kill rounding error */
- page_offset <<= 1;
- npte = 2;
- }
- quadrant = page_offset >> PAGE_SHIFT;
- page_offset &= ~PAGE_MASK;
- if (quadrant != sp->role.quadrant)
- continue;
- }
+
+ spte = get_written_sptes(sp, gpa, &npte);
+ if (!spte)
+ continue;
+
local_flush = true;
- spte = &sp->spt[page_offset / sizeof(*spte)];
while (npte--) {
entry = *spte;
mmu_page_zap_pte(vcpu->kvm, sp, spte);
if (gentry &&
!((sp->role.word ^ vcpu->arch.mmu.base_role.word)
- & mask.word))
+ & mask.word) && rmap_can_add(vcpu))
mmu_pte_write_new_pte(vcpu, sp, spte, &gentry);
if (!remote_flush && need_remote_flush(entry, *spte))
remote_flush = true;
@@ -3665,7 +3693,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
}
mmu_pte_write_flush_tlb(vcpu, zap_page, remote_flush, local_flush);
kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
- trace_kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE);
+ kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE);
spin_unlock(&vcpu->kvm->mmu_lock);
}
@@ -3679,9 +3707,8 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
- spin_lock(&vcpu->kvm->mmu_lock);
r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
- spin_unlock(&vcpu->kvm->mmu_lock);
+
return r;
}
EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt);
@@ -3702,10 +3729,18 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
}
+static bool is_mmio_page_fault(struct kvm_vcpu *vcpu, gva_t addr)
+{
+ if (vcpu->arch.mmu.direct_map || mmu_is_nested(vcpu))
+ return vcpu_match_mmio_gpa(vcpu, addr);
+
+ return vcpu_match_mmio_gva(vcpu, addr);
+}
+
int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code,
void *insn, int insn_len)
{
- int r;
+ int r, emulation_type = EMULTYPE_RETRY;
enum emulation_result er;
r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code, false);
@@ -3717,11 +3752,10 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code,
goto out;
}
- r = mmu_topup_memory_caches(vcpu);
- if (r)
- goto out;
+ if (is_mmio_page_fault(vcpu, cr2))
+ emulation_type = 0;
- er = x86_emulate_instruction(vcpu, cr2, 0, insn, insn_len);
+ er = x86_emulate_instruction(vcpu, cr2, emulation_type, insn, insn_len);
switch (er) {
case EMULATE_DONE:
@@ -3792,7 +3826,11 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
int kvm_mmu_create(struct kvm_vcpu *vcpu)
{
ASSERT(vcpu);
- ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
+
+ vcpu->arch.walk_mmu = &vcpu->arch.mmu;
+ vcpu->arch.mmu.root_hpa = INVALID_PAGE;
+ vcpu->arch.mmu.translate_gpa = translate_gpa;
+ vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa;
return alloc_mmu_pages(vcpu);
}
@@ -3852,14 +3890,14 @@ restart:
spin_unlock(&kvm->mmu_lock);
}
-static int kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm,
- struct list_head *invalid_list)
+static void kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm,
+ struct list_head *invalid_list)
{
struct kvm_mmu_page *page;
page = container_of(kvm->arch.active_mmu_pages.prev,
struct kvm_mmu_page, link);
- return kvm_mmu_prepare_zap_page(kvm, page, invalid_list);
+ kvm_mmu_prepare_zap_page(kvm, page, invalid_list);
}
static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc)
@@ -3874,15 +3912,15 @@ static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc)
raw_spin_lock(&kvm_lock);
list_for_each_entry(kvm, &vm_list, vm_list) {
- int idx, freed_pages;
+ int idx;
LIST_HEAD(invalid_list);
idx = srcu_read_lock(&kvm->srcu);
spin_lock(&kvm->mmu_lock);
if (!kvm_freed && nr_to_scan > 0 &&
kvm->arch.n_used_mmu_pages > 0) {
- freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm,
- &invalid_list);
+ kvm_mmu_remove_some_alloc_mmu_pages(kvm,
+ &invalid_list);
kvm_freed = kvm;
}
nr_to_scan--;
@@ -3944,15 +3982,15 @@ nomem:
*/
unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm)
{
- int i;
unsigned int nr_mmu_pages;
unsigned int nr_pages = 0;
struct kvm_memslots *slots;
+ struct kvm_memory_slot *memslot;
slots = kvm_memslots(kvm);
- for (i = 0; i < slots->nmemslots; i++)
- nr_pages += slots->memslots[i].npages;
+ kvm_for_each_memslot(memslot, slots)
+ nr_pages += memslot->npages;
nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000;
nr_mmu_pages = max(nr_mmu_pages,
@@ -3961,127 +3999,6 @@ unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm)
return nr_mmu_pages;
}
-static void *pv_mmu_peek_buffer(struct kvm_pv_mmu_op_buffer *buffer,
- unsigned len)
-{
- if (len > buffer->len)
- return NULL;
- return buffer->ptr;
-}
-
-static void *pv_mmu_read_buffer(struct kvm_pv_mmu_op_buffer *buffer,
- unsigned len)
-{
- void *ret;
-
- ret = pv_mmu_peek_buffer(buffer, len);
- if (!ret)
- return ret;
- buffer->ptr += len;
- buffer->len -= len;
- buffer->processed += len;
- return ret;
-}
-
-static int kvm_pv_mmu_write(struct kvm_vcpu *vcpu,
- gpa_t addr, gpa_t value)
-{
- int bytes = 8;
- int r;
-
- if (!is_long_mode(vcpu) && !is_pae(vcpu))
- bytes = 4;
-
- r = mmu_topup_memory_caches(vcpu);
- if (r)
- return r;
-
- if (!emulator_write_phys(vcpu, addr, &value, bytes))
- return -EFAULT;
-
- return 1;
-}
-
-static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu)
-{
- (void)kvm_set_cr3(vcpu, kvm_read_cr3(vcpu));
- return 1;
-}
-
-static int kvm_pv_mmu_release_pt(struct kvm_vcpu *vcpu, gpa_t addr)
-{
- spin_lock(&vcpu->kvm->mmu_lock);
- mmu_unshadow(vcpu->kvm, addr >> PAGE_SHIFT);
- spin_unlock(&vcpu->kvm->mmu_lock);
- return 1;
-}
-
-static int kvm_pv_mmu_op_one(struct kvm_vcpu *vcpu,
- struct kvm_pv_mmu_op_buffer *buffer)
-{
- struct kvm_mmu_op_header *header;
-
- header = pv_mmu_peek_buffer(buffer, sizeof *header);
- if (!header)
- return 0;
- switch (header->op) {
- case KVM_MMU_OP_WRITE_PTE: {
- struct kvm_mmu_op_write_pte *wpte;
-
- wpte = pv_mmu_read_buffer(buffer, sizeof *wpte);
- if (!wpte)
- return 0;
- return kvm_pv_mmu_write(vcpu, wpte->pte_phys,
- wpte->pte_val);
- }
- case KVM_MMU_OP_FLUSH_TLB: {
- struct kvm_mmu_op_flush_tlb *ftlb;
-
- ftlb = pv_mmu_read_buffer(buffer, sizeof *ftlb);
- if (!ftlb)
- return 0;
- return kvm_pv_mmu_flush_tlb(vcpu);
- }
- case KVM_MMU_OP_RELEASE_PT: {
- struct kvm_mmu_op_release_pt *rpt;
-
- rpt = pv_mmu_read_buffer(buffer, sizeof *rpt);
- if (!rpt)
- return 0;
- return kvm_pv_mmu_release_pt(vcpu, rpt->pt_phys);
- }
- default: return 0;
- }
-}
-
-int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes,
- gpa_t addr, unsigned long *ret)
-{
- int r;
- struct kvm_pv_mmu_op_buffer *buffer = &vcpu->arch.mmu_op_buffer;
-
- buffer->ptr = buffer->buf;
- buffer->len = min_t(unsigned long, bytes, sizeof buffer->buf);
- buffer->processed = 0;
-
- r = kvm_read_guest(vcpu->kvm, addr, buffer->buf, buffer->len);
- if (r)
- goto out;
-
- while (buffer->len) {
- r = kvm_pv_mmu_op_one(vcpu, buffer);
- if (r < 0)
- goto out;
- if (r == 0)
- break;
- }
-
- r = 1;
-out:
- *ret = buffer->processed;
- return r;
-}
-
int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4])
{
struct kvm_shadow_walk_iterator iterator;
@@ -4110,12 +4027,6 @@ void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
mmu_free_memory_caches(vcpu);
}
-#ifdef CONFIG_KVM_MMU_AUDIT
-#include "mmu_audit.c"
-#else
-static void mmu_audit_disable(void) { }
-#endif
-
void kvm_mmu_module_exit(void)
{
mmu_destroy_caches();
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
index 746ec259d024..fe15dcc07a6b 100644
--- a/arch/x86/kvm/mmu_audit.c
+++ b/arch/x86/kvm/mmu_audit.c
@@ -19,6 +19,15 @@
#include <linux/ratelimit.h>
+char const *audit_point_name[] = {
+ "pre page fault",
+ "post page fault",
+ "pre pte write",
+ "post pte write",
+ "pre sync",
+ "post sync"
+};
+
#define audit_printk(kvm, fmt, args...) \
printk(KERN_ERR "audit: (%s) error: " \
fmt, audit_point_name[kvm->arch.audit_point], ##args)
@@ -224,7 +233,10 @@ static void audit_vcpu_spte(struct kvm_vcpu *vcpu)
mmu_spte_walk(vcpu, audit_spte);
}
-static void kvm_mmu_audit(void *ignore, struct kvm_vcpu *vcpu, int point)
+static bool mmu_audit;
+static struct jump_label_key mmu_audit_key;
+
+static void __kvm_mmu_audit(struct kvm_vcpu *vcpu, int point)
{
static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10);
@@ -236,18 +248,18 @@ static void kvm_mmu_audit(void *ignore, struct kvm_vcpu *vcpu, int point)
audit_vcpu_spte(vcpu);
}
-static bool mmu_audit;
+static inline void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point)
+{
+ if (static_branch((&mmu_audit_key)))
+ __kvm_mmu_audit(vcpu, point);
+}
static void mmu_audit_enable(void)
{
- int ret;
-
if (mmu_audit)
return;
- ret = register_trace_kvm_mmu_audit(kvm_mmu_audit, NULL);
- WARN_ON(ret);
-
+ jump_label_inc(&mmu_audit_key);
mmu_audit = true;
}
@@ -256,8 +268,7 @@ static void mmu_audit_disable(void)
if (!mmu_audit)
return;
- unregister_trace_kvm_mmu_audit(kvm_mmu_audit, NULL);
- tracepoint_synchronize_unregister();
+ jump_label_dec(&mmu_audit_key);
mmu_audit = false;
}
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index eed67f34146d..89fb0e81322a 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -243,25 +243,6 @@ TRACE_EVENT(
TP_printk("addr:%llx gfn %llx access %x", __entry->addr, __entry->gfn,
__entry->access)
);
-
-TRACE_EVENT(
- kvm_mmu_audit,
- TP_PROTO(struct kvm_vcpu *vcpu, int audit_point),
- TP_ARGS(vcpu, audit_point),
-
- TP_STRUCT__entry(
- __field(struct kvm_vcpu *, vcpu)
- __field(int, audit_point)
- ),
-
- TP_fast_assign(
- __entry->vcpu = vcpu;
- __entry->audit_point = audit_point;
- ),
-
- TP_printk("vcpu:%d %s", __entry->vcpu->cpu,
- audit_point_name[__entry->audit_point])
-);
#endif /* _TRACE_KVMMMU_H */
#undef TRACE_INCLUDE_PATH
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 92994100638b..15610285ebb6 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -497,6 +497,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
shadow_walk_next(&it)) {
gfn_t table_gfn;
+ clear_sp_write_flooding_count(it.sptep);
drop_large_spte(vcpu, it.sptep);
sp = NULL;
@@ -522,6 +523,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
shadow_walk_next(&it)) {
gfn_t direct_gfn;
+ clear_sp_write_flooding_count(it.sptep);
validate_direct_spte(vcpu, it.sptep, direct_access);
drop_large_spte(vcpu, it.sptep);
@@ -536,6 +538,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
link_shadow_page(it.sptep, sp);
}
+ clear_sp_write_flooding_count(it.sptep);
mmu_set_spte(vcpu, it.sptep, access, gw->pte_access,
user_fault, write_fault, emulate, it.level,
gw->gfn, pfn, prefault, map_writable);
@@ -599,11 +602,9 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
*/
if (!r) {
pgprintk("%s: guest page fault\n", __func__);
- if (!prefault) {
+ if (!prefault)
inject_page_fault(vcpu, &walker.fault);
- /* reset fork detector */
- vcpu->arch.last_pt_write_count = 0;
- }
+
return 0;
}
@@ -631,7 +632,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
if (mmu_notifier_retry(vcpu, mmu_seq))
goto out_unlock;
- trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
+ kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
kvm_mmu_free_some_pages(vcpu);
if (!force_pt_level)
transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level);
@@ -641,11 +642,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
pgprintk("%s: shadow pte %p %llx emulate %d\n", __func__,
sptep, *sptep, emulate);
- if (!emulate)
- vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
-
++vcpu->stat.pf_fixed;
- trace_kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT);
+ kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT);
spin_unlock(&vcpu->kvm->mmu_lock);
return emulate;
@@ -656,65 +654,66 @@ out_unlock:
return 0;
}
+static gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp)
+{
+ int offset = 0;
+
+ WARN_ON(sp->role.level != 1);
+
+ if (PTTYPE == 32)
+ offset = sp->role.quadrant << PT64_LEVEL_BITS;
+
+ return gfn_to_gpa(sp->gfn) + offset * sizeof(pt_element_t);
+}
+
static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
{
struct kvm_shadow_walk_iterator iterator;
struct kvm_mmu_page *sp;
- gpa_t pte_gpa = -1;
int level;
u64 *sptep;
- int need_flush = 0;
vcpu_clear_mmio_info(vcpu, gva);
- spin_lock(&vcpu->kvm->mmu_lock);
+ /*
+ * No need to check return value here, rmap_can_add() can
+ * help us to skip pte prefetch later.
+ */
+ mmu_topup_memory_caches(vcpu);
+ spin_lock(&vcpu->kvm->mmu_lock);
for_each_shadow_entry(vcpu, gva, iterator) {
level = iterator.level;
sptep = iterator.sptep;
sp = page_header(__pa(sptep));
if (is_last_spte(*sptep, level)) {
- int offset, shift;
+ pt_element_t gpte;
+ gpa_t pte_gpa;
if (!sp->unsync)
break;
- shift = PAGE_SHIFT -
- (PT_LEVEL_BITS - PT64_LEVEL_BITS) * level;
- offset = sp->role.quadrant << shift;
-
- pte_gpa = (sp->gfn << PAGE_SHIFT) + offset;
+ pte_gpa = FNAME(get_level1_sp_gpa)(sp);
pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t);
- if (is_shadow_present_pte(*sptep)) {
- if (is_large_pte(*sptep))
- --vcpu->kvm->stat.lpages;
- drop_spte(vcpu->kvm, sptep);
- need_flush = 1;
- } else if (is_mmio_spte(*sptep))
- mmu_spte_clear_no_track(sptep);
+ if (mmu_page_zap_pte(vcpu->kvm, sp, sptep))
+ kvm_flush_remote_tlbs(vcpu->kvm);
- break;
+ if (!rmap_can_add(vcpu))
+ break;
+
+ if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte,
+ sizeof(pt_element_t)))
+ break;
+
+ FNAME(update_pte)(vcpu, sp, sptep, &gpte);
}
if (!is_shadow_present_pte(*sptep) || !sp->unsync_children)
break;
}
-
- if (need_flush)
- kvm_flush_remote_tlbs(vcpu->kvm);
-
- atomic_inc(&vcpu->kvm->arch.invlpg_counter);
-
spin_unlock(&vcpu->kvm->mmu_lock);
-
- if (pte_gpa == -1)
- return;
-
- if (mmu_topup_memory_caches(vcpu))
- return;
- kvm_mmu_pte_write(vcpu, pte_gpa, NULL, sizeof(pt_element_t), 0);
}
static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access,
@@ -769,19 +768,14 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
*/
static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
{
- int i, offset, nr_present;
+ int i, nr_present = 0;
bool host_writable;
gpa_t first_pte_gpa;
- offset = nr_present = 0;
-
/* direct kvm_mmu_page can not be unsync. */
BUG_ON(sp->role.direct);
- if (PTTYPE == 32)
- offset = sp->role.quadrant << PT64_LEVEL_BITS;
-
- first_pte_gpa = gfn_to_gpa(sp->gfn) + offset * sizeof(pt_element_t);
+ first_pte_gpa = FNAME(get_level1_sp_gpa)(sp);
for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
unsigned pte_access;
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
new file mode 100644
index 000000000000..7aad5446f393
--- /dev/null
+++ b/arch/x86/kvm/pmu.c
@@ -0,0 +1,533 @@
+/*
+ * Kernel-based Virtual Machine -- Performane Monitoring Unit support
+ *
+ * Copyright 2011 Red Hat, Inc. and/or its affiliates.
+ *
+ * Authors:
+ * Avi Kivity <avi@redhat.com>
+ * Gleb Natapov <gleb@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include <linux/types.h>
+#include <linux/kvm_host.h>
+#include <linux/perf_event.h>
+#include "x86.h"
+#include "cpuid.h"
+#include "lapic.h"
+
+static struct kvm_arch_event_perf_mapping {
+ u8 eventsel;
+ u8 unit_mask;
+ unsigned event_type;
+ bool inexact;
+} arch_events[] = {
+ /* Index must match CPUID 0x0A.EBX bit vector */
+ [0] = { 0x3c, 0x00, PERF_COUNT_HW_CPU_CYCLES },
+ [1] = { 0xc0, 0x00, PERF_COUNT_HW_INSTRUCTIONS },
+ [2] = { 0x3c, 0x01, PERF_COUNT_HW_BUS_CYCLES },
+ [3] = { 0x2e, 0x4f, PERF_COUNT_HW_CACHE_REFERENCES },
+ [4] = { 0x2e, 0x41, PERF_COUNT_HW_CACHE_MISSES },
+ [5] = { 0xc4, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS },
+ [6] = { 0xc5, 0x00, PERF_COUNT_HW_BRANCH_MISSES },
+};
+
+/* mapping between fixed pmc index and arch_events array */
+int fixed_pmc_events[] = {1, 0, 2};
+
+static bool pmc_is_gp(struct kvm_pmc *pmc)
+{
+ return pmc->type == KVM_PMC_GP;
+}
+
+static inline u64 pmc_bitmask(struct kvm_pmc *pmc)
+{
+ struct kvm_pmu *pmu = &pmc->vcpu->arch.pmu;
+
+ return pmu->counter_bitmask[pmc->type];
+}
+
+static inline bool pmc_enabled(struct kvm_pmc *pmc)
+{
+ struct kvm_pmu *pmu = &pmc->vcpu->arch.pmu;
+ return test_bit(pmc->idx, (unsigned long *)&pmu->global_ctrl);
+}
+
+static inline struct kvm_pmc *get_gp_pmc(struct kvm_pmu *pmu, u32 msr,
+ u32 base)
+{
+ if (msr >= base && msr < base + pmu->nr_arch_gp_counters)
+ return &pmu->gp_counters[msr - base];
+ return NULL;
+}
+
+static inline struct kvm_pmc *get_fixed_pmc(struct kvm_pmu *pmu, u32 msr)
+{
+ int base = MSR_CORE_PERF_FIXED_CTR0;
+ if (msr >= base && msr < base + pmu->nr_arch_fixed_counters)
+ return &pmu->fixed_counters[msr - base];
+ return NULL;
+}
+
+static inline struct kvm_pmc *get_fixed_pmc_idx(struct kvm_pmu *pmu, int idx)
+{
+ return get_fixed_pmc(pmu, MSR_CORE_PERF_FIXED_CTR0 + idx);
+}
+
+static struct kvm_pmc *global_idx_to_pmc(struct kvm_pmu *pmu, int idx)
+{
+ if (idx < X86_PMC_IDX_FIXED)
+ return get_gp_pmc(pmu, MSR_P6_EVNTSEL0 + idx, MSR_P6_EVNTSEL0);
+ else
+ return get_fixed_pmc_idx(pmu, idx - X86_PMC_IDX_FIXED);
+}
+
+void kvm_deliver_pmi(struct kvm_vcpu *vcpu)
+{
+ if (vcpu->arch.apic)
+ kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTPC);
+}
+
+static void trigger_pmi(struct irq_work *irq_work)
+{
+ struct kvm_pmu *pmu = container_of(irq_work, struct kvm_pmu,
+ irq_work);
+ struct kvm_vcpu *vcpu = container_of(pmu, struct kvm_vcpu,
+ arch.pmu);
+
+ kvm_deliver_pmi(vcpu);
+}
+
+static void kvm_perf_overflow(struct perf_event *perf_event,
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
+{
+ struct kvm_pmc *pmc = perf_event->overflow_handler_context;
+ struct kvm_pmu *pmu = &pmc->vcpu->arch.pmu;
+ __set_bit(pmc->idx, (unsigned long *)&pmu->global_status);
+}
+
+static void kvm_perf_overflow_intr(struct perf_event *perf_event,
+ struct perf_sample_data *data, struct pt_regs *regs)
+{
+ struct kvm_pmc *pmc = perf_event->overflow_handler_context;
+ struct kvm_pmu *pmu = &pmc->vcpu->arch.pmu;
+ if (!test_and_set_bit(pmc->idx, (unsigned long *)&pmu->reprogram_pmi)) {
+ kvm_perf_overflow(perf_event, data, regs);
+ kvm_make_request(KVM_REQ_PMU, pmc->vcpu);
+ /*
+ * Inject PMI. If vcpu was in a guest mode during NMI PMI
+ * can be ejected on a guest mode re-entry. Otherwise we can't
+ * be sure that vcpu wasn't executing hlt instruction at the
+ * time of vmexit and is not going to re-enter guest mode until,
+ * woken up. So we should wake it, but this is impossible from
+ * NMI context. Do it from irq work instead.
+ */
+ if (!kvm_is_in_guest())
+ irq_work_queue(&pmc->vcpu->arch.pmu.irq_work);
+ else
+ kvm_make_request(KVM_REQ_PMI, pmc->vcpu);
+ }
+}
+
+static u64 read_pmc(struct kvm_pmc *pmc)
+{
+ u64 counter, enabled, running;
+
+ counter = pmc->counter;
+
+ if (pmc->perf_event)
+ counter += perf_event_read_value(pmc->perf_event,
+ &enabled, &running);
+
+ /* FIXME: Scaling needed? */
+
+ return counter & pmc_bitmask(pmc);
+}
+
+static void stop_counter(struct kvm_pmc *pmc)
+{
+ if (pmc->perf_event) {
+ pmc->counter = read_pmc(pmc);
+ perf_event_release_kernel(pmc->perf_event);
+ pmc->perf_event = NULL;
+ }
+}
+
+static void reprogram_counter(struct kvm_pmc *pmc, u32 type,
+ unsigned config, bool exclude_user, bool exclude_kernel,
+ bool intr)
+{
+ struct perf_event *event;
+ struct perf_event_attr attr = {
+ .type = type,
+ .size = sizeof(attr),
+ .pinned = true,
+ .exclude_idle = true,
+ .exclude_host = 1,
+ .exclude_user = exclude_user,
+ .exclude_kernel = exclude_kernel,
+ .config = config,
+ };
+
+ attr.sample_period = (-pmc->counter) & pmc_bitmask(pmc);
+
+ event = perf_event_create_kernel_counter(&attr, -1, current,
+ intr ? kvm_perf_overflow_intr :
+ kvm_perf_overflow, pmc);
+ if (IS_ERR(event)) {
+ printk_once("kvm: pmu event creation failed %ld\n",
+ PTR_ERR(event));
+ return;
+ }
+
+ pmc->perf_event = event;
+ clear_bit(pmc->idx, (unsigned long*)&pmc->vcpu->arch.pmu.reprogram_pmi);
+}
+
+static unsigned find_arch_event(struct kvm_pmu *pmu, u8 event_select,
+ u8 unit_mask)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(arch_events); i++)
+ if (arch_events[i].eventsel == event_select
+ && arch_events[i].unit_mask == unit_mask
+ && (pmu->available_event_types & (1 << i)))
+ break;
+
+ if (i == ARRAY_SIZE(arch_events))
+ return PERF_COUNT_HW_MAX;
+
+ return arch_events[i].event_type;
+}
+
+static void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
+{
+ unsigned config, type = PERF_TYPE_RAW;
+ u8 event_select, unit_mask;
+
+ pmc->eventsel = eventsel;
+
+ stop_counter(pmc);
+
+ if (!(eventsel & ARCH_PERFMON_EVENTSEL_ENABLE) || !pmc_enabled(pmc))
+ return;
+
+ event_select = eventsel & ARCH_PERFMON_EVENTSEL_EVENT;
+ unit_mask = (eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8;
+
+ if (!(event_select & (ARCH_PERFMON_EVENTSEL_EDGE |
+ ARCH_PERFMON_EVENTSEL_INV |
+ ARCH_PERFMON_EVENTSEL_CMASK))) {
+ config = find_arch_event(&pmc->vcpu->arch.pmu, event_select,
+ unit_mask);
+ if (config != PERF_COUNT_HW_MAX)
+ type = PERF_TYPE_HARDWARE;
+ }
+
+ if (type == PERF_TYPE_RAW)
+ config = eventsel & X86_RAW_EVENT_MASK;
+
+ reprogram_counter(pmc, type, config,
+ !(eventsel & ARCH_PERFMON_EVENTSEL_USR),
+ !(eventsel & ARCH_PERFMON_EVENTSEL_OS),
+ eventsel & ARCH_PERFMON_EVENTSEL_INT);
+}
+
+static void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 en_pmi, int idx)
+{
+ unsigned en = en_pmi & 0x3;
+ bool pmi = en_pmi & 0x8;
+
+ stop_counter(pmc);
+
+ if (!en || !pmc_enabled(pmc))
+ return;
+
+ reprogram_counter(pmc, PERF_TYPE_HARDWARE,
+ arch_events[fixed_pmc_events[idx]].event_type,
+ !(en & 0x2), /* exclude user */
+ !(en & 0x1), /* exclude kernel */
+ pmi);
+}
+
+static inline u8 fixed_en_pmi(u64 ctrl, int idx)
+{
+ return (ctrl >> (idx * 4)) & 0xf;
+}
+
+static void reprogram_fixed_counters(struct kvm_pmu *pmu, u64 data)
+{
+ int i;
+
+ for (i = 0; i < pmu->nr_arch_fixed_counters; i++) {
+ u8 en_pmi = fixed_en_pmi(data, i);
+ struct kvm_pmc *pmc = get_fixed_pmc_idx(pmu, i);
+
+ if (fixed_en_pmi(pmu->fixed_ctr_ctrl, i) == en_pmi)
+ continue;
+
+ reprogram_fixed_counter(pmc, en_pmi, i);
+ }
+
+ pmu->fixed_ctr_ctrl = data;
+}
+
+static void reprogram_idx(struct kvm_pmu *pmu, int idx)
+{
+ struct kvm_pmc *pmc = global_idx_to_pmc(pmu, idx);
+
+ if (!pmc)
+ return;
+
+ if (pmc_is_gp(pmc))
+ reprogram_gp_counter(pmc, pmc->eventsel);
+ else {
+ int fidx = idx - X86_PMC_IDX_FIXED;
+ reprogram_fixed_counter(pmc,
+ fixed_en_pmi(pmu->fixed_ctr_ctrl, fidx), fidx);
+ }
+}
+
+static void global_ctrl_changed(struct kvm_pmu *pmu, u64 data)
+{
+ int bit;
+ u64 diff = pmu->global_ctrl ^ data;
+
+ pmu->global_ctrl = data;
+
+ for_each_set_bit(bit, (unsigned long *)&diff, X86_PMC_IDX_MAX)
+ reprogram_idx(pmu, bit);
+}
+
+bool kvm_pmu_msr(struct kvm_vcpu *vcpu, u32 msr)
+{
+ struct kvm_pmu *pmu = &vcpu->arch.pmu;
+ int ret;
+
+ switch (msr) {
+ case MSR_CORE_PERF_FIXED_CTR_CTRL:
+ case MSR_CORE_PERF_GLOBAL_STATUS:
+ case MSR_CORE_PERF_GLOBAL_CTRL:
+ case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
+ ret = pmu->version > 1;
+ break;
+ default:
+ ret = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)
+ || get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0)
+ || get_fixed_pmc(pmu, msr);
+ break;
+ }
+ return ret;
+}
+
+int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data)
+{
+ struct kvm_pmu *pmu = &vcpu->arch.pmu;
+ struct kvm_pmc *pmc;
+
+ switch (index) {
+ case MSR_CORE_PERF_FIXED_CTR_CTRL:
+ *data = pmu->fixed_ctr_ctrl;
+ return 0;
+ case MSR_CORE_PERF_GLOBAL_STATUS:
+ *data = pmu->global_status;
+ return 0;
+ case MSR_CORE_PERF_GLOBAL_CTRL:
+ *data = pmu->global_ctrl;
+ return 0;
+ case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
+ *data = pmu->global_ovf_ctrl;
+ return 0;
+ default:
+ if ((pmc = get_gp_pmc(pmu, index, MSR_IA32_PERFCTR0)) ||
+ (pmc = get_fixed_pmc(pmu, index))) {
+ *data = read_pmc(pmc);
+ return 0;
+ } else if ((pmc = get_gp_pmc(pmu, index, MSR_P6_EVNTSEL0))) {
+ *data = pmc->eventsel;
+ return 0;
+ }
+ }
+ return 1;
+}
+
+int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data)
+{
+ struct kvm_pmu *pmu = &vcpu->arch.pmu;
+ struct kvm_pmc *pmc;
+
+ switch (index) {
+ case MSR_CORE_PERF_FIXED_CTR_CTRL:
+ if (pmu->fixed_ctr_ctrl == data)
+ return 0;
+ if (!(data & 0xfffffffffffff444)) {
+ reprogram_fixed_counters(pmu, data);
+ return 0;
+ }
+ break;
+ case MSR_CORE_PERF_GLOBAL_STATUS:
+ break; /* RO MSR */
+ case MSR_CORE_PERF_GLOBAL_CTRL:
+ if (pmu->global_ctrl == data)
+ return 0;
+ if (!(data & pmu->global_ctrl_mask)) {
+ global_ctrl_changed(pmu, data);
+ return 0;
+ }
+ break;
+ case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
+ if (!(data & (pmu->global_ctrl_mask & ~(3ull<<62)))) {
+ pmu->global_status &= ~data;
+ pmu->global_ovf_ctrl = data;
+ return 0;
+ }
+ break;
+ default:
+ if ((pmc = get_gp_pmc(pmu, index, MSR_IA32_PERFCTR0)) ||
+ (pmc = get_fixed_pmc(pmu, index))) {
+ data = (s64)(s32)data;
+ pmc->counter += data - read_pmc(pmc);
+ return 0;
+ } else if ((pmc = get_gp_pmc(pmu, index, MSR_P6_EVNTSEL0))) {
+ if (data == pmc->eventsel)
+ return 0;
+ if (!(data & 0xffffffff00200000ull)) {
+ reprogram_gp_counter(pmc, data);
+ return 0;
+ }
+ }
+ }
+ return 1;
+}
+
+int kvm_pmu_read_pmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data)
+{
+ struct kvm_pmu *pmu = &vcpu->arch.pmu;
+ bool fast_mode = pmc & (1u << 31);
+ bool fixed = pmc & (1u << 30);
+ struct kvm_pmc *counters;
+ u64 ctr;
+
+ pmc &= (3u << 30) - 1;
+ if (!fixed && pmc >= pmu->nr_arch_gp_counters)
+ return 1;
+ if (fixed && pmc >= pmu->nr_arch_fixed_counters)
+ return 1;
+ counters = fixed ? pmu->fixed_counters : pmu->gp_counters;
+ ctr = read_pmc(&counters[pmc]);
+ if (fast_mode)
+ ctr = (u32)ctr;
+ *data = ctr;
+
+ return 0;
+}
+
+void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu)
+{
+ struct kvm_pmu *pmu = &vcpu->arch.pmu;
+ struct kvm_cpuid_entry2 *entry;
+ unsigned bitmap_len;
+
+ pmu->nr_arch_gp_counters = 0;
+ pmu->nr_arch_fixed_counters = 0;
+ pmu->counter_bitmask[KVM_PMC_GP] = 0;
+ pmu->counter_bitmask[KVM_PMC_FIXED] = 0;
+ pmu->version = 0;
+
+ entry = kvm_find_cpuid_entry(vcpu, 0xa, 0);
+ if (!entry)
+ return;
+
+ pmu->version = entry->eax & 0xff;
+ if (!pmu->version)
+ return;
+
+ pmu->nr_arch_gp_counters = min((int)(entry->eax >> 8) & 0xff,
+ X86_PMC_MAX_GENERIC);
+ pmu->counter_bitmask[KVM_PMC_GP] =
+ ((u64)1 << ((entry->eax >> 16) & 0xff)) - 1;
+ bitmap_len = (entry->eax >> 24) & 0xff;
+ pmu->available_event_types = ~entry->ebx & ((1ull << bitmap_len) - 1);
+
+ if (pmu->version == 1) {
+ pmu->global_ctrl = (1 << pmu->nr_arch_gp_counters) - 1;
+ return;
+ }
+
+ pmu->nr_arch_fixed_counters = min((int)(entry->edx & 0x1f),
+ X86_PMC_MAX_FIXED);
+ pmu->counter_bitmask[KVM_PMC_FIXED] =
+ ((u64)1 << ((entry->edx >> 5) & 0xff)) - 1;
+ pmu->global_ctrl_mask = ~(((1 << pmu->nr_arch_gp_counters) - 1)
+ | (((1ull << pmu->nr_arch_fixed_counters) - 1)
+ << X86_PMC_IDX_FIXED));
+}
+
+void kvm_pmu_init(struct kvm_vcpu *vcpu)
+{
+ int i;
+ struct kvm_pmu *pmu = &vcpu->arch.pmu;
+
+ memset(pmu, 0, sizeof(*pmu));
+ for (i = 0; i < X86_PMC_MAX_GENERIC; i++) {
+ pmu->gp_counters[i].type = KVM_PMC_GP;
+ pmu->gp_counters[i].vcpu = vcpu;
+ pmu->gp_counters[i].idx = i;
+ }
+ for (i = 0; i < X86_PMC_MAX_FIXED; i++) {
+ pmu->fixed_counters[i].type = KVM_PMC_FIXED;
+ pmu->fixed_counters[i].vcpu = vcpu;
+ pmu->fixed_counters[i].idx = i + X86_PMC_IDX_FIXED;
+ }
+ init_irq_work(&pmu->irq_work, trigger_pmi);
+ kvm_pmu_cpuid_update(vcpu);
+}
+
+void kvm_pmu_reset(struct kvm_vcpu *vcpu)
+{
+ struct kvm_pmu *pmu = &vcpu->arch.pmu;
+ int i;
+
+ irq_work_sync(&pmu->irq_work);
+ for (i = 0; i < X86_PMC_MAX_GENERIC; i++) {
+ struct kvm_pmc *pmc = &pmu->gp_counters[i];
+ stop_counter(pmc);
+ pmc->counter = pmc->eventsel = 0;
+ }
+
+ for (i = 0; i < X86_PMC_MAX_FIXED; i++)
+ stop_counter(&pmu->fixed_counters[i]);
+
+ pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status =
+ pmu->global_ovf_ctrl = 0;
+}
+
+void kvm_pmu_destroy(struct kvm_vcpu *vcpu)
+{
+ kvm_pmu_reset(vcpu);
+}
+
+void kvm_handle_pmu_event(struct kvm_vcpu *vcpu)
+{
+ struct kvm_pmu *pmu = &vcpu->arch.pmu;
+ u64 bitmask;
+ int bit;
+
+ bitmask = pmu->reprogram_pmi;
+
+ for_each_set_bit(bit, (unsigned long *)&bitmask, X86_PMC_IDX_MAX) {
+ struct kvm_pmc *pmc = global_idx_to_pmc(pmu, bit);
+
+ if (unlikely(!pmc || !pmc->perf_event)) {
+ clear_bit(bit, (unsigned long *)&pmu->reprogram_pmi);
+ continue;
+ }
+
+ reprogram_idx(pmu, bit);
+ }
+}
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index e32243eac2f4..5fa553babe56 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1014,6 +1014,7 @@ static void init_vmcb(struct vcpu_svm *svm)
set_intercept(svm, INTERCEPT_NMI);
set_intercept(svm, INTERCEPT_SMI);
set_intercept(svm, INTERCEPT_SELECTIVE_CR0);
+ set_intercept(svm, INTERCEPT_RDPMC);
set_intercept(svm, INTERCEPT_CPUID);
set_intercept(svm, INTERCEPT_INVD);
set_intercept(svm, INTERCEPT_HLT);
@@ -2770,6 +2771,19 @@ static int emulate_on_interception(struct vcpu_svm *svm)
return emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE;
}
+static int rdpmc_interception(struct vcpu_svm *svm)
+{
+ int err;
+
+ if (!static_cpu_has(X86_FEATURE_NRIPS))
+ return emulate_on_interception(svm);
+
+ err = kvm_rdpmc(&svm->vcpu);
+ kvm_complete_insn_gp(&svm->vcpu, err);
+
+ return 1;
+}
+
bool check_selective_cr0_intercepted(struct vcpu_svm *svm, unsigned long val)
{
unsigned long cr0 = svm->vcpu.arch.cr0;
@@ -3190,6 +3204,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
[SVM_EXIT_SMI] = nop_on_interception,
[SVM_EXIT_INIT] = nop_on_interception,
[SVM_EXIT_VINTR] = interrupt_window_interception,
+ [SVM_EXIT_RDPMC] = rdpmc_interception,
[SVM_EXIT_CPUID] = cpuid_interception,
[SVM_EXIT_IRET] = iret_interception,
[SVM_EXIT_INVD] = emulate_on_interception,
diff --git a/arch/x86/kvm/timer.c b/arch/x86/kvm/timer.c
index ae432ea1cd83..6b85cc647f34 100644
--- a/arch/x86/kvm/timer.c
+++ b/arch/x86/kvm/timer.c
@@ -18,9 +18,10 @@
#include <linux/atomic.h>
#include "kvm_timer.h"
-static int __kvm_timer_fn(struct kvm_vcpu *vcpu, struct kvm_timer *ktimer)
+enum hrtimer_restart kvm_timer_fn(struct hrtimer *data)
{
- int restart_timer = 0;
+ struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer);
+ struct kvm_vcpu *vcpu = ktimer->vcpu;
wait_queue_head_t *q = &vcpu->wq;
/*
@@ -40,26 +41,7 @@ static int __kvm_timer_fn(struct kvm_vcpu *vcpu, struct kvm_timer *ktimer)
if (ktimer->t_ops->is_periodic(ktimer)) {
hrtimer_add_expires_ns(&ktimer->timer, ktimer->period);
- restart_timer = 1;
- }
-
- return restart_timer;
-}
-
-enum hrtimer_restart kvm_timer_fn(struct hrtimer *data)
-{
- int restart_timer;
- struct kvm_vcpu *vcpu;
- struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer);
-
- vcpu = ktimer->vcpu;
- if (!vcpu)
- return HRTIMER_NORESTART;
-
- restart_timer = __kvm_timer_fn(vcpu, ktimer);
- if (restart_timer)
return HRTIMER_RESTART;
- else
+ } else
return HRTIMER_NORESTART;
}
-
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index a0d6bd9ad442..d29216c462b3 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -18,6 +18,7 @@
#include "irq.h"
#include "mmu.h"
+#include "cpuid.h"
#include <linux/kvm_host.h>
#include <linux/module.h>
@@ -39,6 +40,7 @@
#include <asm/mce.h>
#include <asm/i387.h>
#include <asm/xcr.h>
+#include <asm/perf_event.h>
#include "trace.h"
@@ -49,29 +51,29 @@
MODULE_AUTHOR("Qumranet");
MODULE_LICENSE("GPL");
-static int __read_mostly enable_vpid = 1;
+static bool __read_mostly enable_vpid = 1;
module_param_named(vpid, enable_vpid, bool, 0444);
-static int __read_mostly flexpriority_enabled = 1;
+static bool __read_mostly flexpriority_enabled = 1;
module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO);
-static int __read_mostly enable_ept = 1;
+static bool __read_mostly enable_ept = 1;
module_param_named(ept, enable_ept, bool, S_IRUGO);
-static int __read_mostly enable_unrestricted_guest = 1;
+static bool __read_mostly enable_unrestricted_guest = 1;
module_param_named(unrestricted_guest,
enable_unrestricted_guest, bool, S_IRUGO);
-static int __read_mostly emulate_invalid_guest_state = 0;
+static bool __read_mostly emulate_invalid_guest_state = 0;
module_param(emulate_invalid_guest_state, bool, S_IRUGO);
-static int __read_mostly vmm_exclusive = 1;
+static bool __read_mostly vmm_exclusive = 1;
module_param(vmm_exclusive, bool, S_IRUGO);
-static int __read_mostly yield_on_hlt = 1;
+static bool __read_mostly yield_on_hlt = 1;
module_param(yield_on_hlt, bool, S_IRUGO);
-static int __read_mostly fasteoi = 1;
+static bool __read_mostly fasteoi = 1;
module_param(fasteoi, bool, S_IRUGO);
/*
@@ -79,7 +81,7 @@ module_param(fasteoi, bool, S_IRUGO);
* VMX and be a hypervisor for its own guests. If nested=0, guests may not
* use VMX instructions.
*/
-static int __read_mostly nested = 0;
+static bool __read_mostly nested = 0;
module_param(nested, bool, S_IRUGO);
#define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \
@@ -118,7 +120,7 @@ module_param(ple_gap, int, S_IRUGO);
static int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
module_param(ple_window, int, S_IRUGO);
-#define NR_AUTOLOAD_MSRS 1
+#define NR_AUTOLOAD_MSRS 8
#define VMCS02_POOL_SIZE 1
struct vmcs {
@@ -622,6 +624,7 @@ static unsigned long *vmx_msr_bitmap_legacy;
static unsigned long *vmx_msr_bitmap_longmode;
static bool cpu_has_load_ia32_efer;
+static bool cpu_has_load_perf_global_ctrl;
static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
static DEFINE_SPINLOCK(vmx_vpid_lock);
@@ -1191,15 +1194,34 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
vmcs_write32(EXCEPTION_BITMAP, eb);
}
+static void clear_atomic_switch_msr_special(unsigned long entry,
+ unsigned long exit)
+{
+ vmcs_clear_bits(VM_ENTRY_CONTROLS, entry);
+ vmcs_clear_bits(VM_EXIT_CONTROLS, exit);
+}
+
static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
{
unsigned i;
struct msr_autoload *m = &vmx->msr_autoload;
- if (msr == MSR_EFER && cpu_has_load_ia32_efer) {
- vmcs_clear_bits(VM_ENTRY_CONTROLS, VM_ENTRY_LOAD_IA32_EFER);
- vmcs_clear_bits(VM_EXIT_CONTROLS, VM_EXIT_LOAD_IA32_EFER);
- return;
+ switch (msr) {
+ case MSR_EFER:
+ if (cpu_has_load_ia32_efer) {
+ clear_atomic_switch_msr_special(VM_ENTRY_LOAD_IA32_EFER,
+ VM_EXIT_LOAD_IA32_EFER);
+ return;
+ }
+ break;
+ case MSR_CORE_PERF_GLOBAL_CTRL:
+ if (cpu_has_load_perf_global_ctrl) {
+ clear_atomic_switch_msr_special(
+ VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
+ VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL);
+ return;
+ }
+ break;
}
for (i = 0; i < m->nr; ++i)
@@ -1215,25 +1237,55 @@ static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr);
}
+static void add_atomic_switch_msr_special(unsigned long entry,
+ unsigned long exit, unsigned long guest_val_vmcs,
+ unsigned long host_val_vmcs, u64 guest_val, u64 host_val)
+{
+ vmcs_write64(guest_val_vmcs, guest_val);
+ vmcs_write64(host_val_vmcs, host_val);
+ vmcs_set_bits(VM_ENTRY_CONTROLS, entry);
+ vmcs_set_bits(VM_EXIT_CONTROLS, exit);
+}
+
static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
u64 guest_val, u64 host_val)
{
unsigned i;
struct msr_autoload *m = &vmx->msr_autoload;
- if (msr == MSR_EFER && cpu_has_load_ia32_efer) {
- vmcs_write64(GUEST_IA32_EFER, guest_val);
- vmcs_write64(HOST_IA32_EFER, host_val);
- vmcs_set_bits(VM_ENTRY_CONTROLS, VM_ENTRY_LOAD_IA32_EFER);
- vmcs_set_bits(VM_EXIT_CONTROLS, VM_EXIT_LOAD_IA32_EFER);
- return;
+ switch (msr) {
+ case MSR_EFER:
+ if (cpu_has_load_ia32_efer) {
+ add_atomic_switch_msr_special(VM_ENTRY_LOAD_IA32_EFER,
+ VM_EXIT_LOAD_IA32_EFER,
+ GUEST_IA32_EFER,
+ HOST_IA32_EFER,
+ guest_val, host_val);
+ return;
+ }
+ break;
+ case MSR_CORE_PERF_GLOBAL_CTRL:
+ if (cpu_has_load_perf_global_ctrl) {
+ add_atomic_switch_msr_special(
+ VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
+ VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL,
+ GUEST_IA32_PERF_GLOBAL_CTRL,
+ HOST_IA32_PERF_GLOBAL_CTRL,
+ guest_val, host_val);
+ return;
+ }
+ break;
}
for (i = 0; i < m->nr; ++i)
if (m->guest[i].index == msr)
break;
- if (i == m->nr) {
+ if (i == NR_AUTOLOAD_MSRS) {
+ printk_once(KERN_WARNING"Not enough mst switch entries. "
+ "Can't add msr %x\n", msr);
+ return;
+ } else if (i == m->nr) {
++m->nr;
vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr);
vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr);
@@ -1696,7 +1748,6 @@ static void setup_msrs(struct vcpu_vmx *vmx)
int save_nmsrs, index;
unsigned long *msr_bitmap;
- vmx_load_host_state(vmx);
save_nmsrs = 0;
#ifdef CONFIG_X86_64
if (is_long_mode(&vmx->vcpu)) {
@@ -1905,6 +1956,7 @@ static __init void nested_vmx_setup_ctls_msrs(void)
#endif
CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING |
CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_EXITING |
+ CPU_BASED_RDPMC_EXITING |
CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
/*
* We can allow some features even when not supported by the
@@ -2091,12 +2143,10 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
return 1;
/* Otherwise falls through */
default:
- vmx_load_host_state(to_vmx(vcpu));
if (vmx_get_vmx_msr(vcpu, msr_index, pdata))
return 0;
msr = find_msr_entry(to_vmx(vcpu), msr_index);
if (msr) {
- vmx_load_host_state(to_vmx(vcpu));
data = msr->data;
break;
}
@@ -2120,7 +2170,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
switch (msr_index) {
case MSR_EFER:
- vmx_load_host_state(vmx);
ret = kvm_set_msr_common(vcpu, msr_index, data);
break;
#ifdef CONFIG_X86_64
@@ -2169,7 +2218,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
break;
msr = find_msr_entry(vmx, msr_index);
if (msr) {
- vmx_load_host_state(vmx);
msr->data = data;
break;
}
@@ -2363,7 +2411,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
CPU_BASED_USE_TSC_OFFSETING |
CPU_BASED_MWAIT_EXITING |
CPU_BASED_MONITOR_EXITING |
- CPU_BASED_INVLPG_EXITING;
+ CPU_BASED_INVLPG_EXITING |
+ CPU_BASED_RDPMC_EXITING;
if (yield_on_hlt)
min |= CPU_BASED_HLT_EXITING;
@@ -2455,6 +2504,42 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
&& allow_1_setting(MSR_IA32_VMX_EXIT_CTLS,
VM_EXIT_LOAD_IA32_EFER);
+ cpu_has_load_perf_global_ctrl =
+ allow_1_setting(MSR_IA32_VMX_ENTRY_CTLS,
+ VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL)
+ && allow_1_setting(MSR_IA32_VMX_EXIT_CTLS,
+ VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL);
+
+ /*
+ * Some cpus support VM_ENTRY_(LOAD|SAVE)_IA32_PERF_GLOBAL_CTRL
+ * but due to arrata below it can't be used. Workaround is to use
+ * msr load mechanism to switch IA32_PERF_GLOBAL_CTRL.
+ *
+ * VM Exit May Incorrectly Clear IA32_PERF_GLOBAL_CTRL [34:32]
+ *
+ * AAK155 (model 26)
+ * AAP115 (model 30)
+ * AAT100 (model 37)
+ * BC86,AAY89,BD102 (model 44)
+ * BA97 (model 46)
+ *
+ */
+ if (cpu_has_load_perf_global_ctrl && boot_cpu_data.x86 == 0x6) {
+ switch (boot_cpu_data.x86_model) {
+ case 26:
+ case 30:
+ case 37:
+ case 44:
+ case 46:
+ cpu_has_load_perf_global_ctrl = false;
+ printk_once(KERN_WARNING"kvm: VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL "
+ "does not work properly. Using workaround\n");
+ break;
+ default:
+ break;
+ }
+ }
+
return 0;
}
@@ -2629,11 +2714,13 @@ static gva_t rmode_tss_base(struct kvm *kvm)
{
if (!kvm->arch.tss_addr) {
struct kvm_memslots *slots;
+ struct kvm_memory_slot *slot;
gfn_t base_gfn;
slots = kvm_memslots(kvm);
- base_gfn = slots->memslots[0].base_gfn +
- kvm->memslots->memslots[0].npages - 3;
+ slot = id_to_memslot(slots, 0);
+ base_gfn = slot->base_gfn + slot->npages - 3;
+
return base_gfn << PAGE_SHIFT;
}
return kvm->arch.tss_addr;
@@ -3858,12 +3945,15 @@ static bool nested_exit_on_intr(struct kvm_vcpu *vcpu)
static void enable_irq_window(struct kvm_vcpu *vcpu)
{
u32 cpu_based_vm_exec_control;
- if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
- /* We can get here when nested_run_pending caused
- * vmx_interrupt_allowed() to return false. In this case, do
- * nothing - the interrupt will be injected later.
+ if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) {
+ /*
+ * We get here if vmx_interrupt_allowed() said we can't
+ * inject to L1 now because L2 must run. Ask L2 to exit
+ * right after entry, so we can inject to L1 more promptly.
*/
+ kvm_make_request(KVM_REQ_IMMEDIATE_EXIT, vcpu);
return;
+ }
cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
@@ -3990,11 +4080,12 @@ static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
{
if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) {
- struct vmcs12 *vmcs12;
- if (to_vmx(vcpu)->nested.nested_run_pending)
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ if (to_vmx(vcpu)->nested.nested_run_pending ||
+ (vmcs12->idt_vectoring_info_field &
+ VECTORING_INFO_VALID_MASK))
return 0;
nested_vmx_vmexit(vcpu);
- vmcs12 = get_vmcs12(vcpu);
vmcs12->vm_exit_reason = EXIT_REASON_EXTERNAL_INTERRUPT;
vmcs12->vm_exit_intr_info = 0;
/* fall through to normal code, but now in L1, not L2 */
@@ -4524,6 +4615,16 @@ static int handle_invlpg(struct kvm_vcpu *vcpu)
return 1;
}
+static int handle_rdpmc(struct kvm_vcpu *vcpu)
+{
+ int err;
+
+ err = kvm_rdpmc(vcpu);
+ kvm_complete_insn_gp(vcpu, err);
+
+ return 1;
+}
+
static int handle_wbinvd(struct kvm_vcpu *vcpu)
{
skip_emulated_instruction(vcpu);
@@ -5474,6 +5575,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
[EXIT_REASON_HLT] = handle_halt,
[EXIT_REASON_INVD] = handle_invd,
[EXIT_REASON_INVLPG] = handle_invlpg,
+ [EXIT_REASON_RDPMC] = handle_rdpmc,
[EXIT_REASON_VMCALL] = handle_vmcall,
[EXIT_REASON_VMCLEAR] = handle_vmclear,
[EXIT_REASON_VMLAUNCH] = handle_vmlaunch,
@@ -5968,6 +6070,24 @@ static void vmx_cancel_injection(struct kvm_vcpu *vcpu)
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
}
+static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
+{
+ int i, nr_msrs;
+ struct perf_guest_switch_msr *msrs;
+
+ msrs = perf_guest_get_msrs(&nr_msrs);
+
+ if (!msrs)
+ return;
+
+ for (i = 0; i < nr_msrs; i++)
+ if (msrs[i].host == msrs[i].guest)
+ clear_atomic_switch_msr(vmx, msrs[i].msr);
+ else
+ add_atomic_switch_msr(vmx, msrs[i].msr, msrs[i].guest,
+ msrs[i].host);
+}
+
#ifdef CONFIG_X86_64
#define R "r"
#define Q "q"
@@ -6017,6 +6137,8 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
vmx_set_interrupt_shadow(vcpu, 0);
+ atomic_switch_perf_msrs(vmx);
+
vmx->__launched = vmx->loaded_vmcs->launched;
asm(
/* Store host registers */
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c38efd7b792e..14d6cadc4ba6 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -26,6 +26,7 @@
#include "tss.h"
#include "kvm_cache_regs.h"
#include "x86.h"
+#include "cpuid.h"
#include <linux/clocksource.h>
#include <linux/interrupt.h>
@@ -82,15 +83,13 @@ static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE);
#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
static void update_cr8_intercept(struct kvm_vcpu *vcpu);
-static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
- struct kvm_cpuid_entry2 __user *entries);
static void process_nmi(struct kvm_vcpu *vcpu);
struct kvm_x86_ops *kvm_x86_ops;
EXPORT_SYMBOL_GPL(kvm_x86_ops);
-int ignore_msrs = 0;
-module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR);
+static bool ignore_msrs = 0;
+module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR);
bool kvm_has_tsc_control;
EXPORT_SYMBOL_GPL(kvm_has_tsc_control);
@@ -574,58 +573,6 @@ int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
}
EXPORT_SYMBOL_GPL(kvm_set_xcr);
-static bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu)
-{
- struct kvm_cpuid_entry2 *best;
-
- best = kvm_find_cpuid_entry(vcpu, 1, 0);
- return best && (best->ecx & bit(X86_FEATURE_XSAVE));
-}
-
-static bool guest_cpuid_has_smep(struct kvm_vcpu *vcpu)
-{
- struct kvm_cpuid_entry2 *best;
-
- best = kvm_find_cpuid_entry(vcpu, 7, 0);
- return best && (best->ebx & bit(X86_FEATURE_SMEP));
-}
-
-static bool guest_cpuid_has_fsgsbase(struct kvm_vcpu *vcpu)
-{
- struct kvm_cpuid_entry2 *best;
-
- best = kvm_find_cpuid_entry(vcpu, 7, 0);
- return best && (best->ebx & bit(X86_FEATURE_FSGSBASE));
-}
-
-static void update_cpuid(struct kvm_vcpu *vcpu)
-{
- struct kvm_cpuid_entry2 *best;
- struct kvm_lapic *apic = vcpu->arch.apic;
- u32 timer_mode_mask;
-
- best = kvm_find_cpuid_entry(vcpu, 1, 0);
- if (!best)
- return;
-
- /* Update OSXSAVE bit */
- if (cpu_has_xsave && best->function == 0x1) {
- best->ecx &= ~(bit(X86_FEATURE_OSXSAVE));
- if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE))
- best->ecx |= bit(X86_FEATURE_OSXSAVE);
- }
-
- if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
- best->function == 0x1) {
- best->ecx |= bit(X86_FEATURE_TSC_DEADLINE_TIMER);
- timer_mode_mask = 3 << 17;
- } else
- timer_mode_mask = 1 << 17;
-
- if (apic)
- apic->lapic_timer.timer_mode_mask = timer_mode_mask;
-}
-
int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
{
unsigned long old_cr4 = kvm_read_cr4(vcpu);
@@ -659,7 +606,7 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
kvm_mmu_reset_context(vcpu);
if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE)
- update_cpuid(vcpu);
+ kvm_update_cpuid(vcpu);
return 0;
}
@@ -813,6 +760,21 @@ int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
}
EXPORT_SYMBOL_GPL(kvm_get_dr);
+bool kvm_rdpmc(struct kvm_vcpu *vcpu)
+{
+ u32 ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);
+ u64 data;
+ int err;
+
+ err = kvm_pmu_read_pmc(vcpu, ecx, &data);
+ if (err)
+ return err;
+ kvm_register_write(vcpu, VCPU_REGS_RAX, (u32)data);
+ kvm_register_write(vcpu, VCPU_REGS_RDX, data >> 32);
+ return err;
+}
+EXPORT_SYMBOL_GPL(kvm_rdpmc);
+
/*
* List of msr numbers which we expose to userspace through KVM_GET_MSRS
* and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
@@ -1362,12 +1324,11 @@ static int xen_hvm_config(struct kvm_vcpu *vcpu, u64 data)
if (page_num >= blob_size)
goto out;
r = -ENOMEM;
- page = kzalloc(PAGE_SIZE, GFP_KERNEL);
- if (!page)
+ page = memdup_user(blob_addr + (page_num * PAGE_SIZE), PAGE_SIZE);
+ if (IS_ERR(page)) {
+ r = PTR_ERR(page);
goto out;
- r = -EFAULT;
- if (copy_from_user(page, blob_addr + (page_num * PAGE_SIZE), PAGE_SIZE))
- goto out_free;
+ }
if (kvm_write_guest(kvm, page_addr, page, PAGE_SIZE))
goto out_free;
r = 0;
@@ -1656,8 +1617,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
* which we perfectly emulate ;-). Any other value should be at least
* reported, some guests depend on them.
*/
- case MSR_P6_EVNTSEL0:
- case MSR_P6_EVNTSEL1:
case MSR_K7_EVNTSEL0:
case MSR_K7_EVNTSEL1:
case MSR_K7_EVNTSEL2:
@@ -1669,8 +1628,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
/* at least RHEL 4 unconditionally writes to the perfctr registers,
* so we ignore writes to make it happy.
*/
- case MSR_P6_PERFCTR0:
- case MSR_P6_PERFCTR1:
case MSR_K7_PERFCTR0:
case MSR_K7_PERFCTR1:
case MSR_K7_PERFCTR2:
@@ -1707,6 +1664,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
default:
if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))
return xen_hvm_config(vcpu, data);
+ if (kvm_pmu_msr(vcpu, msr))
+ return kvm_pmu_set_msr(vcpu, msr, data);
if (!ignore_msrs) {
pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n",
msr, data);
@@ -1869,10 +1828,6 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
case MSR_K8_SYSCFG:
case MSR_K7_HWCR:
case MSR_VM_HSAVE_PA:
- case MSR_P6_PERFCTR0:
- case MSR_P6_PERFCTR1:
- case MSR_P6_EVNTSEL0:
- case MSR_P6_EVNTSEL1:
case MSR_K7_EVNTSEL0:
case MSR_K7_PERFCTR0:
case MSR_K8_INT_PENDING_MSG:
@@ -1983,6 +1938,8 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
data = 0xbe702111;
break;
default:
+ if (kvm_pmu_msr(vcpu, msr))
+ return kvm_pmu_get_msr(vcpu, msr, pdata);
if (!ignore_msrs) {
pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
return 1;
@@ -2041,15 +1998,12 @@ static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
if (msrs.nmsrs >= MAX_IO_MSRS)
goto out;
- r = -ENOMEM;
size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
- entries = kmalloc(size, GFP_KERNEL);
- if (!entries)
+ entries = memdup_user(user_msrs->entries, size);
+ if (IS_ERR(entries)) {
+ r = PTR_ERR(entries);
goto out;
-
- r = -EFAULT;
- if (copy_from_user(entries, user_msrs->entries, size))
- goto out_free;
+ }
r = n = __msr_io(vcpu, &msrs, entries, do_msr);
if (r < 0)
@@ -2135,6 +2089,9 @@ int kvm_dev_ioctl_check_extension(long ext)
case KVM_CAP_TSC_CONTROL:
r = kvm_has_tsc_control;
break;
+ case KVM_CAP_TSC_DEADLINE_TIMER:
+ r = boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER);
+ break;
default:
r = 0;
break;
@@ -2266,466 +2223,6 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu);
}
-static int is_efer_nx(void)
-{
- unsigned long long efer = 0;
-
- rdmsrl_safe(MSR_EFER, &efer);
- return efer & EFER_NX;
-}
-
-static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu)
-{
- int i;
- struct kvm_cpuid_entry2 *e, *entry;
-
- entry = NULL;
- for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
- e = &vcpu->arch.cpuid_entries[i];
- if (e->function == 0x80000001) {
- entry = e;
- break;
- }
- }
- if (entry && (entry->edx & (1 << 20)) && !is_efer_nx()) {
- entry->edx &= ~(1 << 20);
- printk(KERN_INFO "kvm: guest NX capability removed\n");
- }
-}
-
-/* when an old userspace process fills a new kernel module */
-static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
- struct kvm_cpuid *cpuid,
- struct kvm_cpuid_entry __user *entries)
-{
- int r, i;
- struct kvm_cpuid_entry *cpuid_entries;
-
- r = -E2BIG;
- if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
- goto out;
- r = -ENOMEM;
- cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry) * cpuid->nent);
- if (!cpuid_entries)
- goto out;
- r = -EFAULT;
- if (copy_from_user(cpuid_entries, entries,
- cpuid->nent * sizeof(struct kvm_cpuid_entry)))
- goto out_free;
- for (i = 0; i < cpuid->nent; i++) {
- vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function;
- vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax;
- vcpu->arch.cpuid_entries[i].ebx = cpuid_entries[i].ebx;
- vcpu->arch.cpuid_entries[i].ecx = cpuid_entries[i].ecx;
- vcpu->arch.cpuid_entries[i].edx = cpuid_entries[i].edx;
- vcpu->arch.cpuid_entries[i].index = 0;
- vcpu->arch.cpuid_entries[i].flags = 0;
- vcpu->arch.cpuid_entries[i].padding[0] = 0;
- vcpu->arch.cpuid_entries[i].padding[1] = 0;
- vcpu->arch.cpuid_entries[i].padding[2] = 0;
- }
- vcpu->arch.cpuid_nent = cpuid->nent;
- cpuid_fix_nx_cap(vcpu);
- r = 0;
- kvm_apic_set_version(vcpu);
- kvm_x86_ops->cpuid_update(vcpu);
- update_cpuid(vcpu);
-
-out_free:
- vfree(cpuid_entries);
-out:
- return r;
-}
-
-static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
- struct kvm_cpuid2 *cpuid,
- struct kvm_cpuid_entry2 __user *entries)
-{
- int r;
-
- r = -E2BIG;
- if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
- goto out;
- r = -EFAULT;
- if (copy_from_user(&vcpu->arch.cpuid_entries, entries,
- cpuid->nent * sizeof(struct kvm_cpuid_entry2)))
- goto out;
- vcpu->arch.cpuid_nent = cpuid->nent;
- kvm_apic_set_version(vcpu);
- kvm_x86_ops->cpuid_update(vcpu);
- update_cpuid(vcpu);
- return 0;
-
-out:
- return r;
-}
-
-static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
- struct kvm_cpuid2 *cpuid,
- struct kvm_cpuid_entry2 __user *entries)
-{
- int r;
-
- r = -E2BIG;
- if (cpuid->nent < vcpu->arch.cpuid_nent)
- goto out;
- r = -EFAULT;
- if (copy_to_user(entries, &vcpu->arch.cpuid_entries,
- vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2)))
- goto out;
- return 0;
-
-out:
- cpuid->nent = vcpu->arch.cpuid_nent;
- return r;
-}
-
-static void cpuid_mask(u32 *word, int wordnum)
-{
- *word &= boot_cpu_data.x86_capability[wordnum];
-}
-
-static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function,
- u32 index)
-{
- entry->function = function;
- entry->index = index;
- cpuid_count(entry->function, entry->index,
- &entry->eax, &entry->ebx, &entry->ecx, &entry->edx);
- entry->flags = 0;
-}
-
-static bool supported_xcr0_bit(unsigned bit)
-{
- u64 mask = ((u64)1 << bit);
-
- return mask & (XSTATE_FP | XSTATE_SSE | XSTATE_YMM) & host_xcr0;
-}
-
-#define F(x) bit(X86_FEATURE_##x)
-
-static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
- u32 index, int *nent, int maxnent)
-{
- unsigned f_nx = is_efer_nx() ? F(NX) : 0;
-#ifdef CONFIG_X86_64
- unsigned f_gbpages = (kvm_x86_ops->get_lpage_level() == PT_PDPE_LEVEL)
- ? F(GBPAGES) : 0;
- unsigned f_lm = F(LM);
-#else
- unsigned f_gbpages = 0;
- unsigned f_lm = 0;
-#endif
- unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0;
-
- /* cpuid 1.edx */
- const u32 kvm_supported_word0_x86_features =
- F(FPU) | F(VME) | F(DE) | F(PSE) |
- F(TSC) | F(MSR) | F(PAE) | F(MCE) |
- F(CX8) | F(APIC) | 0 /* Reserved */ | F(SEP) |
- F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
- F(PAT) | F(PSE36) | 0 /* PSN */ | F(CLFLSH) |
- 0 /* Reserved, DS, ACPI */ | F(MMX) |
- F(FXSR) | F(XMM) | F(XMM2) | F(SELFSNOOP) |
- 0 /* HTT, TM, Reserved, PBE */;
- /* cpuid 0x80000001.edx */
- const u32 kvm_supported_word1_x86_features =
- F(FPU) | F(VME) | F(DE) | F(PSE) |
- F(TSC) | F(MSR) | F(PAE) | F(MCE) |
- F(CX8) | F(APIC) | 0 /* Reserved */ | F(SYSCALL) |
- F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
- F(PAT) | F(PSE36) | 0 /* Reserved */ |
- f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) |
- F(FXSR) | F(FXSR_OPT) | f_gbpages | f_rdtscp |
- 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW);
- /* cpuid 1.ecx */
- const u32 kvm_supported_word4_x86_features =
- F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ |
- 0 /* DS-CPL, VMX, SMX, EST */ |
- 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |
- 0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ |
- 0 /* Reserved, DCA */ | F(XMM4_1) |
- F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) |
- 0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) |
- F(F16C) | F(RDRAND);
- /* cpuid 0x80000001.ecx */
- const u32 kvm_supported_word6_x86_features =
- F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ |
- F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
- F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(XOP) |
- 0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM);
-
- /* cpuid 0xC0000001.edx */
- const u32 kvm_supported_word5_x86_features =
- F(XSTORE) | F(XSTORE_EN) | F(XCRYPT) | F(XCRYPT_EN) |
- F(ACE2) | F(ACE2_EN) | F(PHE) | F(PHE_EN) |
- F(PMM) | F(PMM_EN);
-
- /* cpuid 7.0.ebx */
- const u32 kvm_supported_word9_x86_features =
- F(SMEP) | F(FSGSBASE) | F(ERMS);
-
- /* all calls to cpuid_count() should be made on the same cpu */
- get_cpu();
- do_cpuid_1_ent(entry, function, index);
- ++*nent;
-
- switch (function) {
- case 0:
- entry->eax = min(entry->eax, (u32)0xd);
- break;
- case 1:
- entry->edx &= kvm_supported_word0_x86_features;
- cpuid_mask(&entry->edx, 0);
- entry->ecx &= kvm_supported_word4_x86_features;
- cpuid_mask(&entry->ecx, 4);
- /* we support x2apic emulation even if host does not support
- * it since we emulate x2apic in software */
- entry->ecx |= F(X2APIC);
- break;
- /* function 2 entries are STATEFUL. That is, repeated cpuid commands
- * may return different values. This forces us to get_cpu() before
- * issuing the first command, and also to emulate this annoying behavior
- * in kvm_emulate_cpuid() using KVM_CPUID_FLAG_STATE_READ_NEXT */
- case 2: {
- int t, times = entry->eax & 0xff;
-
- entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
- entry->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
- for (t = 1; t < times && *nent < maxnent; ++t) {
- do_cpuid_1_ent(&entry[t], function, 0);
- entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
- ++*nent;
- }
- break;
- }
- /* function 4 has additional index. */
- case 4: {
- int i, cache_type;
-
- entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
- /* read more entries until cache_type is zero */
- for (i = 1; *nent < maxnent; ++i) {
- cache_type = entry[i - 1].eax & 0x1f;
- if (!cache_type)
- break;
- do_cpuid_1_ent(&entry[i], function, i);
- entry[i].flags |=
- KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
- ++*nent;
- }
- break;
- }
- case 7: {
- entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
- /* Mask ebx against host capbability word 9 */
- if (index == 0) {
- entry->ebx &= kvm_supported_word9_x86_features;
- cpuid_mask(&entry->ebx, 9);
- } else
- entry->ebx = 0;
- entry->eax = 0;
- entry->ecx = 0;
- entry->edx = 0;
- break;
- }
- case 9:
- break;
- /* function 0xb has additional index. */
- case 0xb: {
- int i, level_type;
-
- entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
- /* read more entries until level_type is zero */
- for (i = 1; *nent < maxnent; ++i) {
- level_type = entry[i - 1].ecx & 0xff00;
- if (!level_type)
- break;
- do_cpuid_1_ent(&entry[i], function, i);
- entry[i].flags |=
- KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
- ++*nent;
- }
- break;
- }
- case 0xd: {
- int idx, i;
-
- entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
- for (idx = 1, i = 1; *nent < maxnent && idx < 64; ++idx) {
- do_cpuid_1_ent(&entry[i], function, idx);
- if (entry[i].eax == 0 || !supported_xcr0_bit(idx))
- continue;
- entry[i].flags |=
- KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
- ++*nent;
- ++i;
- }
- break;
- }
- case KVM_CPUID_SIGNATURE: {
- char signature[12] = "KVMKVMKVM\0\0";
- u32 *sigptr = (u32 *)signature;
- entry->eax = 0;
- entry->ebx = sigptr[0];
- entry->ecx = sigptr[1];
- entry->edx = sigptr[2];
- break;
- }
- case KVM_CPUID_FEATURES:
- entry->eax = (1 << KVM_FEATURE_CLOCKSOURCE) |
- (1 << KVM_FEATURE_NOP_IO_DELAY) |
- (1 << KVM_FEATURE_CLOCKSOURCE2) |
- (1 << KVM_FEATURE_ASYNC_PF) |
- (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT);
-
- if (sched_info_on())
- entry->eax |= (1 << KVM_FEATURE_STEAL_TIME);
-
- entry->ebx = 0;
- entry->ecx = 0;
- entry->edx = 0;
- break;
- case 0x80000000:
- entry->eax = min(entry->eax, 0x8000001a);
- break;
- case 0x80000001:
- entry->edx &= kvm_supported_word1_x86_features;
- cpuid_mask(&entry->edx, 1);
- entry->ecx &= kvm_supported_word6_x86_features;
- cpuid_mask(&entry->ecx, 6);
- break;
- case 0x80000008: {
- unsigned g_phys_as = (entry->eax >> 16) & 0xff;
- unsigned virt_as = max((entry->eax >> 8) & 0xff, 48U);
- unsigned phys_as = entry->eax & 0xff;
-
- if (!g_phys_as)
- g_phys_as = phys_as;
- entry->eax = g_phys_as | (virt_as << 8);
- entry->ebx = entry->edx = 0;
- break;
- }
- case 0x80000019:
- entry->ecx = entry->edx = 0;
- break;
- case 0x8000001a:
- break;
- case 0x8000001d:
- break;
- /*Add support for Centaur's CPUID instruction*/
- case 0xC0000000:
- /*Just support up to 0xC0000004 now*/
- entry->eax = min(entry->eax, 0xC0000004);
- break;
- case 0xC0000001:
- entry->edx &= kvm_supported_word5_x86_features;
- cpuid_mask(&entry->edx, 5);
- break;
- case 3: /* Processor serial number */
- case 5: /* MONITOR/MWAIT */
- case 6: /* Thermal management */
- case 0xA: /* Architectural Performance Monitoring */
- case 0x80000007: /* Advanced power management */
- case 0xC0000002:
- case 0xC0000003:
- case 0xC0000004:
- default:
- entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
- break;
- }
-
- kvm_x86_ops->set_supported_cpuid(function, entry);
-
- put_cpu();
-}
-
-#undef F
-
-static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
- struct kvm_cpuid_entry2 __user *entries)
-{
- struct kvm_cpuid_entry2 *cpuid_entries;
- int limit, nent = 0, r = -E2BIG;
- u32 func;
-
- if (cpuid->nent < 1)
- goto out;
- if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
- cpuid->nent = KVM_MAX_CPUID_ENTRIES;
- r = -ENOMEM;
- cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent);
- if (!cpuid_entries)
- goto out;
-
- do_cpuid_ent(&cpuid_entries[0], 0, 0, &nent, cpuid->nent);
- limit = cpuid_entries[0].eax;
- for (func = 1; func <= limit && nent < cpuid->nent; ++func)
- do_cpuid_ent(&cpuid_entries[nent], func, 0,
- &nent, cpuid->nent);
- r = -E2BIG;
- if (nent >= cpuid->nent)
- goto out_free;
-
- do_cpuid_ent(&cpuid_entries[nent], 0x80000000, 0, &nent, cpuid->nent);
- limit = cpuid_entries[nent - 1].eax;
- for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func)
- do_cpuid_ent(&cpuid_entries[nent], func, 0,
- &nent, cpuid->nent);
-
-
-
- r = -E2BIG;
- if (nent >= cpuid->nent)
- goto out_free;
-
- /* Add support for Centaur's CPUID instruction. */
- if (boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR) {
- do_cpuid_ent(&cpuid_entries[nent], 0xC0000000, 0,
- &nent, cpuid->nent);
-
- r = -E2BIG;
- if (nent >= cpuid->nent)
- goto out_free;
-
- limit = cpuid_entries[nent - 1].eax;
- for (func = 0xC0000001;
- func <= limit && nent < cpuid->nent; ++func)
- do_cpuid_ent(&cpuid_entries[nent], func, 0,
- &nent, cpuid->nent);
-
- r = -E2BIG;
- if (nent >= cpuid->nent)
- goto out_free;
- }
-
- do_cpuid_ent(&cpuid_entries[nent], KVM_CPUID_SIGNATURE, 0, &nent,
- cpuid->nent);
-
- r = -E2BIG;
- if (nent >= cpuid->nent)
- goto out_free;
-
- do_cpuid_ent(&cpuid_entries[nent], KVM_CPUID_FEATURES, 0, &nent,
- cpuid->nent);
-
- r = -E2BIG;
- if (nent >= cpuid->nent)
- goto out_free;
-
- r = -EFAULT;
- if (copy_to_user(entries, cpuid_entries,
- nent * sizeof(struct kvm_cpuid_entry2)))
- goto out_free;
- cpuid->nent = nent;
- r = 0;
-
-out_free:
- vfree(cpuid_entries);
-out:
- return r;
-}
-
static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
struct kvm_lapic_state *s)
{
@@ -3043,13 +2540,12 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
r = -EINVAL;
if (!vcpu->arch.apic)
goto out;
- u.lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
- r = -ENOMEM;
- if (!u.lapic)
- goto out;
- r = -EFAULT;
- if (copy_from_user(u.lapic, argp, sizeof(struct kvm_lapic_state)))
+ u.lapic = memdup_user(argp, sizeof(*u.lapic));
+ if (IS_ERR(u.lapic)) {
+ r = PTR_ERR(u.lapic);
goto out;
+ }
+
r = kvm_vcpu_ioctl_set_lapic(vcpu, u.lapic);
if (r)
goto out;
@@ -3228,14 +2724,11 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
break;
}
case KVM_SET_XSAVE: {
- u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL);
- r = -ENOMEM;
- if (!u.xsave)
- break;
-
- r = -EFAULT;
- if (copy_from_user(u.xsave, argp, sizeof(struct kvm_xsave)))
- break;
+ u.xsave = memdup_user(argp, sizeof(*u.xsave));
+ if (IS_ERR(u.xsave)) {
+ r = PTR_ERR(u.xsave);
+ goto out;
+ }
r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave);
break;
@@ -3256,15 +2749,11 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
break;
}
case KVM_SET_XCRS: {
- u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL);
- r = -ENOMEM;
- if (!u.xcrs)
- break;
-
- r = -EFAULT;
- if (copy_from_user(u.xcrs, argp,
- sizeof(struct kvm_xcrs)))
- break;
+ u.xcrs = memdup_user(argp, sizeof(*u.xcrs));
+ if (IS_ERR(u.xcrs)) {
+ r = PTR_ERR(u.xcrs);
+ goto out;
+ }
r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs);
break;
@@ -3461,16 +2950,59 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm,
return 0;
}
+/**
+ * write_protect_slot - write protect a slot for dirty logging
+ * @kvm: the kvm instance
+ * @memslot: the slot we protect
+ * @dirty_bitmap: the bitmap indicating which pages are dirty
+ * @nr_dirty_pages: the number of dirty pages
+ *
+ * We have two ways to find all sptes to protect:
+ * 1. Use kvm_mmu_slot_remove_write_access() which walks all shadow pages and
+ * checks ones that have a spte mapping a page in the slot.
+ * 2. Use kvm_mmu_rmap_write_protect() for each gfn found in the bitmap.
+ *
+ * Generally speaking, if there are not so many dirty pages compared to the
+ * number of shadow pages, we should use the latter.
+ *
+ * Note that letting others write into a page marked dirty in the old bitmap
+ * by using the remaining tlb entry is not a problem. That page will become
+ * write protected again when we flush the tlb and then be reported dirty to
+ * the user space by copying the old bitmap.
+ */
+static void write_protect_slot(struct kvm *kvm,
+ struct kvm_memory_slot *memslot,
+ unsigned long *dirty_bitmap,
+ unsigned long nr_dirty_pages)
+{
+ /* Not many dirty pages compared to # of shadow pages. */
+ if (nr_dirty_pages < kvm->arch.n_used_mmu_pages) {
+ unsigned long gfn_offset;
+
+ for_each_set_bit(gfn_offset, dirty_bitmap, memslot->npages) {
+ unsigned long gfn = memslot->base_gfn + gfn_offset;
+
+ spin_lock(&kvm->mmu_lock);
+ kvm_mmu_rmap_write_protect(kvm, gfn, memslot);
+ spin_unlock(&kvm->mmu_lock);
+ }
+ kvm_flush_remote_tlbs(kvm);
+ } else {
+ spin_lock(&kvm->mmu_lock);
+ kvm_mmu_slot_remove_write_access(kvm, memslot->id);
+ spin_unlock(&kvm->mmu_lock);
+ }
+}
+
/*
* Get (and clear) the dirty memory log for a memory slot.
*/
int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
struct kvm_dirty_log *log)
{
- int r, i;
+ int r;
struct kvm_memory_slot *memslot;
- unsigned long n;
- unsigned long is_dirty = 0;
+ unsigned long n, nr_dirty_pages;
mutex_lock(&kvm->slots_lock);
@@ -3478,43 +3010,41 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
if (log->slot >= KVM_MEMORY_SLOTS)
goto out;
- memslot = &kvm->memslots->memslots[log->slot];
+ memslot = id_to_memslot(kvm->memslots, log->slot);
r = -ENOENT;
if (!memslot->dirty_bitmap)
goto out;
n = kvm_dirty_bitmap_bytes(memslot);
-
- for (i = 0; !is_dirty && i < n/sizeof(long); i++)
- is_dirty = memslot->dirty_bitmap[i];
+ nr_dirty_pages = memslot->nr_dirty_pages;
/* If nothing is dirty, don't bother messing with page tables. */
- if (is_dirty) {
+ if (nr_dirty_pages) {
struct kvm_memslots *slots, *old_slots;
- unsigned long *dirty_bitmap;
+ unsigned long *dirty_bitmap, *dirty_bitmap_head;
- dirty_bitmap = memslot->dirty_bitmap_head;
- if (memslot->dirty_bitmap == dirty_bitmap)
- dirty_bitmap += n / sizeof(long);
- memset(dirty_bitmap, 0, n);
+ dirty_bitmap = memslot->dirty_bitmap;
+ dirty_bitmap_head = memslot->dirty_bitmap_head;
+ if (dirty_bitmap == dirty_bitmap_head)
+ dirty_bitmap_head += n / sizeof(long);
+ memset(dirty_bitmap_head, 0, n);
r = -ENOMEM;
- slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
+ slots = kmemdup(kvm->memslots, sizeof(*kvm->memslots), GFP_KERNEL);
if (!slots)
goto out;
- memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots));
- slots->memslots[log->slot].dirty_bitmap = dirty_bitmap;
- slots->generation++;
+
+ memslot = id_to_memslot(slots, log->slot);
+ memslot->nr_dirty_pages = 0;
+ memslot->dirty_bitmap = dirty_bitmap_head;
+ update_memslots(slots, NULL);
old_slots = kvm->memslots;
rcu_assign_pointer(kvm->memslots, slots);
synchronize_srcu_expedited(&kvm->srcu);
- dirty_bitmap = old_slots->memslots[log->slot].dirty_bitmap;
kfree(old_slots);
- spin_lock(&kvm->mmu_lock);
- kvm_mmu_slot_remove_write_access(kvm, log->slot);
- spin_unlock(&kvm->mmu_lock);
+ write_protect_slot(kvm, memslot, dirty_bitmap, nr_dirty_pages);
r = -EFAULT;
if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n))
@@ -3659,14 +3189,14 @@ long kvm_arch_vm_ioctl(struct file *filp,
}
case KVM_GET_IRQCHIP: {
/* 0: PIC master, 1: PIC slave, 2: IOAPIC */
- struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL);
+ struct kvm_irqchip *chip;
- r = -ENOMEM;
- if (!chip)
+ chip = memdup_user(argp, sizeof(*chip));
+ if (IS_ERR(chip)) {
+ r = PTR_ERR(chip);
goto out;
- r = -EFAULT;
- if (copy_from_user(chip, argp, sizeof *chip))
- goto get_irqchip_out;
+ }
+
r = -ENXIO;
if (!irqchip_in_kernel(kvm))
goto get_irqchip_out;
@@ -3685,14 +3215,14 @@ long kvm_arch_vm_ioctl(struct file *filp,
}
case KVM_SET_IRQCHIP: {
/* 0: PIC master, 1: PIC slave, 2: IOAPIC */
- struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL);
+ struct kvm_irqchip *chip;
- r = -ENOMEM;
- if (!chip)
+ chip = memdup_user(argp, sizeof(*chip));
+ if (IS_ERR(chip)) {
+ r = PTR_ERR(chip);
goto out;
- r = -EFAULT;
- if (copy_from_user(chip, argp, sizeof *chip))
- goto set_irqchip_out;
+ }
+
r = -ENXIO;
if (!irqchip_in_kernel(kvm))
goto set_irqchip_out;
@@ -3899,12 +3429,7 @@ void kvm_get_segment(struct kvm_vcpu *vcpu,
kvm_x86_ops->get_segment(vcpu, var, seg);
}
-static gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access)
-{
- return gpa;
-}
-
-static gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access)
+gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access)
{
gpa_t t_gpa;
struct x86_exception exception;
@@ -4088,7 +3613,7 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes);
if (ret < 0)
return 0;
- kvm_mmu_pte_write(vcpu, gpa, val, bytes, 1);
+ kvm_mmu_pte_write(vcpu, gpa, val, bytes);
return 1;
}
@@ -4325,7 +3850,7 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
if (!exchanged)
return X86EMUL_CMPXCHG_FAILED;
- kvm_mmu_pte_write(vcpu, gpa, new, bytes, 1);
+ kvm_mmu_pte_write(vcpu, gpa, new, bytes);
return X86EMUL_CONTINUE;
@@ -4350,32 +3875,24 @@ static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
return r;
}
-
-static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt,
- int size, unsigned short port, void *val,
- unsigned int count)
+static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size,
+ unsigned short port, void *val,
+ unsigned int count, bool in)
{
- struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
-
- if (vcpu->arch.pio.count)
- goto data_avail;
-
- trace_kvm_pio(0, port, size, count);
+ trace_kvm_pio(!in, port, size, count);
vcpu->arch.pio.port = port;
- vcpu->arch.pio.in = 1;
+ vcpu->arch.pio.in = in;
vcpu->arch.pio.count = count;
vcpu->arch.pio.size = size;
if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
- data_avail:
- memcpy(val, vcpu->arch.pio_data, size * count);
vcpu->arch.pio.count = 0;
return 1;
}
vcpu->run->exit_reason = KVM_EXIT_IO;
- vcpu->run->io.direction = KVM_EXIT_IO_IN;
+ vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
vcpu->run->io.size = size;
vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
vcpu->run->io.count = count;
@@ -4384,36 +3901,37 @@ static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt,
return 0;
}
-static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt,
- int size, unsigned short port,
- const void *val, unsigned int count)
+static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt,
+ int size, unsigned short port, void *val,
+ unsigned int count)
{
struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+ int ret;
- trace_kvm_pio(1, port, size, count);
-
- vcpu->arch.pio.port = port;
- vcpu->arch.pio.in = 0;
- vcpu->arch.pio.count = count;
- vcpu->arch.pio.size = size;
-
- memcpy(vcpu->arch.pio_data, val, size * count);
+ if (vcpu->arch.pio.count)
+ goto data_avail;
- if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
+ ret = emulator_pio_in_out(vcpu, size, port, val, count, true);
+ if (ret) {
+data_avail:
+ memcpy(val, vcpu->arch.pio_data, size * count);
vcpu->arch.pio.count = 0;
return 1;
}
- vcpu->run->exit_reason = KVM_EXIT_IO;
- vcpu->run->io.direction = KVM_EXIT_IO_OUT;
- vcpu->run->io.size = size;
- vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
- vcpu->run->io.count = count;
- vcpu->run->io.port = port;
-
return 0;
}
+static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt,
+ int size, unsigned short port,
+ const void *val, unsigned int count)
+{
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+
+ memcpy(vcpu->arch.pio_data, val, size * count);
+ return emulator_pio_in_out(vcpu, size, port, (void *)val, count, false);
+}
+
static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
{
return kvm_x86_ops->get_segment_base(vcpu, seg);
@@ -4628,6 +4146,12 @@ static int emulator_set_msr(struct x86_emulate_ctxt *ctxt,
return kvm_set_msr(emul_to_vcpu(ctxt), msr_index, data);
}
+static int emulator_read_pmc(struct x86_emulate_ctxt *ctxt,
+ u32 pmc, u64 *pdata)
+{
+ return kvm_pmu_read_pmc(emul_to_vcpu(ctxt), pmc, pdata);
+}
+
static void emulator_halt(struct x86_emulate_ctxt *ctxt)
{
emul_to_vcpu(ctxt)->arch.halt_request = 1;
@@ -4680,6 +4204,7 @@ static struct x86_emulate_ops emulate_ops = {
.set_dr = emulator_set_dr,
.set_msr = emulator_set_msr,
.get_msr = emulator_get_msr,
+ .read_pmc = emulator_read_pmc,
.halt = emulator_halt,
.wbinvd = emulator_wbinvd,
.fix_hypercall = emulator_fix_hypercall,
@@ -4837,6 +4362,50 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)
return false;
}
+static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
+ unsigned long cr2, int emulation_type)
+{
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+ unsigned long last_retry_eip, last_retry_addr, gpa = cr2;
+
+ last_retry_eip = vcpu->arch.last_retry_eip;
+ last_retry_addr = vcpu->arch.last_retry_addr;
+
+ /*
+ * If the emulation is caused by #PF and it is non-page_table
+ * writing instruction, it means the VM-EXIT is caused by shadow
+ * page protected, we can zap the shadow page and retry this
+ * instruction directly.
+ *
+ * Note: if the guest uses a non-page-table modifying instruction
+ * on the PDE that points to the instruction, then we will unmap
+ * the instruction and go to an infinite loop. So, we cache the
+ * last retried eip and the last fault address, if we meet the eip
+ * and the address again, we can break out of the potential infinite
+ * loop.
+ */
+ vcpu->arch.last_retry_eip = vcpu->arch.last_retry_addr = 0;
+
+ if (!(emulation_type & EMULTYPE_RETRY))
+ return false;
+
+ if (x86_page_table_writing_insn(ctxt))
+ return false;
+
+ if (ctxt->eip == last_retry_eip && last_retry_addr == cr2)
+ return false;
+
+ vcpu->arch.last_retry_eip = ctxt->eip;
+ vcpu->arch.last_retry_addr = cr2;
+
+ if (!vcpu->arch.mmu.direct_map)
+ gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL);
+
+ kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
+
+ return true;
+}
+
int x86_emulate_instruction(struct kvm_vcpu *vcpu,
unsigned long cr2,
int emulation_type,
@@ -4878,6 +4447,9 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
return EMULATE_DONE;
}
+ if (retry_instruction(ctxt, cr2, emulation_type))
+ return EMULATE_DONE;
+
/* this is needed for vmware backdoor interface to work since it
changes registers values during IO operation */
if (vcpu->arch.emulate_regs_need_sync_from_vcpu) {
@@ -5096,17 +4668,17 @@ static void kvm_timer_init(void)
static DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu);
-static int kvm_is_in_guest(void)
+int kvm_is_in_guest(void)
{
- return percpu_read(current_vcpu) != NULL;
+ return __this_cpu_read(current_vcpu) != NULL;
}
static int kvm_is_user_mode(void)
{
int user_mode = 3;
- if (percpu_read(current_vcpu))
- user_mode = kvm_x86_ops->get_cpl(percpu_read(current_vcpu));
+ if (__this_cpu_read(current_vcpu))
+ user_mode = kvm_x86_ops->get_cpl(__this_cpu_read(current_vcpu));
return user_mode != 0;
}
@@ -5115,8 +4687,8 @@ static unsigned long kvm_get_guest_ip(void)
{
unsigned long ip = 0;
- if (percpu_read(current_vcpu))
- ip = kvm_rip_read(percpu_read(current_vcpu));
+ if (__this_cpu_read(current_vcpu))
+ ip = kvm_rip_read(__this_cpu_read(current_vcpu));
return ip;
}
@@ -5129,13 +4701,13 @@ static struct perf_guest_info_callbacks kvm_guest_cbs = {
void kvm_before_handle_nmi(struct kvm_vcpu *vcpu)
{
- percpu_write(current_vcpu, vcpu);
+ __this_cpu_write(current_vcpu, vcpu);
}
EXPORT_SYMBOL_GPL(kvm_before_handle_nmi);
void kvm_after_handle_nmi(struct kvm_vcpu *vcpu)
{
- percpu_write(current_vcpu, NULL);
+ __this_cpu_write(current_vcpu, NULL);
}
EXPORT_SYMBOL_GPL(kvm_after_handle_nmi);
@@ -5234,15 +4806,6 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu)
}
EXPORT_SYMBOL_GPL(kvm_emulate_halt);
-static inline gpa_t hc_gpa(struct kvm_vcpu *vcpu, unsigned long a0,
- unsigned long a1)
-{
- if (is_long_mode(vcpu))
- return a0;
- else
- return a0 | ((gpa_t)a1 << 32);
-}
-
int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
{
u64 param, ingpa, outgpa, ret;
@@ -5338,9 +4901,6 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
case KVM_HC_VAPIC_POLL_IRQ:
ret = 0;
break;
- case KVM_HC_MMU_OP:
- r = kvm_pv_mmu_op(vcpu, a0, hc_gpa(vcpu, a1, a2), &ret);
- break;
default:
ret = -KVM_ENOSYS;
break;
@@ -5370,125 +4930,6 @@ int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt)
return emulator_write_emulated(ctxt, rip, instruction, 3, NULL);
}
-static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i)
-{
- struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i];
- int j, nent = vcpu->arch.cpuid_nent;
-
- e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT;
- /* when no next entry is found, the current entry[i] is reselected */
- for (j = i + 1; ; j = (j + 1) % nent) {
- struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j];
- if (ej->function == e->function) {
- ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
- return j;
- }
- }
- return 0; /* silence gcc, even though control never reaches here */
-}
-
-/* find an entry with matching function, matching index (if needed), and that
- * should be read next (if it's stateful) */
-static int is_matching_cpuid_entry(struct kvm_cpuid_entry2 *e,
- u32 function, u32 index)
-{
- if (e->function != function)
- return 0;
- if ((e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index)
- return 0;
- if ((e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) &&
- !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT))
- return 0;
- return 1;
-}
-
-struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
- u32 function, u32 index)
-{
- int i;
- struct kvm_cpuid_entry2 *best = NULL;
-
- for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
- struct kvm_cpuid_entry2 *e;
-
- e = &vcpu->arch.cpuid_entries[i];
- if (is_matching_cpuid_entry(e, function, index)) {
- if (e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC)
- move_to_next_stateful_cpuid_entry(vcpu, i);
- best = e;
- break;
- }
- }
- return best;
-}
-EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry);
-
-int cpuid_maxphyaddr(struct kvm_vcpu *vcpu)
-{
- struct kvm_cpuid_entry2 *best;
-
- best = kvm_find_cpuid_entry(vcpu, 0x80000000, 0);
- if (!best || best->eax < 0x80000008)
- goto not_found;
- best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0);
- if (best)
- return best->eax & 0xff;
-not_found:
- return 36;
-}
-
-/*
- * If no match is found, check whether we exceed the vCPU's limit
- * and return the content of the highest valid _standard_ leaf instead.
- * This is to satisfy the CPUID specification.
- */
-static struct kvm_cpuid_entry2* check_cpuid_limit(struct kvm_vcpu *vcpu,
- u32 function, u32 index)
-{
- struct kvm_cpuid_entry2 *maxlevel;
-
- maxlevel = kvm_find_cpuid_entry(vcpu, function & 0x80000000, 0);
- if (!maxlevel || maxlevel->eax >= function)
- return NULL;
- if (function & 0x80000000) {
- maxlevel = kvm_find_cpuid_entry(vcpu, 0, 0);
- if (!maxlevel)
- return NULL;
- }
- return kvm_find_cpuid_entry(vcpu, maxlevel->eax, index);
-}
-
-void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
-{
- u32 function, index;
- struct kvm_cpuid_entry2 *best;
-
- function = kvm_register_read(vcpu, VCPU_REGS_RAX);
- index = kvm_register_read(vcpu, VCPU_REGS_RCX);
- kvm_register_write(vcpu, VCPU_REGS_RAX, 0);
- kvm_register_write(vcpu, VCPU_REGS_RBX, 0);
- kvm_register_write(vcpu, VCPU_REGS_RCX, 0);
- kvm_register_write(vcpu, VCPU_REGS_RDX, 0);
- best = kvm_find_cpuid_entry(vcpu, function, index);
-
- if (!best)
- best = check_cpuid_limit(vcpu, function, index);
-
- if (best) {
- kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax);
- kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx);
- kvm_register_write(vcpu, VCPU_REGS_RCX, best->ecx);
- kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx);
- }
- kvm_x86_ops->skip_emulated_instruction(vcpu);
- trace_kvm_cpuid(function,
- kvm_register_read(vcpu, VCPU_REGS_RAX),
- kvm_register_read(vcpu, VCPU_REGS_RBX),
- kvm_register_read(vcpu, VCPU_REGS_RCX),
- kvm_register_read(vcpu, VCPU_REGS_RDX));
-}
-EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
-
/*
* Check if userspace requested an interrupt window, and that the
* interrupt window is open.
@@ -5649,6 +5090,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
int r;
bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
vcpu->run->request_interrupt_window;
+ bool req_immediate_exit = 0;
if (vcpu->requests) {
if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu))
@@ -5688,7 +5130,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
record_steal_time(vcpu);
if (kvm_check_request(KVM_REQ_NMI, vcpu))
process_nmi(vcpu);
-
+ req_immediate_exit =
+ kvm_check_request(KVM_REQ_IMMEDIATE_EXIT, vcpu);
+ if (kvm_check_request(KVM_REQ_PMU, vcpu))
+ kvm_handle_pmu_event(vcpu);
+ if (kvm_check_request(KVM_REQ_PMI, vcpu))
+ kvm_deliver_pmi(vcpu);
}
r = kvm_mmu_reload(vcpu);
@@ -5739,6 +5186,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
+ if (req_immediate_exit)
+ smp_send_reschedule(vcpu->cpu);
+
kvm_guest_enter();
if (unlikely(vcpu->arch.switch_db_regs)) {
@@ -5944,10 +5394,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
if (r <= 0)
goto out;
- if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL)
- kvm_register_write(vcpu, VCPU_REGS_RAX,
- kvm_run->hypercall.ret);
-
r = __vcpu_run(vcpu);
out:
@@ -6149,7 +5595,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
if (sregs->cr4 & X86_CR4_OSXSAVE)
- update_cpuid(vcpu);
+ kvm_update_cpuid(vcpu);
idx = srcu_read_lock(&vcpu->kvm->srcu);
if (!is_long_mode(vcpu) && is_pae(vcpu)) {
@@ -6426,6 +5872,8 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
kvm_async_pf_hash_reset(vcpu);
vcpu->arch.apf.halted = false;
+ kvm_pmu_reset(vcpu);
+
return kvm_x86_ops->vcpu_reset(vcpu);
}
@@ -6474,10 +5922,6 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
kvm = vcpu->kvm;
vcpu->arch.emulate_ctxt.ops = &emulate_ops;
- vcpu->arch.walk_mmu = &vcpu->arch.mmu;
- vcpu->arch.mmu.root_hpa = INVALID_PAGE;
- vcpu->arch.mmu.translate_gpa = translate_gpa;
- vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa;
if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu))
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
else
@@ -6514,6 +5958,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
goto fail_free_mce_banks;
kvm_async_pf_hash_reset(vcpu);
+ kvm_pmu_init(vcpu);
return 0;
fail_free_mce_banks:
@@ -6532,6 +5977,7 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
{
int idx;
+ kvm_pmu_destroy(vcpu);
kfree(vcpu->arch.mce_banks);
kvm_free_lapic(vcpu);
idx = srcu_read_lock(&vcpu->kvm->srcu);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index d36fe237c665..cb80c293cdd8 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -33,9 +33,6 @@ static inline bool kvm_exception_is_soft(unsigned int nr)
return (nr == BP_VECTOR) || (nr == OF_VECTOR);
}
-struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
- u32 function, u32 index);
-
static inline bool is_protmode(struct kvm_vcpu *vcpu)
{
return kvm_read_cr0_bits(vcpu, X86_CR0_PE);
@@ -125,4 +122,6 @@ int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt,
gva_t addr, void *val, unsigned int bytes,
struct x86_exception *exception);
+extern u64 host_xcr0;
+
#endif
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index cf4603ba866f..642d8805bc1b 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -856,18 +856,23 @@ static void __init lguest_init_IRQ(void)
}
/*
- * With CONFIG_SPARSE_IRQ, interrupt descriptors are allocated as-needed, so
- * rather than set them in lguest_init_IRQ we are called here every time an
- * lguest device needs an interrupt.
- *
- * FIXME: irq_alloc_desc_at() can fail due to lack of memory, we should
- * pass that up!
+ * Interrupt descriptors are allocated as-needed, but low-numbered ones are
+ * reserved by the generic x86 code. So we ignore irq_alloc_desc_at if it
+ * tells us the irq is already used: other errors (ie. ENOMEM) we take
+ * seriously.
*/
-void lguest_setup_irq(unsigned int irq)
+int lguest_setup_irq(unsigned int irq)
{
- irq_alloc_desc_at(irq, 0);
+ int err;
+
+ /* Returns -ve error or vector number. */
+ err = irq_alloc_desc_at(irq, 0);
+ if (err < 0 && err != -EEXIST)
+ return err;
+
irq_set_chip_and_handler_name(irq, &lguest_irq_controller,
handle_level_irq, "level");
+ return 0;
}
/*
diff --git a/arch/x86/lib/inat.c b/arch/x86/lib/inat.c
index 46fc4ee09fc4..88ad5fbda6e1 100644
--- a/arch/x86/lib/inat.c
+++ b/arch/x86/lib/inat.c
@@ -82,9 +82,16 @@ insn_attr_t inat_get_avx_attribute(insn_byte_t opcode, insn_byte_t vex_m,
const insn_attr_t *table;
if (vex_m > X86_VEX_M_MAX || vex_p > INAT_LSTPFX_MAX)
return 0;
- table = inat_avx_tables[vex_m][vex_p];
+ /* At first, this checks the master table */
+ table = inat_avx_tables[vex_m][0];
if (!table)
return 0;
+ if (!inat_is_group(table[opcode]) && vex_p) {
+ /* If this is not a group, get attribute directly */
+ table = inat_avx_tables[vex_m][vex_p];
+ if (!table)
+ return 0;
+ }
return table[opcode];
}
diff --git a/arch/x86/lib/insn.c b/arch/x86/lib/insn.c
index 374562ed6704..5a1f9f3e3fbb 100644
--- a/arch/x86/lib/insn.c
+++ b/arch/x86/lib/insn.c
@@ -202,7 +202,7 @@ void insn_get_opcode(struct insn *insn)
m = insn_vex_m_bits(insn);
p = insn_vex_p_bits(insn);
insn->attr = inat_get_avx_attribute(op, m, p);
- if (!inat_accept_vex(insn->attr))
+ if (!inat_accept_vex(insn->attr) && !inat_is_group(insn->attr))
insn->attr = 0; /* This instruction is bad */
goto end; /* VEX has only 1 byte for opcode */
}
@@ -249,6 +249,8 @@ void insn_get_modrm(struct insn *insn)
pfx = insn_last_prefix(insn);
insn->attr = inat_get_group_attribute(mod, pfx,
insn->attr);
+ if (insn_is_avx(insn) && !inat_accept_vex(insn->attr))
+ insn->attr = 0; /* This is bad */
}
}
diff --git a/arch/x86/lib/string_32.c b/arch/x86/lib/string_32.c
index 82004d2bf05e..bd59090825db 100644
--- a/arch/x86/lib/string_32.c
+++ b/arch/x86/lib/string_32.c
@@ -164,15 +164,13 @@ EXPORT_SYMBOL(strchr);
size_t strlen(const char *s)
{
int d0;
- int res;
+ size_t res;
asm volatile("repne\n\t"
- "scasb\n\t"
- "notl %0\n\t"
- "decl %0"
+ "scasb"
: "=c" (res), "=&D" (d0)
: "1" (s), "a" (0), "0" (0xffffffffu)
: "memory");
- return res;
+ return ~res - 1;
}
EXPORT_SYMBOL(strlen);
#endif
diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt
index a793da5e560e..5b83c51c12e0 100644
--- a/arch/x86/lib/x86-opcode-map.txt
+++ b/arch/x86/lib/x86-opcode-map.txt
@@ -1,5 +1,11 @@
# x86 Opcode Maps
#
+# This is (mostly) based on following documentations.
+# - Intel(R) 64 and IA-32 Architectures Software Developer's Manual Vol.2
+# (#325383-040US, October 2011)
+# - Intel(R) Advanced Vector Extensions Programming Reference
+# (#319433-011,JUNE 2011).
+#
#<Opcode maps>
# Table: table-name
# Referrer: escaped-name
@@ -15,10 +21,13 @@
# EndTable
#
# AVX Superscripts
-# (VEX): this opcode can accept VEX prefix.
-# (oVEX): this opcode requires VEX prefix.
-# (o128): this opcode only supports 128bit VEX.
-# (o256): this opcode only supports 256bit VEX.
+# (v): this opcode requires VEX prefix.
+# (v1): this opcode only supports 128bit VEX.
+#
+# Last Prefix Superscripts
+# - (66): the last prefix is 0x66
+# - (F3): the last prefix is 0xF3
+# - (F2): the last prefix is 0xF2
#
Table: one byte opcode
@@ -199,8 +208,8 @@ a0: MOV AL,Ob
a1: MOV rAX,Ov
a2: MOV Ob,AL
a3: MOV Ov,rAX
-a4: MOVS/B Xb,Yb
-a5: MOVS/W/D/Q Xv,Yv
+a4: MOVS/B Yb,Xb
+a5: MOVS/W/D/Q Yv,Xv
a6: CMPS/B Xb,Yb
a7: CMPS/W/D Xv,Yv
a8: TEST AL,Ib
@@ -233,8 +242,8 @@ c0: Grp2 Eb,Ib (1A)
c1: Grp2 Ev,Ib (1A)
c2: RETN Iw (f64)
c3: RETN
-c4: LES Gz,Mp (i64) | 3bytes-VEX (Prefix)
-c5: LDS Gz,Mp (i64) | 2bytes-VEX (Prefix)
+c4: LES Gz,Mp (i64) | VEX+2byte (Prefix)
+c5: LDS Gz,Mp (i64) | VEX+1byte (Prefix)
c6: Grp11 Eb,Ib (1A)
c7: Grp11 Ev,Iz (1A)
c8: ENTER Iw,Ib
@@ -320,14 +329,19 @@ AVXcode: 1
# 3DNow! uses the last imm byte as opcode extension.
0f: 3DNow! Pq,Qq,Ib
# 0x0f 0x10-0x1f
-10: movups Vps,Wps (VEX) | movss Vss,Wss (F3),(VEX),(o128) | movupd Vpd,Wpd (66),(VEX) | movsd Vsd,Wsd (F2),(VEX),(o128)
-11: movups Wps,Vps (VEX) | movss Wss,Vss (F3),(VEX),(o128) | movupd Wpd,Vpd (66),(VEX) | movsd Wsd,Vsd (F2),(VEX),(o128)
-12: movlps Vq,Mq (VEX),(o128) | movlpd Vq,Mq (66),(VEX),(o128) | movhlps Vq,Uq (VEX),(o128) | movddup Vq,Wq (F2),(VEX) | movsldup Vq,Wq (F3),(VEX)
-13: mpvlps Mq,Vq (VEX),(o128) | movlpd Mq,Vq (66),(VEX),(o128)
-14: unpcklps Vps,Wq (VEX) | unpcklpd Vpd,Wq (66),(VEX)
-15: unpckhps Vps,Wq (VEX) | unpckhpd Vpd,Wq (66),(VEX)
-16: movhps Vq,Mq (VEX),(o128) | movhpd Vq,Mq (66),(VEX),(o128) | movlsps Vq,Uq (VEX),(o128) | movshdup Vq,Wq (F3),(VEX)
-17: movhps Mq,Vq (VEX),(o128) | movhpd Mq,Vq (66),(VEX),(o128)
+# NOTE: According to Intel SDM opcode map, vmovups and vmovupd has no operands
+# but it actually has operands. And also, vmovss and vmovsd only accept 128bit.
+# MOVSS/MOVSD has too many forms(3) on SDM. This map just shows a typical form.
+# Many AVX instructions lack v1 superscript, according to Intel AVX-Prgramming
+# Reference A.1
+10: vmovups Vps,Wps | vmovupd Vpd,Wpd (66) | vmovss Vx,Hx,Wss (F3),(v1) | vmovsd Vx,Hx,Wsd (F2),(v1)
+11: vmovups Wps,Vps | vmovupd Wpd,Vpd (66) | vmovss Wss,Hx,Vss (F3),(v1) | vmovsd Wsd,Hx,Vsd (F2),(v1)
+12: vmovlps Vq,Hq,Mq (v1) | vmovhlps Vq,Hq,Uq (v1) | vmovlpd Vq,Hq,Mq (66),(v1) | vmovsldup Vx,Wx (F3) | vmovddup Vx,Wx (F2)
+13: vmovlps Mq,Vq (v1) | vmovlpd Mq,Vq (66),(v1)
+14: vunpcklps Vx,Hx,Wx | vunpcklpd Vx,Hx,Wx (66)
+15: vunpckhps Vx,Hx,Wx | vunpckhpd Vx,Hx,Wx (66)
+16: vmovhps Vdq,Hq,Mq (v1) | vmovlhps Vdq,Hq,Uq (v1) | vmovhpd Vdq,Hq,Mq (66),(v1) | vmovshdup Vx,Wx (F3)
+17: vmovhps Mq,Vq (v1) | vmovhpd Mq,Vq (66),(v1)
18: Grp16 (1A)
19:
1a:
@@ -345,14 +359,14 @@ AVXcode: 1
25:
26:
27:
-28: movaps Vps,Wps (VEX) | movapd Vpd,Wpd (66),(VEX)
-29: movaps Wps,Vps (VEX) | movapd Wpd,Vpd (66),(VEX)
-2a: cvtpi2ps Vps,Qpi | cvtsi2ss Vss,Ed/q (F3),(VEX),(o128) | cvtpi2pd Vpd,Qpi (66) | cvtsi2sd Vsd,Ed/q (F2),(VEX),(o128)
-2b: movntps Mps,Vps (VEX) | movntpd Mpd,Vpd (66),(VEX)
-2c: cvttps2pi Ppi,Wps | cvttss2si Gd/q,Wss (F3),(VEX),(o128) | cvttpd2pi Ppi,Wpd (66) | cvttsd2si Gd/q,Wsd (F2),(VEX),(o128)
-2d: cvtps2pi Ppi,Wps | cvtss2si Gd/q,Wss (F3),(VEX),(o128) | cvtpd2pi Qpi,Wpd (66) | cvtsd2si Gd/q,Wsd (F2),(VEX),(o128)
-2e: ucomiss Vss,Wss (VEX),(o128) | ucomisd Vsd,Wsd (66),(VEX),(o128)
-2f: comiss Vss,Wss (VEX),(o128) | comisd Vsd,Wsd (66),(VEX),(o128)
+28: vmovaps Vps,Wps | vmovapd Vpd,Wpd (66)
+29: vmovaps Wps,Vps | vmovapd Wpd,Vpd (66)
+2a: cvtpi2ps Vps,Qpi | cvtpi2pd Vpd,Qpi (66) | vcvtsi2ss Vss,Hss,Ey (F3),(v1) | vcvtsi2sd Vsd,Hsd,Ey (F2),(v1)
+2b: vmovntps Mps,Vps | vmovntpd Mpd,Vpd (66)
+2c: cvttps2pi Ppi,Wps | cvttpd2pi Ppi,Wpd (66) | vcvttss2si Gy,Wss (F3),(v1) | vcvttsd2si Gy,Wsd (F2),(v1)
+2d: cvtps2pi Ppi,Wps | cvtpd2pi Qpi,Wpd (66) | vcvtss2si Gy,Wss (F3),(v1) | vcvtsd2si Gy,Wsd (F2),(v1)
+2e: vucomiss Vss,Wss (v1) | vucomisd Vsd,Wsd (66),(v1)
+2f: vcomiss Vss,Wss (v1) | vcomisd Vsd,Wsd (66),(v1)
# 0x0f 0x30-0x3f
30: WRMSR
31: RDTSC
@@ -388,65 +402,66 @@ AVXcode: 1
4e: CMOVLE/NG Gv,Ev
4f: CMOVNLE/G Gv,Ev
# 0x0f 0x50-0x5f
-50: movmskps Gd/q,Ups (VEX) | movmskpd Gd/q,Upd (66),(VEX)
-51: sqrtps Vps,Wps (VEX) | sqrtss Vss,Wss (F3),(VEX),(o128) | sqrtpd Vpd,Wpd (66),(VEX) | sqrtsd Vsd,Wsd (F2),(VEX),(o128)
-52: rsqrtps Vps,Wps (VEX) | rsqrtss Vss,Wss (F3),(VEX),(o128)
-53: rcpps Vps,Wps (VEX) | rcpss Vss,Wss (F3),(VEX),(o128)
-54: andps Vps,Wps (VEX) | andpd Vpd,Wpd (66),(VEX)
-55: andnps Vps,Wps (VEX) | andnpd Vpd,Wpd (66),(VEX)
-56: orps Vps,Wps (VEX) | orpd Vpd,Wpd (66),(VEX)
-57: xorps Vps,Wps (VEX) | xorpd Vpd,Wpd (66),(VEX)
-58: addps Vps,Wps (VEX) | addss Vss,Wss (F3),(VEX),(o128) | addpd Vpd,Wpd (66),(VEX) | addsd Vsd,Wsd (F2),(VEX),(o128)
-59: mulps Vps,Wps (VEX) | mulss Vss,Wss (F3),(VEX),(o128) | mulpd Vpd,Wpd (66),(VEX) | mulsd Vsd,Wsd (F2),(VEX),(o128)
-5a: cvtps2pd Vpd,Wps (VEX) | cvtss2sd Vsd,Wss (F3),(VEX),(o128) | cvtpd2ps Vps,Wpd (66),(VEX) | cvtsd2ss Vsd,Wsd (F2),(VEX),(o128)
-5b: cvtdq2ps Vps,Wdq (VEX) | cvtps2dq Vdq,Wps (66),(VEX) | cvttps2dq Vdq,Wps (F3),(VEX)
-5c: subps Vps,Wps (VEX) | subss Vss,Wss (F3),(VEX),(o128) | subpd Vpd,Wpd (66),(VEX) | subsd Vsd,Wsd (F2),(VEX),(o128)
-5d: minps Vps,Wps (VEX) | minss Vss,Wss (F3),(VEX),(o128) | minpd Vpd,Wpd (66),(VEX) | minsd Vsd,Wsd (F2),(VEX),(o128)
-5e: divps Vps,Wps (VEX) | divss Vss,Wss (F3),(VEX),(o128) | divpd Vpd,Wpd (66),(VEX) | divsd Vsd,Wsd (F2),(VEX),(o128)
-5f: maxps Vps,Wps (VEX) | maxss Vss,Wss (F3),(VEX),(o128) | maxpd Vpd,Wpd (66),(VEX) | maxsd Vsd,Wsd (F2),(VEX),(o128)
+50: vmovmskps Gy,Ups | vmovmskpd Gy,Upd (66)
+51: vsqrtps Vps,Wps | vsqrtpd Vpd,Wpd (66) | vsqrtss Vss,Hss,Wss (F3),(v1) | vsqrtsd Vsd,Hsd,Wsd (F2),(v1)
+52: vrsqrtps Vps,Wps | vrsqrtss Vss,Hss,Wss (F3),(v1)
+53: vrcpps Vps,Wps | vrcpss Vss,Hss,Wss (F3),(v1)
+54: vandps Vps,Hps,Wps | vandpd Vpd,Hpd,Wpd (66)
+55: vandnps Vps,Hps,Wps | vandnpd Vpd,Hpd,Wpd (66)
+56: vorps Vps,Hps,Wps | vorpd Vpd,Hpd,Wpd (66)
+57: vxorps Vps,Hps,Wps | vxorpd Vpd,Hpd,Wpd (66)
+58: vaddps Vps,Hps,Wps | vaddpd Vpd,Hpd,Wpd (66) | vaddss Vss,Hss,Wss (F3),(v1) | vaddsd Vsd,Hsd,Wsd (F2),(v1)
+59: vmulps Vps,Hps,Wps | vmulpd Vpd,Hpd,Wpd (66) | vmulss Vss,Hss,Wss (F3),(v1) | vmulsd Vsd,Hsd,Wsd (F2),(v1)
+5a: vcvtps2pd Vpd,Wps | vcvtpd2ps Vps,Wpd (66) | vcvtss2sd Vsd,Hx,Wss (F3),(v1) | vcvtsd2ss Vss,Hx,Wsd (F2),(v1)
+5b: vcvtdq2ps Vps,Wdq | vcvtps2dq Vdq,Wps (66) | vcvttps2dq Vdq,Wps (F3)
+5c: vsubps Vps,Hps,Wps | vsubpd Vpd,Hpd,Wpd (66) | vsubss Vss,Hss,Wss (F3),(v1) | vsubsd Vsd,Hsd,Wsd (F2),(v1)
+5d: vminps Vps,Hps,Wps | vminpd Vpd,Hpd,Wpd (66) | vminss Vss,Hss,Wss (F3),(v1) | vminsd Vsd,Hsd,Wsd (F2),(v1)
+5e: vdivps Vps,Hps,Wps | vdivpd Vpd,Hpd,Wpd (66) | vdivss Vss,Hss,Wss (F3),(v1) | vdivsd Vsd,Hsd,Wsd (F2),(v1)
+5f: vmaxps Vps,Hps,Wps | vmaxpd Vpd,Hpd,Wpd (66) | vmaxss Vss,Hss,Wss (F3),(v1) | vmaxsd Vsd,Hsd,Wsd (F2),(v1)
# 0x0f 0x60-0x6f
-60: punpcklbw Pq,Qd | punpcklbw Vdq,Wdq (66),(VEX),(o128)
-61: punpcklwd Pq,Qd | punpcklwd Vdq,Wdq (66),(VEX),(o128)
-62: punpckldq Pq,Qd | punpckldq Vdq,Wdq (66),(VEX),(o128)
-63: packsswb Pq,Qq | packsswb Vdq,Wdq (66),(VEX),(o128)
-64: pcmpgtb Pq,Qq | pcmpgtb Vdq,Wdq (66),(VEX),(o128)
-65: pcmpgtw Pq,Qq | pcmpgtw Vdq,Wdq (66),(VEX),(o128)
-66: pcmpgtd Pq,Qq | pcmpgtd Vdq,Wdq (66),(VEX),(o128)
-67: packuswb Pq,Qq | packuswb Vdq,Wdq (66),(VEX),(o128)
-68: punpckhbw Pq,Qd | punpckhbw Vdq,Wdq (66),(VEX),(o128)
-69: punpckhwd Pq,Qd | punpckhwd Vdq,Wdq (66),(VEX),(o128)
-6a: punpckhdq Pq,Qd | punpckhdq Vdq,Wdq (66),(VEX),(o128)
-6b: packssdw Pq,Qd | packssdw Vdq,Wdq (66),(VEX),(o128)
-6c: punpcklqdq Vdq,Wdq (66),(VEX),(o128)
-6d: punpckhqdq Vdq,Wdq (66),(VEX),(o128)
-6e: movd/q/ Pd,Ed/q | movd/q Vdq,Ed/q (66),(VEX),(o128)
-6f: movq Pq,Qq | movdqa Vdq,Wdq (66),(VEX) | movdqu Vdq,Wdq (F3),(VEX)
+60: punpcklbw Pq,Qd | vpunpcklbw Vx,Hx,Wx (66),(v1)
+61: punpcklwd Pq,Qd | vpunpcklwd Vx,Hx,Wx (66),(v1)
+62: punpckldq Pq,Qd | vpunpckldq Vx,Hx,Wx (66),(v1)
+63: packsswb Pq,Qq | vpacksswb Vx,Hx,Wx (66),(v1)
+64: pcmpgtb Pq,Qq | vpcmpgtb Vx,Hx,Wx (66),(v1)
+65: pcmpgtw Pq,Qq | vpcmpgtw Vx,Hx,Wx (66),(v1)
+66: pcmpgtd Pq,Qq | vpcmpgtd Vx,Hx,Wx (66),(v1)
+67: packuswb Pq,Qq | vpackuswb Vx,Hx,Wx (66),(v1)
+68: punpckhbw Pq,Qd | vpunpckhbw Vx,Hx,Wx (66),(v1)
+69: punpckhwd Pq,Qd | vpunpckhwd Vx,Hx,Wx (66),(v1)
+6a: punpckhdq Pq,Qd | vpunpckhdq Vx,Hx,Wx (66),(v1)
+6b: packssdw Pq,Qd | vpackssdw Vx,Hx,Wx (66),(v1)
+6c: vpunpcklqdq Vx,Hx,Wx (66),(v1)
+6d: vpunpckhqdq Vx,Hx,Wx (66),(v1)
+6e: movd/q Pd,Ey | vmovd/q Vy,Ey (66),(v1)
+6f: movq Pq,Qq | vmovdqa Vx,Wx (66) | vmovdqu Vx,Wx (F3)
# 0x0f 0x70-0x7f
-70: pshufw Pq,Qq,Ib | pshufd Vdq,Wdq,Ib (66),(VEX),(o128) | pshufhw Vdq,Wdq,Ib (F3),(VEX),(o128) | pshuflw VdqWdq,Ib (F2),(VEX),(o128)
+70: pshufw Pq,Qq,Ib | vpshufd Vx,Wx,Ib (66),(v1) | vpshufhw Vx,Wx,Ib (F3),(v1) | vpshuflw Vx,Wx,Ib (F2),(v1)
71: Grp12 (1A)
72: Grp13 (1A)
73: Grp14 (1A)
-74: pcmpeqb Pq,Qq | pcmpeqb Vdq,Wdq (66),(VEX),(o128)
-75: pcmpeqw Pq,Qq | pcmpeqw Vdq,Wdq (66),(VEX),(o128)
-76: pcmpeqd Pq,Qq | pcmpeqd Vdq,Wdq (66),(VEX),(o128)
-77: emms/vzeroupper/vzeroall (VEX)
-78: VMREAD Ed/q,Gd/q
-79: VMWRITE Gd/q,Ed/q
+74: pcmpeqb Pq,Qq | vpcmpeqb Vx,Hx,Wx (66),(v1)
+75: pcmpeqw Pq,Qq | vpcmpeqw Vx,Hx,Wx (66),(v1)
+76: pcmpeqd Pq,Qq | vpcmpeqd Vx,Hx,Wx (66),(v1)
+# Note: Remove (v), because vzeroall and vzeroupper becomes emms without VEX.
+77: emms | vzeroupper | vzeroall
+78: VMREAD Ey,Gy
+79: VMWRITE Gy,Ey
7a:
7b:
-7c: haddps Vps,Wps (F2),(VEX) | haddpd Vpd,Wpd (66),(VEX)
-7d: hsubps Vps,Wps (F2),(VEX) | hsubpd Vpd,Wpd (66),(VEX)
-7e: movd/q Ed/q,Pd | movd/q Ed/q,Vdq (66),(VEX),(o128) | movq Vq,Wq (F3),(VEX),(o128)
-7f: movq Qq,Pq | movdqa Wdq,Vdq (66),(VEX) | movdqu Wdq,Vdq (F3),(VEX)
+7c: vhaddpd Vpd,Hpd,Wpd (66) | vhaddps Vps,Hps,Wps (F2)
+7d: vhsubpd Vpd,Hpd,Wpd (66) | vhsubps Vps,Hps,Wps (F2)
+7e: movd/q Ey,Pd | vmovd/q Ey,Vy (66),(v1) | vmovq Vq,Wq (F3),(v1)
+7f: movq Qq,Pq | vmovdqa Wx,Vx (66) | vmovdqu Wx,Vx (F3)
# 0x0f 0x80-0x8f
80: JO Jz (f64)
81: JNO Jz (f64)
-82: JB/JNAE/JC Jz (f64)
-83: JNB/JAE/JNC Jz (f64)
-84: JZ/JE Jz (f64)
-85: JNZ/JNE Jz (f64)
+82: JB/JC/JNAE Jz (f64)
+83: JAE/JNB/JNC Jz (f64)
+84: JE/JZ Jz (f64)
+85: JNE/JNZ Jz (f64)
86: JBE/JNA Jz (f64)
-87: JNBE/JA Jz (f64)
+87: JA/JNBE Jz (f64)
88: JS Jz (f64)
89: JNS Jz (f64)
8a: JP/JPE Jz (f64)
@@ -502,18 +517,18 @@ b8: JMPE | POPCNT Gv,Ev (F3)
b9: Grp10 (1A)
ba: Grp8 Ev,Ib (1A)
bb: BTC Ev,Gv
-bc: BSF Gv,Ev
-bd: BSR Gv,Ev
+bc: BSF Gv,Ev | TZCNT Gv,Ev (F3)
+bd: BSR Gv,Ev | LZCNT Gv,Ev (F3)
be: MOVSX Gv,Eb
bf: MOVSX Gv,Ew
# 0x0f 0xc0-0xcf
c0: XADD Eb,Gb
c1: XADD Ev,Gv
-c2: cmpps Vps,Wps,Ib (VEX) | cmpss Vss,Wss,Ib (F3),(VEX),(o128) | cmppd Vpd,Wpd,Ib (66),(VEX) | cmpsd Vsd,Wsd,Ib (F2),(VEX)
-c3: movnti Md/q,Gd/q
-c4: pinsrw Pq,Rd/q/Mw,Ib | pinsrw Vdq,Rd/q/Mw,Ib (66),(VEX),(o128)
-c5: pextrw Gd,Nq,Ib | pextrw Gd,Udq,Ib (66),(VEX),(o128)
-c6: shufps Vps,Wps,Ib (VEX) | shufpd Vpd,Wpd,Ib (66),(VEX)
+c2: vcmpps Vps,Hps,Wps,Ib | vcmppd Vpd,Hpd,Wpd,Ib (66) | vcmpss Vss,Hss,Wss,Ib (F3),(v1) | vcmpsd Vsd,Hsd,Wsd,Ib (F2),(v1)
+c3: movnti My,Gy
+c4: pinsrw Pq,Ry/Mw,Ib | vpinsrw Vdq,Hdq,Ry/Mw,Ib (66),(v1)
+c5: pextrw Gd,Nq,Ib | vpextrw Gd,Udq,Ib (66),(v1)
+c6: vshufps Vps,Hps,Wps,Ib | vshufpd Vpd,Hpd,Wpd,Ib (66)
c7: Grp9 (1A)
c8: BSWAP RAX/EAX/R8/R8D
c9: BSWAP RCX/ECX/R9/R9D
@@ -524,55 +539,55 @@ cd: BSWAP RBP/EBP/R13/R13D
ce: BSWAP RSI/ESI/R14/R14D
cf: BSWAP RDI/EDI/R15/R15D
# 0x0f 0xd0-0xdf
-d0: addsubps Vps,Wps (F2),(VEX) | addsubpd Vpd,Wpd (66),(VEX)
-d1: psrlw Pq,Qq | psrlw Vdq,Wdq (66),(VEX),(o128)
-d2: psrld Pq,Qq | psrld Vdq,Wdq (66),(VEX),(o128)
-d3: psrlq Pq,Qq | psrlq Vdq,Wdq (66),(VEX),(o128)
-d4: paddq Pq,Qq | paddq Vdq,Wdq (66),(VEX),(o128)
-d5: pmullw Pq,Qq | pmullw Vdq,Wdq (66),(VEX),(o128)
-d6: movq Wq,Vq (66),(VEX),(o128) | movq2dq Vdq,Nq (F3) | movdq2q Pq,Uq (F2)
-d7: pmovmskb Gd,Nq | pmovmskb Gd,Udq (66),(VEX),(o128)
-d8: psubusb Pq,Qq | psubusb Vdq,Wdq (66),(VEX),(o128)
-d9: psubusw Pq,Qq | psubusw Vdq,Wdq (66),(VEX),(o128)
-da: pminub Pq,Qq | pminub Vdq,Wdq (66),(VEX),(o128)
-db: pand Pq,Qq | pand Vdq,Wdq (66),(VEX),(o128)
-dc: paddusb Pq,Qq | paddusb Vdq,Wdq (66),(VEX),(o128)
-dd: paddusw Pq,Qq | paddusw Vdq,Wdq (66),(VEX),(o128)
-de: pmaxub Pq,Qq | pmaxub Vdq,Wdq (66),(VEX),(o128)
-df: pandn Pq,Qq | pandn Vdq,Wdq (66),(VEX),(o128)
+d0: vaddsubpd Vpd,Hpd,Wpd (66) | vaddsubps Vps,Hps,Wps (F2)
+d1: psrlw Pq,Qq | vpsrlw Vx,Hx,Wx (66),(v1)
+d2: psrld Pq,Qq | vpsrld Vx,Hx,Wx (66),(v1)
+d3: psrlq Pq,Qq | vpsrlq Vx,Hx,Wx (66),(v1)
+d4: paddq Pq,Qq | vpaddq Vx,Hx,Wx (66),(v1)
+d5: pmullw Pq,Qq | vpmullw Vx,Hx,Wx (66),(v1)
+d6: vmovq Wq,Vq (66),(v1) | movq2dq Vdq,Nq (F3) | movdq2q Pq,Uq (F2)
+d7: pmovmskb Gd,Nq | vpmovmskb Gd,Ux (66),(v1)
+d8: psubusb Pq,Qq | vpsubusb Vx,Hx,Wx (66),(v1)
+d9: psubusw Pq,Qq | vpsubusw Vx,Hx,Wx (66),(v1)
+da: pminub Pq,Qq | vpminub Vx,Hx,Wx (66),(v1)
+db: pand Pq,Qq | vpand Vx,Hx,Wx (66),(v1)
+dc: paddusb Pq,Qq | vpaddusb Vx,Hx,Wx (66),(v1)
+dd: paddusw Pq,Qq | vpaddusw Vx,Hx,Wx (66),(v1)
+de: pmaxub Pq,Qq | vpmaxub Vx,Hx,Wx (66),(v1)
+df: pandn Pq,Qq | vpandn Vx,Hx,Wx (66),(v1)
# 0x0f 0xe0-0xef
-e0: pavgb Pq,Qq | pavgb Vdq,Wdq (66),(VEX),(o128)
-e1: psraw Pq,Qq | psraw Vdq,Wdq (66),(VEX),(o128)
-e2: psrad Pq,Qq | psrad Vdq,Wdq (66),(VEX),(o128)
-e3: pavgw Pq,Qq | pavgw Vdq,Wdq (66),(VEX),(o128)
-e4: pmulhuw Pq,Qq | pmulhuw Vdq,Wdq (66),(VEX),(o128)
-e5: pmulhw Pq,Qq | pmulhw Vdq,Wdq (66),(VEX),(o128)
-e6: cvtpd2dq Vdq,Wpd (F2),(VEX) | cvttpd2dq Vdq,Wpd (66),(VEX) | cvtdq2pd Vpd,Wdq (F3),(VEX)
-e7: movntq Mq,Pq | movntdq Mdq,Vdq (66),(VEX)
-e8: psubsb Pq,Qq | psubsb Vdq,Wdq (66),(VEX),(o128)
-e9: psubsw Pq,Qq | psubsw Vdq,Wdq (66),(VEX),(o128)
-ea: pminsw Pq,Qq | pminsw Vdq,Wdq (66),(VEX),(o128)
-eb: por Pq,Qq | por Vdq,Wdq (66),(VEX),(o128)
-ec: paddsb Pq,Qq | paddsb Vdq,Wdq (66),(VEX),(o128)
-ed: paddsw Pq,Qq | paddsw Vdq,Wdq (66),(VEX),(o128)
-ee: pmaxsw Pq,Qq | pmaxsw Vdq,Wdq (66),(VEX),(o128)
-ef: pxor Pq,Qq | pxor Vdq,Wdq (66),(VEX),(o128)
+e0: pavgb Pq,Qq | vpavgb Vx,Hx,Wx (66),(v1)
+e1: psraw Pq,Qq | vpsraw Vx,Hx,Wx (66),(v1)
+e2: psrad Pq,Qq | vpsrad Vx,Hx,Wx (66),(v1)
+e3: pavgw Pq,Qq | vpavgw Vx,Hx,Wx (66),(v1)
+e4: pmulhuw Pq,Qq | vpmulhuw Vx,Hx,Wx (66),(v1)
+e5: pmulhw Pq,Qq | vpmulhw Vx,Hx,Wx (66),(v1)
+e6: vcvttpd2dq Vx,Wpd (66) | vcvtdq2pd Vx,Wdq (F3) | vcvtpd2dq Vx,Wpd (F2)
+e7: movntq Mq,Pq | vmovntdq Mx,Vx (66)
+e8: psubsb Pq,Qq | vpsubsb Vx,Hx,Wx (66),(v1)
+e9: psubsw Pq,Qq | vpsubsw Vx,Hx,Wx (66),(v1)
+ea: pminsw Pq,Qq | vpminsw Vx,Hx,Wx (66),(v1)
+eb: por Pq,Qq | vpor Vx,Hx,Wx (66),(v1)
+ec: paddsb Pq,Qq | vpaddsb Vx,Hx,Wx (66),(v1)
+ed: paddsw Pq,Qq | vpaddsw Vx,Hx,Wx (66),(v1)
+ee: pmaxsw Pq,Qq | vpmaxsw Vx,Hx,Wx (66),(v1)
+ef: pxor Pq,Qq | vpxor Vx,Hx,Wx (66),(v1)
# 0x0f 0xf0-0xff
-f0: lddqu Vdq,Mdq (F2),(VEX)
-f1: psllw Pq,Qq | psllw Vdq,Wdq (66),(VEX),(o128)
-f2: pslld Pq,Qq | pslld Vdq,Wdq (66),(VEX),(o128)
-f3: psllq Pq,Qq | psllq Vdq,Wdq (66),(VEX),(o128)
-f4: pmuludq Pq,Qq | pmuludq Vdq,Wdq (66),(VEX),(o128)
-f5: pmaddwd Pq,Qq | pmaddwd Vdq,Wdq (66),(VEX),(o128)
-f6: psadbw Pq,Qq | psadbw Vdq,Wdq (66),(VEX),(o128)
-f7: maskmovq Pq,Nq | maskmovdqu Vdq,Udq (66),(VEX),(o128)
-f8: psubb Pq,Qq | psubb Vdq,Wdq (66),(VEX),(o128)
-f9: psubw Pq,Qq | psubw Vdq,Wdq (66),(VEX),(o128)
-fa: psubd Pq,Qq | psubd Vdq,Wdq (66),(VEX),(o128)
-fb: psubq Pq,Qq | psubq Vdq,Wdq (66),(VEX),(o128)
-fc: paddb Pq,Qq | paddb Vdq,Wdq (66),(VEX),(o128)
-fd: paddw Pq,Qq | paddw Vdq,Wdq (66),(VEX),(o128)
-fe: paddd Pq,Qq | paddd Vdq,Wdq (66),(VEX),(o128)
+f0: vlddqu Vx,Mx (F2)
+f1: psllw Pq,Qq | vpsllw Vx,Hx,Wx (66),(v1)
+f2: pslld Pq,Qq | vpslld Vx,Hx,Wx (66),(v1)
+f3: psllq Pq,Qq | vpsllq Vx,Hx,Wx (66),(v1)
+f4: pmuludq Pq,Qq | vpmuludq Vx,Hx,Wx (66),(v1)
+f5: pmaddwd Pq,Qq | vpmaddwd Vx,Hx,Wx (66),(v1)
+f6: psadbw Pq,Qq | vpsadbw Vx,Hx,Wx (66),(v1)
+f7: maskmovq Pq,Nq | vmaskmovdqu Vx,Ux (66),(v1)
+f8: psubb Pq,Qq | vpsubb Vx,Hx,Wx (66),(v1)
+f9: psubw Pq,Qq | vpsubw Vx,Hx,Wx (66),(v1)
+fa: psubd Pq,Qq | vpsubd Vx,Hx,Wx (66),(v1)
+fb: psubq Pq,Qq | vpsubq Vx,Hx,Wx (66),(v1)
+fc: paddb Pq,Qq | vpaddb Vx,Hx,Wx (66),(v1)
+fd: paddw Pq,Qq | vpaddw Vx,Hx,Wx (66),(v1)
+fe: paddd Pq,Qq | vpaddd Vx,Hx,Wx (66),(v1)
ff:
EndTable
@@ -580,155 +595,193 @@ Table: 3-byte opcode 1 (0x0f 0x38)
Referrer: 3-byte escape 1
AVXcode: 2
# 0x0f 0x38 0x00-0x0f
-00: pshufb Pq,Qq | pshufb Vdq,Wdq (66),(VEX),(o128)
-01: phaddw Pq,Qq | phaddw Vdq,Wdq (66),(VEX),(o128)
-02: phaddd Pq,Qq | phaddd Vdq,Wdq (66),(VEX),(o128)
-03: phaddsw Pq,Qq | phaddsw Vdq,Wdq (66),(VEX),(o128)
-04: pmaddubsw Pq,Qq | pmaddubsw Vdq,Wdq (66),(VEX),(o128)
-05: phsubw Pq,Qq | phsubw Vdq,Wdq (66),(VEX),(o128)
-06: phsubd Pq,Qq | phsubd Vdq,Wdq (66),(VEX),(o128)
-07: phsubsw Pq,Qq | phsubsw Vdq,Wdq (66),(VEX),(o128)
-08: psignb Pq,Qq | psignb Vdq,Wdq (66),(VEX),(o128)
-09: psignw Pq,Qq | psignw Vdq,Wdq (66),(VEX),(o128)
-0a: psignd Pq,Qq | psignd Vdq,Wdq (66),(VEX),(o128)
-0b: pmulhrsw Pq,Qq | pmulhrsw Vdq,Wdq (66),(VEX),(o128)
-0c: Vpermilps /r (66),(oVEX)
-0d: Vpermilpd /r (66),(oVEX)
-0e: vtestps /r (66),(oVEX)
-0f: vtestpd /r (66),(oVEX)
+00: pshufb Pq,Qq | vpshufb Vx,Hx,Wx (66),(v1)
+01: phaddw Pq,Qq | vphaddw Vx,Hx,Wx (66),(v1)
+02: phaddd Pq,Qq | vphaddd Vx,Hx,Wx (66),(v1)
+03: phaddsw Pq,Qq | vphaddsw Vx,Hx,Wx (66),(v1)
+04: pmaddubsw Pq,Qq | vpmaddubsw Vx,Hx,Wx (66),(v1)
+05: phsubw Pq,Qq | vphsubw Vx,Hx,Wx (66),(v1)
+06: phsubd Pq,Qq | vphsubd Vx,Hx,Wx (66),(v1)
+07: phsubsw Pq,Qq | vphsubsw Vx,Hx,Wx (66),(v1)
+08: psignb Pq,Qq | vpsignb Vx,Hx,Wx (66),(v1)
+09: psignw Pq,Qq | vpsignw Vx,Hx,Wx (66),(v1)
+0a: psignd Pq,Qq | vpsignd Vx,Hx,Wx (66),(v1)
+0b: pmulhrsw Pq,Qq | vpmulhrsw Vx,Hx,Wx (66),(v1)
+0c: vpermilps Vx,Hx,Wx (66),(v)
+0d: vpermilpd Vx,Hx,Wx (66),(v)
+0e: vtestps Vx,Wx (66),(v)
+0f: vtestpd Vx,Wx (66),(v)
# 0x0f 0x38 0x10-0x1f
10: pblendvb Vdq,Wdq (66)
11:
12:
-13:
+13: vcvtph2ps Vx,Wx,Ib (66),(v)
14: blendvps Vdq,Wdq (66)
15: blendvpd Vdq,Wdq (66)
-16:
-17: ptest Vdq,Wdq (66),(VEX)
-18: vbroadcastss /r (66),(oVEX)
-19: vbroadcastsd /r (66),(oVEX),(o256)
-1a: vbroadcastf128 /r (66),(oVEX),(o256)
+16: vpermps Vqq,Hqq,Wqq (66),(v)
+17: vptest Vx,Wx (66)
+18: vbroadcastss Vx,Wd (66),(v)
+19: vbroadcastsd Vqq,Wq (66),(v)
+1a: vbroadcastf128 Vqq,Mdq (66),(v)
1b:
-1c: pabsb Pq,Qq | pabsb Vdq,Wdq (66),(VEX),(o128)
-1d: pabsw Pq,Qq | pabsw Vdq,Wdq (66),(VEX),(o128)
-1e: pabsd Pq,Qq | pabsd Vdq,Wdq (66),(VEX),(o128)
+1c: pabsb Pq,Qq | vpabsb Vx,Wx (66),(v1)
+1d: pabsw Pq,Qq | vpabsw Vx,Wx (66),(v1)
+1e: pabsd Pq,Qq | vpabsd Vx,Wx (66),(v1)
1f:
# 0x0f 0x38 0x20-0x2f
-20: pmovsxbw Vdq,Udq/Mq (66),(VEX),(o128)
-21: pmovsxbd Vdq,Udq/Md (66),(VEX),(o128)
-22: pmovsxbq Vdq,Udq/Mw (66),(VEX),(o128)
-23: pmovsxwd Vdq,Udq/Mq (66),(VEX),(o128)
-24: pmovsxwq Vdq,Udq/Md (66),(VEX),(o128)
-25: pmovsxdq Vdq,Udq/Mq (66),(VEX),(o128)
+20: vpmovsxbw Vx,Ux/Mq (66),(v1)
+21: vpmovsxbd Vx,Ux/Md (66),(v1)
+22: vpmovsxbq Vx,Ux/Mw (66),(v1)
+23: vpmovsxwd Vx,Ux/Mq (66),(v1)
+24: vpmovsxwq Vx,Ux/Md (66),(v1)
+25: vpmovsxdq Vx,Ux/Mq (66),(v1)
26:
27:
-28: pmuldq Vdq,Wdq (66),(VEX),(o128)
-29: pcmpeqq Vdq,Wdq (66),(VEX),(o128)
-2a: movntdqa Vdq,Mdq (66),(VEX),(o128)
-2b: packusdw Vdq,Wdq (66),(VEX),(o128)
-2c: vmaskmovps(ld) /r (66),(oVEX)
-2d: vmaskmovpd(ld) /r (66),(oVEX)
-2e: vmaskmovps(st) /r (66),(oVEX)
-2f: vmaskmovpd(st) /r (66),(oVEX)
+28: vpmuldq Vx,Hx,Wx (66),(v1)
+29: vpcmpeqq Vx,Hx,Wx (66),(v1)
+2a: vmovntdqa Vx,Mx (66),(v1)
+2b: vpackusdw Vx,Hx,Wx (66),(v1)
+2c: vmaskmovps Vx,Hx,Mx (66),(v)
+2d: vmaskmovpd Vx,Hx,Mx (66),(v)
+2e: vmaskmovps Mx,Hx,Vx (66),(v)
+2f: vmaskmovpd Mx,Hx,Vx (66),(v)
# 0x0f 0x38 0x30-0x3f
-30: pmovzxbw Vdq,Udq/Mq (66),(VEX),(o128)
-31: pmovzxbd Vdq,Udq/Md (66),(VEX),(o128)
-32: pmovzxbq Vdq,Udq/Mw (66),(VEX),(o128)
-33: pmovzxwd Vdq,Udq/Mq (66),(VEX),(o128)
-34: pmovzxwq Vdq,Udq/Md (66),(VEX),(o128)
-35: pmovzxdq Vdq,Udq/Mq (66),(VEX),(o128)
-36:
-37: pcmpgtq Vdq,Wdq (66),(VEX),(o128)
-38: pminsb Vdq,Wdq (66),(VEX),(o128)
-39: pminsd Vdq,Wdq (66),(VEX),(o128)
-3a: pminuw Vdq,Wdq (66),(VEX),(o128)
-3b: pminud Vdq,Wdq (66),(VEX),(o128)
-3c: pmaxsb Vdq,Wdq (66),(VEX),(o128)
-3d: pmaxsd Vdq,Wdq (66),(VEX),(o128)
-3e: pmaxuw Vdq,Wdq (66),(VEX),(o128)
-3f: pmaxud Vdq,Wdq (66),(VEX),(o128)
+30: vpmovzxbw Vx,Ux/Mq (66),(v1)
+31: vpmovzxbd Vx,Ux/Md (66),(v1)
+32: vpmovzxbq Vx,Ux/Mw (66),(v1)
+33: vpmovzxwd Vx,Ux/Mq (66),(v1)
+34: vpmovzxwq Vx,Ux/Md (66),(v1)
+35: vpmovzxdq Vx,Ux/Mq (66),(v1)
+36: vpermd Vqq,Hqq,Wqq (66),(v)
+37: vpcmpgtq Vx,Hx,Wx (66),(v1)
+38: vpminsb Vx,Hx,Wx (66),(v1)
+39: vpminsd Vx,Hx,Wx (66),(v1)
+3a: vpminuw Vx,Hx,Wx (66),(v1)
+3b: vpminud Vx,Hx,Wx (66),(v1)
+3c: vpmaxsb Vx,Hx,Wx (66),(v1)
+3d: vpmaxsd Vx,Hx,Wx (66),(v1)
+3e: vpmaxuw Vx,Hx,Wx (66),(v1)
+3f: vpmaxud Vx,Hx,Wx (66),(v1)
# 0x0f 0x38 0x40-0x8f
-40: pmulld Vdq,Wdq (66),(VEX),(o128)
-41: phminposuw Vdq,Wdq (66),(VEX),(o128)
-80: INVEPT Gd/q,Mdq (66)
-81: INVPID Gd/q,Mdq (66)
+40: vpmulld Vx,Hx,Wx (66),(v1)
+41: vphminposuw Vdq,Wdq (66),(v1)
+42:
+43:
+44:
+45: vpsrlvd/q Vx,Hx,Wx (66),(v)
+46: vpsravd Vx,Hx,Wx (66),(v)
+47: vpsllvd/q Vx,Hx,Wx (66),(v)
+# Skip 0x48-0x57
+58: vpbroadcastd Vx,Wx (66),(v)
+59: vpbroadcastq Vx,Wx (66),(v)
+5a: vbroadcasti128 Vqq,Mdq (66),(v)
+# Skip 0x5b-0x77
+78: vpbroadcastb Vx,Wx (66),(v)
+79: vpbroadcastw Vx,Wx (66),(v)
+# Skip 0x7a-0x7f
+80: INVEPT Gy,Mdq (66)
+81: INVPID Gy,Mdq (66)
+82: INVPCID Gy,Mdq (66)
+8c: vpmaskmovd/q Vx,Hx,Mx (66),(v)
+8e: vpmaskmovd/q Mx,Vx,Hx (66),(v)
# 0x0f 0x38 0x90-0xbf (FMA)
-96: vfmaddsub132pd/ps /r (66),(VEX)
-97: vfmsubadd132pd/ps /r (66),(VEX)
-98: vfmadd132pd/ps /r (66),(VEX)
-99: vfmadd132sd/ss /r (66),(VEX),(o128)
-9a: vfmsub132pd/ps /r (66),(VEX)
-9b: vfmsub132sd/ss /r (66),(VEX),(o128)
-9c: vfnmadd132pd/ps /r (66),(VEX)
-9d: vfnmadd132sd/ss /r (66),(VEX),(o128)
-9e: vfnmsub132pd/ps /r (66),(VEX)
-9f: vfnmsub132sd/ss /r (66),(VEX),(o128)
-a6: vfmaddsub213pd/ps /r (66),(VEX)
-a7: vfmsubadd213pd/ps /r (66),(VEX)
-a8: vfmadd213pd/ps /r (66),(VEX)
-a9: vfmadd213sd/ss /r (66),(VEX),(o128)
-aa: vfmsub213pd/ps /r (66),(VEX)
-ab: vfmsub213sd/ss /r (66),(VEX),(o128)
-ac: vfnmadd213pd/ps /r (66),(VEX)
-ad: vfnmadd213sd/ss /r (66),(VEX),(o128)
-ae: vfnmsub213pd/ps /r (66),(VEX)
-af: vfnmsub213sd/ss /r (66),(VEX),(o128)
-b6: vfmaddsub231pd/ps /r (66),(VEX)
-b7: vfmsubadd231pd/ps /r (66),(VEX)
-b8: vfmadd231pd/ps /r (66),(VEX)
-b9: vfmadd231sd/ss /r (66),(VEX),(o128)
-ba: vfmsub231pd/ps /r (66),(VEX)
-bb: vfmsub231sd/ss /r (66),(VEX),(o128)
-bc: vfnmadd231pd/ps /r (66),(VEX)
-bd: vfnmadd231sd/ss /r (66),(VEX),(o128)
-be: vfnmsub231pd/ps /r (66),(VEX)
-bf: vfnmsub231sd/ss /r (66),(VEX),(o128)
+90: vgatherdd/q Vx,Hx,Wx (66),(v)
+91: vgatherqd/q Vx,Hx,Wx (66),(v)
+92: vgatherdps/d Vx,Hx,Wx (66),(v)
+93: vgatherqps/d Vx,Hx,Wx (66),(v)
+94:
+95:
+96: vfmaddsub132ps/d Vx,Hx,Wx (66),(v)
+97: vfmsubadd132ps/d Vx,Hx,Wx (66),(v)
+98: vfmadd132ps/d Vx,Hx,Wx (66),(v)
+99: vfmadd132ss/d Vx,Hx,Wx (66),(v),(v1)
+9a: vfmsub132ps/d Vx,Hx,Wx (66),(v)
+9b: vfmsub132ss/d Vx,Hx,Wx (66),(v),(v1)
+9c: vfnmadd132ps/d Vx,Hx,Wx (66),(v)
+9d: vfnmadd132ss/d Vx,Hx,Wx (66),(v),(v1)
+9e: vfnmsub132ps/d Vx,Hx,Wx (66),(v)
+9f: vfnmsub132ss/d Vx,Hx,Wx (66),(v),(v1)
+a6: vfmaddsub213ps/d Vx,Hx,Wx (66),(v)
+a7: vfmsubadd213ps/d Vx,Hx,Wx (66),(v)
+a8: vfmadd213ps/d Vx,Hx,Wx (66),(v)
+a9: vfmadd213ss/d Vx,Hx,Wx (66),(v),(v1)
+aa: vfmsub213ps/d Vx,Hx,Wx (66),(v)
+ab: vfmsub213ss/d Vx,Hx,Wx (66),(v),(v1)
+ac: vfnmadd213ps/d Vx,Hx,Wx (66),(v)
+ad: vfnmadd213ss/d Vx,Hx,Wx (66),(v),(v1)
+ae: vfnmsub213ps/d Vx,Hx,Wx (66),(v)
+af: vfnmsub213ss/d Vx,Hx,Wx (66),(v),(v1)
+b6: vfmaddsub231ps/d Vx,Hx,Wx (66),(v)
+b7: vfmsubadd231ps/d Vx,Hx,Wx (66),(v)
+b8: vfmadd231ps/d Vx,Hx,Wx (66),(v)
+b9: vfmadd231ss/d Vx,Hx,Wx (66),(v),(v1)
+ba: vfmsub231ps/d Vx,Hx,Wx (66),(v)
+bb: vfmsub231ss/d Vx,Hx,Wx (66),(v),(v1)
+bc: vfnmadd231ps/d Vx,Hx,Wx (66),(v)
+bd: vfnmadd231ss/d Vx,Hx,Wx (66),(v),(v1)
+be: vfnmsub231ps/d Vx,Hx,Wx (66),(v)
+bf: vfnmsub231ss/d Vx,Hx,Wx (66),(v),(v1)
# 0x0f 0x38 0xc0-0xff
-db: aesimc Vdq,Wdq (66),(VEX),(o128)
-dc: aesenc Vdq,Wdq (66),(VEX),(o128)
-dd: aesenclast Vdq,Wdq (66),(VEX),(o128)
-de: aesdec Vdq,Wdq (66),(VEX),(o128)
-df: aesdeclast Vdq,Wdq (66),(VEX),(o128)
-f0: MOVBE Gv,Mv | CRC32 Gd,Eb (F2)
-f1: MOVBE Mv,Gv | CRC32 Gd,Ev (F2)
+db: VAESIMC Vdq,Wdq (66),(v1)
+dc: VAESENC Vdq,Hdq,Wdq (66),(v1)
+dd: VAESENCLAST Vdq,Hdq,Wdq (66),(v1)
+de: VAESDEC Vdq,Hdq,Wdq (66),(v1)
+df: VAESDECLAST Vdq,Hdq,Wdq (66),(v1)
+f0: MOVBE Gy,My | MOVBE Gw,Mw (66) | CRC32 Gd,Eb (F2)
+f1: MOVBE My,Gy | MOVBE Mw,Gw (66) | CRC32 Gd,Ey (F2)
+f3: ANDN Gy,By,Ey (v)
+f4: Grp17 (1A)
+f5: BZHI Gy,Ey,By (v) | PEXT Gy,By,Ey (F3),(v) | PDEP Gy,By,Ey (F2),(v)
+f6: MULX By,Gy,rDX,Ey (F2),(v)
+f7: BEXTR Gy,Ey,By (v) | SHLX Gy,Ey,By (66),(v) | SARX Gy,Ey,By (F3),(v) | SHRX Gy,Ey,By (F2),(v)
EndTable
Table: 3-byte opcode 2 (0x0f 0x3a)
Referrer: 3-byte escape 2
AVXcode: 3
# 0x0f 0x3a 0x00-0xff
-04: vpermilps /r,Ib (66),(oVEX)
-05: vpermilpd /r,Ib (66),(oVEX)
-06: vperm2f128 /r,Ib (66),(oVEX),(o256)
-08: roundps Vdq,Wdq,Ib (66),(VEX)
-09: roundpd Vdq,Wdq,Ib (66),(VEX)
-0a: roundss Vss,Wss,Ib (66),(VEX),(o128)
-0b: roundsd Vsd,Wsd,Ib (66),(VEX),(o128)
-0c: blendps Vdq,Wdq,Ib (66),(VEX)
-0d: blendpd Vdq,Wdq,Ib (66),(VEX)
-0e: pblendw Vdq,Wdq,Ib (66),(VEX),(o128)
-0f: palignr Pq,Qq,Ib | palignr Vdq,Wdq,Ib (66),(VEX),(o128)
-14: pextrb Rd/Mb,Vdq,Ib (66),(VEX),(o128)
-15: pextrw Rd/Mw,Vdq,Ib (66),(VEX),(o128)
-16: pextrd/pextrq Ed/q,Vdq,Ib (66),(VEX),(o128)
-17: extractps Ed,Vdq,Ib (66),(VEX),(o128)
-18: vinsertf128 /r,Ib (66),(oVEX),(o256)
-19: vextractf128 /r,Ib (66),(oVEX),(o256)
-20: pinsrb Vdq,Rd/q/Mb,Ib (66),(VEX),(o128)
-21: insertps Vdq,Udq/Md,Ib (66),(VEX),(o128)
-22: pinsrd/pinsrq Vdq,Ed/q,Ib (66),(VEX),(o128)
-40: dpps Vdq,Wdq,Ib (66),(VEX)
-41: dppd Vdq,Wdq,Ib (66),(VEX),(o128)
-42: mpsadbw Vdq,Wdq,Ib (66),(VEX),(o128)
-44: pclmulq Vdq,Wdq,Ib (66),(VEX),(o128)
-4a: vblendvps /r,Ib (66),(oVEX)
-4b: vblendvpd /r,Ib (66),(oVEX)
-4c: vpblendvb /r,Ib (66),(oVEX),(o128)
-60: pcmpestrm Vdq,Wdq,Ib (66),(VEX),(o128)
-61: pcmpestri Vdq,Wdq,Ib (66),(VEX),(o128)
-62: pcmpistrm Vdq,Wdq,Ib (66),(VEX),(o128)
-63: pcmpistri Vdq,Wdq,Ib (66),(VEX),(o128)
-df: aeskeygenassist Vdq,Wdq,Ib (66),(VEX),(o128)
+00: vpermq Vqq,Wqq,Ib (66),(v)
+01: vpermpd Vqq,Wqq,Ib (66),(v)
+02: vpblendd Vx,Hx,Wx,Ib (66),(v)
+03:
+04: vpermilps Vx,Wx,Ib (66),(v)
+05: vpermilpd Vx,Wx,Ib (66),(v)
+06: vperm2f128 Vqq,Hqq,Wqq,Ib (66),(v)
+07:
+08: vroundps Vx,Wx,Ib (66)
+09: vroundpd Vx,Wx,Ib (66)
+0a: vroundss Vss,Wss,Ib (66),(v1)
+0b: vroundsd Vsd,Wsd,Ib (66),(v1)
+0c: vblendps Vx,Hx,Wx,Ib (66)
+0d: vblendpd Vx,Hx,Wx,Ib (66)
+0e: vpblendw Vx,Hx,Wx,Ib (66),(v1)
+0f: palignr Pq,Qq,Ib | vpalignr Vx,Hx,Wx,Ib (66),(v1)
+14: vpextrb Rd/Mb,Vdq,Ib (66),(v1)
+15: vpextrw Rd/Mw,Vdq,Ib (66),(v1)
+16: vpextrd/q Ey,Vdq,Ib (66),(v1)
+17: vextractps Ed,Vdq,Ib (66),(v1)
+18: vinsertf128 Vqq,Hqq,Wqq,Ib (66),(v)
+19: vextractf128 Wdq,Vqq,Ib (66),(v)
+1d: vcvtps2ph Wx,Vx,Ib (66),(v)
+20: vpinsrb Vdq,Hdq,Ry/Mb,Ib (66),(v1)
+21: vinsertps Vdq,Hdq,Udq/Md,Ib (66),(v1)
+22: vpinsrd/q Vdq,Hdq,Ey,Ib (66),(v1)
+38: vinserti128 Vqq,Hqq,Wqq,Ib (66),(v)
+39: vextracti128 Wdq,Vqq,Ib (66),(v)
+40: vdpps Vx,Hx,Wx,Ib (66)
+41: vdppd Vdq,Hdq,Wdq,Ib (66),(v1)
+42: vmpsadbw Vx,Hx,Wx,Ib (66),(v1)
+44: vpclmulqdq Vdq,Hdq,Wdq,Ib (66),(v1)
+46: vperm2i128 Vqq,Hqq,Wqq,Ib (66),(v)
+4a: vblendvps Vx,Hx,Wx,Lx (66),(v)
+4b: vblendvpd Vx,Hx,Wx,Lx (66),(v)
+4c: vpblendvb Vx,Hx,Wx,Lx (66),(v1)
+60: vpcmpestrm Vdq,Wdq,Ib (66),(v1)
+61: vpcmpestri Vdq,Wdq,Ib (66),(v1)
+62: vpcmpistrm Vdq,Wdq,Ib (66),(v1)
+63: vpcmpistri Vdq,Wdq,Ib (66),(v1)
+df: VAESKEYGEN Vdq,Wdq,Ib (66),(v1)
+f0: RORX Gy,Ey,Ib (F2),(v)
EndTable
GrpTable: Grp1
@@ -790,7 +843,7 @@ GrpTable: Grp5
2: CALLN Ev (f64)
3: CALLF Ep
4: JMPN Ev (f64)
-5: JMPF Ep
+5: JMPF Mp
6: PUSH Ev (d64)
7:
EndTable
@@ -807,7 +860,7 @@ EndTable
GrpTable: Grp7
0: SGDT Ms | VMCALL (001),(11B) | VMLAUNCH (010),(11B) | VMRESUME (011),(11B) | VMXOFF (100),(11B)
1: SIDT Ms | MONITOR (000),(11B) | MWAIT (001)
-2: LGDT Ms | XGETBV (000),(11B) | XSETBV (001),(11B)
+2: LGDT Ms | XGETBV (000),(11B) | XSETBV (001),(11B) | VMFUNC (100),(11B)
3: LIDT Ms
4: SMSW Mw/Rv
5:
@@ -824,44 +877,45 @@ EndTable
GrpTable: Grp9
1: CMPXCHG8B/16B Mq/Mdq
-6: VMPTRLD Mq | VMCLEAR Mq (66) | VMXON Mq (F3)
-7: VMPTRST Mq
+6: VMPTRLD Mq | VMCLEAR Mq (66) | VMXON Mq (F3) | RDRAND Rv (11B)
+7: VMPTRST Mq | VMPTRST Mq (F3)
EndTable
GrpTable: Grp10
EndTable
GrpTable: Grp11
+# Note: the operands are given by group opcode
0: MOV
EndTable
GrpTable: Grp12
-2: psrlw Nq,Ib (11B) | psrlw Udq,Ib (66),(11B),(VEX),(o128)
-4: psraw Nq,Ib (11B) | psraw Udq,Ib (66),(11B),(VEX),(o128)
-6: psllw Nq,Ib (11B) | psllw Udq,Ib (66),(11B),(VEX),(o128)
+2: psrlw Nq,Ib (11B) | vpsrlw Hx,Ux,Ib (66),(11B),(v1)
+4: psraw Nq,Ib (11B) | vpsraw Hx,Ux,Ib (66),(11B),(v1)
+6: psllw Nq,Ib (11B) | vpsllw Hx,Ux,Ib (66),(11B),(v1)
EndTable
GrpTable: Grp13
-2: psrld Nq,Ib (11B) | psrld Udq,Ib (66),(11B),(VEX),(o128)
-4: psrad Nq,Ib (11B) | psrad Udq,Ib (66),(11B),(VEX),(o128)
-6: pslld Nq,Ib (11B) | pslld Udq,Ib (66),(11B),(VEX),(o128)
+2: psrld Nq,Ib (11B) | vpsrld Hx,Ux,Ib (66),(11B),(v1)
+4: psrad Nq,Ib (11B) | vpsrad Hx,Ux,Ib (66),(11B),(v1)
+6: pslld Nq,Ib (11B) | vpslld Hx,Ux,Ib (66),(11B),(v1)
EndTable
GrpTable: Grp14
-2: psrlq Nq,Ib (11B) | psrlq Udq,Ib (66),(11B),(VEX),(o128)
-3: psrldq Udq,Ib (66),(11B),(VEX),(o128)
-6: psllq Nq,Ib (11B) | psllq Udq,Ib (66),(11B),(VEX),(o128)
-7: pslldq Udq,Ib (66),(11B),(VEX),(o128)
+2: psrlq Nq,Ib (11B) | vpsrlq Hx,Ux,Ib (66),(11B),(v1)
+3: vpsrldq Hx,Ux,Ib (66),(11B),(v1)
+6: psllq Nq,Ib (11B) | vpsllq Hx,Ux,Ib (66),(11B),(v1)
+7: vpslldq Hx,Ux,Ib (66),(11B),(v1)
EndTable
GrpTable: Grp15
-0: fxsave
-1: fxstor
-2: ldmxcsr (VEX)
-3: stmxcsr (VEX)
+0: fxsave | RDFSBASE Ry (F3),(11B)
+1: fxstor | RDGSBASE Ry (F3),(11B)
+2: vldmxcsr Md (v1) | WRFSBASE Ry (F3),(11B)
+3: vstmxcsr Md (v1) | WRGSBASE Ry (F3),(11B)
4: XSAVE
5: XRSTOR | lfence (11B)
-6: mfence (11B)
+6: XSAVEOPT | mfence (11B)
7: clflush | sfence (11B)
EndTable
@@ -872,6 +926,12 @@ GrpTable: Grp16
3: prefetch T2
EndTable
+GrpTable: Grp17
+1: BLSR By,Ey (v)
+2: BLSMSK By,Ey (v)
+3: BLSI By,Ey (v)
+EndTable
+
# AMD's Prefetch Group
GrpTable: GrpP
0: PREFETCH
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 3d11327c9ab4..23d8e5fecf76 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -27,6 +27,4 @@ obj-$(CONFIG_AMD_NUMA) += amdtopology.o
obj-$(CONFIG_ACPI_NUMA) += srat.o
obj-$(CONFIG_NUMA_EMU) += numa_emulation.o
-obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o
-
obj-$(CONFIG_MEMTEST) += memtest.o
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index d0474ad2a6e5..1fb85dbe390a 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -25,7 +25,7 @@ int fixup_exception(struct pt_regs *regs)
if (fixup) {
/* If fixup is less than 16, it means uaccess error */
if (fixup->fixup < 16) {
- current_thread_info()->uaccess_err = -EFAULT;
+ current_thread_info()->uaccess_err = 1;
regs->ip += fixup->fixup;
return 1;
}
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 5db0490deb07..9d74824a708d 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -626,7 +626,7 @@ pgtable_bad(struct pt_regs *regs, unsigned long error_code,
static noinline void
no_context(struct pt_regs *regs, unsigned long error_code,
- unsigned long address)
+ unsigned long address, int signal, int si_code)
{
struct task_struct *tsk = current;
unsigned long *stackend;
@@ -634,8 +634,17 @@ no_context(struct pt_regs *regs, unsigned long error_code,
int sig;
/* Are we prepared to handle this kernel fault? */
- if (fixup_exception(regs))
+ if (fixup_exception(regs)) {
+ if (current_thread_info()->sig_on_uaccess_error && signal) {
+ tsk->thread.trap_no = 14;
+ tsk->thread.error_code = error_code | PF_USER;
+ tsk->thread.cr2 = address;
+
+ /* XXX: hwpoison faults will set the wrong code. */
+ force_sig_info_fault(signal, si_code, address, tsk, 0);
+ }
return;
+ }
/*
* 32-bit:
@@ -755,7 +764,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
if (is_f00f_bug(regs, address))
return;
- no_context(regs, error_code, address);
+ no_context(regs, error_code, address, SIGSEGV, si_code);
}
static noinline void
@@ -819,7 +828,7 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
/* Kernel mode? Handle exceptions or die: */
if (!(error_code & PF_USER)) {
- no_context(regs, error_code, address);
+ no_context(regs, error_code, address, SIGBUS, BUS_ADRERR);
return;
}
@@ -854,7 +863,7 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
if (!(fault & VM_FAULT_RETRY))
up_read(&current->mm->mmap_sem);
if (!(error_code & PF_USER))
- no_context(regs, error_code, address);
+ no_context(regs, error_code, address, 0, 0);
return 1;
}
if (!(fault & VM_FAULT_ERROR))
@@ -864,7 +873,8 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
/* Kernel mode? Handle exceptions or die: */
if (!(error_code & PF_USER)) {
up_read(&current->mm->mmap_sem);
- no_context(regs, error_code, address);
+ no_context(regs, error_code, address,
+ SIGSEGV, SEGV_MAPERR);
return 1;
}
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
index ea305856151c..dd74e46828c0 100644
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -201,6 +201,8 @@ static noinline int gup_huge_pud(pud_t pud, unsigned long addr,
do {
VM_BUG_ON(compound_head(page) != head);
pages[*nr] = page;
+ if (PageTail(page))
+ get_huge_page_tail(page);
(*nr)++;
page++;
refs++;
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
index b49962662101..f4f29b19fac5 100644
--- a/arch/x86/mm/highmem_32.c
+++ b/arch/x86/mm/highmem_32.c
@@ -45,6 +45,7 @@ void *kmap_atomic_prot(struct page *page, pgprot_t prot)
vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
BUG_ON(!pte_none(*(kmap_pte-idx)));
set_pte(kmap_pte-idx, mk_pte(page, prot));
+ arch_flush_lazy_mmu_mode();
return (void *)vaddr;
}
@@ -88,6 +89,7 @@ void __kunmap_atomic(void *kvaddr)
*/
kpte_clear_flush(kmap_pte-idx, vaddr);
kmap_atomic_idx_pop();
+ arch_flush_lazy_mmu_mode();
}
#ifdef CONFIG_DEBUG_HIGHMEM
else {
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 87488b93a65c..6cabf6570d64 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -3,6 +3,7 @@
#include <linux/ioport.h>
#include <linux/swap.h>
#include <linux/memblock.h>
+#include <linux/bootmem.h> /* for max_low_pfn */
#include <asm/cacheflush.h>
#include <asm/e820.h>
@@ -15,6 +16,7 @@
#include <asm/tlbflush.h>
#include <asm/tlb.h>
#include <asm/proto.h>
+#include <asm/dma.h> /* for MAX_DMA_PFN */
unsigned long __initdata pgt_buf_start;
unsigned long __meminitdata pgt_buf_end;
@@ -67,7 +69,7 @@ static void __init find_early_table_space(unsigned long end, int use_pse,
good_end = max_pfn_mapped << PAGE_SHIFT;
base = memblock_find_in_range(start, good_end, tables, PAGE_SIZE);
- if (base == MEMBLOCK_ERROR)
+ if (!base)
panic("Cannot find space for the kernel page tables");
pgt_buf_start = base >> PAGE_SHIFT;
@@ -80,7 +82,7 @@ static void __init find_early_table_space(unsigned long end, int use_pse,
void __init native_pagetable_reserve(u64 start, u64 end)
{
- memblock_x86_reserve_range(start, end, "PGTABLE");
+ memblock_reserve(start, end - start);
}
struct map_range {
@@ -279,8 +281,8 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
* pgt_buf_end) and free the other ones (pgt_buf_end - pgt_buf_top)
* so that they can be reused for other purposes.
*
- * On native it just means calling memblock_x86_reserve_range, on Xen it
- * also means marking RW the pagetable pages that we allocated before
+ * On native it just means calling memblock_reserve, on Xen it also
+ * means marking RW the pagetable pages that we allocated before
* but that haven't been used.
*
* In fact on xen we mark RO the whole range pgt_buf_start -
@@ -392,3 +394,24 @@ void free_initrd_mem(unsigned long start, unsigned long end)
free_init_pages("initrd memory", start, PAGE_ALIGN(end));
}
#endif
+
+void __init zone_sizes_init(void)
+{
+ unsigned long max_zone_pfns[MAX_NR_ZONES];
+
+ memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
+
+#ifdef CONFIG_ZONE_DMA
+ max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
+#endif
+#ifdef CONFIG_ZONE_DMA32
+ max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
+#endif
+ max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
+#ifdef CONFIG_HIGHMEM
+ max_zone_pfns[ZONE_HIGHMEM] = max_pfn;
+#endif
+
+ free_area_init_nodes(max_zone_pfns);
+}
+
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 29f7c6d98179..8663f6c47ccb 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -427,23 +427,17 @@ static void __init add_one_highpage_init(struct page *page)
void __init add_highpages_with_active_regions(int nid,
unsigned long start_pfn, unsigned long end_pfn)
{
- struct range *range;
- int nr_range;
- int i;
-
- nr_range = __get_free_all_memory_range(&range, nid, start_pfn, end_pfn);
-
- for (i = 0; i < nr_range; i++) {
- struct page *page;
- int node_pfn;
-
- for (node_pfn = range[i].start; node_pfn < range[i].end;
- node_pfn++) {
- if (!pfn_valid(node_pfn))
- continue;
- page = pfn_to_page(node_pfn);
- add_one_highpage_init(page);
- }
+ phys_addr_t start, end;
+ u64 i;
+
+ for_each_free_mem_range(i, nid, &start, &end, NULL) {
+ unsigned long pfn = clamp_t(unsigned long, PFN_UP(start),
+ start_pfn, end_pfn);
+ unsigned long e_pfn = clamp_t(unsigned long, PFN_DOWN(end),
+ start_pfn, end_pfn);
+ for ( ; pfn < e_pfn; pfn++)
+ if (pfn_valid(pfn))
+ add_one_highpage_init(pfn_to_page(pfn));
}
}
#else
@@ -650,18 +644,18 @@ void __init initmem_init(void)
highstart_pfn = highend_pfn = max_pfn;
if (max_pfn > max_low_pfn)
highstart_pfn = max_low_pfn;
- memblock_x86_register_active_regions(0, 0, highend_pfn);
- sparse_memory_present_with_active_regions(0);
printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
pages_to_mb(highend_pfn - highstart_pfn));
num_physpages = highend_pfn;
high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
#else
- memblock_x86_register_active_regions(0, 0, max_low_pfn);
- sparse_memory_present_with_active_regions(0);
num_physpages = max_low_pfn;
high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
#endif
+
+ memblock_set_node(0, (phys_addr_t)ULLONG_MAX, 0);
+ sparse_memory_present_with_active_regions(0);
+
#ifdef CONFIG_FLATMEM
max_mapnr = num_physpages;
#endif
@@ -674,22 +668,6 @@ void __init initmem_init(void)
}
#endif /* !CONFIG_NEED_MULTIPLE_NODES */
-static void __init zone_sizes_init(void)
-{
- unsigned long max_zone_pfns[MAX_NR_ZONES];
- memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
-#ifdef CONFIG_ZONE_DMA
- max_zone_pfns[ZONE_DMA] =
- virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
-#endif
- max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
-#ifdef CONFIG_HIGHMEM
- max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
-#endif
-
- free_area_init_nodes(max_zone_pfns);
-}
-
void __init setup_bootmem_allocator(void)
{
printk(KERN_INFO " mapped low ram: 0 - %08lx\n",
@@ -760,6 +738,17 @@ void __init mem_init(void)
#ifdef CONFIG_FLATMEM
BUG_ON(!mem_map);
#endif
+ /*
+ * With CONFIG_DEBUG_PAGEALLOC initialization of highmem pages has to
+ * be done before free_all_bootmem(). Memblock use free low memory for
+ * temporary data (see find_range_array()) and for this purpose can use
+ * pages that was already passed to the buddy allocator, hence marked as
+ * not accessible in the page tables when compiled with
+ * CONFIG_DEBUG_PAGEALLOC. Otherwise order of initialization is not
+ * important here.
+ */
+ set_highmem_pages_init();
+
/* this will put all low memory onto the freelists */
totalram_pages += free_all_bootmem();
@@ -771,8 +760,6 @@ void __init mem_init(void)
if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp)))
reservedpages++;
- set_highmem_pages_init();
-
codesize = (unsigned long) &_etext - (unsigned long) &_text;
datasize = (unsigned long) &_edata - (unsigned long) &_etext;
initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin;
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index bbaaa005bf0e..436a0309db33 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -608,21 +608,12 @@ kernel_physical_mapping_init(unsigned long start,
#ifndef CONFIG_NUMA
void __init initmem_init(void)
{
- memblock_x86_register_active_regions(0, 0, max_pfn);
+ memblock_set_node(0, (phys_addr_t)ULLONG_MAX, 0);
}
#endif
void __init paging_init(void)
{
- unsigned long max_zone_pfns[MAX_NR_ZONES];
-
- memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
-#ifdef CONFIG_ZONE_DMA
- max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
-#endif
- max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
- max_zone_pfns[ZONE_NORMAL] = max_pfn;
-
sparse_memory_present_with_active_regions(MAX_NUMNODES);
sparse_init();
@@ -634,7 +625,7 @@ void __init paging_init(void)
*/
node_clear_state(0, N_NORMAL_MEMORY);
- free_area_init_nodes(max_zone_pfns);
+ zone_sizes_init();
}
/*
diff --git a/arch/x86/mm/memblock.c b/arch/x86/mm/memblock.c
deleted file mode 100644
index 992da5ec5a64..000000000000
--- a/arch/x86/mm/memblock.c
+++ /dev/null
@@ -1,348 +0,0 @@
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/bitops.h>
-#include <linux/memblock.h>
-#include <linux/bootmem.h>
-#include <linux/mm.h>
-#include <linux/range.h>
-
-/* Check for already reserved areas */
-bool __init memblock_x86_check_reserved_size(u64 *addrp, u64 *sizep, u64 align)
-{
- struct memblock_region *r;
- u64 addr = *addrp, last;
- u64 size = *sizep;
- bool changed = false;
-
-again:
- last = addr + size;
- for_each_memblock(reserved, r) {
- if (last > r->base && addr < r->base) {
- size = r->base - addr;
- changed = true;
- goto again;
- }
- if (last > (r->base + r->size) && addr < (r->base + r->size)) {
- addr = round_up(r->base + r->size, align);
- size = last - addr;
- changed = true;
- goto again;
- }
- if (last <= (r->base + r->size) && addr >= r->base) {
- *sizep = 0;
- return false;
- }
- }
- if (changed) {
- *addrp = addr;
- *sizep = size;
- }
- return changed;
-}
-
-/*
- * Find next free range after start, and size is returned in *sizep
- */
-u64 __init memblock_x86_find_in_range_size(u64 start, u64 *sizep, u64 align)
-{
- struct memblock_region *r;
-
- for_each_memblock(memory, r) {
- u64 ei_start = r->base;
- u64 ei_last = ei_start + r->size;
- u64 addr;
-
- addr = round_up(ei_start, align);
- if (addr < start)
- addr = round_up(start, align);
- if (addr >= ei_last)
- continue;
- *sizep = ei_last - addr;
- while (memblock_x86_check_reserved_size(&addr, sizep, align))
- ;
-
- if (*sizep)
- return addr;
- }
-
- return MEMBLOCK_ERROR;
-}
-
-static __init struct range *find_range_array(int count)
-{
- u64 end, size, mem;
- struct range *range;
-
- size = sizeof(struct range) * count;
- end = memblock.current_limit;
-
- mem = memblock_find_in_range(0, end, size, sizeof(struct range));
- if (mem == MEMBLOCK_ERROR)
- panic("can not find more space for range array");
-
- /*
- * This range is tempoaray, so don't reserve it, it will not be
- * overlapped because We will not alloccate new buffer before
- * We discard this one
- */
- range = __va(mem);
- memset(range, 0, size);
-
- return range;
-}
-
-static void __init memblock_x86_subtract_reserved(struct range *range, int az)
-{
- u64 final_start, final_end;
- struct memblock_region *r;
-
- /* Take out region array itself at first*/
- memblock_free_reserved_regions();
-
- memblock_dbg("Subtract (%ld early reservations)\n", memblock.reserved.cnt);
-
- for_each_memblock(reserved, r) {
- memblock_dbg(" [%010llx-%010llx]\n", (u64)r->base, (u64)r->base + r->size - 1);
- final_start = PFN_DOWN(r->base);
- final_end = PFN_UP(r->base + r->size);
- if (final_start >= final_end)
- continue;
- subtract_range(range, az, final_start, final_end);
- }
-
- /* Put region array back ? */
- memblock_reserve_reserved_regions();
-}
-
-struct count_data {
- int nr;
-};
-
-static int __init count_work_fn(unsigned long start_pfn,
- unsigned long end_pfn, void *datax)
-{
- struct count_data *data = datax;
-
- data->nr++;
-
- return 0;
-}
-
-static int __init count_early_node_map(int nodeid)
-{
- struct count_data data;
-
- data.nr = 0;
- work_with_active_regions(nodeid, count_work_fn, &data);
-
- return data.nr;
-}
-
-int __init __get_free_all_memory_range(struct range **rangep, int nodeid,
- unsigned long start_pfn, unsigned long end_pfn)
-{
- int count;
- struct range *range;
- int nr_range;
-
- count = (memblock.reserved.cnt + count_early_node_map(nodeid)) * 2;
-
- range = find_range_array(count);
- nr_range = 0;
-
- /*
- * Use early_node_map[] and memblock.reserved.region to get range array
- * at first
- */
- nr_range = add_from_early_node_map(range, count, nr_range, nodeid);
- subtract_range(range, count, 0, start_pfn);
- subtract_range(range, count, end_pfn, -1ULL);
-
- memblock_x86_subtract_reserved(range, count);
- nr_range = clean_sort_range(range, count);
-
- *rangep = range;
- return nr_range;
-}
-
-int __init get_free_all_memory_range(struct range **rangep, int nodeid)
-{
- unsigned long end_pfn = -1UL;
-
-#ifdef CONFIG_X86_32
- end_pfn = max_low_pfn;
-#endif
- return __get_free_all_memory_range(rangep, nodeid, 0, end_pfn);
-}
-
-static u64 __init __memblock_x86_memory_in_range(u64 addr, u64 limit, bool get_free)
-{
- int i, count;
- struct range *range;
- int nr_range;
- u64 final_start, final_end;
- u64 free_size;
- struct memblock_region *r;
-
- count = (memblock.reserved.cnt + memblock.memory.cnt) * 2;
-
- range = find_range_array(count);
- nr_range = 0;
-
- addr = PFN_UP(addr);
- limit = PFN_DOWN(limit);
-
- for_each_memblock(memory, r) {
- final_start = PFN_UP(r->base);
- final_end = PFN_DOWN(r->base + r->size);
- if (final_start >= final_end)
- continue;
- if (final_start >= limit || final_end <= addr)
- continue;
-
- nr_range = add_range(range, count, nr_range, final_start, final_end);
- }
- subtract_range(range, count, 0, addr);
- subtract_range(range, count, limit, -1ULL);
-
- /* Subtract memblock.reserved.region in range ? */
- if (!get_free)
- goto sort_and_count_them;
- for_each_memblock(reserved, r) {
- final_start = PFN_DOWN(r->base);
- final_end = PFN_UP(r->base + r->size);
- if (final_start >= final_end)
- continue;
- if (final_start >= limit || final_end <= addr)
- continue;
-
- subtract_range(range, count, final_start, final_end);
- }
-
-sort_and_count_them:
- nr_range = clean_sort_range(range, count);
-
- free_size = 0;
- for (i = 0; i < nr_range; i++)
- free_size += range[i].end - range[i].start;
-
- return free_size << PAGE_SHIFT;
-}
-
-u64 __init memblock_x86_free_memory_in_range(u64 addr, u64 limit)
-{
- return __memblock_x86_memory_in_range(addr, limit, true);
-}
-
-u64 __init memblock_x86_memory_in_range(u64 addr, u64 limit)
-{
- return __memblock_x86_memory_in_range(addr, limit, false);
-}
-
-void __init memblock_x86_reserve_range(u64 start, u64 end, char *name)
-{
- if (start == end)
- return;
-
- if (WARN_ONCE(start > end, "memblock_x86_reserve_range: wrong range [%#llx, %#llx)\n", start, end))
- return;
-
- memblock_dbg(" memblock_x86_reserve_range: [%#010llx-%#010llx] %16s\n", start, end - 1, name);
-
- memblock_reserve(start, end - start);
-}
-
-void __init memblock_x86_free_range(u64 start, u64 end)
-{
- if (start == end)
- return;
-
- if (WARN_ONCE(start > end, "memblock_x86_free_range: wrong range [%#llx, %#llx)\n", start, end))
- return;
-
- memblock_dbg(" memblock_x86_free_range: [%#010llx-%#010llx]\n", start, end - 1);
-
- memblock_free(start, end - start);
-}
-
-/*
- * Need to call this function after memblock_x86_register_active_regions,
- * so early_node_map[] is filled already.
- */
-u64 __init memblock_x86_find_in_range_node(int nid, u64 start, u64 end, u64 size, u64 align)
-{
- u64 addr;
- addr = find_memory_core_early(nid, size, align, start, end);
- if (addr != MEMBLOCK_ERROR)
- return addr;
-
- /* Fallback, should already have start end within node range */
- return memblock_find_in_range(start, end, size, align);
-}
-
-/*
- * Finds an active region in the address range from start_pfn to last_pfn and
- * returns its range in ei_startpfn and ei_endpfn for the memblock entry.
- */
-static int __init memblock_x86_find_active_region(const struct memblock_region *ei,
- unsigned long start_pfn,
- unsigned long last_pfn,
- unsigned long *ei_startpfn,
- unsigned long *ei_endpfn)
-{
- u64 align = PAGE_SIZE;
-
- *ei_startpfn = round_up(ei->base, align) >> PAGE_SHIFT;
- *ei_endpfn = round_down(ei->base + ei->size, align) >> PAGE_SHIFT;
-
- /* Skip map entries smaller than a page */
- if (*ei_startpfn >= *ei_endpfn)
- return 0;
-
- /* Skip if map is outside the node */
- if (*ei_endpfn <= start_pfn || *ei_startpfn >= last_pfn)
- return 0;
-
- /* Check for overlaps */
- if (*ei_startpfn < start_pfn)
- *ei_startpfn = start_pfn;
- if (*ei_endpfn > last_pfn)
- *ei_endpfn = last_pfn;
-
- return 1;
-}
-
-/* Walk the memblock.memory map and register active regions within a node */
-void __init memblock_x86_register_active_regions(int nid, unsigned long start_pfn,
- unsigned long last_pfn)
-{
- unsigned long ei_startpfn;
- unsigned long ei_endpfn;
- struct memblock_region *r;
-
- for_each_memblock(memory, r)
- if (memblock_x86_find_active_region(r, start_pfn, last_pfn,
- &ei_startpfn, &ei_endpfn))
- add_active_range(nid, ei_startpfn, ei_endpfn);
-}
-
-/*
- * Find the hole size (in bytes) in the memory range.
- * @start: starting address of the memory range to scan
- * @end: ending address of the memory range to scan
- */
-u64 __init memblock_x86_hole_size(u64 start, u64 end)
-{
- unsigned long start_pfn = start >> PAGE_SHIFT;
- unsigned long last_pfn = end >> PAGE_SHIFT;
- unsigned long ei_startpfn, ei_endpfn, ram = 0;
- struct memblock_region *r;
-
- for_each_memblock(memory, r)
- if (memblock_x86_find_active_region(r, start_pfn, last_pfn,
- &ei_startpfn, &ei_endpfn))
- ram += ei_endpfn - ei_startpfn;
-
- return end - start - ((u64)ram << PAGE_SHIFT);
-}
diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c
index 92faf3a1c53e..c80b9fb95734 100644
--- a/arch/x86/mm/memtest.c
+++ b/arch/x86/mm/memtest.c
@@ -34,7 +34,7 @@ static void __init reserve_bad_mem(u64 pattern, u64 start_bad, u64 end_bad)
(unsigned long long) pattern,
(unsigned long long) start_bad,
(unsigned long long) end_bad);
- memblock_x86_reserve_range(start_bad, end_bad, "BAD RAM");
+ memblock_reserve(start_bad, end_bad - start_bad);
}
static void __init memtest(u64 pattern, u64 start_phys, u64 size)
@@ -70,24 +70,19 @@ static void __init memtest(u64 pattern, u64 start_phys, u64 size)
static void __init do_one_pass(u64 pattern, u64 start, u64 end)
{
- u64 size = 0;
-
- while (start < end) {
- start = memblock_x86_find_in_range_size(start, &size, 1);
-
- /* done ? */
- if (start >= end)
- break;
- if (start + size > end)
- size = end - start;
-
- printk(KERN_INFO " %010llx - %010llx pattern %016llx\n",
- (unsigned long long) start,
- (unsigned long long) start + size,
- (unsigned long long) cpu_to_be64(pattern));
- memtest(pattern, start, size);
-
- start += size;
+ u64 i;
+ phys_addr_t this_start, this_end;
+
+ for_each_free_mem_range(i, MAX_NUMNODES, &this_start, &this_end, NULL) {
+ this_start = clamp_t(phys_addr_t, this_start, start, end);
+ this_end = clamp_t(phys_addr_t, this_end, start, end);
+ if (this_start < this_end) {
+ printk(KERN_INFO " %010llx - %010llx pattern %016llx\n",
+ (unsigned long long)this_start,
+ (unsigned long long)this_end,
+ (unsigned long long)cpu_to_be64(pattern));
+ memtest(pattern, this_start, this_end - this_start);
+ }
}
}
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index 4b5ba85eb5c9..845df6835f9f 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -75,9 +75,9 @@ static unsigned long mmap_rnd(void)
*/
if (current->flags & PF_RANDOMIZE) {
if (mmap_is_ia32())
- rnd = (long)get_random_int() % (1<<8);
+ rnd = get_random_int() % (1<<8);
else
- rnd = (long)(get_random_int() % (1<<28));
+ rnd = get_random_int() % (1<<28);
}
return rnd << PAGE_SHIFT;
}
diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c
index de54b9b278a7..dc0b727742f4 100644
--- a/arch/x86/mm/mmio-mod.c
+++ b/arch/x86/mm/mmio-mod.c
@@ -75,8 +75,8 @@ static LIST_HEAD(trace_list); /* struct remap_trace */
/* module parameters */
static unsigned long filter_offset;
-static int nommiotrace;
-static int trace_pc;
+static bool nommiotrace;
+static bool trace_pc;
module_param(filter_offset, ulong, 0);
module_param(nommiotrace, bool, 0);
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index fbeaaf416610..19d3fa08b119 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -110,7 +110,7 @@ void __cpuinit numa_clear_node(int cpu)
* Allocate node_to_cpumask_map based on number of available nodes
* Requires node_possible_map to be valid.
*
- * Note: node_to_cpumask() is not valid until after this is done.
+ * Note: cpumask_of_node() is not valid until after this is done.
* (Use CONFIG_DEBUG_PER_CPU_MAPS to check this.)
*/
void __init setup_node_to_cpumask_map(void)
@@ -192,8 +192,6 @@ int __init numa_add_memblk(int nid, u64 start, u64 end)
/* Initialize NODE_DATA for a node on the local memory */
static void __init setup_node_data(int nid, u64 start, u64 end)
{
- const u64 nd_low = PFN_PHYS(MAX_DMA_PFN);
- const u64 nd_high = PFN_PHYS(max_pfn_mapped);
const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
bool remapped = false;
u64 nd_pa;
@@ -224,17 +222,12 @@ static void __init setup_node_data(int nid, u64 start, u64 end)
nd_pa = __pa(nd);
remapped = true;
} else {
- nd_pa = memblock_x86_find_in_range_node(nid, nd_low, nd_high,
- nd_size, SMP_CACHE_BYTES);
- if (nd_pa == MEMBLOCK_ERROR)
- nd_pa = memblock_find_in_range(nd_low, nd_high,
- nd_size, SMP_CACHE_BYTES);
- if (nd_pa == MEMBLOCK_ERROR) {
+ nd_pa = memblock_alloc_nid(nd_size, SMP_CACHE_BYTES, nid);
+ if (!nd_pa) {
pr_err("Cannot find %zu bytes in node %d\n",
nd_size, nid);
return;
}
- memblock_x86_reserve_range(nd_pa, nd_pa + nd_size, "NODE_DATA");
nd = __va(nd_pa);
}
@@ -371,8 +364,7 @@ void __init numa_reset_distance(void)
/* numa_distance could be 1LU marking allocation failure, test cnt */
if (numa_distance_cnt)
- memblock_x86_free_range(__pa(numa_distance),
- __pa(numa_distance) + size);
+ memblock_free(__pa(numa_distance), size);
numa_distance_cnt = 0;
numa_distance = NULL; /* enable table creation */
}
@@ -395,13 +387,13 @@ static int __init numa_alloc_distance(void)
phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped),
size, PAGE_SIZE);
- if (phys == MEMBLOCK_ERROR) {
+ if (!phys) {
pr_warning("NUMA: Warning: can't allocate distance table!\n");
/* don't retry until explicitly reset */
numa_distance = (void *)1LU;
return -ENOMEM;
}
- memblock_x86_reserve_range(phys, phys + size, "NUMA DIST");
+ memblock_reserve(phys, size);
numa_distance = __va(phys);
numa_distance_cnt = cnt;
@@ -430,8 +422,9 @@ static int __init numa_alloc_distance(void)
* calls are ignored until the distance table is reset with
* numa_reset_distance().
*
- * If @from or @to is higher than the highest known node at the time of
- * table creation or @distance doesn't make sense, the call is ignored.
+ * If @from or @to is higher than the highest known node or lower than zero
+ * at the time of table creation or @distance doesn't make sense, the call
+ * is ignored.
* This is to allow simplification of specific NUMA config implementations.
*/
void __init numa_set_distance(int from, int to, int distance)
@@ -439,8 +432,9 @@ void __init numa_set_distance(int from, int to, int distance)
if (!numa_distance && numa_alloc_distance() < 0)
return;
- if (from >= numa_distance_cnt || to >= numa_distance_cnt) {
- printk_once(KERN_DEBUG "NUMA: Debug: distance out of bound, from=%d to=%d distance=%d\n",
+ if (from >= numa_distance_cnt || to >= numa_distance_cnt ||
+ from < 0 || to < 0) {
+ pr_warn_once("NUMA: Warning: node ids are out of bound, from=%d to=%d distance=%d\n",
from, to, distance);
return;
}
@@ -482,8 +476,8 @@ static bool __init numa_meminfo_cover_memory(const struct numa_meminfo *mi)
numaram = 0;
}
- e820ram = max_pfn - (memblock_x86_hole_size(0,
- PFN_PHYS(max_pfn)) >> PAGE_SHIFT);
+ e820ram = max_pfn - absent_pages_in_range(0, max_pfn);
+
/* We seem to lose 3 pages somewhere. Allow 1M of slack. */
if ((s64)(e820ram - numaram) >= (1 << (20 - PAGE_SHIFT))) {
printk(KERN_ERR "NUMA: nodes only cover %LuMB of your %LuMB e820 RAM. Not used.\n",
@@ -505,13 +499,10 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
if (WARN_ON(nodes_empty(node_possible_map)))
return -EINVAL;
- for (i = 0; i < mi->nr_blks; i++)
- memblock_x86_register_active_regions(mi->blk[i].nid,
- mi->blk[i].start >> PAGE_SHIFT,
- mi->blk[i].end >> PAGE_SHIFT);
-
- /* for out of order entries */
- sort_node_map();
+ for (i = 0; i < mi->nr_blks; i++) {
+ struct numa_memblk *mb = &mi->blk[i];
+ memblock_set_node(mb->start, mb->end - mb->start, mb->nid);
+ }
/*
* If sections array is gonna be used for pfn -> nid mapping, check
@@ -545,6 +536,8 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
setup_node_data(nid, start, end);
}
+ /* Dump memblock with node info and return. */
+ memblock_dump_all();
return 0;
}
@@ -582,7 +575,7 @@ static int __init numa_init(int (*init_func)(void))
nodes_clear(node_possible_map);
nodes_clear(node_online_map);
memset(&numa_meminfo, 0, sizeof(numa_meminfo));
- remove_all_active_ranges();
+ WARN_ON(memblock_set_node(0, ULLONG_MAX, MAX_NUMNODES));
numa_reset_distance();
ret = init_func();
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index 3adebe7e536a..534255a36b6b 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -199,23 +199,23 @@ void __init init_alloc_remap(int nid, u64 start, u64 end)
/* allocate node memory and the lowmem remap area */
node_pa = memblock_find_in_range(start, end, size, LARGE_PAGE_BYTES);
- if (node_pa == MEMBLOCK_ERROR) {
+ if (!node_pa) {
pr_warning("remap_alloc: failed to allocate %lu bytes for node %d\n",
size, nid);
return;
}
- memblock_x86_reserve_range(node_pa, node_pa + size, "KVA RAM");
+ memblock_reserve(node_pa, size);
remap_pa = memblock_find_in_range(min_low_pfn << PAGE_SHIFT,
max_low_pfn << PAGE_SHIFT,
size, LARGE_PAGE_BYTES);
- if (remap_pa == MEMBLOCK_ERROR) {
+ if (!remap_pa) {
pr_warning("remap_alloc: failed to allocate %lu bytes remap area for node %d\n",
size, nid);
- memblock_x86_free_range(node_pa, node_pa + size);
+ memblock_free(node_pa, size);
return;
}
- memblock_x86_reserve_range(remap_pa, remap_pa + size, "KVA PG");
+ memblock_reserve(remap_pa, size);
remap_va = phys_to_virt(remap_pa);
/* perform actual remap */
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index dd27f401f0a0..92e27119ee1a 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -19,7 +19,7 @@ unsigned long __init numa_free_all_bootmem(void)
for_each_online_node(i)
pages += free_all_bootmem_node(NODE_DATA(i));
- pages += free_all_memory_core_early(MAX_NUMNODES);
+ pages += free_low_memory_core_early(MAX_NUMNODES);
return pages;
}
diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c
index d0ed086b6247..46db56845f18 100644
--- a/arch/x86/mm/numa_emulation.c
+++ b/arch/x86/mm/numa_emulation.c
@@ -28,6 +28,16 @@ static int __init emu_find_memblk_by_nid(int nid, const struct numa_meminfo *mi)
return -ENOENT;
}
+static u64 mem_hole_size(u64 start, u64 end)
+{
+ unsigned long start_pfn = PFN_UP(start);
+ unsigned long end_pfn = PFN_DOWN(end);
+
+ if (start_pfn < end_pfn)
+ return PFN_PHYS(absent_pages_in_range(start_pfn, end_pfn));
+ return 0;
+}
+
/*
* Sets up nid to range from @start to @end. The return value is -errno if
* something went wrong, 0 otherwise.
@@ -89,7 +99,7 @@ static int __init split_nodes_interleave(struct numa_meminfo *ei,
* Calculate target node size. x86_32 freaks on __udivdi3() so do
* the division in ulong number of pages and convert back.
*/
- size = max_addr - addr - memblock_x86_hole_size(addr, max_addr);
+ size = max_addr - addr - mem_hole_size(addr, max_addr);
size = PFN_PHYS((unsigned long)(size >> PAGE_SHIFT) / nr_nodes);
/*
@@ -135,8 +145,7 @@ static int __init split_nodes_interleave(struct numa_meminfo *ei,
* Continue to add memory to this fake node if its
* non-reserved memory is less than the per-node size.
*/
- while (end - start -
- memblock_x86_hole_size(start, end) < size) {
+ while (end - start - mem_hole_size(start, end) < size) {
end += FAKE_NODE_MIN_SIZE;
if (end > limit) {
end = limit;
@@ -150,7 +159,7 @@ static int __init split_nodes_interleave(struct numa_meminfo *ei,
* this one must extend to the boundary.
*/
if (end < dma32_end && dma32_end - end -
- memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
+ mem_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
end = dma32_end;
/*
@@ -158,8 +167,7 @@ static int __init split_nodes_interleave(struct numa_meminfo *ei,
* next node, this one must extend to the end of the
* physical node.
*/
- if (limit - end -
- memblock_x86_hole_size(end, limit) < size)
+ if (limit - end - mem_hole_size(end, limit) < size)
end = limit;
ret = emu_setup_memblk(ei, pi, nid++ % nr_nodes,
@@ -180,7 +188,7 @@ static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size)
{
u64 end = start + size;
- while (end - start - memblock_x86_hole_size(start, end) < size) {
+ while (end - start - mem_hole_size(start, end) < size) {
end += FAKE_NODE_MIN_SIZE;
if (end > max_addr) {
end = max_addr;
@@ -211,8 +219,7 @@ static int __init split_nodes_size_interleave(struct numa_meminfo *ei,
* creates a uniform distribution of node sizes across the entire
* machine (but not necessarily over physical nodes).
*/
- min_size = (max_addr - addr - memblock_x86_hole_size(addr, max_addr)) /
- MAX_NUMNODES;
+ min_size = (max_addr - addr - mem_hole_size(addr, max_addr)) / MAX_NUMNODES;
min_size = max(min_size, FAKE_NODE_MIN_SIZE);
if ((min_size & FAKE_NODE_MIN_HASH_MASK) < min_size)
min_size = (min_size + FAKE_NODE_MIN_SIZE) &
@@ -252,7 +259,7 @@ static int __init split_nodes_size_interleave(struct numa_meminfo *ei,
* this one must extend to the boundary.
*/
if (end < dma32_end && dma32_end - end -
- memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
+ mem_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
end = dma32_end;
/*
@@ -260,8 +267,7 @@ static int __init split_nodes_size_interleave(struct numa_meminfo *ei,
* next node, this one must extend to the end of the
* physical node.
*/
- if (limit - end -
- memblock_x86_hole_size(end, limit) < size)
+ if (limit - end - mem_hole_size(end, limit) < size)
end = limit;
ret = emu_setup_memblk(ei, pi, nid++ % MAX_NUMNODES,
@@ -351,11 +357,11 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped),
phys_size, PAGE_SIZE);
- if (phys == MEMBLOCK_ERROR) {
+ if (!phys) {
pr_warning("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n");
goto no_emu;
}
- memblock_x86_reserve_range(phys, phys + phys_size, "TMP NUMA DIST");
+ memblock_reserve(phys, phys_size);
phys_dist = __va(phys);
for (i = 0; i < numa_dist_cnt; i++)
@@ -424,7 +430,7 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
/* free the copied physical distance table */
if (phys_dist)
- memblock_x86_free_range(__pa(phys_dist), __pa(phys_dist) + phys_size);
+ memblock_free(__pa(phys_dist), phys_size);
return;
no_emu:
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index f9e526742fa1..e1ebde315210 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -998,7 +998,7 @@ out_err:
}
EXPORT_SYMBOL(set_memory_uc);
-int _set_memory_array(unsigned long *addr, int addrinarray,
+static int _set_memory_array(unsigned long *addr, int addrinarray,
unsigned long new_type)
{
int i, j;
@@ -1334,12 +1334,6 @@ void kernel_map_pages(struct page *page, int numpages, int enable)
}
/*
- * If page allocator is not up yet then do not call c_p_a():
- */
- if (!debug_pagealloc_enabled)
- return;
-
- /*
* The return value is ignored as the calls cannot fail.
* Large pages for identity mappings are not used at boot time
* and hence no memory allocations during large page split.
diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c
index 81dbfdeb080d..fd61b3fb7341 100644
--- a/arch/x86/mm/srat.c
+++ b/arch/x86/mm/srat.c
@@ -69,6 +69,12 @@ acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa)
if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0)
return;
pxm = pa->proximity_domain;
+ apic_id = pa->apic_id;
+ if (!cpu_has_x2apic && (apic_id >= 0xff)) {
+ printk(KERN_INFO "SRAT: PXM %u -> X2APIC 0x%04x ignored\n",
+ pxm, apic_id);
+ return;
+ }
node = setup_node(pxm);
if (node < 0) {
printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm);
@@ -76,7 +82,6 @@ acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa)
return;
}
- apic_id = pa->apic_id;
if (apic_id >= MAX_LOCAL_APIC) {
printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node);
return;
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index bfab3fa10edc..7b65f752c5f8 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -568,8 +568,8 @@ cond_branch: f_offset = addrs[i + filter[i].jf] - addrs[i];
break;
}
if (filter[i].jt != 0) {
- if (filter[i].jf)
- t_offset += is_near(f_offset) ? 2 : 6;
+ if (filter[i].jf && f_offset)
+ t_offset += is_near(f_offset) ? 2 : 5;
EMIT_COND_JMP(t_op, t_offset);
if (filter[i].jf)
EMIT_JMP(f_offset);
diff --git a/arch/x86/oprofile/Makefile b/arch/x86/oprofile/Makefile
index 446902b2a6b6..1599f568f0e2 100644
--- a/arch/x86/oprofile/Makefile
+++ b/arch/x86/oprofile/Makefile
@@ -4,9 +4,8 @@ DRIVER_OBJS = $(addprefix ../../../drivers/oprofile/, \
oprof.o cpu_buffer.o buffer_sync.o \
event_buffer.o oprofile_files.o \
oprofilefs.o oprofile_stats.o \
- timer_int.o )
+ timer_int.o nmi_timer_int.o )
oprofile-y := $(DRIVER_OBJS) init.o backtrace.o
oprofile-$(CONFIG_X86_LOCAL_APIC) += nmi_int.o op_model_amd.o \
op_model_ppro.o op_model_p4.o
-oprofile-$(CONFIG_X86_IO_APIC) += nmi_timer_int.o
diff --git a/arch/x86/oprofile/init.c b/arch/x86/oprofile/init.c
index cdfe4c54deca..9e138d00ad36 100644
--- a/arch/x86/oprofile/init.c
+++ b/arch/x86/oprofile/init.c
@@ -16,34 +16,23 @@
* with the NMI mode driver.
*/
+#ifdef CONFIG_X86_LOCAL_APIC
extern int op_nmi_init(struct oprofile_operations *ops);
-extern int op_nmi_timer_init(struct oprofile_operations *ops);
extern void op_nmi_exit(void);
-extern void x86_backtrace(struct pt_regs * const regs, unsigned int depth);
+#else
+static int op_nmi_init(struct oprofile_operations *ops) { return -ENODEV; }
+static void op_nmi_exit(void) { }
+#endif
+extern void x86_backtrace(struct pt_regs * const regs, unsigned int depth);
int __init oprofile_arch_init(struct oprofile_operations *ops)
{
- int ret;
-
- ret = -ENODEV;
-
-#ifdef CONFIG_X86_LOCAL_APIC
- ret = op_nmi_init(ops);
-#endif
-#ifdef CONFIG_X86_IO_APIC
- if (ret < 0)
- ret = op_nmi_timer_init(ops);
-#endif
ops->backtrace = x86_backtrace;
-
- return ret;
+ return op_nmi_init(ops);
}
-
void oprofile_arch_exit(void)
{
-#ifdef CONFIG_X86_LOCAL_APIC
op_nmi_exit();
-#endif
}
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index 75f9528e0372..26b8a8514ee5 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -595,24 +595,36 @@ static int __init p4_init(char **cpu_type)
return 0;
}
-static int force_arch_perfmon;
-static int force_cpu_type(const char *str, struct kernel_param *kp)
+enum __force_cpu_type {
+ reserved = 0, /* do not force */
+ timer,
+ arch_perfmon,
+};
+
+static int force_cpu_type;
+
+static int set_cpu_type(const char *str, struct kernel_param *kp)
{
- if (!strcmp(str, "arch_perfmon")) {
- force_arch_perfmon = 1;
+ if (!strcmp(str, "timer")) {
+ force_cpu_type = timer;
+ printk(KERN_INFO "oprofile: forcing NMI timer mode\n");
+ } else if (!strcmp(str, "arch_perfmon")) {
+ force_cpu_type = arch_perfmon;
printk(KERN_INFO "oprofile: forcing architectural perfmon\n");
+ } else {
+ force_cpu_type = 0;
}
return 0;
}
-module_param_call(cpu_type, force_cpu_type, NULL, NULL, 0);
+module_param_call(cpu_type, set_cpu_type, NULL, NULL, 0);
static int __init ppro_init(char **cpu_type)
{
__u8 cpu_model = boot_cpu_data.x86_model;
struct op_x86_model_spec *spec = &op_ppro_spec; /* default */
- if (force_arch_perfmon && cpu_has_arch_perfmon)
+ if (force_cpu_type == arch_perfmon && cpu_has_arch_perfmon)
return 0;
/*
@@ -679,6 +691,9 @@ int __init op_nmi_init(struct oprofile_operations *ops)
if (!cpu_has_apic)
return -ENODEV;
+ if (force_cpu_type == timer)
+ return -ENODEV;
+
switch (vendor) {
case X86_VENDOR_AMD:
/* Needs to be at least an Athlon (or hammer in 32bit mode) */
diff --git a/arch/x86/oprofile/nmi_timer_int.c b/arch/x86/oprofile/nmi_timer_int.c
deleted file mode 100644
index 7f8052cd6620..000000000000
--- a/arch/x86/oprofile/nmi_timer_int.c
+++ /dev/null
@@ -1,50 +0,0 @@
-/**
- * @file nmi_timer_int.c
- *
- * @remark Copyright 2003 OProfile authors
- * @remark Read the file COPYING
- *
- * @author Zwane Mwaikambo <zwane@linuxpower.ca>
- */
-
-#include <linux/init.h>
-#include <linux/smp.h>
-#include <linux/errno.h>
-#include <linux/oprofile.h>
-#include <linux/rcupdate.h>
-#include <linux/kdebug.h>
-
-#include <asm/nmi.h>
-#include <asm/apic.h>
-#include <asm/ptrace.h>
-
-static int profile_timer_exceptions_notify(unsigned int val, struct pt_regs *regs)
-{
- oprofile_add_sample(regs, 0);
- return NMI_HANDLED;
-}
-
-static int timer_start(void)
-{
- if (register_nmi_handler(NMI_LOCAL, profile_timer_exceptions_notify,
- 0, "oprofile-timer"))
- return 1;
- return 0;
-}
-
-
-static void timer_stop(void)
-{
- unregister_nmi_handler(NMI_LOCAL, "oprofile-timer");
- synchronize_sched(); /* Allow already-started NMIs to complete. */
-}
-
-
-int __init op_nmi_timer_init(struct oprofile_operations *ops)
-{
- ops->start = timer_start;
- ops->stop = timer_stop;
- ops->cpu_type = "timer";
- printk(KERN_INFO "oprofile: using NMI timer interrupt.\n");
- return 0;
-}
diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile
index 6b8759f7634e..e76e18c94a3c 100644
--- a/arch/x86/pci/Makefile
+++ b/arch/x86/pci/Makefile
@@ -15,11 +15,12 @@ obj-$(CONFIG_X86_VISWS) += visws.o
obj-$(CONFIG_X86_NUMAQ) += numaq_32.o
-obj-$(CONFIG_X86_MRST) += mrst.o
+obj-$(CONFIG_X86_INTEL_MID) += mrst.o
obj-y += common.o early.o
-obj-y += amd_bus.o bus_numa.o
+obj-y += bus_numa.o
+obj-$(CONFIG_AMD_NB) += amd_bus.o
obj-$(CONFIG_PCI_CNB20LE_QUIRK) += broadcom_bus.o
ifeq ($(CONFIG_PCI_DEBUG),y)
diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index 404f21a3ff9e..a312e76063a7 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -12,7 +12,7 @@ struct pci_root_info {
char *name;
unsigned int res_num;
struct resource *res;
- struct pci_bus *bus;
+ struct list_head *resources;
int busnum;
};
@@ -24,6 +24,12 @@ static int __init set_use_crs(const struct dmi_system_id *id)
return 0;
}
+static int __init set_nouse_crs(const struct dmi_system_id *id)
+{
+ pci_use_crs = false;
+ return 0;
+}
+
static const struct dmi_system_id pci_use_crs_table[] __initconst = {
/* http://bugzilla.kernel.org/show_bug.cgi?id=14183 */
{
@@ -54,6 +60,29 @@ static const struct dmi_system_id pci_use_crs_table[] __initconst = {
DMI_MATCH(DMI_BIOS_VENDOR, "American Megatrends Inc."),
},
},
+
+ /* Now for the blacklist.. */
+
+ /* https://bugzilla.redhat.com/show_bug.cgi?id=769657 */
+ {
+ .callback = set_nouse_crs,
+ .ident = "Dell Studio 1557",
+ .matches = {
+ DMI_MATCH(DMI_BOARD_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Studio 1557"),
+ DMI_MATCH(DMI_BIOS_VERSION, "A09"),
+ },
+ },
+ /* https://bugzilla.redhat.com/show_bug.cgi?id=769657 */
+ {
+ .callback = set_nouse_crs,
+ .ident = "Thinkpad SL510",
+ .matches = {
+ DMI_MATCH(DMI_BOARD_VENDOR, "LENOVO"),
+ DMI_MATCH(DMI_BOARD_NAME, "2847DFG"),
+ DMI_MATCH(DMI_BIOS_VERSION, "6JET85WW (1.43 )"),
+ },
+ },
{}
};
@@ -149,7 +178,7 @@ setup_resource(struct acpi_resource *acpi_res, void *data)
struct acpi_resource_address64 addr;
acpi_status status;
unsigned long flags;
- u64 start, end;
+ u64 start, orig_end, end;
status = resource_to_addr(acpi_res, &addr);
if (!ACPI_SUCCESS(status))
@@ -165,7 +194,21 @@ setup_resource(struct acpi_resource *acpi_res, void *data)
return AE_OK;
start = addr.minimum + addr.translation_offset;
- end = addr.maximum + addr.translation_offset;
+ orig_end = end = addr.maximum + addr.translation_offset;
+
+ /* Exclude non-addressable range or non-addressable portion of range */
+ end = min(end, (u64)iomem_resource.end);
+ if (end <= start) {
+ dev_info(&info->bridge->dev,
+ "host bridge window [%#llx-%#llx] "
+ "(ignored, not CPU addressable)\n", start, orig_end);
+ return AE_OK;
+ } else if (orig_end != end) {
+ dev_info(&info->bridge->dev,
+ "host bridge window [%#llx-%#llx] "
+ "([%#llx-%#llx] ignored, not CPU addressable)\n",
+ start, orig_end, end + 1, orig_end);
+ }
res = &info->res[info->res_num];
res->name = info->name;
@@ -261,23 +304,20 @@ static void add_resources(struct pci_root_info *info)
"ignoring host bridge window %pR (conflicts with %s %pR)\n",
res, conflict->name, conflict);
else
- pci_bus_add_resource(info->bus, res, 0);
+ pci_add_resource(info->resources, res);
}
}
static void
get_current_resources(struct acpi_device *device, int busnum,
- int domain, struct pci_bus *bus)
+ int domain, struct list_head *resources)
{
struct pci_root_info info;
size_t size;
- if (pci_use_crs)
- pci_bus_remove_resources(bus);
-
info.bridge = device;
- info.bus = bus;
info.res_num = 0;
+ info.resources = resources;
acpi_walk_resources(device->handle, METHOD_NAME__CRS, count_resource,
&info);
if (!info.res_num)
@@ -286,7 +326,7 @@ get_current_resources(struct acpi_device *device, int busnum,
size = sizeof(*info.res) * info.res_num;
info.res = kmalloc(size, GFP_KERNEL);
if (!info.res)
- goto res_alloc_fail;
+ return;
info.name = kasprintf(GFP_KERNEL, "PCI Bus %04x:%02x", domain, busnum);
if (!info.name)
@@ -301,8 +341,6 @@ get_current_resources(struct acpi_device *device, int busnum,
name_alloc_fail:
kfree(info.res);
-res_alloc_fail:
- return;
}
struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_pci_root *root)
@@ -310,6 +348,7 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_pci_root *root)
struct acpi_device *device = root->device;
int domain = root->segment;
int busnum = root->secondary.start;
+ LIST_HEAD(resources);
struct pci_bus *bus;
struct pci_sysdata *sd;
int node;
@@ -364,11 +403,15 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_pci_root *root)
memcpy(bus->sysdata, sd, sizeof(*sd));
kfree(sd);
} else {
- bus = pci_create_bus(NULL, busnum, &pci_root_ops, sd);
- if (bus) {
- get_current_resources(device, busnum, domain, bus);
+ get_current_resources(device, busnum, domain, &resources);
+ if (list_empty(&resources))
+ x86_pci_root_bus_resources(busnum, &resources);
+ bus = pci_create_root_bus(NULL, busnum, &pci_root_ops, sd,
+ &resources);
+ if (bus)
bus->subordinate = pci_scan_child_bus(bus);
- }
+ else
+ pci_free_resource_list(&resources);
}
/* After the PCI-E bus has been walked and all devices discovered,
diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c
index 026e4931d162..0567df3890e1 100644
--- a/arch/x86/pci/amd_bus.c
+++ b/arch/x86/pci/amd_bus.c
@@ -30,34 +30,6 @@ static struct pci_hostbridge_probe pci_probes[] __initdata = {
{ 0, 0x18, PCI_VENDOR_ID_AMD, 0x1300 },
};
-static u64 __initdata fam10h_mmconf_start;
-static u64 __initdata fam10h_mmconf_end;
-static void __init get_pci_mmcfg_amd_fam10h_range(void)
-{
- u32 address;
- u64 base, msr;
- unsigned segn_busn_bits;
-
- /* assume all cpus from fam10h have mmconf */
- if (boot_cpu_data.x86 < 0x10)
- return;
-
- address = MSR_FAM10H_MMIO_CONF_BASE;
- rdmsrl(address, msr);
-
- /* mmconfig is not enable */
- if (!(msr & FAM10H_MMIO_CONF_ENABLE))
- return;
-
- base = msr & (FAM10H_MMIO_CONF_BASE_MASK<<FAM10H_MMIO_CONF_BASE_SHIFT);
-
- segn_busn_bits = (msr >> FAM10H_MMIO_CONF_BUSRANGE_SHIFT) &
- FAM10H_MMIO_CONF_BUSRANGE_MASK;
-
- fam10h_mmconf_start = base;
- fam10h_mmconf_end = base + (1ULL<<(segn_busn_bits + 20)) - 1;
-}
-
#define RANGE_NUM 16
/**
@@ -85,6 +57,9 @@ static int __init early_fill_mp_bus_info(void)
u64 val;
u32 address;
bool found;
+ struct resource fam10h_mmconf_res, *fam10h_mmconf;
+ u64 fam10h_mmconf_start;
+ u64 fam10h_mmconf_end;
if (!early_pci_allowed())
return -1;
@@ -211,12 +186,17 @@ static int __init early_fill_mp_bus_info(void)
subtract_range(range, RANGE_NUM, 0, end);
/* get mmconfig */
- get_pci_mmcfg_amd_fam10h_range();
+ fam10h_mmconf = amd_get_mmconfig_range(&fam10h_mmconf_res);
/* need to take out mmconf range */
- if (fam10h_mmconf_end) {
- printk(KERN_DEBUG "Fam 10h mmconf [%llx, %llx]\n", fam10h_mmconf_start, fam10h_mmconf_end);
+ if (fam10h_mmconf) {
+ printk(KERN_DEBUG "Fam 10h mmconf %pR\n", fam10h_mmconf);
+ fam10h_mmconf_start = fam10h_mmconf->start;
+ fam10h_mmconf_end = fam10h_mmconf->end;
subtract_range(range, RANGE_NUM, fam10h_mmconf_start,
fam10h_mmconf_end + 1);
+ } else {
+ fam10h_mmconf_start = 0;
+ fam10h_mmconf_end = 0;
}
/* mmio resource */
@@ -403,7 +383,6 @@ static void __init pci_enable_pci_io_ecs(void)
++n;
}
}
- pr_info("Extended Config Space enabled on %u nodes\n", n);
#endif
}
diff --git a/arch/x86/pci/broadcom_bus.c b/arch/x86/pci/broadcom_bus.c
index ab8269b0da29..f3a7c569a403 100644
--- a/arch/x86/pci/broadcom_bus.c
+++ b/arch/x86/pci/broadcom_bus.c
@@ -15,10 +15,11 @@
#include <linux/pci.h>
#include <linux/init.h>
#include <asm/pci_x86.h>
+#include <asm/pci-direct.h>
#include "bus_numa.h"
-static void __devinit cnb20le_res(struct pci_dev *dev)
+static void __init cnb20le_res(u8 bus, u8 slot, u8 func)
{
struct pci_root_info *info;
struct resource res;
@@ -26,21 +27,12 @@ static void __devinit cnb20le_res(struct pci_dev *dev)
u8 fbus, lbus;
int i;
-#ifdef CONFIG_ACPI
- /*
- * We should get host bridge information from ACPI unless the BIOS
- * doesn't support it.
- */
- if (acpi_os_get_root_pointer())
- return;
-#endif
-
info = &pci_root_info[pci_root_num];
pci_root_num++;
/* read the PCI bus numbers */
- pci_read_config_byte(dev, 0x44, &fbus);
- pci_read_config_byte(dev, 0x45, &lbus);
+ fbus = read_pci_config_byte(bus, slot, func, 0x44);
+ lbus = read_pci_config_byte(bus, slot, func, 0x45);
info->bus_min = fbus;
info->bus_max = lbus;
@@ -59,8 +51,8 @@ static void __devinit cnb20le_res(struct pci_dev *dev)
}
/* read the non-prefetchable memory window */
- pci_read_config_word(dev, 0xc0, &word1);
- pci_read_config_word(dev, 0xc2, &word2);
+ word1 = read_pci_config_16(bus, slot, func, 0xc0);
+ word2 = read_pci_config_16(bus, slot, func, 0xc2);
if (word1 != word2) {
res.start = (word1 << 16) | 0x0000;
res.end = (word2 << 16) | 0xffff;
@@ -69,8 +61,8 @@ static void __devinit cnb20le_res(struct pci_dev *dev)
}
/* read the prefetchable memory window */
- pci_read_config_word(dev, 0xc4, &word1);
- pci_read_config_word(dev, 0xc6, &word2);
+ word1 = read_pci_config_16(bus, slot, func, 0xc4);
+ word2 = read_pci_config_16(bus, slot, func, 0xc6);
if (word1 != word2) {
res.start = (word1 << 16) | 0x0000;
res.end = (word2 << 16) | 0xffff;
@@ -79,8 +71,8 @@ static void __devinit cnb20le_res(struct pci_dev *dev)
}
/* read the IO port window */
- pci_read_config_word(dev, 0xd0, &word1);
- pci_read_config_word(dev, 0xd2, &word2);
+ word1 = read_pci_config_16(bus, slot, func, 0xd0);
+ word2 = read_pci_config_16(bus, slot, func, 0xd2);
if (word1 != word2) {
res.start = word1;
res.end = word2;
@@ -92,13 +84,37 @@ static void __devinit cnb20le_res(struct pci_dev *dev)
res.start = fbus;
res.end = lbus;
res.flags = IORESOURCE_BUS;
- dev_info(&dev->dev, "CNB20LE PCI Host Bridge (domain %04x %pR)\n",
- pci_domain_nr(dev->bus), &res);
+ printk(KERN_INFO "CNB20LE PCI Host Bridge (domain 0000 %pR)\n", &res);
for (i = 0; i < info->res_num; i++)
- dev_info(&dev->dev, "host bridge window %pR\n", &info->res[i]);
+ printk(KERN_INFO "host bridge window %pR\n", &info->res[i]);
}
-DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_SERVERWORKS, PCI_DEVICE_ID_SERVERWORKS_LE,
- cnb20le_res);
+static int __init broadcom_postcore_init(void)
+{
+ u8 bus = 0, slot = 0;
+ u32 id;
+ u16 vendor, device;
+
+#ifdef CONFIG_ACPI
+ /*
+ * We should get host bridge information from ACPI unless the BIOS
+ * doesn't support it.
+ */
+ if (acpi_os_get_root_pointer())
+ return 0;
+#endif
+
+ id = read_pci_config(bus, slot, 0, PCI_VENDOR_ID);
+ vendor = id & 0xffff;
+ device = (id >> 16) & 0xffff;
+
+ if (vendor == PCI_VENDOR_ID_SERVERWORKS &&
+ device == PCI_DEVICE_ID_SERVERWORKS_LE) {
+ cnb20le_res(bus, slot, 0);
+ cnb20le_res(bus, slot, 1);
+ }
+ return 0;
+}
+postcore_initcall(broadcom_postcore_init);
diff --git a/arch/x86/pci/bus_numa.c b/arch/x86/pci/bus_numa.c
index 64a122883896..fd3f65510e9d 100644
--- a/arch/x86/pci/bus_numa.c
+++ b/arch/x86/pci/bus_numa.c
@@ -7,45 +7,50 @@
int pci_root_num;
struct pci_root_info pci_root_info[PCI_ROOT_NR];
-void x86_pci_root_bus_res_quirks(struct pci_bus *b)
+void x86_pci_root_bus_resources(int bus, struct list_head *resources)
{
int i;
int j;
struct pci_root_info *info;
- /* don't go for it if _CRS is used already */
- if (b->resource[0] != &ioport_resource ||
- b->resource[1] != &iomem_resource)
- return;
-
if (!pci_root_num)
- return;
+ goto default_resources;
for (i = 0; i < pci_root_num; i++) {
- if (pci_root_info[i].bus_min == b->number)
+ if (pci_root_info[i].bus_min == bus)
break;
}
if (i == pci_root_num)
- return;
+ goto default_resources;
- printk(KERN_DEBUG "PCI: peer root bus %02x res updated from pci conf\n",
- b->number);
+ printk(KERN_DEBUG "PCI: root bus %02x: hardware-probed resources\n",
+ bus);
- pci_bus_remove_resources(b);
info = &pci_root_info[i];
for (j = 0; j < info->res_num; j++) {
struct resource *res;
struct resource *root;
res = &info->res[j];
- pci_bus_add_resource(b, res, 0);
+ pci_add_resource(resources, res);
if (res->flags & IORESOURCE_IO)
root = &ioport_resource;
else
root = &iomem_resource;
insert_resource(root, res);
}
+ return;
+
+default_resources:
+ /*
+ * We don't have any host bridge aperture information from the
+ * "native host bridge drivers," e.g., amd_bus or broadcom_bus,
+ * so fall back to the defaults historically used by pci_create_bus().
+ */
+ printk(KERN_DEBUG "PCI: root bus %02x: using default resources\n", bus);
+ pci_add_resource(resources, &ioport_resource);
+ pci_add_resource(resources, &iomem_resource);
}
void __devinit update_res(struct pci_root_info *info, resource_size_t start,
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index 7962ccb4d9b2..323481e06ef8 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -164,9 +164,6 @@ void __devinit pcibios_fixup_bus(struct pci_bus *b)
{
struct pci_dev *dev;
- /* root bus? */
- if (!b->parent)
- x86_pci_root_bus_res_quirks(b);
pci_read_bridge_bases(b);
list_for_each_entry(dev, &b->devices, bus_list)
pcibios_fixup_device_resources(dev);
@@ -433,6 +430,7 @@ void __init dmi_check_pciprobe(void)
struct pci_bus * __devinit pcibios_scan_root(int busnum)
{
+ LIST_HEAD(resources);
struct pci_bus *bus = NULL;
struct pci_sysdata *sd;
@@ -456,9 +454,12 @@ struct pci_bus * __devinit pcibios_scan_root(int busnum)
sd->node = get_mp_bus_to_node(busnum);
printk(KERN_DEBUG "PCI: Probing PCI hardware (bus %02x)\n", busnum);
- bus = pci_scan_bus_parented(NULL, busnum, &pci_root_ops, sd);
- if (!bus)
+ x86_pci_root_bus_resources(busnum, &resources);
+ bus = pci_scan_root_bus(NULL, busnum, &pci_root_ops, sd, &resources);
+ if (!bus) {
+ pci_free_resource_list(&resources);
kfree(sd);
+ }
return bus;
}
@@ -639,6 +640,7 @@ int pci_ext_cfg_avail(struct pci_dev *dev)
struct pci_bus * __devinit pci_scan_bus_on_node(int busno, struct pci_ops *ops, int node)
{
+ LIST_HEAD(resources);
struct pci_bus *bus = NULL;
struct pci_sysdata *sd;
@@ -653,9 +655,12 @@ struct pci_bus * __devinit pci_scan_bus_on_node(int busno, struct pci_ops *ops,
return NULL;
}
sd->node = node;
- bus = pci_scan_bus(busno, ops, sd);
- if (!bus)
+ x86_pci_root_bus_resources(busno, &resources);
+ bus = pci_scan_root_bus(NULL, busno, ops, sd, &resources);
+ if (!bus) {
+ pci_free_resource_list(&resources);
kfree(sd);
+ }
return bus;
}
diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
index 794b092d01ae..91821a1a0c3a 100644
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -254,26 +254,6 @@ void __init pcibios_resource_survey(void)
*/
fs_initcall(pcibios_assign_resources);
-/*
- * If we set up a device for bus mastering, we need to check the latency
- * timer as certain crappy BIOSes forget to set it properly.
- */
-unsigned int pcibios_max_latency = 255;
-
-void pcibios_set_master(struct pci_dev *dev)
-{
- u8 lat;
- pci_read_config_byte(dev, PCI_LATENCY_TIMER, &lat);
- if (lat < 16)
- lat = (64 <= pcibios_max_latency) ? 64 : pcibios_max_latency;
- else if (lat > pcibios_max_latency)
- lat = pcibios_max_latency;
- else
- return;
- dev_printk(KERN_DEBUG, &dev->dev, "setting latency timer to %d\n", lat);
- pci_write_config_byte(dev, PCI_LATENCY_TIMER, lat);
-}
-
static const struct vm_operations_struct pci_mmap_ops = {
.access = generic_access_phys,
};
diff --git a/arch/x86/pci/legacy.c b/arch/x86/pci/legacy.c
index 2c2aeabc2609..a1df191129d3 100644
--- a/arch/x86/pci/legacy.c
+++ b/arch/x86/pci/legacy.c
@@ -31,9 +31,6 @@ int __init pci_legacy_init(void)
printk("PCI: Probing PCI hardware\n");
pci_root_bus = pcibios_scan_root(0);
- if (pci_root_bus)
- pci_bus_add_devices(pci_root_bus);
-
return 0;
}
diff --git a/arch/x86/pci/numaq_32.c b/arch/x86/pci/numaq_32.c
index 51abf02f9226..83e125b95ca6 100644
--- a/arch/x86/pci/numaq_32.c
+++ b/arch/x86/pci/numaq_32.c
@@ -153,8 +153,6 @@ int __init pci_numaq_init(void)
raw_pci_ops = &pci_direct_conf1_mq;
pci_root_bus = pcibios_scan_root(0);
- if (pci_root_bus)
- pci_bus_add_devices(pci_root_bus);
if (num_online_nodes() > 1)
for_each_online_node(quad) {
if (quad == 0)
diff --git a/arch/x86/pci/pcbios.c b/arch/x86/pci/pcbios.c
index db0e9a51e611..da8fe0535ff4 100644
--- a/arch/x86/pci/pcbios.c
+++ b/arch/x86/pci/pcbios.c
@@ -44,7 +44,7 @@ static inline void set_bios_x(void)
pcibios_enabled = 1;
set_memory_x(PAGE_OFFSET + BIOS_BEGIN, (BIOS_END - BIOS_BEGIN) >> PAGE_SHIFT);
if (__supported_pte_mask & _PAGE_NX)
- printk(KERN_INFO "PCI : PCI BIOS aera is rw and x. Use pci=nobios if you want it NX.\n");
+ printk(KERN_INFO "PCI : PCI BIOS area is rw and x. Use pci=nobios if you want it NX.\n");
}
/*
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index 37718f0f053d..4cf9bd0a1653 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -238,7 +238,8 @@ static efi_status_t __init phys_efi_get_time(efi_time_t *tm,
spin_lock_irqsave(&rtc_lock, flags);
efi_call_phys_prelog();
- status = efi_call_phys2(efi_phys.get_time, tm, tc);
+ status = efi_call_phys2(efi_phys.get_time, virt_to_phys(tm),
+ virt_to_phys(tc));
efi_call_phys_epilog();
spin_unlock_irqrestore(&rtc_lock, flags);
return status;
@@ -352,8 +353,7 @@ void __init efi_memblock_x86_reserve_range(void)
boot_params.efi_info.efi_memdesc_size;
memmap.desc_version = boot_params.efi_info.efi_memdesc_version;
memmap.desc_size = boot_params.efi_info.efi_memdesc_size;
- memblock_x86_reserve_range(pmap, pmap + memmap.nr_map * memmap.desc_size,
- "EFI memmap");
+ memblock_reserve(pmap, memmap.nr_map * memmap.desc_size);
}
#if EFI_DEBUG
@@ -397,16 +397,14 @@ void __init efi_reserve_boot_services(void)
if ((start+size >= virt_to_phys(_text)
&& start <= virt_to_phys(_end)) ||
!e820_all_mapped(start, start+size, E820_RAM) ||
- memblock_x86_check_reserved_size(&start, &size,
- 1<<EFI_PAGE_SHIFT)) {
+ memblock_is_region_reserved(start, size)) {
/* Could not reserve, skip it */
md->num_pages = 0;
memblock_dbg(PFX "Could not reserve boot range "
"[0x%010llx-0x%010llx]\n",
start, start+size-1);
} else
- memblock_x86_reserve_range(start, start+size,
- "EFI Boot");
+ memblock_reserve(start, size);
}
}
diff --git a/arch/x86/platform/efi/efi_32.c b/arch/x86/platform/efi/efi_32.c
index e36bf714cb77..40e446941dd7 100644
--- a/arch/x86/platform/efi/efi_32.c
+++ b/arch/x86/platform/efi/efi_32.c
@@ -39,43 +39,14 @@
*/
static unsigned long efi_rt_eflags;
-static pgd_t efi_bak_pg_dir_pointer[2];
void efi_call_phys_prelog(void)
{
- unsigned long cr4;
- unsigned long temp;
struct desc_ptr gdt_descr;
local_irq_save(efi_rt_eflags);
- /*
- * If I don't have PAE, I should just duplicate two entries in page
- * directory. If I have PAE, I just need to duplicate one entry in
- * page directory.
- */
- cr4 = read_cr4_safe();
-
- if (cr4 & X86_CR4_PAE) {
- efi_bak_pg_dir_pointer[0].pgd =
- swapper_pg_dir[pgd_index(0)].pgd;
- swapper_pg_dir[0].pgd =
- swapper_pg_dir[pgd_index(PAGE_OFFSET)].pgd;
- } else {
- efi_bak_pg_dir_pointer[0].pgd =
- swapper_pg_dir[pgd_index(0)].pgd;
- efi_bak_pg_dir_pointer[1].pgd =
- swapper_pg_dir[pgd_index(0x400000)].pgd;
- swapper_pg_dir[pgd_index(0)].pgd =
- swapper_pg_dir[pgd_index(PAGE_OFFSET)].pgd;
- temp = PAGE_OFFSET + 0x400000;
- swapper_pg_dir[pgd_index(0x400000)].pgd =
- swapper_pg_dir[pgd_index(temp)].pgd;
- }
-
- /*
- * After the lock is released, the original page table is restored.
- */
+ load_cr3(initial_page_table);
__flush_tlb_all();
gdt_descr.address = __pa(get_cpu_gdt_table(0));
@@ -85,28 +56,13 @@ void efi_call_phys_prelog(void)
void efi_call_phys_epilog(void)
{
- unsigned long cr4;
struct desc_ptr gdt_descr;
gdt_descr.address = (unsigned long)get_cpu_gdt_table(0);
gdt_descr.size = GDT_SIZE - 1;
load_gdt(&gdt_descr);
- cr4 = read_cr4_safe();
-
- if (cr4 & X86_CR4_PAE) {
- swapper_pg_dir[pgd_index(0)].pgd =
- efi_bak_pg_dir_pointer[0].pgd;
- } else {
- swapper_pg_dir[pgd_index(0)].pgd =
- efi_bak_pg_dir_pointer[0].pgd;
- swapper_pg_dir[pgd_index(0x400000)].pgd =
- efi_bak_pg_dir_pointer[1].pgd;
- }
-
- /*
- * After the lock is released, the original page table is restored.
- */
+ load_cr3(swapper_pg_dir);
__flush_tlb_all();
local_irq_restore(efi_rt_eflags);
diff --git a/arch/x86/platform/geode/alix.c b/arch/x86/platform/geode/alix.c
index ca1973699d3d..dc5f1d32aced 100644
--- a/arch/x86/platform/geode/alix.c
+++ b/arch/x86/platform/geode/alix.c
@@ -27,7 +27,7 @@
#include <asm/geode.h>
-static int force = 0;
+static bool force = 0;
module_param(force, bool, 0444);
/* FIXME: Award bios is not automatically detected as Alix platform */
MODULE_PARM_DESC(force, "Force detection as ALIX.2/ALIX.3 platform");
diff --git a/arch/x86/platform/iris/iris.c b/arch/x86/platform/iris/iris.c
index 1ba7f5ed8c9b..5917eb56b313 100644
--- a/arch/x86/platform/iris/iris.c
+++ b/arch/x86/platform/iris/iris.c
@@ -42,7 +42,7 @@ MODULE_AUTHOR("Sébastien Hinderer <Sebastien.Hinderer@ens-lyon.org>");
MODULE_DESCRIPTION("A power_off handler for Iris devices from EuroBraille");
MODULE_SUPPORTED_DEVICE("Eurobraille/Iris");
-static int force;
+static bool force;
module_param(force, bool, 0);
MODULE_PARM_DESC(force, "Set to one to force poweroff handler installation.");
diff --git a/arch/x86/platform/mrst/Makefile b/arch/x86/platform/mrst/Makefile
index 1ea38775a6d3..7baed5135e0f 100644
--- a/arch/x86/platform/mrst/Makefile
+++ b/arch/x86/platform/mrst/Makefile
@@ -1,4 +1,4 @@
-obj-$(CONFIG_X86_MRST) += mrst.o
-obj-$(CONFIG_X86_MRST) += vrtc.o
-obj-$(CONFIG_EARLY_PRINTK_MRST) += early_printk_mrst.o
+obj-$(CONFIG_X86_INTEL_MID) += mrst.o
+obj-$(CONFIG_X86_INTEL_MID) += vrtc.o
+obj-$(CONFIG_EARLY_PRINTK_INTEL_MID) += early_printk_mrst.o
obj-$(CONFIG_X86_MRST) += pmu.o
diff --git a/arch/x86/platform/mrst/early_printk_mrst.c b/arch/x86/platform/mrst/early_printk_mrst.c
index 25bfdbb5b130..3c6e328483c7 100644
--- a/arch/x86/platform/mrst/early_printk_mrst.c
+++ b/arch/x86/platform/mrst/early_printk_mrst.c
@@ -245,16 +245,24 @@ struct console early_mrst_console = {
* Following is the early console based on Medfield HSU (High
* Speed UART) device.
*/
-#define HSU_PORT2_PADDR 0xffa28180
+#define HSU_PORT_BASE 0xffa28080
static void __iomem *phsu;
-void hsu_early_console_init(void)
+void hsu_early_console_init(const char *s)
{
+ unsigned long paddr, port = 0;
u8 lcr;
- phsu = (void *)set_fixmap_offset_nocache(FIX_EARLYCON_MEM_BASE,
- HSU_PORT2_PADDR);
+ /*
+ * Select the early HSU console port if specified by user in the
+ * kernel command line.
+ */
+ if (*s && !kstrtoul(s, 10, &port))
+ port = clamp_val(port, 0, 2);
+
+ paddr = HSU_PORT_BASE + port * 0x80;
+ phsu = (void *)set_fixmap_offset_nocache(FIX_EARLYCON_MEM_BASE, paddr);
/* Disable FIFO */
writeb(0x0, phsu + UART_FCR);
diff --git a/arch/x86/platform/mrst/mrst.c b/arch/x86/platform/mrst/mrst.c
index 541020df0da6..475e2cd0f3c3 100644
--- a/arch/x86/platform/mrst/mrst.c
+++ b/arch/x86/platform/mrst/mrst.c
@@ -76,6 +76,20 @@ struct sfi_rtc_table_entry sfi_mrtc_array[SFI_MRTC_MAX];
EXPORT_SYMBOL_GPL(sfi_mrtc_array);
int sfi_mrtc_num;
+static void mrst_power_off(void)
+{
+ if (__mrst_cpu_chip == MRST_CPU_CHIP_LINCROFT)
+ intel_scu_ipc_simple_command(IPCMSG_COLD_RESET, 1);
+}
+
+static void mrst_reboot(void)
+{
+ if (__mrst_cpu_chip == MRST_CPU_CHIP_LINCROFT)
+ intel_scu_ipc_simple_command(IPCMSG_COLD_RESET, 0);
+ else
+ intel_scu_ipc_simple_command(IPCMSG_COLD_BOOT, 0);
+}
+
/* parse all the mtimer info to a static mtimer array */
static int __init sfi_parse_mtmr(struct sfi_table_header *table)
{
@@ -187,11 +201,34 @@ int __init sfi_parse_mrtc(struct sfi_table_header *table)
static unsigned long __init mrst_calibrate_tsc(void)
{
unsigned long flags, fast_calibrate;
-
- local_irq_save(flags);
- fast_calibrate = apbt_quick_calibrate();
- local_irq_restore(flags);
-
+ if (__mrst_cpu_chip == MRST_CPU_CHIP_PENWELL) {
+ u32 lo, hi, ratio, fsb;
+
+ rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
+ pr_debug("IA32 perf status is 0x%x, 0x%0x\n", lo, hi);
+ ratio = (hi >> 8) & 0x1f;
+ pr_debug("ratio is %d\n", ratio);
+ if (!ratio) {
+ pr_err("read a zero ratio, should be incorrect!\n");
+ pr_err("force tsc ratio to 16 ...\n");
+ ratio = 16;
+ }
+ rdmsr(MSR_FSB_FREQ, lo, hi);
+ if ((lo & 0x7) == 0x7)
+ fsb = PENWELL_FSB_FREQ_83SKU;
+ else
+ fsb = PENWELL_FSB_FREQ_100SKU;
+ fast_calibrate = ratio * fsb;
+ pr_debug("read penwell tsc %lu khz\n", fast_calibrate);
+ lapic_timer_frequency = fsb * 1000 / HZ;
+ /* mark tsc clocksource as reliable */
+ set_cpu_cap(&boot_cpu_data, X86_FEATURE_TSC_RELIABLE);
+ } else {
+ local_irq_save(flags);
+ fast_calibrate = apbt_quick_calibrate();
+ local_irq_restore(flags);
+ }
+
if (fast_calibrate)
return fast_calibrate;
@@ -242,15 +279,15 @@ static int mrst_i8042_detect(void)
return 0;
}
-/* Reboot and power off are handled by the SCU on a MID device */
-static void mrst_power_off(void)
-{
- intel_scu_ipc_simple_command(0xf1, 1);
-}
-
-static void mrst_reboot(void)
+/*
+ * Moorestown does not have external NMI source nor port 0x61 to report
+ * NMI status. The possible NMI sources are from pmu as a result of NMI
+ * watchdog or lock debug. Reading io port 0x61 results in 0xff which
+ * misled NMI handler.
+ */
+static unsigned char mrst_get_nmi_reason(void)
{
- intel_scu_ipc_simple_command(0xf1, 0);
+ return 0;
}
/*
@@ -274,6 +311,8 @@ void __init x86_mrst_early_setup(void)
x86_platform.calibrate_tsc = mrst_calibrate_tsc;
x86_platform.i8042_detect = mrst_i8042_detect;
x86_init.timers.wallclock_init = mrst_rtc_init;
+ x86_platform.get_nmi_reason = mrst_get_nmi_reason;
+
x86_init.pci.init = pci_mrst_init;
x86_init.pci.fixup_irqs = x86_init_noop;
@@ -448,6 +487,46 @@ static void __init *max7315_platform_data(void *info)
return max7315;
}
+static void *tca6416_platform_data(void *info)
+{
+ static struct pca953x_platform_data tca6416;
+ struct i2c_board_info *i2c_info = info;
+ int gpio_base, intr;
+ char base_pin_name[SFI_NAME_LEN + 1];
+ char intr_pin_name[SFI_NAME_LEN + 1];
+
+ strcpy(i2c_info->type, "tca6416");
+ strcpy(base_pin_name, "tca6416_base");
+ strcpy(intr_pin_name, "tca6416_int");
+
+ gpio_base = get_gpio_by_name(base_pin_name);
+ intr = get_gpio_by_name(intr_pin_name);
+
+ if (gpio_base == -1)
+ return NULL;
+ tca6416.gpio_base = gpio_base;
+ if (intr != -1) {
+ i2c_info->irq = intr + MRST_IRQ_OFFSET;
+ tca6416.irq_base = gpio_base + MRST_IRQ_OFFSET;
+ } else {
+ i2c_info->irq = -1;
+ tca6416.irq_base = -1;
+ }
+ return &tca6416;
+}
+
+static void *mpu3050_platform_data(void *info)
+{
+ struct i2c_board_info *i2c_info = info;
+ int intr = get_gpio_by_name("mpu3050_int");
+
+ if (intr == -1)
+ return NULL;
+
+ i2c_info->irq = intr + MRST_IRQ_OFFSET;
+ return NULL;
+}
+
static void __init *emc1403_platform_data(void *info)
{
static short intr2nd_pdata;
@@ -610,12 +689,15 @@ static void *msic_ocd_platform_data(void *info)
static const struct devs_id __initconst device_ids[] = {
{"bma023", SFI_DEV_TYPE_I2C, 1, &no_platform_data},
{"pmic_gpio", SFI_DEV_TYPE_SPI, 1, &pmic_gpio_platform_data},
+ {"pmic_gpio", SFI_DEV_TYPE_IPC, 1, &pmic_gpio_platform_data},
{"spi_max3111", SFI_DEV_TYPE_SPI, 0, &max3111_platform_data},
{"i2c_max7315", SFI_DEV_TYPE_I2C, 1, &max7315_platform_data},
{"i2c_max7315_2", SFI_DEV_TYPE_I2C, 1, &max7315_platform_data},
+ {"tca6416", SFI_DEV_TYPE_I2C, 1, &tca6416_platform_data},
{"emc1403", SFI_DEV_TYPE_I2C, 1, &emc1403_platform_data},
{"i2c_accel", SFI_DEV_TYPE_I2C, 0, &lis331dl_platform_data},
{"pmic_audio", SFI_DEV_TYPE_IPC, 1, &no_platform_data},
+ {"mpu3050", SFI_DEV_TYPE_I2C, 1, &mpu3050_platform_data},
/* MSIC subdevices */
{"msic_battery", SFI_DEV_TYPE_IPC, 1, &msic_battery_platform_data},
@@ -766,8 +848,7 @@ static void __init sfi_handle_ipc_dev(struct sfi_device_table_entry *entry)
if (mrst_has_msic())
return;
- /* ID as IRQ is a hack that will go away */
- pdev = platform_device_alloc(entry->name, entry->irq);
+ pdev = platform_device_alloc(entry->name, 0);
if (pdev == NULL) {
pr_err("out of memory for SFI platform device '%s'.\n",
entry->name);
@@ -948,6 +1029,7 @@ static int __init pb_keys_init(void)
num = sizeof(gpio_button) / sizeof(struct gpio_keys_button);
for (i = 0; i < num; i++) {
gb[i].gpio = get_gpio_by_name(gb[i].desc);
+ pr_debug("info[%2d]: name = %s, gpio = %d\n", i, gb[i].desc, gb[i].gpio);
if (gb[i].gpio == -1)
continue;
diff --git a/arch/x86/platform/uv/uv_sysfs.c b/arch/x86/platform/uv/uv_sysfs.c
index 309c70fb7759..5d4ba301e776 100644
--- a/arch/x86/platform/uv/uv_sysfs.c
+++ b/arch/x86/platform/uv/uv_sysfs.c
@@ -19,7 +19,7 @@
* Copyright (c) Russ Anderson
*/
-#include <linux/sysdev.h>
+#include <linux/device.h>
#include <asm/uv/bios.h>
#include <asm/uv/uv.h>
diff --git a/arch/x86/tools/Makefile b/arch/x86/tools/Makefile
index f82082677337..d511aa97533a 100644
--- a/arch/x86/tools/Makefile
+++ b/arch/x86/tools/Makefile
@@ -18,14 +18,21 @@ chkobjdump = $(srctree)/arch/x86/tools/chkobjdump.awk
quiet_cmd_posttest = TEST $@
cmd_posttest = ($(OBJDUMP) -v | $(AWK) -f $(chkobjdump)) || $(OBJDUMP) -d -j .text $(objtree)/vmlinux | $(AWK) -f $(distill_awk) | $(obj)/test_get_len $(posttest_64bit) $(posttest_verbose)
-posttest: $(obj)/test_get_len vmlinux
+quiet_cmd_sanitytest = TEST $@
+ cmd_sanitytest = $(obj)/insn_sanity $(posttest_64bit) -m 1000000
+
+posttest: $(obj)/test_get_len vmlinux $(obj)/insn_sanity
$(call cmd,posttest)
+ $(call cmd,sanitytest)
-hostprogs-y := test_get_len
+hostprogs-y += test_get_len insn_sanity
# -I needed for generated C source and C source which in the kernel tree.
HOSTCFLAGS_test_get_len.o := -Wall -I$(objtree)/arch/x86/lib/ -I$(srctree)/arch/x86/include/ -I$(srctree)/arch/x86/lib/ -I$(srctree)/include/
+HOSTCFLAGS_insn_sanity.o := -Wall -I$(objtree)/arch/x86/lib/ -I$(srctree)/arch/x86/include/ -I$(srctree)/arch/x86/lib/ -I$(srctree)/include/
+
# Dependencies are also needed.
$(obj)/test_get_len.o: $(srctree)/arch/x86/lib/insn.c $(srctree)/arch/x86/lib/inat.c $(srctree)/arch/x86/include/asm/inat_types.h $(srctree)/arch/x86/include/asm/inat.h $(srctree)/arch/x86/include/asm/insn.h $(objtree)/arch/x86/lib/inat-tables.c
+$(obj)/insn_sanity.o: $(srctree)/arch/x86/lib/insn.c $(srctree)/arch/x86/lib/inat.c $(srctree)/arch/x86/include/asm/inat_types.h $(srctree)/arch/x86/include/asm/inat.h $(srctree)/arch/x86/include/asm/insn.h $(objtree)/arch/x86/lib/inat-tables.c
diff --git a/arch/x86/tools/gen-insn-attr-x86.awk b/arch/x86/tools/gen-insn-attr-x86.awk
index eaf11f52fc0b..5f6a5b6c3a15 100644
--- a/arch/x86/tools/gen-insn-attr-x86.awk
+++ b/arch/x86/tools/gen-insn-attr-x86.awk
@@ -47,7 +47,7 @@ BEGIN {
sep_expr = "^\\|$"
group_expr = "^Grp[0-9A-Za-z]+"
- imm_expr = "^[IJAO][a-z]"
+ imm_expr = "^[IJAOL][a-z]"
imm_flag["Ib"] = "INAT_MAKE_IMM(INAT_IMM_BYTE)"
imm_flag["Jb"] = "INAT_MAKE_IMM(INAT_IMM_BYTE)"
imm_flag["Iw"] = "INAT_MAKE_IMM(INAT_IMM_WORD)"
@@ -59,6 +59,7 @@ BEGIN {
imm_flag["Iv"] = "INAT_MAKE_IMM(INAT_IMM_VWORD)"
imm_flag["Ob"] = "INAT_MOFFSET"
imm_flag["Ov"] = "INAT_MOFFSET"
+ imm_flag["Lx"] = "INAT_MAKE_IMM(INAT_IMM_BYTE)"
modrm_expr = "^([CDEGMNPQRSUVW/][a-z]+|NTA|T[012])"
force64_expr = "\\([df]64\\)"
@@ -70,8 +71,12 @@ BEGIN {
lprefix3_expr = "\\(F2\\)"
max_lprefix = 4
- vexok_expr = "\\(VEX\\)"
- vexonly_expr = "\\(oVEX\\)"
+ # All opcodes starting with lower-case 'v' or with (v1) superscript
+ # accepts VEX prefix
+ vexok_opcode_expr = "^v.*"
+ vexok_expr = "\\(v1\\)"
+ # All opcodes with (v) superscript supports *only* VEX prefix
+ vexonly_expr = "\\(v\\)"
prefix_expr = "\\(Prefix\\)"
prefix_num["Operand-Size"] = "INAT_PFX_OPNDSZ"
@@ -85,8 +90,8 @@ BEGIN {
prefix_num["SEG=GS"] = "INAT_PFX_GS"
prefix_num["SEG=SS"] = "INAT_PFX_SS"
prefix_num["Address-Size"] = "INAT_PFX_ADDRSZ"
- prefix_num["2bytes-VEX"] = "INAT_PFX_VEX2"
- prefix_num["3bytes-VEX"] = "INAT_PFX_VEX3"
+ prefix_num["VEX+1byte"] = "INAT_PFX_VEX2"
+ prefix_num["VEX+2byte"] = "INAT_PFX_VEX3"
clear_vars()
}
@@ -310,12 +315,10 @@ function convert_operands(count,opnd, i,j,imm,mod)
if (match(opcode, fpu_expr))
flags = add_flags(flags, "INAT_MODRM")
- # check VEX only code
+ # check VEX codes
if (match(ext, vexonly_expr))
flags = add_flags(flags, "INAT_VEXOK | INAT_VEXONLY")
-
- # check VEX only code
- if (match(ext, vexok_expr))
+ else if (match(ext, vexok_expr) || match(opcode, vexok_opcode_expr))
flags = add_flags(flags, "INAT_VEXOK")
# check prefixes
diff --git a/arch/x86/tools/insn_sanity.c b/arch/x86/tools/insn_sanity.c
new file mode 100644
index 000000000000..cc2f8c131286
--- /dev/null
+++ b/arch/x86/tools/insn_sanity.c
@@ -0,0 +1,275 @@
+/*
+ * x86 decoder sanity test - based on test_get_insn.c
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2009
+ * Copyright (C) Hitachi, Ltd., 2011
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+
+#define unlikely(cond) (cond)
+#define ARRAY_SIZE(a) (sizeof(a)/sizeof(a[0]))
+
+#include <asm/insn.h>
+#include <inat.c>
+#include <insn.c>
+
+/*
+ * Test of instruction analysis against tampering.
+ * Feed random binary to instruction decoder and ensure not to
+ * access out-of-instruction-buffer.
+ */
+
+#define DEFAULT_MAX_ITER 10000
+#define INSN_NOP 0x90
+
+static const char *prog; /* Program name */
+static int verbose; /* Verbosity */
+static int x86_64; /* x86-64 bit mode flag */
+static unsigned int seed; /* Random seed */
+static unsigned long iter_start; /* Start of iteration number */
+static unsigned long iter_end = DEFAULT_MAX_ITER; /* End of iteration number */
+static FILE *input_file; /* Input file name */
+
+static void usage(const char *err)
+{
+ if (err)
+ fprintf(stderr, "Error: %s\n\n", err);
+ fprintf(stderr, "Usage: %s [-y|-n|-v] [-s seed[,no]] [-m max] [-i input]\n", prog);
+ fprintf(stderr, "\t-y 64bit mode\n");
+ fprintf(stderr, "\t-n 32bit mode\n");
+ fprintf(stderr, "\t-v Verbosity(-vv dumps any decoded result)\n");
+ fprintf(stderr, "\t-s Give a random seed (and iteration number)\n");
+ fprintf(stderr, "\t-m Give a maximum iteration number\n");
+ fprintf(stderr, "\t-i Give an input file with decoded binary\n");
+ exit(1);
+}
+
+static void dump_field(FILE *fp, const char *name, const char *indent,
+ struct insn_field *field)
+{
+ fprintf(fp, "%s.%s = {\n", indent, name);
+ fprintf(fp, "%s\t.value = %d, bytes[] = {%x, %x, %x, %x},\n",
+ indent, field->value, field->bytes[0], field->bytes[1],
+ field->bytes[2], field->bytes[3]);
+ fprintf(fp, "%s\t.got = %d, .nbytes = %d},\n", indent,
+ field->got, field->nbytes);
+}
+
+static void dump_insn(FILE *fp, struct insn *insn)
+{
+ fprintf(fp, "Instruction = {\n");
+ dump_field(fp, "prefixes", "\t", &insn->prefixes);
+ dump_field(fp, "rex_prefix", "\t", &insn->rex_prefix);
+ dump_field(fp, "vex_prefix", "\t", &insn->vex_prefix);
+ dump_field(fp, "opcode", "\t", &insn->opcode);
+ dump_field(fp, "modrm", "\t", &insn->modrm);
+ dump_field(fp, "sib", "\t", &insn->sib);
+ dump_field(fp, "displacement", "\t", &insn->displacement);
+ dump_field(fp, "immediate1", "\t", &insn->immediate1);
+ dump_field(fp, "immediate2", "\t", &insn->immediate2);
+ fprintf(fp, "\t.attr = %x, .opnd_bytes = %d, .addr_bytes = %d,\n",
+ insn->attr, insn->opnd_bytes, insn->addr_bytes);
+ fprintf(fp, "\t.length = %d, .x86_64 = %d, .kaddr = %p}\n",
+ insn->length, insn->x86_64, insn->kaddr);
+}
+
+static void dump_stream(FILE *fp, const char *msg, unsigned long nr_iter,
+ unsigned char *insn_buf, struct insn *insn)
+{
+ int i;
+
+ fprintf(fp, "%s:\n", msg);
+
+ dump_insn(fp, insn);
+
+ fprintf(fp, "You can reproduce this with below command(s);\n");
+
+ /* Input a decoded instruction sequence directly */
+ fprintf(fp, " $ echo ");
+ for (i = 0; i < MAX_INSN_SIZE; i++)
+ fprintf(fp, " %02x", insn_buf[i]);
+ fprintf(fp, " | %s -i -\n", prog);
+
+ if (!input_file) {
+ fprintf(fp, "Or \n");
+ /* Give a seed and iteration number */
+ fprintf(fp, " $ %s -s 0x%x,%lu\n", prog, seed, nr_iter);
+ }
+}
+
+static void init_random_seed(void)
+{
+ int fd;
+
+ fd = open("/dev/urandom", O_RDONLY);
+ if (fd < 0)
+ goto fail;
+
+ if (read(fd, &seed, sizeof(seed)) != sizeof(seed))
+ goto fail;
+
+ close(fd);
+ return;
+fail:
+ usage("Failed to open /dev/urandom");
+}
+
+/* Read given instruction sequence from the input file */
+static int read_next_insn(unsigned char *insn_buf)
+{
+ char buf[256] = "", *tmp;
+ int i;
+
+ tmp = fgets(buf, ARRAY_SIZE(buf), input_file);
+ if (tmp == NULL || feof(input_file))
+ return 0;
+
+ for (i = 0; i < MAX_INSN_SIZE; i++) {
+ insn_buf[i] = (unsigned char)strtoul(tmp, &tmp, 16);
+ if (*tmp != ' ')
+ break;
+ }
+
+ return i;
+}
+
+static int generate_insn(unsigned char *insn_buf)
+{
+ int i;
+
+ if (input_file)
+ return read_next_insn(insn_buf);
+
+ /* Fills buffer with random binary up to MAX_INSN_SIZE */
+ for (i = 0; i < MAX_INSN_SIZE - 1; i += 2)
+ *(unsigned short *)(&insn_buf[i]) = random() & 0xffff;
+
+ while (i < MAX_INSN_SIZE)
+ insn_buf[i++] = random() & 0xff;
+
+ return i;
+}
+
+static void parse_args(int argc, char **argv)
+{
+ int c;
+ char *tmp = NULL;
+ int set_seed = 0;
+
+ prog = argv[0];
+ while ((c = getopt(argc, argv, "ynvs:m:i:")) != -1) {
+ switch (c) {
+ case 'y':
+ x86_64 = 1;
+ break;
+ case 'n':
+ x86_64 = 0;
+ break;
+ case 'v':
+ verbose++;
+ break;
+ case 'i':
+ if (strcmp("-", optarg) == 0)
+ input_file = stdin;
+ else
+ input_file = fopen(optarg, "r");
+ if (!input_file)
+ usage("Failed to open input file");
+ break;
+ case 's':
+ seed = (unsigned int)strtoul(optarg, &tmp, 0);
+ if (*tmp == ',') {
+ optarg = tmp + 1;
+ iter_start = strtoul(optarg, &tmp, 0);
+ }
+ if (*tmp != '\0' || tmp == optarg)
+ usage("Failed to parse seed");
+ set_seed = 1;
+ break;
+ case 'm':
+ iter_end = strtoul(optarg, &tmp, 0);
+ if (*tmp != '\0' || tmp == optarg)
+ usage("Failed to parse max_iter");
+ break;
+ default:
+ usage(NULL);
+ }
+ }
+
+ /* Check errors */
+ if (iter_end < iter_start)
+ usage("Max iteration number must be bigger than iter-num");
+
+ if (set_seed && input_file)
+ usage("Don't use input file (-i) with random seed (-s)");
+
+ /* Initialize random seed */
+ if (!input_file) {
+ if (!set_seed) /* No seed is given */
+ init_random_seed();
+ srand(seed);
+ }
+}
+
+int main(int argc, char **argv)
+{
+ struct insn insn;
+ int insns = 0;
+ int errors = 0;
+ unsigned long i;
+ unsigned char insn_buf[MAX_INSN_SIZE * 2];
+
+ parse_args(argc, argv);
+
+ /* Prepare stop bytes with NOPs */
+ memset(insn_buf + MAX_INSN_SIZE, INSN_NOP, MAX_INSN_SIZE);
+
+ for (i = 0; i < iter_end; i++) {
+ if (generate_insn(insn_buf) <= 0)
+ break;
+
+ if (i < iter_start) /* Skip to given iteration number */
+ continue;
+
+ /* Decode an instruction */
+ insn_init(&insn, insn_buf, x86_64);
+ insn_get_length(&insn);
+
+ if (insn.next_byte <= insn.kaddr ||
+ insn.kaddr + MAX_INSN_SIZE < insn.next_byte) {
+ /* Access out-of-range memory */
+ dump_stream(stderr, "Error: Found an access violation", i, insn_buf, &insn);
+ errors++;
+ } else if (verbose && !insn_complete(&insn))
+ dump_stream(stdout, "Info: Found an undecodable input", i, insn_buf, &insn);
+ else if (verbose >= 2)
+ dump_insn(stdout, &insn);
+ insns++;
+ }
+
+ fprintf(stdout, "%s: decoded and checked %d %s instructions with %d errors (seed:0x%x)\n", (errors) ? "Failure" : "Success", insns, (input_file) ? "given" : "random", errors, seed);
+
+ return errors ? 1 : 0;
+}
diff --git a/arch/x86/um/Kconfig b/arch/x86/um/Kconfig
index 1d97bd84b6fb..b2b54d2edf53 100644
--- a/arch/x86/um/Kconfig
+++ b/arch/x86/um/Kconfig
@@ -6,14 +6,6 @@ menu "UML-specific options"
menu "Host processor type and features"
-config CMPXCHG_LOCAL
- bool
- default n
-
-config CMPXCHG_DOUBLE
- bool
- default n
-
source "arch/x86/Kconfig.cpu"
endmenu
diff --git a/arch/x86/um/asm/processor.h b/arch/x86/um/asm/processor.h
index 118c143a9cb4..2c32df6fe231 100644
--- a/arch/x86/um/asm/processor.h
+++ b/arch/x86/um/asm/processor.h
@@ -11,7 +11,7 @@
#endif
#define KSTK_EIP(tsk) KSTK_REG(tsk, HOST_IP)
-#define KSTK_ESP(tsk) KSTK_REG(tsk, HOST_IP)
+#define KSTK_ESP(tsk) KSTK_REG(tsk, HOST_SP)
#define KSTK_EBP(tsk) KSTK_REG(tsk, HOST_BP)
#define ARCH_IS_STACKGROW(address) \
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index 26c731a106af..fdce49c7aff6 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -29,7 +29,8 @@ config XEN_PVHVM
config XEN_MAX_DOMAIN_MEMORY
int
- default 128
+ default 500 if X86_64
+ default 64 if X86_32
depends on XEN
help
This only affects the sizing of some bss arrays, the unused
@@ -48,3 +49,4 @@ config XEN_DEBUG_FS
help
Enable statistics output and various tuning options in debugfs.
Enabling this option may incur a significant performance overhead.
+
diff --git a/arch/x86/xen/debugfs.c b/arch/x86/xen/debugfs.c
index 7c0fedd98ea0..ef1db1900d86 100644
--- a/arch/x86/xen/debugfs.c
+++ b/arch/x86/xen/debugfs.c
@@ -109,7 +109,7 @@ static const struct file_operations u32_array_fops = {
.llseek = no_llseek,
};
-struct dentry *xen_debugfs_create_u32_array(const char *name, mode_t mode,
+struct dentry *xen_debugfs_create_u32_array(const char *name, umode_t mode,
struct dentry *parent,
u32 *array, unsigned elements)
{
diff --git a/arch/x86/xen/debugfs.h b/arch/x86/xen/debugfs.h
index e28132084832..78d25499be5b 100644
--- a/arch/x86/xen/debugfs.h
+++ b/arch/x86/xen/debugfs.h
@@ -3,7 +3,7 @@
struct dentry * __init xen_init_debugfs(void);
-struct dentry *xen_debugfs_create_u32_array(const char *name, mode_t mode,
+struct dentry *xen_debugfs_create_u32_array(const char *name, umode_t mode,
struct dentry *parent,
u32 *array, unsigned elements);
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index da8afd576a6b..12eb07bfb267 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1215,8 +1215,6 @@ asmlinkage void __init xen_start_kernel(void)
local_irq_disable();
early_boot_irqs_disabled = true;
- memblock_init();
-
xen_raw_console_write("mapping kernel into physical memory\n");
pgd = xen_setup_kernel_pagetable(pgd, xen_start_info->nr_pages);
xen_ident_map_ISA();
@@ -1356,7 +1354,7 @@ static int __cpuinit xen_hvm_cpu_notify(struct notifier_block *self,
int cpu = (long)hcpu;
switch (action) {
case CPU_UP_PREPARE:
- per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
+ xen_vcpu_setup(cpu);
if (xen_have_vector_callback)
xen_init_lock_cpu(cpu);
break;
@@ -1386,7 +1384,6 @@ static void __init xen_hvm_guest_init(void)
xen_hvm_smp_init();
register_cpu_notifier(&xen_hvm_cpu_notifier);
xen_unplug_emulated_devices();
- have_vcpu_info_placement = 0;
x86_init.irqs.intr_init = xen_init_IRQ;
xen_hvm_init_time_ops();
xen_hvm_init_mmu_ops();
diff --git a/arch/x86/xen/grant-table.c b/arch/x86/xen/grant-table.c
index 6bbfd7ac5e81..3a5f55d51907 100644
--- a/arch/x86/xen/grant-table.c
+++ b/arch/x86/xen/grant-table.c
@@ -54,6 +54,20 @@ static int map_pte_fn(pte_t *pte, struct page *pmd_page,
return 0;
}
+/*
+ * This function is used to map shared frames to store grant status. It is
+ * different from map_pte_fn above, the frames type here is uint64_t.
+ */
+static int map_pte_fn_status(pte_t *pte, struct page *pmd_page,
+ unsigned long addr, void *data)
+{
+ uint64_t **frames = (uint64_t **)data;
+
+ set_pte_at(&init_mm, addr, pte, mfn_pte((*frames)[0], PAGE_KERNEL));
+ (*frames)++;
+ return 0;
+}
+
static int unmap_pte_fn(pte_t *pte, struct page *pmd_page,
unsigned long addr, void *data)
{
@@ -64,14 +78,14 @@ static int unmap_pte_fn(pte_t *pte, struct page *pmd_page,
int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes,
unsigned long max_nr_gframes,
- struct grant_entry **__shared)
+ void **__shared)
{
int rc;
- struct grant_entry *shared = *__shared;
+ void *shared = *__shared;
if (shared == NULL) {
struct vm_struct *area =
- alloc_vm_area(PAGE_SIZE * max_nr_gframes);
+ alloc_vm_area(PAGE_SIZE * max_nr_gframes, NULL);
BUG_ON(area == NULL);
shared = area->addr;
*__shared = shared;
@@ -83,8 +97,30 @@ int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes,
return rc;
}
-void arch_gnttab_unmap_shared(struct grant_entry *shared,
- unsigned long nr_gframes)
+int arch_gnttab_map_status(uint64_t *frames, unsigned long nr_gframes,
+ unsigned long max_nr_gframes,
+ grant_status_t **__shared)
+{
+ int rc;
+ grant_status_t *shared = *__shared;
+
+ if (shared == NULL) {
+ /* No need to pass in PTE as we are going to do it
+ * in apply_to_page_range anyhow. */
+ struct vm_struct *area =
+ alloc_vm_area(PAGE_SIZE * max_nr_gframes, NULL);
+ BUG_ON(area == NULL);
+ shared = area->addr;
+ *__shared = shared;
+ }
+
+ rc = apply_to_page_range(&init_mm, (unsigned long)shared,
+ PAGE_SIZE * nr_gframes,
+ map_pte_fn_status, &frames);
+ return rc;
+}
+
+void arch_gnttab_unmap(void *shared, unsigned long nr_gframes)
{
apply_to_page_range(&init_mm, (unsigned long)shared,
PAGE_SIZE * nr_gframes, unmap_pte_fn, NULL);
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 87f6673b1207..58a0e46c404d 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1774,10 +1774,8 @@ pgd_t * __init xen_setup_kernel_pagetable(pgd_t *pgd,
__xen_write_cr3(true, __pa(pgd));
xen_mc_issue(PARAVIRT_LAZY_CPU);
- memblock_x86_reserve_range(__pa(xen_start_info->pt_base),
- __pa(xen_start_info->pt_base +
- xen_start_info->nr_pt_frames * PAGE_SIZE),
- "XEN PAGETABLES");
+ memblock_reserve(__pa(xen_start_info->pt_base),
+ xen_start_info->nr_pt_frames * PAGE_SIZE);
return pgd;
}
@@ -1853,10 +1851,8 @@ pgd_t * __init xen_setup_kernel_pagetable(pgd_t *pgd,
PFN_DOWN(__pa(initial_page_table)));
xen_write_cr3(__pa(initial_page_table));
- memblock_x86_reserve_range(__pa(xen_start_info->pt_base),
- __pa(xen_start_info->pt_base +
- xen_start_info->nr_pt_frames * PAGE_SIZE),
- "XEN PAGETABLES");
+ memblock_reserve(__pa(xen_start_info->pt_base),
+ xen_start_info->nr_pt_frames * PAGE_SIZE);
return initial_page_table;
}
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 38d0af4fefec..e03c63692176 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -75,7 +75,7 @@ static void __init xen_add_extra_mem(u64 start, u64 size)
if (i == XEN_EXTRA_MEM_MAX_REGIONS)
printk(KERN_WARNING "Warning: not enough extra memory regions\n");
- memblock_x86_reserve_range(start, start + size, "XEN EXTRA");
+ memblock_reserve(start, size);
xen_max_p2m_pfn = PFN_DOWN(start + size);
@@ -173,9 +173,21 @@ static unsigned long __init xen_get_max_pages(void)
domid_t domid = DOMID_SELF;
int ret;
- ret = HYPERVISOR_memory_op(XENMEM_maximum_reservation, &domid);
- if (ret > 0)
- max_pages = ret;
+ /*
+ * For the initial domain we use the maximum reservation as
+ * the maximum page.
+ *
+ * For guest domains the current maximum reservation reflects
+ * the current maximum rather than the static maximum. In this
+ * case the e820 map provided to us will cover the static
+ * maximum region.
+ */
+ if (xen_initial_domain()) {
+ ret = HYPERVISOR_memory_op(XENMEM_maximum_reservation, &domid);
+ if (ret > 0)
+ max_pages = ret;
+ }
+
return min(max_pages, MAX_DOMAIN_PAGES);
}
@@ -299,9 +311,8 @@ char * __init xen_memory_setup(void)
* - xen_start_info
* See comment above "struct start_info" in <xen/interface/xen.h>
*/
- memblock_x86_reserve_range(__pa(xen_start_info->mfn_list),
- __pa(xen_start_info->pt_base),
- "XEN START INFO");
+ memblock_reserve(__pa(xen_start_info->mfn_list),
+ xen_start_info->pt_base - xen_start_info->mfn_list);
sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
@@ -410,6 +421,6 @@ void __init xen_arch_setup(void)
#endif
disable_cpuidle();
boot_option_idle_override = IDLE_HALT;
-
+ WARN_ON(set_pm_idle_to_default());
fiddle_vdso();
}