aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig42
-rw-r--r--arch/x86/Kconfig.debug13
-rw-r--r--arch/x86/Makefile1
-rw-r--r--arch/x86/Makefile.um2
-rw-r--r--arch/x86/boot/Makefile3
-rw-r--r--arch/x86/boot/compressed/Makefile5
-rw-r--r--arch/x86/boot/compressed/aslr.c34
-rw-r--r--arch/x86/boot/compressed/eboot.c3
-rw-r--r--arch/x86/boot/compressed/efi_stub_64.S25
-rw-r--r--arch/x86/boot/compressed/efi_thunk_64.S196
-rw-r--r--arch/x86/boot/compressed/misc.c12
-rw-r--r--arch/x86/boot/compressed/misc.h7
-rw-r--r--arch/x86/boot/ctype.h5
-rw-r--r--arch/x86/boot/early_serial_console.c6
-rw-r--r--arch/x86/crypto/Makefile2
-rw-r--r--arch/x86/crypto/aes_ctrby8_avx-x86_64.S46
-rw-r--r--arch/x86/crypto/aesni-intel_asm.S343
-rw-r--r--arch/x86/crypto/aesni-intel_glue.c34
-rw-r--r--arch/x86/crypto/des3_ede_glue.c2
-rw-r--r--arch/x86/crypto/sha-mb/sha1_mb.c2
-rw-r--r--arch/x86/ia32/ia32_signal.c2
-rw-r--r--arch/x86/ia32/ia32entry.S4
-rw-r--r--arch/x86/include/asm/acpi.h1
-rw-r--r--arch/x86/include/asm/apic.h66
-rw-r--r--arch/x86/include/asm/calling.h1
-rw-r--r--arch/x86/include/asm/cpufeature.h2
-rw-r--r--arch/x86/include/asm/debugreg.h5
-rw-r--r--arch/x86/include/asm/desc.h20
-rw-r--r--arch/x86/include/asm/fpu-internal.h10
-rw-r--r--arch/x86/include/asm/hw_breakpoint.h1
-rw-r--r--arch/x86/include/asm/i387.h6
-rw-r--r--arch/x86/include/asm/imr.h60
-rw-r--r--arch/x86/include/asm/intel-mid.h3
-rw-r--r--arch/x86/include/asm/io_apic.h5
-rw-r--r--arch/x86/include/asm/irq_remapping.h4
-rw-r--r--arch/x86/include/asm/kasan.h31
-rw-r--r--arch/x86/include/asm/kvm_emulate.h1
-rw-r--r--arch/x86/include/asm/kvm_host.h59
-rw-r--r--arch/x86/include/asm/lguest_hcall.h1
-rw-r--r--arch/x86/include/asm/livepatch.h46
-rw-r--r--arch/x86/include/asm/mce.h1
-rw-r--r--arch/x86/include/asm/mmu.h2
-rw-r--r--arch/x86/include/asm/mmu_context.h53
-rw-r--r--arch/x86/include/asm/page_64_types.h12
-rw-r--r--arch/x86/include/asm/page_types.h2
-rw-r--r--arch/x86/include/asm/paravirt.h6
-rw-r--r--arch/x86/include/asm/pci_x86.h2
-rw-r--r--arch/x86/include/asm/pgtable-2level.h38
-rw-r--r--arch/x86/include/asm/pgtable-3level.h12
-rw-r--r--arch/x86/include/asm/pgtable.h68
-rw-r--r--arch/x86/include/asm/pgtable_64.h11
-rw-r--r--arch/x86/include/asm/pgtable_types.h46
-rw-r--r--arch/x86/include/asm/pmc_atom.h22
-rw-r--r--arch/x86/include/asm/processor.h33
-rw-r--r--arch/x86/include/asm/smpboot_hooks.h68
-rw-r--r--arch/x86/include/asm/special_insns.h6
-rw-r--r--arch/x86/include/asm/spinlock.h94
-rw-r--r--arch/x86/include/asm/string_64.h18
-rw-r--r--arch/x86/include/asm/thread_info.h19
-rw-r--r--arch/x86/include/asm/tlbflush.h77
-rw-r--r--arch/x86/include/asm/traps.h6
-rw-r--r--arch/x86/include/asm/uaccess.h2
-rw-r--r--arch/x86/include/asm/vgtod.h6
-rw-r--r--arch/x86/include/asm/virtext.h5
-rw-r--r--arch/x86/include/asm/vmx.h4
-rw-r--r--arch/x86/include/asm/xen/page.h20
-rw-r--r--arch/x86/include/uapi/asm/bootparam.h1
-rw-r--r--arch/x86/include/uapi/asm/hyperv.h11
-rw-r--r--arch/x86/include/uapi/asm/msr-index.h12
-rw-r--r--arch/x86/include/uapi/asm/vmx.h6
-rw-r--r--arch/x86/kernel/Makefile5
-rw-r--r--arch/x86/kernel/acpi/boot.c52
-rw-r--r--arch/x86/kernel/acpi/sleep.c2
-rw-r--r--arch/x86/kernel/apb_timer.c8
-rw-r--r--arch/x86/kernel/apic/apic.c455
-rw-r--r--arch/x86/kernel/apic/io_apic.c13
-rw-r--r--arch/x86/kernel/cpu/Makefile1
-rw-r--r--arch/x86/kernel/cpu/amd.c19
-rw-r--r--arch/x86/kernel/cpu/common.c32
-rw-r--r--arch/x86/kernel/cpu/intel.c6
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c26
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c145
-rw-r--r--arch/x86/kernel/cpu/mcheck/p5.c9
-rw-r--r--arch/x86/kernel/cpu/mcheck/winchip.c8
-rw-r--r--arch/x86/kernel/cpu/microcode/core.c2
-rw-r--r--arch/x86/kernel/cpu/microcode/intel.c5
-rw-r--r--arch/x86/kernel/cpu/microcode/intel_early.c6
-rw-r--r--arch/x86/kernel/cpu/mkcapflags.sh2
-rw-r--r--arch/x86/kernel/cpu/mshyperv.c1
-rw-r--r--arch/x86/kernel/cpu/mtrr/cyrix.c6
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c6
-rw-r--r--arch/x86/kernel/cpu/perf_event.c76
-rw-r--r--arch/x86/kernel/cpu/perf_event.h2
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c1
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_ds.c4
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_rapl.c46
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_uncore.c9
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_uncore.h20
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c17
-rw-r--r--arch/x86/kernel/dumpstack.c5
-rw-r--r--arch/x86/kernel/e820.c26
-rw-r--r--arch/x86/kernel/early_printk.c187
-rw-r--r--arch/x86/kernel/entry_64.S317
-rw-r--r--arch/x86/kernel/ftrace.c2
-rw-r--r--arch/x86/kernel/head32.c1
-rw-r--r--arch/x86/kernel/head64.c11
-rw-r--r--arch/x86/kernel/head_64.S30
-rw-r--r--arch/x86/kernel/hpet.c2
-rw-r--r--arch/x86/kernel/hw_breakpoint.c45
-rw-r--r--arch/x86/kernel/i387.c42
-rw-r--r--arch/x86/kernel/irq.c5
-rw-r--r--arch/x86/kernel/irq_32.c13
-rw-r--r--arch/x86/kernel/kprobes/core.c22
-rw-r--r--arch/x86/kernel/kprobes/opt.c3
-rw-r--r--arch/x86/kernel/kvm.c13
-rw-r--r--arch/x86/kernel/livepatch.c90
-rw-r--r--arch/x86/kernel/module.c24
-rw-r--r--arch/x86/kernel/perf_regs.c90
-rw-r--r--arch/x86/kernel/pmc_atom.c81
-rw-r--r--arch/x86/kernel/process.c5
-rw-r--r--arch/x86/kernel/process_32.c2
-rw-r--r--arch/x86/kernel/process_64.c2
-rw-r--r--arch/x86/kernel/rtc.c6
-rw-r--r--arch/x86/kernel/setup.c35
-rw-r--r--arch/x86/kernel/signal.c8
-rw-r--r--arch/x86/kernel/smpboot.c113
-rw-r--r--arch/x86/kernel/tls.c25
-rw-r--r--arch/x86/kernel/traps.c131
-rw-r--r--arch/x86/kernel/tsc.c2
-rw-r--r--arch/x86/kernel/uprobes.c153
-rw-r--r--arch/x86/kernel/x8664_ksyms_64.c10
-rw-r--r--arch/x86/kernel/xsave.c3
-rw-r--r--arch/x86/kvm/Kconfig2
-rw-r--r--arch/x86/kvm/emulate.c261
-rw-r--r--arch/x86/kvm/ioapic.h2
-rw-r--r--arch/x86/kvm/iommu.c4
-rw-r--r--arch/x86/kvm/lapic.c150
-rw-r--r--arch/x86/kvm/lapic.h6
-rw-r--r--arch/x86/kvm/mmu.c351
-rw-r--r--arch/x86/kvm/mmu.h17
-rw-r--r--arch/x86/kvm/svm.c6
-rw-r--r--arch/x86/kvm/trace.h38
-rw-r--r--arch/x86/kvm/vmx.c1096
-rw-r--r--arch/x86/kvm/x86.c209
-rw-r--r--arch/x86/kvm/x86.h3
-rw-r--r--arch/x86/lguest/boot.c173
-rw-r--r--arch/x86/lib/insn.c2
-rw-r--r--arch/x86/lib/memcpy_64.S6
-rw-r--r--arch/x86/lib/memmove_64.S4
-rw-r--r--arch/x86/lib/memset_64.S10
-rw-r--r--arch/x86/mm/Makefile3
-rw-r--r--arch/x86/mm/fault.c4
-rw-r--r--arch/x86/mm/gup.c13
-rw-r--r--arch/x86/mm/hugetlbpage.c31
-rw-r--r--arch/x86/mm/init.c84
-rw-r--r--arch/x86/mm/kasan_init_64.c206
-rw-r--r--arch/x86/mm/mmap.c6
-rw-r--r--arch/x86/mm/mpx.c6
-rw-r--r--arch/x86/mm/numa.c6
-rw-r--r--arch/x86/mm/pat.c7
-rw-r--r--arch/x86/mm/pgtable.c14
-rw-r--r--arch/x86/mm/tlb.c3
-rw-r--r--arch/x86/pci/acpi.c293
-rw-r--r--arch/x86/pci/bus_numa.c4
-rw-r--r--arch/x86/pci/common.c50
-rw-r--r--arch/x86/pci/i386.c2
-rw-r--r--arch/x86/pci/intel_mid_pci.c5
-rw-r--r--arch/x86/pci/irq.c15
-rw-r--r--arch/x86/pci/mmconfig-shared.c34
-rw-r--r--arch/x86/pci/xen.c53
-rw-r--r--arch/x86/platform/Makefile1
-rw-r--r--arch/x86/platform/efi/efi_stub_64.S161
-rw-r--r--arch/x86/platform/efi/efi_thunk_64.S121
-rw-r--r--arch/x86/platform/intel-mid/device_libs/Makefile2
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_max3111.c35
-rw-r--r--arch/x86/platform/intel-mid/early_printk_intel_mid.c220
-rw-r--r--arch/x86/platform/intel-mid/intel_mid_vrtc.c2
-rw-r--r--arch/x86/platform/intel-quark/Makefile2
-rw-r--r--arch/x86/platform/intel-quark/imr.c661
-rw-r--r--arch/x86/platform/intel-quark/imr_selftest.c129
-rw-r--r--arch/x86/platform/uv/uv_nmi.c25
-rw-r--r--arch/x86/power/cpu.c11
-rw-r--r--arch/x86/realmode/Makefile2
-rw-r--r--arch/x86/realmode/init.c2
-rw-r--r--arch/x86/realmode/rm/Makefile1
-rw-r--r--arch/x86/tools/calc_run_size.pl39
-rw-r--r--arch/x86/tools/calc_run_size.sh42
-rw-r--r--arch/x86/um/signal.c2
-rw-r--r--arch/x86/vdso/Makefile3
-rw-r--r--arch/x86/vdso/vma.c45
-rw-r--r--arch/x86/xen/enlighten.c26
-rw-r--r--arch/x86/xen/mmu.c17
-rw-r--r--arch/x86/xen/p2m.c289
-rw-r--r--arch/x86/xen/setup.c79
-rw-r--r--arch/x86/xen/smp.c2
-rw-r--r--arch/x86/xen/spinlock.c13
-rw-r--r--arch/x86/xen/time.c22
-rw-r--r--arch/x86/xen/xen-ops.h6
198 files changed, 6170 insertions, 3231 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index ba397bde7948..c2fb8a87dccb 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -17,6 +17,7 @@ config X86_64
depends on 64BIT
select X86_DEV_DMA_OPS
select ARCH_USE_CMPXCHG_LOCKREF
+ select HAVE_LIVEPATCH
### Arch settings
config X86
@@ -84,6 +85,7 @@ config X86
select HAVE_CMPXCHG_LOCAL
select HAVE_CMPXCHG_DOUBLE
select HAVE_ARCH_KMEMCHECK
+ select HAVE_ARCH_KASAN if X86_64 && SPARSEMEM_VMEMMAP
select HAVE_USER_RETURN_NOTIFIER
select ARCH_BINFMT_ELF_RANDOMIZE_PIE
select HAVE_ARCH_JUMP_LABEL
@@ -138,6 +140,7 @@ config X86
select HAVE_ACPI_APEI_NMI if ACPI
select ACPI_LEGACY_TABLES_LOOKUP if ACPI
select X86_FEATURE_NAMES if PROC_FS
+ select SRCU
config INSTRUCTION_DECODER
def_bool y
@@ -485,6 +488,22 @@ config X86_INTEL_MID
Intel MID platforms are based on an Intel processor and chipset which
consume less power than most of the x86 derivatives.
+config X86_INTEL_QUARK
+ bool "Intel Quark platform support"
+ depends on X86_32
+ depends on X86_EXTENDED_PLATFORM
+ depends on X86_PLATFORM_DEVICES
+ depends on X86_TSC
+ depends on PCI
+ depends on PCI_GOANY
+ depends on X86_IO_APIC
+ select IOSF_MBI
+ select INTEL_IMR
+ ---help---
+ Select to include support for Quark X1000 SoC.
+ Say Y here if you have a Quark based system such as the Arduino
+ compatible Intel Galileo.
+
config X86_INTEL_LPSS
bool "Intel Low Power Subsystem Support"
depends on ACPI
@@ -496,6 +515,17 @@ config X86_INTEL_LPSS
things like clock tree (common clock framework) and pincontrol
which are needed by the LPSS peripheral drivers.
+config X86_AMD_PLATFORM_DEVICE
+ bool "AMD ACPI2Platform devices support"
+ depends on ACPI
+ select COMMON_CLK
+ select PINCTRL
+ ---help---
+ Select to interpret AMD specific ACPI device to platform device
+ such as I2C, UART, GPIO found on AMD Carrizo and later chipsets.
+ I2C and UART depend on COMMON_CLK to set clock. GPIO driver is
+ implemented under PINCTRL subsystem.
+
config IOSF_MBI
tristate "Intel SoC IOSF Sideband support for SoC platforms"
depends on PCI
@@ -855,9 +885,13 @@ config SCHED_MC
source "kernel/Kconfig.preempt"
+config UP_LATE_INIT
+ def_bool y
+ depends on !SMP && X86_LOCAL_APIC
+
config X86_UP_APIC
bool "Local APIC support on uniprocessors"
- depends on X86_32 && !SMP && !X86_32_NON_STANDARD && !PCI_MSI
+ depends on X86_32 && !SMP && !X86_32_NON_STANDARD
---help---
A local APIC (Advanced Programmable Interrupt Controller) is an
integrated interrupt controller in the CPU. If you have a single-CPU
@@ -868,6 +902,10 @@ config X86_UP_APIC
performance counters), and the NMI watchdog which detects hard
lockups.
+config X86_UP_APIC_MSI
+ def_bool y
+ select X86_UP_APIC if X86_32 && !SMP && !X86_32_NON_STANDARD && PCI_MSI
+
config X86_UP_IOAPIC
bool "IO-APIC support on uniprocessors"
depends on X86_UP_APIC
@@ -2008,6 +2046,8 @@ config CMDLINE_OVERRIDE
This is used to work around broken boot loaders. This should
be set to 'N' under normal conditions.
+source "kernel/livepatch/Kconfig"
+
endmenu
config ARCH_ENABLE_MEMORY_HOTPLUG
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 61bd2ad94281..20028da8ae18 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -313,6 +313,19 @@ config DEBUG_NMI_SELFTEST
If unsure, say N.
+config DEBUG_IMR_SELFTEST
+ bool "Isolated Memory Region self test"
+ default n
+ depends on INTEL_IMR
+ ---help---
+ This option enables automated sanity testing of the IMR code.
+ Some simple tests are run to verify IMR bounds checking, alignment
+ and overlapping. This option is really only useful if you are
+ debugging an IMR memory map or are modifying the IMR code and want to
+ test your changes.
+
+ If unsure say N here.
+
config X86_DEBUG_STATIC_CPU_HAS
bool "Debug alternatives"
depends on DEBUG_KERNEL
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 920e6160c535..5ba2d9ce82dc 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -148,6 +148,7 @@ cfi-sections := $(call as-instr,.cfi_sections .debug_frame,-DCONFIG_AS_CFI_SECTI
# does binutils support specific instructions?
asinstr := $(call as-instr,fxsaveq (%rax),-DCONFIG_AS_FXSAVEQ=1)
+asinstr += $(call as-instr,pshufb %xmm0$(comma)%xmm0,-DCONFIG_AS_SSSE3=1)
asinstr += $(call as-instr,crc32l %eax$(comma)%eax,-DCONFIG_AS_CRC32=1)
avx_instr := $(call as-instr,vxorps %ymm0$(comma)%ymm1$(comma)%ymm2,-DCONFIG_AS_AVX=1)
avx2_instr :=$(call as-instr,vpbroadcastb %xmm0$(comma)%ymm1,-DCONFIG_AS_AVX2=1)
diff --git a/arch/x86/Makefile.um b/arch/x86/Makefile.um
index 36b62bc52638..95eba554baf9 100644
--- a/arch/x86/Makefile.um
+++ b/arch/x86/Makefile.um
@@ -30,7 +30,7 @@ cflags-y += -ffreestanding
# Disable unit-at-a-time mode on pre-gcc-4.0 compilers, it makes gcc use
# a lot more stack due to the lack of sharing of stacklots. Also, gcc
# 4.3.0 needs -funit-at-a-time for extern inline functions.
-KBUILD_CFLAGS += $(shell if [ $(call cc-version) -lt 0400 ] ; then \
+KBUILD_CFLAGS += $(shell if [ $(cc-version) -lt 0400 ] ; then \
echo $(call cc-option,-fno-unit-at-a-time); \
else echo $(call cc-option,-funit-at-a-time); fi ;)
diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile
index 5b016e2498f3..57bbf2fb21f6 100644
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -14,6 +14,8 @@
# Set it to -DSVGA_MODE=NORMAL_VGA if you just want the EGA/VGA mode.
# The number is the same as you would ordinarily press at bootup.
+KASAN_SANITIZE := n
+
SVGA_MODE := -DSVGA_MODE=NORMAL_VGA
targets := vmlinux.bin setup.bin setup.elf bzImage
@@ -51,6 +53,7 @@ targets += cpustr.h
$(obj)/cpustr.h: $(obj)/mkcpustr FORCE
$(call if_changed,cpustr)
endif
+clean-files += cpustr.h
# ---------------------------------------------------------------------------
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index d999398928bc..0a291cdfaf77 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -16,6 +16,8 @@
# (see scripts/Makefile.lib size_append)
# compressed vmlinux.bin.all + u32 size of vmlinux.bin.all
+KASAN_SANITIZE := n
+
targets := vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma \
vmlinux.bin.xz vmlinux.bin.lzo vmlinux.bin.lz4
@@ -49,6 +51,7 @@ $(obj)/eboot.o: KBUILD_CFLAGS += -fshort-wchar -mno-red-zone
vmlinux-objs-$(CONFIG_EFI_STUB) += $(obj)/eboot.o $(obj)/efi_stub_$(BITS).o \
$(objtree)/drivers/firmware/efi/libstub/lib.a
+vmlinux-objs-$(CONFIG_EFI_MIXED) += $(obj)/efi_thunk_$(BITS).o
$(obj)/vmlinux: $(vmlinux-objs-y) FORCE
$(call if_changed,ld)
@@ -90,7 +93,7 @@ suffix-$(CONFIG_KERNEL_LZO) := lzo
suffix-$(CONFIG_KERNEL_LZ4) := lz4
RUN_SIZE = $(shell $(OBJDUMP) -h vmlinux | \
- perl $(srctree)/arch/x86/tools/calc_run_size.pl)
+ $(CONFIG_SHELL) $(srctree)/arch/x86/tools/calc_run_size.sh)
quiet_cmd_mkpiggy = MKPIGGY $@
cmd_mkpiggy = $(obj)/mkpiggy $< $(RUN_SIZE) > $@ || ( rm -f $@ ; false )
diff --git a/arch/x86/boot/compressed/aslr.c b/arch/x86/boot/compressed/aslr.c
index bb1376381985..7083c16cccba 100644
--- a/arch/x86/boot/compressed/aslr.c
+++ b/arch/x86/boot/compressed/aslr.c
@@ -14,6 +14,13 @@
static const char build_str[] = UTS_RELEASE " (" LINUX_COMPILE_BY "@"
LINUX_COMPILE_HOST ") (" LINUX_COMPILER ") " UTS_VERSION;
+struct kaslr_setup_data {
+ __u64 next;
+ __u32 type;
+ __u32 len;
+ __u8 data[1];
+} kaslr_setup_data;
+
#define I8254_PORT_CONTROL 0x43
#define I8254_PORT_COUNTER0 0x40
#define I8254_CMD_READBACK 0xC0
@@ -295,7 +302,29 @@ static unsigned long find_random_addr(unsigned long minimum,
return slots_fetch_random();
}
-unsigned char *choose_kernel_location(unsigned char *input,
+static void add_kaslr_setup_data(struct boot_params *params, __u8 enabled)
+{
+ struct setup_data *data;
+
+ kaslr_setup_data.type = SETUP_KASLR;
+ kaslr_setup_data.len = 1;
+ kaslr_setup_data.next = 0;
+ kaslr_setup_data.data[0] = enabled;
+
+ data = (struct setup_data *)(unsigned long)params->hdr.setup_data;
+
+ while (data && data->next)
+ data = (struct setup_data *)(unsigned long)data->next;
+
+ if (data)
+ data->next = (unsigned long)&kaslr_setup_data;
+ else
+ params->hdr.setup_data = (unsigned long)&kaslr_setup_data;
+
+}
+
+unsigned char *choose_kernel_location(struct boot_params *params,
+ unsigned char *input,
unsigned long input_size,
unsigned char *output,
unsigned long output_size)
@@ -306,14 +335,17 @@ unsigned char *choose_kernel_location(unsigned char *input,
#ifdef CONFIG_HIBERNATION
if (!cmdline_find_option_bool("kaslr")) {
debug_putstr("KASLR disabled by default...\n");
+ add_kaslr_setup_data(params, 0);
goto out;
}
#else
if (cmdline_find_option_bool("nokaslr")) {
debug_putstr("KASLR disabled by cmdline...\n");
+ add_kaslr_setup_data(params, 0);
goto out;
}
#endif
+ add_kaslr_setup_data(params, 1);
/* Record the various known unsafe memory ranges. */
mem_avoid_init((unsigned long)input, input_size,
diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c
index 92b9a5f2aed6..ef17683484e9 100644
--- a/arch/x86/boot/compressed/eboot.c
+++ b/arch/x86/boot/compressed/eboot.c
@@ -13,8 +13,7 @@
#include <asm/setup.h>
#include <asm/desc.h>
-#undef memcpy /* Use memcpy from misc.c */
-
+#include "../string.h"
#include "eboot.h"
static efi_system_table_t *sys_table;
diff --git a/arch/x86/boot/compressed/efi_stub_64.S b/arch/x86/boot/compressed/efi_stub_64.S
index 7ff3632806b1..99494dff2113 100644
--- a/arch/x86/boot/compressed/efi_stub_64.S
+++ b/arch/x86/boot/compressed/efi_stub_64.S
@@ -3,28 +3,3 @@
#include <asm/processor-flags.h>
#include "../../platform/efi/efi_stub_64.S"
-
-#ifdef CONFIG_EFI_MIXED
- .code64
- .text
-ENTRY(efi64_thunk)
- push %rbp
- push %rbx
-
- subq $16, %rsp
- leaq efi_exit32(%rip), %rax
- movl %eax, 8(%rsp)
- leaq efi_gdt64(%rip), %rax
- movl %eax, 4(%rsp)
- movl %eax, 2(%rax) /* Fixup the gdt base address */
- leaq efi32_boot_gdt(%rip), %rax
- movl %eax, (%rsp)
-
- call __efi64_thunk
-
- addq $16, %rsp
- pop %rbx
- pop %rbp
- ret
-ENDPROC(efi64_thunk)
-#endif /* CONFIG_EFI_MIXED */
diff --git a/arch/x86/boot/compressed/efi_thunk_64.S b/arch/x86/boot/compressed/efi_thunk_64.S
new file mode 100644
index 000000000000..630384a4c14a
--- /dev/null
+++ b/arch/x86/boot/compressed/efi_thunk_64.S
@@ -0,0 +1,196 @@
+/*
+ * Copyright (C) 2014, 2015 Intel Corporation; author Matt Fleming
+ *
+ * Early support for invoking 32-bit EFI services from a 64-bit kernel.
+ *
+ * Because this thunking occurs before ExitBootServices() we have to
+ * restore the firmware's 32-bit GDT before we make EFI serivce calls,
+ * since the firmware's 32-bit IDT is still currently installed and it
+ * needs to be able to service interrupts.
+ *
+ * On the plus side, we don't have to worry about mangling 64-bit
+ * addresses into 32-bits because we're executing with an identify
+ * mapped pagetable and haven't transitioned to 64-bit virtual addresses
+ * yet.
+ */
+
+#include <linux/linkage.h>
+#include <asm/msr.h>
+#include <asm/page_types.h>
+#include <asm/processor-flags.h>
+#include <asm/segment.h>
+
+ .code64
+ .text
+ENTRY(efi64_thunk)
+ push %rbp
+ push %rbx
+
+ subq $8, %rsp
+ leaq efi_exit32(%rip), %rax
+ movl %eax, 4(%rsp)
+ leaq efi_gdt64(%rip), %rax
+ movl %eax, (%rsp)
+ movl %eax, 2(%rax) /* Fixup the gdt base address */
+
+ movl %ds, %eax
+ push %rax
+ movl %es, %eax
+ push %rax
+ movl %ss, %eax
+ push %rax
+
+ /*
+ * Convert x86-64 ABI params to i386 ABI
+ */
+ subq $32, %rsp
+ movl %esi, 0x0(%rsp)
+ movl %edx, 0x4(%rsp)
+ movl %ecx, 0x8(%rsp)
+ movq %r8, %rsi
+ movl %esi, 0xc(%rsp)
+ movq %r9, %rsi
+ movl %esi, 0x10(%rsp)
+
+ sgdt save_gdt(%rip)
+
+ leaq 1f(%rip), %rbx
+ movq %rbx, func_rt_ptr(%rip)
+
+ /*
+ * Switch to gdt with 32-bit segments. This is the firmware GDT
+ * that was installed when the kernel started executing. This
+ * pointer was saved at the EFI stub entry point in head_64.S.
+ */
+ leaq efi32_boot_gdt(%rip), %rax
+ lgdt (%rax)
+
+ pushq $__KERNEL_CS
+ leaq efi_enter32(%rip), %rax
+ pushq %rax
+ lretq
+
+1: addq $32, %rsp
+
+ lgdt save_gdt(%rip)
+
+ pop %rbx
+ movl %ebx, %ss
+ pop %rbx
+ movl %ebx, %es
+ pop %rbx
+ movl %ebx, %ds
+
+ /*
+ * Convert 32-bit status code into 64-bit.
+ */
+ test %rax, %rax
+ jz 1f
+ movl %eax, %ecx
+ andl $0x0fffffff, %ecx
+ andl $0xf0000000, %eax
+ shl $32, %rax
+ or %rcx, %rax
+1:
+ addq $8, %rsp
+ pop %rbx
+ pop %rbp
+ ret
+ENDPROC(efi64_thunk)
+
+ENTRY(efi_exit32)
+ movq func_rt_ptr(%rip), %rax
+ push %rax
+ mov %rdi, %rax
+ ret
+ENDPROC(efi_exit32)
+
+ .code32
+/*
+ * EFI service pointer must be in %edi.
+ *
+ * The stack should represent the 32-bit calling convention.
+ */
+ENTRY(efi_enter32)
+ movl $__KERNEL_DS, %eax
+ movl %eax, %ds
+ movl %eax, %es
+ movl %eax, %ss
+
+ /* Reload pgtables */
+ movl %cr3, %eax
+ movl %eax, %cr3
+
+ /* Disable paging */
+ movl %cr0, %eax
+ btrl $X86_CR0_PG_BIT, %eax
+ movl %eax, %cr0
+
+ /* Disable long mode via EFER */
+ movl $MSR_EFER, %ecx
+ rdmsr
+ btrl $_EFER_LME, %eax
+ wrmsr
+
+ call *%edi
+
+ /* We must preserve return value */
+ movl %eax, %edi
+
+ /*
+ * Some firmware will return with interrupts enabled. Be sure to
+ * disable them before we switch GDTs.
+ */
+ cli
+
+ movl 56(%esp), %eax
+ movl %eax, 2(%eax)
+ lgdtl (%eax)
+
+ movl %cr4, %eax
+ btsl $(X86_CR4_PAE_BIT), %eax
+ movl %eax, %cr4
+
+ movl %cr3, %eax
+ movl %eax, %cr3
+
+ movl $MSR_EFER, %ecx
+ rdmsr
+ btsl $_EFER_LME, %eax
+ wrmsr
+
+ xorl %eax, %eax
+ lldt %ax
+
+ movl 60(%esp), %eax
+ pushl $__KERNEL_CS
+ pushl %eax
+
+ /* Enable paging */
+ movl %cr0, %eax
+ btsl $X86_CR0_PG_BIT, %eax
+ movl %eax, %cr0
+ lret
+ENDPROC(efi_enter32)
+
+ .data
+ .balign 8
+ .global efi32_boot_gdt
+efi32_boot_gdt: .word 0
+ .quad 0
+
+save_gdt: .word 0
+ .quad 0
+func_rt_ptr: .quad 0
+
+ .global efi_gdt64
+efi_gdt64:
+ .word efi_gdt64_end - efi_gdt64
+ .long 0 /* Filled out by user */
+ .word 0
+ .quad 0x0000000000000000 /* NULL descriptor */
+ .quad 0x00af9a000000ffff /* __KERNEL_CS */
+ .quad 0x00cf92000000ffff /* __KERNEL_DS */
+ .quad 0x0080890000000000 /* TS descriptor */
+ .quad 0x0000000000000000 /* TS continued */
+efi_gdt64_end:
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index dcc1c536cc21..5903089c818f 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -373,6 +373,8 @@ asmlinkage __visible void *decompress_kernel(void *rmode, memptr heap,
unsigned long output_len,
unsigned long run_size)
{
+ unsigned char *output_orig = output;
+
real_mode = rmode;
sanitize_boot_params(real_mode);
@@ -399,7 +401,8 @@ asmlinkage __visible void *decompress_kernel(void *rmode, memptr heap,
* the entire decompressed kernel plus relocation table, or the
* entire decompressed kernel plus .bss and .brk sections.
*/
- output = choose_kernel_location(input_data, input_len, output,
+ output = choose_kernel_location(real_mode, input_data, input_len,
+ output,
output_len > run_size ? output_len
: run_size);
@@ -421,7 +424,12 @@ asmlinkage __visible void *decompress_kernel(void *rmode, memptr heap,
debug_putstr("\nDecompressing Linux... ");
decompress(input_data, input_len, NULL, NULL, output, NULL, error);
parse_elf(output);
- handle_relocations(output, output_len);
+ /*
+ * 32-bit always performs relocations. 64-bit relocations are only
+ * needed if kASLR has chosen a different load address.
+ */
+ if (!IS_ENABLED(CONFIG_X86_64) || output != output_orig)
+ handle_relocations(output, output_len);
debug_putstr("done.\nBooting the kernel.\n");
return output;
}
diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
index 24e3e569a13c..ee3576b2666b 100644
--- a/arch/x86/boot/compressed/misc.h
+++ b/arch/x86/boot/compressed/misc.h
@@ -7,6 +7,7 @@
* we just keep it from happening
*/
#undef CONFIG_PARAVIRT
+#undef CONFIG_KASAN
#ifdef CONFIG_X86_32
#define _ASM_X86_DESC_H 1
#endif
@@ -56,7 +57,8 @@ int cmdline_find_option_bool(const char *option);
#if CONFIG_RANDOMIZE_BASE
/* aslr.c */
-unsigned char *choose_kernel_location(unsigned char *input,
+unsigned char *choose_kernel_location(struct boot_params *params,
+ unsigned char *input,
unsigned long input_size,
unsigned char *output,
unsigned long output_size);
@@ -64,7 +66,8 @@ unsigned char *choose_kernel_location(unsigned char *input,
bool has_cpuflag(int flag);
#else
static inline
-unsigned char *choose_kernel_location(unsigned char *input,
+unsigned char *choose_kernel_location(struct boot_params *params,
+ unsigned char *input,
unsigned long input_size,
unsigned char *output,
unsigned long output_size)
diff --git a/arch/x86/boot/ctype.h b/arch/x86/boot/ctype.h
index 25e13403193c..020f137df7a2 100644
--- a/arch/x86/boot/ctype.h
+++ b/arch/x86/boot/ctype.h
@@ -1,6 +1,5 @@
-#ifndef BOOT_ISDIGIT_H
-
-#define BOOT_ISDIGIT_H
+#ifndef BOOT_CTYPE_H
+#define BOOT_CTYPE_H
static inline int isdigit(int ch)
{
diff --git a/arch/x86/boot/early_serial_console.c b/arch/x86/boot/early_serial_console.c
index 5df2869c874b..45a07684bbab 100644
--- a/arch/x86/boot/early_serial_console.c
+++ b/arch/x86/boot/early_serial_console.c
@@ -2,8 +2,6 @@
#define DEFAULT_SERIAL_PORT 0x3f8 /* ttyS0 */
-#define XMTRDY 0x20
-
#define DLAB 0x80
#define TXR 0 /* Transmit register (WRITE) */
@@ -74,8 +72,8 @@ static void parse_earlyprintk(void)
static const int bases[] = { 0x3f8, 0x2f8 };
int idx = 0;
- if (!strncmp(arg + pos, "ttyS", 4))
- pos += 4;
+ /* += strlen("ttyS"); */
+ pos += 4;
if (arg[pos++] == '1')
idx = 1;
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index fd0f848938cc..5a4a089e8b1f 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -26,7 +26,6 @@ obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o
obj-$(CONFIG_CRYPTO_CRC32C_INTEL) += crc32c-intel.o
obj-$(CONFIG_CRYPTO_SHA1_SSSE3) += sha1-ssse3.o
-obj-$(CONFIG_CRYPTO_SHA1_MB) += sha-mb/
obj-$(CONFIG_CRYPTO_CRC32_PCLMUL) += crc32-pclmul.o
obj-$(CONFIG_CRYPTO_SHA256_SSSE3) += sha256-ssse3.o
obj-$(CONFIG_CRYPTO_SHA512_SSSE3) += sha512-ssse3.o
@@ -46,6 +45,7 @@ endif
ifeq ($(avx2_supported),yes)
obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX2_X86_64) += camellia-aesni-avx2.o
obj-$(CONFIG_CRYPTO_SERPENT_AVX2_X86_64) += serpent-avx2.o
+ obj-$(CONFIG_CRYPTO_SHA1_MB) += sha-mb/
endif
aes-i586-y := aes-i586-asm_32.o aes_glue.o
diff --git a/arch/x86/crypto/aes_ctrby8_avx-x86_64.S b/arch/x86/crypto/aes_ctrby8_avx-x86_64.S
index 2df2a0298f5a..a916c4a61165 100644
--- a/arch/x86/crypto/aes_ctrby8_avx-x86_64.S
+++ b/arch/x86/crypto/aes_ctrby8_avx-x86_64.S
@@ -208,7 +208,7 @@ ddq_add_8:
.if (klen == KEY_128)
.if (load_keys)
- vmovdqa 3*16(p_keys), xkeyA
+ vmovdqa 3*16(p_keys), xkey4
.endif
.else
vmovdqa 3*16(p_keys), xkeyA
@@ -224,7 +224,7 @@ ddq_add_8:
add $(16*by), p_in
.if (klen == KEY_128)
- vmovdqa 4*16(p_keys), xkey4
+ vmovdqa 4*16(p_keys), xkeyB
.else
.if (load_keys)
vmovdqa 4*16(p_keys), xkey4
@@ -234,7 +234,12 @@ ddq_add_8:
.set i, 0
.rept by
club XDATA, i
- vaesenc xkeyA, var_xdata, var_xdata /* key 3 */
+ /* key 3 */
+ .if (klen == KEY_128)
+ vaesenc xkey4, var_xdata, var_xdata
+ .else
+ vaesenc xkeyA, var_xdata, var_xdata
+ .endif
.set i, (i +1)
.endr
@@ -243,13 +248,18 @@ ddq_add_8:
.set i, 0
.rept by
club XDATA, i
- vaesenc xkey4, var_xdata, var_xdata /* key 4 */
+ /* key 4 */
+ .if (klen == KEY_128)
+ vaesenc xkeyB, var_xdata, var_xdata
+ .else
+ vaesenc xkey4, var_xdata, var_xdata
+ .endif
.set i, (i +1)
.endr
.if (klen == KEY_128)
.if (load_keys)
- vmovdqa 6*16(p_keys), xkeyB
+ vmovdqa 6*16(p_keys), xkey8
.endif
.else
vmovdqa 6*16(p_keys), xkeyB
@@ -267,12 +277,17 @@ ddq_add_8:
.set i, 0
.rept by
club XDATA, i
- vaesenc xkeyB, var_xdata, var_xdata /* key 6 */
+ /* key 6 */
+ .if (klen == KEY_128)
+ vaesenc xkey8, var_xdata, var_xdata
+ .else
+ vaesenc xkeyB, var_xdata, var_xdata
+ .endif
.set i, (i +1)
.endr
.if (klen == KEY_128)
- vmovdqa 8*16(p_keys), xkey8
+ vmovdqa 8*16(p_keys), xkeyB
.else
.if (load_keys)
vmovdqa 8*16(p_keys), xkey8
@@ -288,7 +303,7 @@ ddq_add_8:
.if (klen == KEY_128)
.if (load_keys)
- vmovdqa 9*16(p_keys), xkeyA
+ vmovdqa 9*16(p_keys), xkey12
.endif
.else
vmovdqa 9*16(p_keys), xkeyA
@@ -297,7 +312,12 @@ ddq_add_8:
.set i, 0
.rept by
club XDATA, i
- vaesenc xkey8, var_xdata, var_xdata /* key 8 */
+ /* key 8 */
+ .if (klen == KEY_128)
+ vaesenc xkeyB, var_xdata, var_xdata
+ .else
+ vaesenc xkey8, var_xdata, var_xdata
+ .endif
.set i, (i +1)
.endr
@@ -306,7 +326,12 @@ ddq_add_8:
.set i, 0
.rept by
club XDATA, i
- vaesenc xkeyA, var_xdata, var_xdata /* key 9 */
+ /* key 9 */
+ .if (klen == KEY_128)
+ vaesenc xkey12, var_xdata, var_xdata
+ .else
+ vaesenc xkeyA, var_xdata, var_xdata
+ .endif
.set i, (i +1)
.endr
@@ -412,7 +437,6 @@ ddq_add_8:
/* main body of aes ctr load */
.macro do_aes_ctrmain key_len
-
cmp $16, num_bytes
jb .Ldo_return2\key_len
diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
index 477e9d75149b..6bd2c6c95373 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -32,12 +32,23 @@
#include <linux/linkage.h>
#include <asm/inst.h>
+/*
+ * The following macros are used to move an (un)aligned 16 byte value to/from
+ * an XMM register. This can done for either FP or integer values, for FP use
+ * movaps (move aligned packed single) or integer use movdqa (move double quad
+ * aligned). It doesn't make a performance difference which instruction is used
+ * since Nehalem (original Core i7) was released. However, the movaps is a byte
+ * shorter, so that is the one we'll use for now. (same for unaligned).
+ */
+#define MOVADQ movaps
+#define MOVUDQ movups
+
#ifdef __x86_64__
+
.data
.align 16
.Lgf128mul_x_ble_mask:
.octa 0x00000000000000010000000000000087
-
POLY: .octa 0xC2000000000000000000000000000001
TWOONE: .octa 0x00000001000000000000000000000001
@@ -89,6 +100,7 @@ enc: .octa 0x2
#define arg8 STACK_OFFSET+16(%r14)
#define arg9 STACK_OFFSET+24(%r14)
#define arg10 STACK_OFFSET+32(%r14)
+#define keysize 2*15*16(%arg1)
#endif
@@ -213,10 +225,12 @@ enc: .octa 0x2
.macro INITIAL_BLOCKS_DEC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
+ MOVADQ SHUF_MASK(%rip), %xmm14
mov arg7, %r10 # %r10 = AAD
mov arg8, %r12 # %r12 = aadLen
mov %r12, %r11
pxor %xmm\i, %xmm\i
+
_get_AAD_loop\num_initial_blocks\operation:
movd (%r10), \TMP1
pslldq $12, \TMP1
@@ -225,16 +239,18 @@ _get_AAD_loop\num_initial_blocks\operation:
add $4, %r10
sub $4, %r12
jne _get_AAD_loop\num_initial_blocks\operation
+
cmp $16, %r11
je _get_AAD_loop2_done\num_initial_blocks\operation
+
mov $16, %r12
_get_AAD_loop2\num_initial_blocks\operation:
psrldq $4, %xmm\i
sub $4, %r12
cmp %r11, %r12
jne _get_AAD_loop2\num_initial_blocks\operation
+
_get_AAD_loop2_done\num_initial_blocks\operation:
- movdqa SHUF_MASK(%rip), %xmm14
PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
xor %r11, %r11 # initialise the data pointer offset as zero
@@ -243,59 +259,34 @@ _get_AAD_loop2_done\num_initial_blocks\operation:
mov %arg5, %rax # %rax = *Y0
movdqu (%rax), \XMM0 # XMM0 = Y0
- movdqa SHUF_MASK(%rip), %xmm14
PSHUFB_XMM %xmm14, \XMM0
.if (\i == 5) || (\i == 6) || (\i == 7)
+ MOVADQ ONE(%RIP),\TMP1
+ MOVADQ (%arg1),\TMP2
.irpc index, \i_seq
- paddd ONE(%rip), \XMM0 # INCR Y0
+ paddd \TMP1, \XMM0 # INCR Y0
movdqa \XMM0, %xmm\index
- movdqa SHUF_MASK(%rip), %xmm14
PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
-
-.endr
-.irpc index, \i_seq
- pxor 16*0(%arg1), %xmm\index
-.endr
-.irpc index, \i_seq
- movaps 0x10(%rdi), \TMP1
- AESENC \TMP1, %xmm\index # Round 1
-.endr
-.irpc index, \i_seq
- movaps 0x20(%arg1), \TMP1
- AESENC \TMP1, %xmm\index # Round 2
-.endr
-.irpc index, \i_seq
- movaps 0x30(%arg1), \TMP1
- AESENC \TMP1, %xmm\index # Round 2
-.endr
-.irpc index, \i_seq
- movaps 0x40(%arg1), \TMP1
- AESENC \TMP1, %xmm\index # Round 2
-.endr
-.irpc index, \i_seq
- movaps 0x50(%arg1), \TMP1
- AESENC \TMP1, %xmm\index # Round 2
-.endr
-.irpc index, \i_seq
- movaps 0x60(%arg1), \TMP1
- AESENC \TMP1, %xmm\index # Round 2
+ pxor \TMP2, %xmm\index
.endr
-.irpc index, \i_seq
- movaps 0x70(%arg1), \TMP1
- AESENC \TMP1, %xmm\index # Round 2
-.endr
-.irpc index, \i_seq
- movaps 0x80(%arg1), \TMP1
- AESENC \TMP1, %xmm\index # Round 2
-.endr
-.irpc index, \i_seq
- movaps 0x90(%arg1), \TMP1
- AESENC \TMP1, %xmm\index # Round 2
+ lea 0x10(%arg1),%r10
+ mov keysize,%eax
+ shr $2,%eax # 128->4, 192->6, 256->8
+ add $5,%eax # 128->9, 192->11, 256->13
+
+aes_loop_initial_dec\num_initial_blocks:
+ MOVADQ (%r10),\TMP1
+.irpc index, \i_seq
+ AESENC \TMP1, %xmm\index
.endr
+ add $16,%r10
+ sub $1,%eax
+ jnz aes_loop_initial_dec\num_initial_blocks
+
+ MOVADQ (%r10), \TMP1
.irpc index, \i_seq
- movaps 0xa0(%arg1), \TMP1
- AESENCLAST \TMP1, %xmm\index # Round 10
+ AESENCLAST \TMP1, %xmm\index # Last Round
.endr
.irpc index, \i_seq
movdqu (%arg3 , %r11, 1), \TMP1
@@ -305,10 +296,8 @@ _get_AAD_loop2_done\num_initial_blocks\operation:
add $16, %r11
movdqa \TMP1, %xmm\index
- movdqa SHUF_MASK(%rip), %xmm14
PSHUFB_XMM %xmm14, %xmm\index
-
- # prepare plaintext/ciphertext for GHASH computation
+ # prepare plaintext/ciphertext for GHASH computation
.endr
.endif
GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
@@ -338,30 +327,28 @@ _get_AAD_loop2_done\num_initial_blocks\operation:
* Precomputations for HashKey parallel with encryption of first 4 blocks.
* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
*/
- paddd ONE(%rip), \XMM0 # INCR Y0
- movdqa \XMM0, \XMM1
- movdqa SHUF_MASK(%rip), %xmm14
+ MOVADQ ONE(%rip), \TMP1
+ paddd \TMP1, \XMM0 # INCR Y0
+ MOVADQ \XMM0, \XMM1
PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
- paddd ONE(%rip), \XMM0 # INCR Y0
- movdqa \XMM0, \XMM2
- movdqa SHUF_MASK(%rip), %xmm14
+ paddd \TMP1, \XMM0 # INCR Y0
+ MOVADQ \XMM0, \XMM2
PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
- paddd ONE(%rip), \XMM0 # INCR Y0
- movdqa \XMM0, \XMM3
- movdqa SHUF_MASK(%rip), %xmm14
+ paddd \TMP1, \XMM0 # INCR Y0
+ MOVADQ \XMM0, \XMM3
PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
- paddd ONE(%rip), \XMM0 # INCR Y0
- movdqa \XMM0, \XMM4
- movdqa SHUF_MASK(%rip), %xmm14
+ paddd \TMP1, \XMM0 # INCR Y0
+ MOVADQ \XMM0, \XMM4
PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
- pxor 16*0(%arg1), \XMM1
- pxor 16*0(%arg1), \XMM2
- pxor 16*0(%arg1), \XMM3
- pxor 16*0(%arg1), \XMM4
+ MOVADQ 0(%arg1),\TMP1
+ pxor \TMP1, \XMM1
+ pxor \TMP1, \XMM2
+ pxor \TMP1, \XMM3
+ pxor \TMP1, \XMM4
movdqa \TMP3, \TMP5
pshufd $78, \TMP3, \TMP1
pxor \TMP3, \TMP1
@@ -399,7 +386,23 @@ _get_AAD_loop2_done\num_initial_blocks\operation:
pshufd $78, \TMP5, \TMP1
pxor \TMP5, \TMP1
movdqa \TMP1, HashKey_4_k(%rsp)
- movaps 0xa0(%arg1), \TMP2
+ lea 0xa0(%arg1),%r10
+ mov keysize,%eax
+ shr $2,%eax # 128->4, 192->6, 256->8
+ sub $4,%eax # 128->0, 192->2, 256->4
+ jz aes_loop_pre_dec_done\num_initial_blocks
+
+aes_loop_pre_dec\num_initial_blocks:
+ MOVADQ (%r10),\TMP2
+.irpc index, 1234
+ AESENC \TMP2, %xmm\index
+.endr
+ add $16,%r10
+ sub $1,%eax
+ jnz aes_loop_pre_dec\num_initial_blocks
+
+aes_loop_pre_dec_done\num_initial_blocks:
+ MOVADQ (%r10), \TMP2
AESENCLAST \TMP2, \XMM1
AESENCLAST \TMP2, \XMM2
AESENCLAST \TMP2, \XMM3
@@ -421,15 +424,11 @@ _get_AAD_loop2_done\num_initial_blocks\operation:
movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
movdqa \TMP1, \XMM4
add $64, %r11
- movdqa SHUF_MASK(%rip), %xmm14
PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
pxor \XMMDst, \XMM1
# combine GHASHed value with the corresponding ciphertext
- movdqa SHUF_MASK(%rip), %xmm14
PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
- movdqa SHUF_MASK(%rip), %xmm14
PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
- movdqa SHUF_MASK(%rip), %xmm14
PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
_initial_blocks_done\num_initial_blocks\operation:
@@ -451,6 +450,7 @@ _initial_blocks_done\num_initial_blocks\operation:
.macro INITIAL_BLOCKS_ENC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
+ MOVADQ SHUF_MASK(%rip), %xmm14
mov arg7, %r10 # %r10 = AAD
mov arg8, %r12 # %r12 = aadLen
mov %r12, %r11
@@ -472,7 +472,6 @@ _get_AAD_loop2\num_initial_blocks\operation:
cmp %r11, %r12
jne _get_AAD_loop2\num_initial_blocks\operation
_get_AAD_loop2_done\num_initial_blocks\operation:
- movdqa SHUF_MASK(%rip), %xmm14
PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
xor %r11, %r11 # initialise the data pointer offset as zero
@@ -481,59 +480,35 @@ _get_AAD_loop2_done\num_initial_blocks\operation:
mov %arg5, %rax # %rax = *Y0
movdqu (%rax), \XMM0 # XMM0 = Y0
- movdqa SHUF_MASK(%rip), %xmm14
PSHUFB_XMM %xmm14, \XMM0
.if (\i == 5) || (\i == 6) || (\i == 7)
-.irpc index, \i_seq
- paddd ONE(%rip), \XMM0 # INCR Y0
- movdqa \XMM0, %xmm\index
- movdqa SHUF_MASK(%rip), %xmm14
- PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
-.endr
-.irpc index, \i_seq
- pxor 16*0(%arg1), %xmm\index
-.endr
-.irpc index, \i_seq
- movaps 0x10(%rdi), \TMP1
- AESENC \TMP1, %xmm\index # Round 1
-.endr
-.irpc index, \i_seq
- movaps 0x20(%arg1), \TMP1
- AESENC \TMP1, %xmm\index # Round 2
-.endr
+ MOVADQ ONE(%RIP),\TMP1
+ MOVADQ 0(%arg1),\TMP2
.irpc index, \i_seq
- movaps 0x30(%arg1), \TMP1
- AESENC \TMP1, %xmm\index # Round 2
+ paddd \TMP1, \XMM0 # INCR Y0
+ MOVADQ \XMM0, %xmm\index
+ PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
+ pxor \TMP2, %xmm\index
.endr
-.irpc index, \i_seq
- movaps 0x40(%arg1), \TMP1
- AESENC \TMP1, %xmm\index # Round 2
-.endr
-.irpc index, \i_seq
- movaps 0x50(%arg1), \TMP1
- AESENC \TMP1, %xmm\index # Round 2
-.endr
-.irpc index, \i_seq
- movaps 0x60(%arg1), \TMP1
- AESENC \TMP1, %xmm\index # Round 2
-.endr
-.irpc index, \i_seq
- movaps 0x70(%arg1), \TMP1
- AESENC \TMP1, %xmm\index # Round 2
-.endr
-.irpc index, \i_seq
- movaps 0x80(%arg1), \TMP1
- AESENC \TMP1, %xmm\index # Round 2
-.endr
-.irpc index, \i_seq
- movaps 0x90(%arg1), \TMP1
- AESENC \TMP1, %xmm\index # Round 2
+ lea 0x10(%arg1),%r10
+ mov keysize,%eax
+ shr $2,%eax # 128->4, 192->6, 256->8
+ add $5,%eax # 128->9, 192->11, 256->13
+
+aes_loop_initial_enc\num_initial_blocks:
+ MOVADQ (%r10),\TMP1
+.irpc index, \i_seq
+ AESENC \TMP1, %xmm\index
.endr
+ add $16,%r10
+ sub $1,%eax
+ jnz aes_loop_initial_enc\num_initial_blocks
+
+ MOVADQ (%r10), \TMP1
.irpc index, \i_seq
- movaps 0xa0(%arg1), \TMP1
- AESENCLAST \TMP1, %xmm\index # Round 10
+ AESENCLAST \TMP1, %xmm\index # Last Round
.endr
.irpc index, \i_seq
movdqu (%arg3 , %r11, 1), \TMP1
@@ -541,8 +516,6 @@ _get_AAD_loop2_done\num_initial_blocks\operation:
movdqu %xmm\index, (%arg2 , %r11, 1)
# write back plaintext/ciphertext for num_initial_blocks
add $16, %r11
-
- movdqa SHUF_MASK(%rip), %xmm14
PSHUFB_XMM %xmm14, %xmm\index
# prepare plaintext/ciphertext for GHASH computation
@@ -575,30 +548,28 @@ _get_AAD_loop2_done\num_initial_blocks\operation:
* Precomputations for HashKey parallel with encryption of first 4 blocks.
* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
*/
- paddd ONE(%rip), \XMM0 # INCR Y0
- movdqa \XMM0, \XMM1
- movdqa SHUF_MASK(%rip), %xmm14
+ MOVADQ ONE(%RIP),\TMP1
+ paddd \TMP1, \XMM0 # INCR Y0
+ MOVADQ \XMM0, \XMM1
PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
- paddd ONE(%rip), \XMM0 # INCR Y0
- movdqa \XMM0, \XMM2
- movdqa SHUF_MASK(%rip), %xmm14
+ paddd \TMP1, \XMM0 # INCR Y0
+ MOVADQ \XMM0, \XMM2
PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
- paddd ONE(%rip), \XMM0 # INCR Y0
- movdqa \XMM0, \XMM3
- movdqa SHUF_MASK(%rip), %xmm14
+ paddd \TMP1, \XMM0 # INCR Y0
+ MOVADQ \XMM0, \XMM3
PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
- paddd ONE(%rip), \XMM0 # INCR Y0
- movdqa \XMM0, \XMM4
- movdqa SHUF_MASK(%rip), %xmm14
+ paddd \TMP1, \XMM0 # INCR Y0
+ MOVADQ \XMM0, \XMM4
PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
- pxor 16*0(%arg1), \XMM1
- pxor 16*0(%arg1), \XMM2
- pxor 16*0(%arg1), \XMM3
- pxor 16*0(%arg1), \XMM4
+ MOVADQ 0(%arg1),\TMP1
+ pxor \TMP1, \XMM1
+ pxor \TMP1, \XMM2
+ pxor \TMP1, \XMM3
+ pxor \TMP1, \XMM4
movdqa \TMP3, \TMP5
pshufd $78, \TMP3, \TMP1
pxor \TMP3, \TMP1
@@ -636,7 +607,23 @@ _get_AAD_loop2_done\num_initial_blocks\operation:
pshufd $78, \TMP5, \TMP1
pxor \TMP5, \TMP1
movdqa \TMP1, HashKey_4_k(%rsp)
- movaps 0xa0(%arg1), \TMP2
+ lea 0xa0(%arg1),%r10
+ mov keysize,%eax
+ shr $2,%eax # 128->4, 192->6, 256->8
+ sub $4,%eax # 128->0, 192->2, 256->4
+ jz aes_loop_pre_enc_done\num_initial_blocks
+
+aes_loop_pre_enc\num_initial_blocks:
+ MOVADQ (%r10),\TMP2
+.irpc index, 1234
+ AESENC \TMP2, %xmm\index
+.endr
+ add $16,%r10
+ sub $1,%eax
+ jnz aes_loop_pre_enc\num_initial_blocks
+
+aes_loop_pre_enc_done\num_initial_blocks:
+ MOVADQ (%r10), \TMP2
AESENCLAST \TMP2, \XMM1
AESENCLAST \TMP2, \XMM2
AESENCLAST \TMP2, \XMM3
@@ -655,15 +642,11 @@ _get_AAD_loop2_done\num_initial_blocks\operation:
movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
add $64, %r11
- movdqa SHUF_MASK(%rip), %xmm14
PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
pxor \XMMDst, \XMM1
# combine GHASHed value with the corresponding ciphertext
- movdqa SHUF_MASK(%rip), %xmm14
PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
- movdqa SHUF_MASK(%rip), %xmm14
PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
- movdqa SHUF_MASK(%rip), %xmm14
PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
_initial_blocks_done\num_initial_blocks\operation:
@@ -794,7 +777,23 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
AESENC \TMP3, \XMM3
AESENC \TMP3, \XMM4
PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
- movaps 0xa0(%arg1), \TMP3
+ lea 0xa0(%arg1),%r10
+ mov keysize,%eax
+ shr $2,%eax # 128->4, 192->6, 256->8
+ sub $4,%eax # 128->0, 192->2, 256->4
+ jz aes_loop_par_enc_done
+
+aes_loop_par_enc:
+ MOVADQ (%r10),\TMP3
+.irpc index, 1234
+ AESENC \TMP3, %xmm\index
+.endr
+ add $16,%r10
+ sub $1,%eax
+ jnz aes_loop_par_enc
+
+aes_loop_par_enc_done:
+ MOVADQ (%r10), \TMP3
AESENCLAST \TMP3, \XMM1 # Round 10
AESENCLAST \TMP3, \XMM2
AESENCLAST \TMP3, \XMM3
@@ -986,8 +985,24 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
AESENC \TMP3, \XMM3
AESENC \TMP3, \XMM4
PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
- movaps 0xa0(%arg1), \TMP3
- AESENCLAST \TMP3, \XMM1 # Round 10
+ lea 0xa0(%arg1),%r10
+ mov keysize,%eax
+ shr $2,%eax # 128->4, 192->6, 256->8
+ sub $4,%eax # 128->0, 192->2, 256->4
+ jz aes_loop_par_dec_done
+
+aes_loop_par_dec:
+ MOVADQ (%r10),\TMP3
+.irpc index, 1234
+ AESENC \TMP3, %xmm\index
+.endr
+ add $16,%r10
+ sub $1,%eax
+ jnz aes_loop_par_dec
+
+aes_loop_par_dec_done:
+ MOVADQ (%r10), \TMP3
+ AESENCLAST \TMP3, \XMM1 # last round
AESENCLAST \TMP3, \XMM2
AESENCLAST \TMP3, \XMM3
AESENCLAST \TMP3, \XMM4
@@ -1155,33 +1170,29 @@ TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
pxor \TMP6, \XMMDst # reduced result is in XMMDst
.endm
-/* Encryption of a single block done*/
-.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
- pxor (%arg1), \XMM0
- movaps 16(%arg1), \TMP1
- AESENC \TMP1, \XMM0
- movaps 32(%arg1), \TMP1
- AESENC \TMP1, \XMM0
- movaps 48(%arg1), \TMP1
- AESENC \TMP1, \XMM0
- movaps 64(%arg1), \TMP1
- AESENC \TMP1, \XMM0
- movaps 80(%arg1), \TMP1
- AESENC \TMP1, \XMM0
- movaps 96(%arg1), \TMP1
- AESENC \TMP1, \XMM0
- movaps 112(%arg1), \TMP1
- AESENC \TMP1, \XMM0
- movaps 128(%arg1), \TMP1
- AESENC \TMP1, \XMM0
- movaps 144(%arg1), \TMP1
- AESENC \TMP1, \XMM0
- movaps 160(%arg1), \TMP1
- AESENCLAST \TMP1, \XMM0
-.endm
+/* Encryption of a single block
+* uses eax & r10
+*/
+.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
+ pxor (%arg1), \XMM0
+ mov keysize,%eax
+ shr $2,%eax # 128->4, 192->6, 256->8
+ add $5,%eax # 128->9, 192->11, 256->13
+ lea 16(%arg1), %r10 # get first expanded key address
+
+_esb_loop_\@:
+ MOVADQ (%r10),\TMP1
+ AESENC \TMP1,\XMM0
+ add $16,%r10
+ sub $1,%eax
+ jnz _esb_loop_\@
+
+ MOVADQ (%r10),\TMP1
+ AESENCLAST \TMP1,\XMM0
+.endm
/*****************************************************************************
* void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
* u8 *out, // Plaintext output. Encrypt in-place is allowed.
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index ae855f4f64b7..947c6bf52c33 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -43,6 +43,7 @@
#include <asm/crypto/glue_helper.h>
#endif
+
/* This data is stored at the end of the crypto_tfm struct.
* It's a type of per "session" data storage location.
* This needs to be 16 byte aligned.
@@ -182,7 +183,8 @@ static void aesni_gcm_enc_avx(void *ctx, u8 *out,
u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
u8 *auth_tag, unsigned long auth_tag_len)
{
- if (plaintext_len < AVX_GEN2_OPTSIZE) {
+ struct crypto_aes_ctx *aes_ctx = (struct crypto_aes_ctx*)ctx;
+ if ((plaintext_len < AVX_GEN2_OPTSIZE) || (aes_ctx-> key_length != AES_KEYSIZE_128)){
aesni_gcm_enc(ctx, out, in, plaintext_len, iv, hash_subkey, aad,
aad_len, auth_tag, auth_tag_len);
} else {
@@ -197,7 +199,8 @@ static void aesni_gcm_dec_avx(void *ctx, u8 *out,
u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
u8 *auth_tag, unsigned long auth_tag_len)
{
- if (ciphertext_len < AVX_GEN2_OPTSIZE) {
+ struct crypto_aes_ctx *aes_ctx = (struct crypto_aes_ctx*)ctx;
+ if ((ciphertext_len < AVX_GEN2_OPTSIZE) || (aes_ctx-> key_length != AES_KEYSIZE_128)) {
aesni_gcm_dec(ctx, out, in, ciphertext_len, iv, hash_subkey, aad,
aad_len, auth_tag, auth_tag_len);
} else {
@@ -231,7 +234,8 @@ static void aesni_gcm_enc_avx2(void *ctx, u8 *out,
u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
u8 *auth_tag, unsigned long auth_tag_len)
{
- if (plaintext_len < AVX_GEN2_OPTSIZE) {
+ struct crypto_aes_ctx *aes_ctx = (struct crypto_aes_ctx*)ctx;
+ if ((plaintext_len < AVX_GEN2_OPTSIZE) || (aes_ctx-> key_length != AES_KEYSIZE_128)) {
aesni_gcm_enc(ctx, out, in, plaintext_len, iv, hash_subkey, aad,
aad_len, auth_tag, auth_tag_len);
} else if (plaintext_len < AVX_GEN4_OPTSIZE) {
@@ -250,7 +254,8 @@ static void aesni_gcm_dec_avx2(void *ctx, u8 *out,
u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
u8 *auth_tag, unsigned long auth_tag_len)
{
- if (ciphertext_len < AVX_GEN2_OPTSIZE) {
+ struct crypto_aes_ctx *aes_ctx = (struct crypto_aes_ctx*)ctx;
+ if ((ciphertext_len < AVX_GEN2_OPTSIZE) || (aes_ctx-> key_length != AES_KEYSIZE_128)) {
aesni_gcm_dec(ctx, out, in, ciphertext_len, iv, hash_subkey,
aad, aad_len, auth_tag, auth_tag_len);
} else if (ciphertext_len < AVX_GEN4_OPTSIZE) {
@@ -511,7 +516,7 @@ static int ctr_crypt(struct blkcipher_desc *desc,
kernel_fpu_begin();
while ((nbytes = walk.nbytes) >= AES_BLOCK_SIZE) {
aesni_ctr_enc_tfm(ctx, walk.dst.virt.addr, walk.src.virt.addr,
- nbytes & AES_BLOCK_MASK, walk.iv);
+ nbytes & AES_BLOCK_MASK, walk.iv);
nbytes &= AES_BLOCK_SIZE - 1;
err = blkcipher_walk_done(desc, &walk, nbytes);
}
@@ -902,7 +907,8 @@ static int rfc4106_set_key(struct crypto_aead *parent, const u8 *key,
}
/*Account for 4 byte nonce at the end.*/
key_len -= 4;
- if (key_len != AES_KEYSIZE_128) {
+ if (key_len != AES_KEYSIZE_128 && key_len != AES_KEYSIZE_192 &&
+ key_len != AES_KEYSIZE_256) {
crypto_tfm_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
return -EINVAL;
}
@@ -1013,6 +1019,7 @@ static int __driver_rfc4106_encrypt(struct aead_request *req)
__be32 counter = cpu_to_be32(1);
struct crypto_aead *tfm = crypto_aead_reqtfm(req);
struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
+ u32 key_len = ctx->aes_key_expanded.key_length;
void *aes_ctx = &(ctx->aes_key_expanded);
unsigned long auth_tag_len = crypto_aead_authsize(tfm);
u8 iv_tab[16+AESNI_ALIGN];
@@ -1027,6 +1034,13 @@ static int __driver_rfc4106_encrypt(struct aead_request *req)
/* to 8 or 12 bytes */
if (unlikely(req->assoclen != 8 && req->assoclen != 12))
return -EINVAL;
+ if (unlikely(auth_tag_len != 8 && auth_tag_len != 12 && auth_tag_len != 16))
+ return -EINVAL;
+ if (unlikely(key_len != AES_KEYSIZE_128 &&
+ key_len != AES_KEYSIZE_192 &&
+ key_len != AES_KEYSIZE_256))
+ return -EINVAL;
+
/* IV below built */
for (i = 0; i < 4; i++)
*(iv+i) = ctx->nonce[i];
@@ -1091,6 +1105,7 @@ static int __driver_rfc4106_decrypt(struct aead_request *req)
int retval = 0;
struct crypto_aead *tfm = crypto_aead_reqtfm(req);
struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
+ u32 key_len = ctx->aes_key_expanded.key_length;
void *aes_ctx = &(ctx->aes_key_expanded);
unsigned long auth_tag_len = crypto_aead_authsize(tfm);
u8 iv_and_authTag[32+AESNI_ALIGN];
@@ -1104,6 +1119,13 @@ static int __driver_rfc4106_decrypt(struct aead_request *req)
if (unlikely((req->cryptlen < auth_tag_len) ||
(req->assoclen != 8 && req->assoclen != 12)))
return -EINVAL;
+ if (unlikely(auth_tag_len != 8 && auth_tag_len != 12 && auth_tag_len != 16))
+ return -EINVAL;
+ if (unlikely(key_len != AES_KEYSIZE_128 &&
+ key_len != AES_KEYSIZE_192 &&
+ key_len != AES_KEYSIZE_256))
+ return -EINVAL;
+
/* Assuming we are supporting rfc4106 64-bit extended */
/* sequence numbers We need to have the AAD length */
/* equal to 8 or 12 bytes */
diff --git a/arch/x86/crypto/des3_ede_glue.c b/arch/x86/crypto/des3_ede_glue.c
index 38a14f818ef1..d6fc59aaaadf 100644
--- a/arch/x86/crypto/des3_ede_glue.c
+++ b/arch/x86/crypto/des3_ede_glue.c
@@ -504,6 +504,4 @@ MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Triple DES EDE Cipher Algorithm, asm optimized");
MODULE_ALIAS_CRYPTO("des3_ede");
MODULE_ALIAS_CRYPTO("des3_ede-asm");
-MODULE_ALIAS_CRYPTO("des");
-MODULE_ALIAS_CRYPTO("des-asm");
MODULE_AUTHOR("Jussi Kivilinna <jussi.kivilinna@iki.fi>");
diff --git a/arch/x86/crypto/sha-mb/sha1_mb.c b/arch/x86/crypto/sha-mb/sha1_mb.c
index a225a5ca1037..fd9f6b035b16 100644
--- a/arch/x86/crypto/sha-mb/sha1_mb.c
+++ b/arch/x86/crypto/sha-mb/sha1_mb.c
@@ -931,4 +931,4 @@ module_exit(sha1_mb_mod_fini);
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("SHA1 Secure Hash Algorithm, multi buffer accelerated");
-MODULE_ALIAS("sha1");
+MODULE_ALIAS_CRYPTO("sha1");
diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index f9e181aaba97..d0165c9a2932 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -169,7 +169,7 @@ static int ia32_restore_sigcontext(struct pt_regs *regs,
u32 tmp;
/* Always make any pending restarted system calls return -EINTR */
- current_thread_info()->restart_block.fn = do_no_restart_syscall;
+ current->restart_block.fn = do_no_restart_syscall;
get_user_try {
/*
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 82e8a1d44658..156ebcab4ada 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -179,8 +179,8 @@ sysenter_dispatch:
sysexit_from_sys_call:
andl $~TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET)
/* clear IF, that popfq doesn't enable interrupts early */
- andl $~0x200,EFLAGS-R11(%rsp)
- movl RIP-R11(%rsp),%edx /* User %eip */
+ andl $~0x200,EFLAGS-ARGOFFSET(%rsp)
+ movl RIP-ARGOFFSET(%rsp),%edx /* User %eip */
CFI_REGISTER rip,rdx
RESTORE_ARGS 0,24,0,0,0,0
xorq %r8,%r8
diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index 0ab4f9fd2687..3a45668f6dc3 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -50,6 +50,7 @@ void acpi_pic_sci_set_trigger(unsigned int, u16);
extern int (*__acpi_register_gsi)(struct device *dev, u32 gsi,
int trigger, int polarity);
+extern void (*__acpi_unregister_gsi)(u32 gsi);
static inline void disable_acpi(void)
{
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 465b309af254..efc3b22d896e 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -106,7 +106,14 @@ extern u32 native_safe_apic_wait_icr_idle(void);
extern void native_apic_icr_write(u32 low, u32 id);
extern u64 native_apic_icr_read(void);
-extern int x2apic_mode;
+static inline bool apic_is_x2apic_enabled(void)
+{
+ u64 msr;
+
+ if (rdmsrl_safe(MSR_IA32_APICBASE, &msr))
+ return false;
+ return msr & X2APIC_ENABLE;
+}
#ifdef CONFIG_X86_X2APIC
/*
@@ -169,48 +176,23 @@ static inline u64 native_x2apic_icr_read(void)
return val;
}
+extern int x2apic_mode;
extern int x2apic_phys;
-extern int x2apic_preenabled;
-extern void check_x2apic(void);
-extern void enable_x2apic(void);
+extern void __init check_x2apic(void);
+extern void x2apic_setup(void);
static inline int x2apic_enabled(void)
{
- u64 msr;
-
- if (!cpu_has_x2apic)
- return 0;
-
- rdmsrl(MSR_IA32_APICBASE, msr);
- if (msr & X2APIC_ENABLE)
- return 1;
- return 0;
+ return cpu_has_x2apic && apic_is_x2apic_enabled();
}
#define x2apic_supported() (cpu_has_x2apic)
-static inline void x2apic_force_phys(void)
-{
- x2apic_phys = 1;
-}
#else
-static inline void disable_x2apic(void)
-{
-}
-static inline void check_x2apic(void)
-{
-}
-static inline void enable_x2apic(void)
-{
-}
-static inline int x2apic_enabled(void)
-{
- return 0;
-}
-static inline void x2apic_force_phys(void)
-{
-}
+static inline void check_x2apic(void) { }
+static inline void x2apic_setup(void) { }
+static inline int x2apic_enabled(void) { return 0; }
-#define x2apic_preenabled 0
-#define x2apic_supported() 0
+#define x2apic_mode (0)
+#define x2apic_supported() (0)
#endif
extern void enable_IR_x2apic(void);
@@ -219,7 +201,6 @@ extern int get_physical_broadcast(void);
extern int lapic_get_maxlvt(void);
extern void clear_local_APIC(void);
-extern void connect_bsp_APIC(void);
extern void disconnect_bsp_APIC(int virt_wire_setup);
extern void disable_local_APIC(void);
extern void lapic_shutdown(void);
@@ -227,14 +208,23 @@ extern int verify_local_APIC(void);
extern void sync_Arb_IDs(void);
extern void init_bsp_APIC(void);
extern void setup_local_APIC(void);
-extern void end_local_APIC_setup(void);
-extern void bsp_end_local_APIC_setup(void);
extern void init_apic_mappings(void);
void register_lapic_address(unsigned long address);
extern void setup_boot_APIC_clock(void);
extern void setup_secondary_APIC_clock(void);
extern int APIC_init_uniprocessor(void);
+
+#ifdef CONFIG_X86_64
+static inline int apic_force_enable(unsigned long addr)
+{
+ return -1;
+}
+#else
extern int apic_force_enable(unsigned long addr);
+#endif
+
+extern int apic_bsp_setup(bool upmode);
+extern void apic_ap_setup(void);
/*
* On 32bit this is mach-xxx local
diff --git a/arch/x86/include/asm/calling.h b/arch/x86/include/asm/calling.h
index 76659b67fd11..1f1297b46f83 100644
--- a/arch/x86/include/asm/calling.h
+++ b/arch/x86/include/asm/calling.h
@@ -83,7 +83,6 @@ For 32-bit we have the following conventions - kernel is built with
#define SS 160
#define ARGOFFSET R11
-#define SWFRAME ORIG_RAX
.macro SAVE_ARGS addskip=0, save_rcx=1, save_r891011=1, rax_enosys=0
subq $9*8+\addskip, %rsp
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index aede2c347bde..90a54851aedc 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -174,6 +174,7 @@
#define X86_FEATURE_TOPOEXT ( 6*32+22) /* topology extensions CPUID leafs */
#define X86_FEATURE_PERFCTR_CORE ( 6*32+23) /* core performance counter extensions */
#define X86_FEATURE_PERFCTR_NB ( 6*32+24) /* NB performance counter extensions */
+#define X86_FEATURE_BPEXT (6*32+26) /* data breakpoint extension */
#define X86_FEATURE_PERFCTR_L2 ( 6*32+28) /* L2 performance counter extensions */
/*
@@ -388,6 +389,7 @@ extern const char * const x86_bug_flags[NBUGINTS*32];
#define cpu_has_cx16 boot_cpu_has(X86_FEATURE_CX16)
#define cpu_has_eager_fpu boot_cpu_has(X86_FEATURE_EAGER_FPU)
#define cpu_has_topoext boot_cpu_has(X86_FEATURE_TOPOEXT)
+#define cpu_has_bpext boot_cpu_has(X86_FEATURE_BPEXT)
#if __GNUC__ >= 4
extern void warn_pre_alternatives(void);
diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h
index 61fd18b83b6c..12cb66f6d3a5 100644
--- a/arch/x86/include/asm/debugreg.h
+++ b/arch/x86/include/asm/debugreg.h
@@ -114,5 +114,10 @@ static inline void debug_stack_usage_inc(void) { }
static inline void debug_stack_usage_dec(void) { }
#endif /* X86_64 */
+#ifdef CONFIG_CPU_SUP_AMD
+extern void set_dr_addr_mask(unsigned long mask, int dr);
+#else
+static inline void set_dr_addr_mask(unsigned long mask, int dr) { }
+#endif
#endif /* _ASM_X86_DEBUGREG_H */
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index 50d033a8947d..a94b82e8f156 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -251,7 +251,8 @@ static inline void native_load_tls(struct thread_struct *t, unsigned int cpu)
gdt[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i];
}
-#define _LDT_empty(info) \
+/* This intentionally ignores lm, since 32-bit apps don't have that field. */
+#define LDT_empty(info) \
((info)->base_addr == 0 && \
(info)->limit == 0 && \
(info)->contents == 0 && \
@@ -261,11 +262,18 @@ static inline void native_load_tls(struct thread_struct *t, unsigned int cpu)
(info)->seg_not_present == 1 && \
(info)->useable == 0)
-#ifdef CONFIG_X86_64
-#define LDT_empty(info) (_LDT_empty(info) && ((info)->lm == 0))
-#else
-#define LDT_empty(info) (_LDT_empty(info))
-#endif
+/* Lots of programs expect an all-zero user_desc to mean "no segment at all". */
+static inline bool LDT_zero(const struct user_desc *info)
+{
+ return (info->base_addr == 0 &&
+ info->limit == 0 &&
+ info->contents == 0 &&
+ info->read_exec_only == 0 &&
+ info->seg_32bit == 0 &&
+ info->limit_in_pages == 0 &&
+ info->seg_not_present == 0 &&
+ info->useable == 0);
+}
static inline void clear_LDT(void)
{
diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h
index e97622f57722..0dbc08282291 100644
--- a/arch/x86/include/asm/fpu-internal.h
+++ b/arch/x86/include/asm/fpu-internal.h
@@ -207,7 +207,7 @@ static inline void fpu_fxsave(struct fpu *fpu)
if (config_enabled(CONFIG_X86_32))
asm volatile( "fxsave %[fx]" : [fx] "=m" (fpu->state->fxsave));
else if (config_enabled(CONFIG_AS_FXSAVEQ))
- asm volatile("fxsaveq %0" : "=m" (fpu->state->fxsave));
+ asm volatile("fxsaveq %[fx]" : [fx] "=m" (fpu->state->fxsave));
else {
/* Using "rex64; fxsave %0" is broken because, if the memory
* operand uses any extended registers for addressing, a second
@@ -290,9 +290,11 @@ static inline int fpu_restore_checking(struct fpu *fpu)
static inline int restore_fpu_checking(struct task_struct *tsk)
{
- /* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception
- is pending. Clear the x87 state here by setting it to fixed
- values. "m" is a random variable that should be in L1 */
+ /*
+ * AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception is
+ * pending. Clear the x87 state here by setting it to fixed values.
+ * "m" is a random variable that should be in L1.
+ */
if (unlikely(static_cpu_has_bug_safe(X86_BUG_FXSAVE_LEAK))) {
asm volatile(
"fnclex\n\t"
diff --git a/arch/x86/include/asm/hw_breakpoint.h b/arch/x86/include/asm/hw_breakpoint.h
index ef1c4d2d41ec..6c98be864a75 100644
--- a/arch/x86/include/asm/hw_breakpoint.h
+++ b/arch/x86/include/asm/hw_breakpoint.h
@@ -12,6 +12,7 @@
*/
struct arch_hw_breakpoint {
unsigned long address;
+ unsigned long mask;
u8 len;
u8 type;
};
diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index ed8089d69094..6eb6fcb83f63 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -40,8 +40,8 @@ extern void __kernel_fpu_end(void);
static inline void kernel_fpu_begin(void)
{
- WARN_ON_ONCE(!irq_fpu_usable());
preempt_disable();
+ WARN_ON_ONCE(!irq_fpu_usable());
__kernel_fpu_begin();
}
@@ -51,6 +51,10 @@ static inline void kernel_fpu_end(void)
preempt_enable();
}
+/* Must be called with preempt disabled */
+extern void kernel_fpu_disable(void);
+extern void kernel_fpu_enable(void);
+
/*
* Some instructions like VIA's padlock instructions generate a spurious
* DNA fault but don't modify SSE registers. And these instructions
diff --git a/arch/x86/include/asm/imr.h b/arch/x86/include/asm/imr.h
new file mode 100644
index 000000000000..cd2ce4068441
--- /dev/null
+++ b/arch/x86/include/asm/imr.h
@@ -0,0 +1,60 @@
+/*
+ * imr.h: Isolated Memory Region API
+ *
+ * Copyright(c) 2013 Intel Corporation.
+ * Copyright(c) 2015 Bryan O'Donoghue <pure.logic@nexus-software.ie>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+#ifndef _IMR_H
+#define _IMR_H
+
+#include <linux/types.h>
+
+/*
+ * IMR agent access mask bits
+ * See section 12.7.4.7 from quark-x1000-datasheet.pdf for register
+ * definitions.
+ */
+#define IMR_ESRAM_FLUSH BIT(31)
+#define IMR_CPU_SNOOP BIT(30) /* Applicable only to write */
+#define IMR_RMU BIT(29)
+#define IMR_VC1_SAI_ID3 BIT(15)
+#define IMR_VC1_SAI_ID2 BIT(14)
+#define IMR_VC1_SAI_ID1 BIT(13)
+#define IMR_VC1_SAI_ID0 BIT(12)
+#define IMR_VC0_SAI_ID3 BIT(11)
+#define IMR_VC0_SAI_ID2 BIT(10)
+#define IMR_VC0_SAI_ID1 BIT(9)
+#define IMR_VC0_SAI_ID0 BIT(8)
+#define IMR_CPU_0 BIT(1) /* SMM mode */
+#define IMR_CPU BIT(0) /* Non SMM mode */
+#define IMR_ACCESS_NONE 0
+
+/*
+ * Read/Write access-all bits here include some reserved bits
+ * These are the values firmware uses and are accepted by hardware.
+ * The kernel defines read/write access-all in the same way as firmware
+ * in order to have a consistent and crisp definition across firmware,
+ * bootloader and kernel.
+ */
+#define IMR_READ_ACCESS_ALL 0xBFFFFFFF
+#define IMR_WRITE_ACCESS_ALL 0xFFFFFFFF
+
+/* Number of IMRs provided by Quark X1000 SoC */
+#define QUARK_X1000_IMR_MAX 0x08
+#define QUARK_X1000_IMR_REGBASE 0x40
+
+/* IMR alignment bits - only bits 31:10 are checked for IMR validity */
+#define IMR_ALIGN 0x400
+#define IMR_MASK (IMR_ALIGN - 1)
+
+int imr_add_range(phys_addr_t base, size_t size,
+ unsigned int rmask, unsigned int wmask, bool lock);
+
+int imr_remove_range(phys_addr_t base, size_t size);
+
+#endif /* _IMR_H */
diff --git a/arch/x86/include/asm/intel-mid.h b/arch/x86/include/asm/intel-mid.h
index e34e097b6f9d..705d35708a50 100644
--- a/arch/x86/include/asm/intel-mid.h
+++ b/arch/x86/include/asm/intel-mid.h
@@ -136,9 +136,6 @@ extern enum intel_mid_timer_options intel_mid_timer_options;
#define SFI_MTMR_MAX_NUM 8
#define SFI_MRTC_MAX 8
-extern struct console early_mrst_console;
-extern void mrst_early_console_init(void);
-
extern struct console early_hsu_console;
extern void hsu_early_console_init(const char *);
diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index bf006cce9418..2f91685fe1cd 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -279,6 +279,11 @@ static inline void disable_ioapic_support(void) { }
#define native_ioapic_set_affinity NULL
#define native_setup_ioapic_entry NULL
#define native_eoi_ioapic_pin NULL
+
+static inline void setup_IO_APIC(void) { }
+static inline void enable_IO_APIC(void) { }
+static inline void setup_ioapic_dest(void) { }
+
#endif
#endif /* _ASM_X86_IO_APIC_H */
diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h
index b7747c4c2cf2..6224d316c405 100644
--- a/arch/x86/include/asm/irq_remapping.h
+++ b/arch/x86/include/asm/irq_remapping.h
@@ -33,8 +33,6 @@ struct irq_cfg;
#ifdef CONFIG_IRQ_REMAP
-extern void setup_irq_remapping_ops(void);
-extern int irq_remapping_supported(void);
extern void set_irq_remapping_broken(void);
extern int irq_remapping_prepare(void);
extern int irq_remapping_enable(void);
@@ -60,8 +58,6 @@ void irq_remap_modify_chip_defaults(struct irq_chip *chip);
#else /* CONFIG_IRQ_REMAP */
-static inline void setup_irq_remapping_ops(void) { }
-static inline int irq_remapping_supported(void) { return 0; }
static inline void set_irq_remapping_broken(void) { }
static inline int irq_remapping_prepare(void) { return -ENODEV; }
static inline int irq_remapping_enable(void) { return -ENODEV; }
diff --git a/arch/x86/include/asm/kasan.h b/arch/x86/include/asm/kasan.h
new file mode 100644
index 000000000000..8b22422fbad8
--- /dev/null
+++ b/arch/x86/include/asm/kasan.h
@@ -0,0 +1,31 @@
+#ifndef _ASM_X86_KASAN_H
+#define _ASM_X86_KASAN_H
+
+/*
+ * Compiler uses shadow offset assuming that addresses start
+ * from 0. Kernel addresses don't start from 0, so shadow
+ * for kernel really starts from compiler's shadow offset +
+ * 'kernel address space start' >> KASAN_SHADOW_SCALE_SHIFT
+ */
+#define KASAN_SHADOW_START (KASAN_SHADOW_OFFSET + \
+ (0xffff800000000000ULL >> 3))
+/* 47 bits for kernel address -> (47 - 3) bits for shadow */
+#define KASAN_SHADOW_END (KASAN_SHADOW_START + (1ULL << (47 - 3)))
+
+#ifndef __ASSEMBLY__
+
+extern pte_t kasan_zero_pte[];
+extern pte_t kasan_zero_pmd[];
+extern pte_t kasan_zero_pud[];
+
+#ifdef CONFIG_KASAN
+void __init kasan_map_early_shadow(pgd_t *pgd);
+void __init kasan_init(void);
+#else
+static inline void kasan_map_early_shadow(pgd_t *pgd) { }
+static inline void kasan_init(void) { }
+#endif
+
+#endif
+
+#endif
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index eb181178fe0b..57a9d94fe160 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -208,6 +208,7 @@ struct x86_emulate_ops {
void (*get_cpuid)(struct x86_emulate_ctxt *ctxt,
u32 *eax, u32 *ebx, u32 *ecx, u32 *edx);
+ void (*set_nmi_mask)(struct x86_emulate_ctxt *ctxt, bool masked);
};
typedef u32 __attribute__((vector_size(16))) sse128_t;
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index d89c6b828c96..a236e39cc385 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -38,8 +38,6 @@
#define KVM_PRIVATE_MEM_SLOTS 3
#define KVM_MEM_SLOTS_NUM (KVM_USER_MEM_SLOTS + KVM_PRIVATE_MEM_SLOTS)
-#define KVM_MMIO_SIZE 16
-
#define KVM_PIO_PAGE_OFFSET 1
#define KVM_COALESCED_MMIO_PAGE_OFFSET 2
@@ -51,7 +49,7 @@
| X86_CR0_NW | X86_CR0_CD | X86_CR0_PG))
#define CR3_L_MODE_RESERVED_BITS 0xFFFFFF0000000000ULL
-#define CR3_PCID_INVD (1UL << 63)
+#define CR3_PCID_INVD BIT_64(63)
#define CR4_RESERVED_BITS \
(~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
| X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \
@@ -160,6 +158,18 @@ enum {
#define DR7_FIXED_1 0x00000400
#define DR7_VOLATILE 0xffff2bff
+#define PFERR_PRESENT_BIT 0
+#define PFERR_WRITE_BIT 1
+#define PFERR_USER_BIT 2
+#define PFERR_RSVD_BIT 3
+#define PFERR_FETCH_BIT 4
+
+#define PFERR_PRESENT_MASK (1U << PFERR_PRESENT_BIT)
+#define PFERR_WRITE_MASK (1U << PFERR_WRITE_BIT)
+#define PFERR_USER_MASK (1U << PFERR_USER_BIT)
+#define PFERR_RSVD_MASK (1U << PFERR_RSVD_BIT)
+#define PFERR_FETCH_MASK (1U << PFERR_FETCH_BIT)
+
/* apic attention bits */
#define KVM_APIC_CHECK_VAPIC 0
/*
@@ -615,6 +625,8 @@ struct kvm_arch {
#ifdef CONFIG_KVM_MMU_AUDIT
int audit_point;
#endif
+
+ bool boot_vcpu_runs_old_kvmclock;
};
struct kvm_vm_stat {
@@ -643,6 +655,7 @@ struct kvm_vcpu_stat {
u32 irq_window_exits;
u32 nmi_window_exits;
u32 halt_exits;
+ u32 halt_successful_poll;
u32 halt_wakeup;
u32 request_irq_exits;
u32 irq_exits;
@@ -787,6 +800,31 @@ struct kvm_x86_ops {
int (*check_nested_events)(struct kvm_vcpu *vcpu, bool external_intr);
void (*sched_in)(struct kvm_vcpu *kvm, int cpu);
+
+ /*
+ * Arch-specific dirty logging hooks. These hooks are only supposed to
+ * be valid if the specific arch has hardware-accelerated dirty logging
+ * mechanism. Currently only for PML on VMX.
+ *
+ * - slot_enable_log_dirty:
+ * called when enabling log dirty mode for the slot.
+ * - slot_disable_log_dirty:
+ * called when disabling log dirty mode for the slot.
+ * also called when slot is created with log dirty disabled.
+ * - flush_log_dirty:
+ * called before reporting dirty_bitmap to userspace.
+ * - enable_log_dirty_pt_masked:
+ * called when reenabling log dirty for the GFNs in the mask after
+ * corresponding bits are cleared in slot->dirty_bitmap.
+ */
+ void (*slot_enable_log_dirty)(struct kvm *kvm,
+ struct kvm_memory_slot *slot);
+ void (*slot_disable_log_dirty)(struct kvm *kvm,
+ struct kvm_memory_slot *slot);
+ void (*flush_log_dirty)(struct kvm *kvm);
+ void (*enable_log_dirty_pt_masked)(struct kvm *kvm,
+ struct kvm_memory_slot *slot,
+ gfn_t offset, unsigned long mask);
};
struct kvm_arch_async_pf {
@@ -819,10 +857,17 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
u64 dirty_mask, u64 nx_mask, u64 x_mask);
void kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
-void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot);
-void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
- struct kvm_memory_slot *slot,
- gfn_t gfn_offset, unsigned long mask);
+void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
+ struct kvm_memory_slot *memslot);
+void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
+ struct kvm_memory_slot *memslot);
+void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm,
+ struct kvm_memory_slot *memslot);
+void kvm_mmu_slot_set_dirty(struct kvm *kvm,
+ struct kvm_memory_slot *memslot);
+void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
+ struct kvm_memory_slot *slot,
+ gfn_t gfn_offset, unsigned long mask);
void kvm_mmu_zap_all(struct kvm *kvm);
void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm);
unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm);
diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h
index 879fd7d33877..ef01fef3eebc 100644
--- a/arch/x86/include/asm/lguest_hcall.h
+++ b/arch/x86/include/asm/lguest_hcall.h
@@ -16,7 +16,6 @@
#define LHCALL_SET_PTE 14
#define LHCALL_SET_PGD 15
#define LHCALL_LOAD_TLS 16
-#define LHCALL_NOTIFY 17
#define LHCALL_LOAD_GDT_ENTRY 18
#define LHCALL_SEND_INTERRUPTS 19
diff --git a/arch/x86/include/asm/livepatch.h b/arch/x86/include/asm/livepatch.h
new file mode 100644
index 000000000000..a455a53d789a
--- /dev/null
+++ b/arch/x86/include/asm/livepatch.h
@@ -0,0 +1,46 @@
+/*
+ * livepatch.h - x86-specific Kernel Live Patching Core
+ *
+ * Copyright (C) 2014 Seth Jennings <sjenning@redhat.com>
+ * Copyright (C) 2014 SUSE
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _ASM_X86_LIVEPATCH_H
+#define _ASM_X86_LIVEPATCH_H
+
+#include <linux/module.h>
+#include <linux/ftrace.h>
+
+#ifdef CONFIG_LIVEPATCH
+static inline int klp_check_compiler_support(void)
+{
+#ifndef CC_USING_FENTRY
+ return 1;
+#endif
+ return 0;
+}
+extern int klp_write_module_reloc(struct module *mod, unsigned long type,
+ unsigned long loc, unsigned long value);
+
+static inline void klp_arch_set_pc(struct pt_regs *regs, unsigned long ip)
+{
+ regs->ip = ip;
+}
+#else
+#error Live patching support is disabled; check CONFIG_LIVEPATCH
+#endif
+
+#endif /* _ASM_X86_LIVEPATCH_H */
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 51b26e895933..9b3de99dc004 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -190,7 +190,6 @@ enum mcp_flags {
void machine_check_poll(enum mcp_flags flags, mce_banks_t *b);
int mce_notify_irq(void);
-void mce_notify_process(void);
DECLARE_PER_CPU(struct mce, injectm);
diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
index 876e74e8eec7..09b9620a73b4 100644
--- a/arch/x86/include/asm/mmu.h
+++ b/arch/x86/include/asm/mmu.h
@@ -19,6 +19,8 @@ typedef struct {
struct mutex lock;
void __user *vdso;
+
+ atomic_t perf_rdpmc_allowed; /* nonzero if rdpmc is allowed */
} mm_context_t;
#ifdef CONFIG_SMP
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 40269a2bf6f9..883f6b933fa4 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -18,6 +18,21 @@ static inline void paravirt_activate_mm(struct mm_struct *prev,
}
#endif /* !CONFIG_PARAVIRT */
+#ifdef CONFIG_PERF_EVENTS
+extern struct static_key rdpmc_always_available;
+
+static inline void load_mm_cr4(struct mm_struct *mm)
+{
+ if (static_key_true(&rdpmc_always_available) ||
+ atomic_read(&mm->context.perf_rdpmc_allowed))
+ cr4_set_bits(X86_CR4_PCE);
+ else
+ cr4_clear_bits(X86_CR4_PCE);
+}
+#else
+static inline void load_mm_cr4(struct mm_struct *mm) {}
+#endif
+
/*
* Used for LDT copy/destruction.
*/
@@ -52,15 +67,20 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
/* Stop flush ipis for the previous mm */
cpumask_clear_cpu(cpu, mm_cpumask(prev));
+ /* Load per-mm CR4 state */
+ load_mm_cr4(next);
+
/*
* Load the LDT, if the LDT is different.
*
- * It's possible leave_mm(prev) has been called. If so,
- * then prev->context.ldt could be out of sync with the
- * LDT descriptor or the LDT register. This can only happen
- * if prev->context.ldt is non-null, since we never free
- * an LDT. But LDTs can't be shared across mms, so
- * prev->context.ldt won't be equal to next->context.ldt.
+ * It's possible that prev->context.ldt doesn't match
+ * the LDT register. This can happen if leave_mm(prev)
+ * was called and then modify_ldt changed
+ * prev->context.ldt but suppressed an IPI to this CPU.
+ * In this case, prev->context.ldt != NULL, because we
+ * never free an LDT while the mm still exists. That
+ * means that next->context.ldt != prev->context.ldt,
+ * because mms never share an LDT.
*/
if (unlikely(prev->context.ldt != next->context.ldt))
load_LDT_nolock(&next->context);
@@ -85,6 +105,7 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
*/
load_cr3(next->pgd);
trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
+ load_mm_cr4(next);
load_LDT_nolock(&next->context);
}
}
@@ -130,7 +151,25 @@ static inline void arch_bprm_mm_init(struct mm_struct *mm,
static inline void arch_unmap(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long start, unsigned long end)
{
- mpx_notify_unmap(mm, vma, start, end);
+ /*
+ * mpx_notify_unmap() goes and reads a rarely-hot
+ * cacheline in the mm_struct. That can be expensive
+ * enough to be seen in profiles.
+ *
+ * The mpx_notify_unmap() call and its contents have been
+ * observed to affect munmap() performance on hardware
+ * where MPX is not present.
+ *
+ * The unlikely() optimizes for the fast case: no MPX
+ * in the CPU, or no MPX use in the process. Even if
+ * we get this wrong (in the unlikely event that MPX
+ * is widely enabled on some system) the overhead of
+ * MPX itself (reading bounds tables) is expected to
+ * overwhelm the overhead of getting this unlikely()
+ * consistently wrong.
+ */
+ if (unlikely(cpu_feature_enabled(X86_FEATURE_MPX)))
+ mpx_notify_unmap(mm, vma, start, end);
}
#endif /* _ASM_X86_MMU_CONTEXT_H */
diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h
index 75450b2c7be4..4edd53b79a81 100644
--- a/arch/x86/include/asm/page_64_types.h
+++ b/arch/x86/include/asm/page_64_types.h
@@ -1,17 +1,23 @@
#ifndef _ASM_X86_PAGE_64_DEFS_H
#define _ASM_X86_PAGE_64_DEFS_H
-#define THREAD_SIZE_ORDER 2
+#ifdef CONFIG_KASAN
+#define KASAN_STACK_ORDER 1
+#else
+#define KASAN_STACK_ORDER 0
+#endif
+
+#define THREAD_SIZE_ORDER (2 + KASAN_STACK_ORDER)
#define THREAD_SIZE (PAGE_SIZE << THREAD_SIZE_ORDER)
#define CURRENT_MASK (~(THREAD_SIZE - 1))
-#define EXCEPTION_STACK_ORDER 0
+#define EXCEPTION_STACK_ORDER (0 + KASAN_STACK_ORDER)
#define EXCEPTION_STKSZ (PAGE_SIZE << EXCEPTION_STACK_ORDER)
#define DEBUG_STACK_ORDER (EXCEPTION_STACK_ORDER + 1)
#define DEBUG_STKSZ (PAGE_SIZE << DEBUG_STACK_ORDER)
-#define IRQ_STACK_ORDER 2
+#define IRQ_STACK_ORDER (2 + KASAN_STACK_ORDER)
#define IRQ_STACK_SIZE (PAGE_SIZE << IRQ_STACK_ORDER)
#define DOUBLEFAULT_STACK 1
diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h
index f97fbe3abb67..95e11f79f123 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -51,6 +51,8 @@ extern int devmem_is_allowed(unsigned long pagenr);
extern unsigned long max_low_pfn_mapped;
extern unsigned long max_pfn_mapped;
+extern bool kaslr_enabled;
+
static inline phys_addr_t get_max_mapped(void)
{
return (phys_addr_t)max_pfn_mapped << PAGE_SHIFT;
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 32444ae939ca..965c47d254aa 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -80,16 +80,16 @@ static inline void write_cr3(unsigned long x)
PVOP_VCALL1(pv_mmu_ops.write_cr3, x);
}
-static inline unsigned long read_cr4(void)
+static inline unsigned long __read_cr4(void)
{
return PVOP_CALL0(unsigned long, pv_cpu_ops.read_cr4);
}
-static inline unsigned long read_cr4_safe(void)
+static inline unsigned long __read_cr4_safe(void)
{
return PVOP_CALL0(unsigned long, pv_cpu_ops.read_cr4_safe);
}
-static inline void write_cr4(unsigned long x)
+static inline void __write_cr4(unsigned long x)
{
PVOP_VCALL1(pv_cpu_ops.write_cr4, x);
}
diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h
index 164e3f8d3c3d..fa1195dae425 100644
--- a/arch/x86/include/asm/pci_x86.h
+++ b/arch/x86/include/asm/pci_x86.h
@@ -93,8 +93,6 @@ extern raw_spinlock_t pci_config_lock;
extern int (*pcibios_enable_irq)(struct pci_dev *dev);
extern void (*pcibios_disable_irq)(struct pci_dev *dev);
-extern bool mp_should_keep_irq(struct device *dev);
-
struct pci_raw_ops {
int (*read)(unsigned int domain, unsigned int bus, unsigned int devfn,
int reg, int len, u32 *val);
diff --git a/arch/x86/include/asm/pgtable-2level.h b/arch/x86/include/asm/pgtable-2level.h
index 206a87fdd22d..fd74a11959de 100644
--- a/arch/x86/include/asm/pgtable-2level.h
+++ b/arch/x86/include/asm/pgtable-2level.h
@@ -62,44 +62,8 @@ static inline unsigned long pte_bitop(unsigned long value, unsigned int rightshi
return ((value >> rightshift) & mask) << leftshift;
}
-/*
- * Bits _PAGE_BIT_PRESENT, _PAGE_BIT_FILE and _PAGE_BIT_PROTNONE are taken,
- * split up the 29 bits of offset into this range.
- */
-#define PTE_FILE_MAX_BITS 29
-#define PTE_FILE_SHIFT1 (_PAGE_BIT_PRESENT + 1)
-#define PTE_FILE_SHIFT2 (_PAGE_BIT_FILE + 1)
-#define PTE_FILE_SHIFT3 (_PAGE_BIT_PROTNONE + 1)
-#define PTE_FILE_BITS1 (PTE_FILE_SHIFT2 - PTE_FILE_SHIFT1 - 1)
-#define PTE_FILE_BITS2 (PTE_FILE_SHIFT3 - PTE_FILE_SHIFT2 - 1)
-
-#define PTE_FILE_MASK1 ((1U << PTE_FILE_BITS1) - 1)
-#define PTE_FILE_MASK2 ((1U << PTE_FILE_BITS2) - 1)
-
-#define PTE_FILE_LSHIFT2 (PTE_FILE_BITS1)
-#define PTE_FILE_LSHIFT3 (PTE_FILE_BITS1 + PTE_FILE_BITS2)
-
-static __always_inline pgoff_t pte_to_pgoff(pte_t pte)
-{
- return (pgoff_t)
- (pte_bitop(pte.pte_low, PTE_FILE_SHIFT1, PTE_FILE_MASK1, 0) +
- pte_bitop(pte.pte_low, PTE_FILE_SHIFT2, PTE_FILE_MASK2, PTE_FILE_LSHIFT2) +
- pte_bitop(pte.pte_low, PTE_FILE_SHIFT3, -1UL, PTE_FILE_LSHIFT3));
-}
-
-static __always_inline pte_t pgoff_to_pte(pgoff_t off)
-{
- return (pte_t){
- .pte_low =
- pte_bitop(off, 0, PTE_FILE_MASK1, PTE_FILE_SHIFT1) +
- pte_bitop(off, PTE_FILE_LSHIFT2, PTE_FILE_MASK2, PTE_FILE_SHIFT2) +
- pte_bitop(off, PTE_FILE_LSHIFT3, -1UL, PTE_FILE_SHIFT3) +
- _PAGE_FILE,
- };
-}
-
/* Encode and de-code a swap entry */
-#define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1)
+#define SWP_TYPE_BITS 5
#define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 1)
#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS)
diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h
index 81bb91b49a88..cdaa58c9b39e 100644
--- a/arch/x86/include/asm/pgtable-3level.h
+++ b/arch/x86/include/asm/pgtable-3level.h
@@ -176,18 +176,6 @@ static inline pmd_t native_pmdp_get_and_clear(pmd_t *pmdp)
#define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp)
#endif
-/*
- * Bits 0, 6 and 7 are taken in the low part of the pte,
- * put the 32 bits of offset into the high part.
- *
- * For soft-dirty tracking 11 bit is taken from
- * the low part of pte as well.
- */
-#define pte_to_pgoff(pte) ((pte).pte_high)
-#define pgoff_to_pte(off) \
- ((pte_t) { { .pte_low = _PAGE_FILE, .pte_high = (off) } })
-#define PTE_FILE_MAX_BITS 32
-
/* Encode and de-code a swap entry */
#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > 5)
#define __swp_type(x) (((x).val) & 0x1f)
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index e8a5454acc99..a0c35bf6cb92 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -115,11 +115,6 @@ static inline int pte_write(pte_t pte)
return pte_flags(pte) & _PAGE_RW;
}
-static inline int pte_file(pte_t pte)
-{
- return pte_flags(pte) & _PAGE_FILE;
-}
-
static inline int pte_huge(pte_t pte)
{
return pte_flags(pte) & _PAGE_PSE;
@@ -137,13 +132,7 @@ static inline int pte_exec(pte_t pte)
static inline int pte_special(pte_t pte)
{
- /*
- * See CONFIG_NUMA_BALANCING pte_numa in include/asm-generic/pgtable.h.
- * On x86 we have _PAGE_BIT_NUMA == _PAGE_BIT_GLOBAL+1 ==
- * __PAGE_BIT_SOFTW1 == _PAGE_BIT_SPECIAL.
- */
- return (pte_flags(pte) & _PAGE_SPECIAL) &&
- (pte_flags(pte) & (_PAGE_PRESENT|_PAGE_PROTNONE));
+ return pte_flags(pte) & _PAGE_SPECIAL;
}
static inline unsigned long pte_pfn(pte_t pte)
@@ -305,7 +294,7 @@ static inline pmd_t pmd_mkwrite(pmd_t pmd)
static inline pmd_t pmd_mknotpresent(pmd_t pmd)
{
- return pmd_clear_flags(pmd, _PAGE_PRESENT);
+ return pmd_clear_flags(pmd, _PAGE_PRESENT | _PAGE_PROTNONE);
}
#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
@@ -329,21 +318,6 @@ static inline pmd_t pmd_mksoft_dirty(pmd_t pmd)
return pmd_set_flags(pmd, _PAGE_SOFT_DIRTY);
}
-static inline pte_t pte_file_clear_soft_dirty(pte_t pte)
-{
- return pte_clear_flags(pte, _PAGE_SOFT_DIRTY);
-}
-
-static inline pte_t pte_file_mksoft_dirty(pte_t pte)
-{
- return pte_set_flags(pte, _PAGE_SOFT_DIRTY);
-}
-
-static inline int pte_file_soft_dirty(pte_t pte)
-{
- return pte_flags(pte) & _PAGE_SOFT_DIRTY;
-}
-
#endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */
/*
@@ -463,13 +437,6 @@ static inline int pte_same(pte_t a, pte_t b)
static inline int pte_present(pte_t a)
{
- return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE |
- _PAGE_NUMA);
-}
-
-#define pte_present_nonuma pte_present_nonuma
-static inline int pte_present_nonuma(pte_t a)
-{
return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE);
}
@@ -479,7 +446,7 @@ static inline bool pte_accessible(struct mm_struct *mm, pte_t a)
if (pte_flags(a) & _PAGE_PRESENT)
return true;
- if ((pte_flags(a) & (_PAGE_PROTNONE | _PAGE_NUMA)) &&
+ if ((pte_flags(a) & _PAGE_PROTNONE) &&
mm_tlb_flush_pending(mm))
return true;
@@ -499,10 +466,27 @@ static inline int pmd_present(pmd_t pmd)
* the _PAGE_PSE flag will remain set at all times while the
* _PAGE_PRESENT bit is clear).
*/
- return pmd_flags(pmd) & (_PAGE_PRESENT | _PAGE_PROTNONE | _PAGE_PSE |
- _PAGE_NUMA);
+ return pmd_flags(pmd) & (_PAGE_PRESENT | _PAGE_PROTNONE | _PAGE_PSE);
}
+#ifdef CONFIG_NUMA_BALANCING
+/*
+ * These work without NUMA balancing but the kernel does not care. See the
+ * comment in include/asm-generic/pgtable.h
+ */
+static inline int pte_protnone(pte_t pte)
+{
+ return (pte_flags(pte) & (_PAGE_PROTNONE | _PAGE_PRESENT))
+ == _PAGE_PROTNONE;
+}
+
+static inline int pmd_protnone(pmd_t pmd)
+{
+ return (pmd_flags(pmd) & (_PAGE_PROTNONE | _PAGE_PRESENT))
+ == _PAGE_PROTNONE;
+}
+#endif /* CONFIG_NUMA_BALANCING */
+
static inline int pmd_none(pmd_t pmd)
{
/* Only check low word on 32-bit platforms, since it might be
@@ -559,11 +543,6 @@ static inline pte_t *pte_offset_kernel(pmd_t *pmd, unsigned long address)
static inline int pmd_bad(pmd_t pmd)
{
-#ifdef CONFIG_NUMA_BALANCING
- /* pmd_numa check */
- if ((pmd_flags(pmd) & (_PAGE_NUMA|_PAGE_PRESENT)) == _PAGE_NUMA)
- return 0;
-#endif
return (pmd_flags(pmd) & ~_PAGE_USER) != _KERNPG_TABLE;
}
@@ -882,19 +861,16 @@ static inline void update_mmu_cache_pmd(struct vm_area_struct *vma,
#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
static inline pte_t pte_swp_mksoft_dirty(pte_t pte)
{
- VM_BUG_ON(pte_present_nonuma(pte));
return pte_set_flags(pte, _PAGE_SWP_SOFT_DIRTY);
}
static inline int pte_swp_soft_dirty(pte_t pte)
{
- VM_BUG_ON(pte_present_nonuma(pte));
return pte_flags(pte) & _PAGE_SWP_SOFT_DIRTY;
}
static inline pte_t pte_swp_clear_soft_dirty(pte_t pte)
{
- VM_BUG_ON(pte_present_nonuma(pte));
return pte_clear_flags(pte, _PAGE_SWP_SOFT_DIRTY);
}
#endif
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 4572b2f30237..2ee781114d34 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -133,10 +133,6 @@ static inline int pgd_large(pgd_t pgd) { return 0; }
/* PUD - Level3 access */
/* PMD - Level 2 access */
-#define pte_to_pgoff(pte) ((pte_val((pte)) & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT)
-#define pgoff_to_pte(off) ((pte_t) { .pte = ((off) << PAGE_SHIFT) | \
- _PAGE_FILE })
-#define PTE_FILE_MAX_BITS __PHYSICAL_MASK_SHIFT
/* PTE - Level 1 access. */
@@ -145,13 +141,8 @@ static inline int pgd_large(pgd_t pgd) { return 0; }
#define pte_unmap(pte) ((void)(pte))/* NOP */
/* Encode and de-code a swap entry */
-#define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1)
-#ifdef CONFIG_NUMA_BALANCING
-/* Automatic NUMA balancing needs to be distinguishable from swap entries */
-#define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 2)
-#else
+#define SWP_TYPE_BITS 5
#define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 1)
-#endif
#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS)
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 25bcd4a89517..8c7c10802e9c 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -4,7 +4,7 @@
#include <linux/const.h>
#include <asm/page_types.h>
-#define FIRST_USER_ADDRESS 0
+#define FIRST_USER_ADDRESS 0UL
#define _PAGE_BIT_PRESENT 0 /* is present */
#define _PAGE_BIT_RW 1 /* writeable */
@@ -27,19 +27,9 @@
#define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_SOFTW3 /* software dirty tracking */
#define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */
-/*
- * Swap offsets on configurations that allow automatic NUMA balancing use the
- * bits after _PAGE_BIT_GLOBAL. To uniquely distinguish NUMA hinting PTEs from
- * swap entries, we use the first bit after _PAGE_BIT_GLOBAL and shrink the
- * maximum possible swap space from 16TB to 8TB.
- */
-#define _PAGE_BIT_NUMA (_PAGE_BIT_GLOBAL+1)
-
/* If _PAGE_BIT_PRESENT is clear, we use these: */
/* - if the user mapped it with PROT_NONE; pte_present gives true */
#define _PAGE_BIT_PROTNONE _PAGE_BIT_GLOBAL
-/* - set: nonlinear file mapping, saved PTE; unset:swap */
-#define _PAGE_BIT_FILE _PAGE_BIT_DIRTY
#define _PAGE_PRESENT (_AT(pteval_t, 1) << _PAGE_BIT_PRESENT)
#define _PAGE_RW (_AT(pteval_t, 1) << _PAGE_BIT_RW)
@@ -78,21 +68,6 @@
#endif
/*
- * _PAGE_NUMA distinguishes between a numa hinting minor fault and a page
- * that is not present. The hinting fault gathers numa placement statistics
- * (see pte_numa()). The bit is always zero when the PTE is not present.
- *
- * The bit picked must be always zero when the pmd is present and not
- * present, so that we don't lose information when we set it while
- * atomically clearing the present bit.
- */
-#ifdef CONFIG_NUMA_BALANCING
-#define _PAGE_NUMA (_AT(pteval_t, 1) << _PAGE_BIT_NUMA)
-#else
-#define _PAGE_NUMA (_AT(pteval_t, 0))
-#endif
-
-/*
* Tracking soft dirty bit when a page goes to a swap is tricky.
* We need a bit which can be stored in pte _and_ not conflict
* with swap entry format. On x86 bits 6 and 7 are *not* involved
@@ -114,7 +89,6 @@
#define _PAGE_NX (_AT(pteval_t, 0))
#endif
-#define _PAGE_FILE (_AT(pteval_t, 1) << _PAGE_BIT_FILE)
#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \
@@ -125,8 +99,8 @@
/* Set of bits not changed in pte_modify */
#define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \
_PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY | \
- _PAGE_SOFT_DIRTY | _PAGE_NUMA)
-#define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE | _PAGE_NUMA)
+ _PAGE_SOFT_DIRTY)
+#define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE)
/*
* The cache modes defined here are used to translate between pure SW usage
@@ -327,20 +301,6 @@ static inline pteval_t pte_flags(pte_t pte)
return native_pte_val(pte) & PTE_FLAGS_MASK;
}
-#ifdef CONFIG_NUMA_BALANCING
-/* Set of bits that distinguishes present, prot_none and numa ptes */
-#define _PAGE_NUMA_MASK (_PAGE_NUMA|_PAGE_PROTNONE|_PAGE_PRESENT)
-static inline pteval_t ptenuma_flags(pte_t pte)
-{
- return pte_flags(pte) & _PAGE_NUMA_MASK;
-}
-
-static inline pmdval_t pmdnuma_flags(pmd_t pmd)
-{
- return pmd_flags(pmd) & _PAGE_NUMA_MASK;
-}
-#endif /* CONFIG_NUMA_BALANCING */
-
#define pgprot_val(x) ((x).pgprot)
#define __pgprot(x) ((pgprot_t) { (x) } )
diff --git a/arch/x86/include/asm/pmc_atom.h b/arch/x86/include/asm/pmc_atom.h
index fc7a17c05d35..bc0fc0866553 100644
--- a/arch/x86/include/asm/pmc_atom.h
+++ b/arch/x86/include/asm/pmc_atom.h
@@ -53,6 +53,28 @@
/* Sleep state counter is in units of of 32us */
#define PMC_TMR_SHIFT 5
+/* Power status of power islands */
+#define PMC_PSS 0x98
+
+#define PMC_PSS_BIT_GBE BIT(0)
+#define PMC_PSS_BIT_SATA BIT(1)
+#define PMC_PSS_BIT_HDA BIT(2)
+#define PMC_PSS_BIT_SEC BIT(3)
+#define PMC_PSS_BIT_PCIE BIT(4)
+#define PMC_PSS_BIT_LPSS BIT(5)
+#define PMC_PSS_BIT_LPE BIT(6)
+#define PMC_PSS_BIT_DFX BIT(7)
+#define PMC_PSS_BIT_USH_CTRL BIT(8)
+#define PMC_PSS_BIT_USH_SUS BIT(9)
+#define PMC_PSS_BIT_USH_VCCS BIT(10)
+#define PMC_PSS_BIT_USH_VCCA BIT(11)
+#define PMC_PSS_BIT_OTG_CTRL BIT(12)
+#define PMC_PSS_BIT_OTG_VCCS BIT(13)
+#define PMC_PSS_BIT_OTG_VCCA_CLK BIT(14)
+#define PMC_PSS_BIT_OTG_VCCA BIT(15)
+#define PMC_PSS_BIT_USB BIT(16)
+#define PMC_PSS_BIT_USB_SUS BIT(17)
+
/* These registers reflect D3 status of functions */
#define PMC_D3_STS_0 0xA0
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index a092a0cce0b7..ec1c93588cef 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -579,39 +579,6 @@ static inline void load_sp0(struct tss_struct *tss,
#define set_iopl_mask native_set_iopl_mask
#endif /* CONFIG_PARAVIRT */
-/*
- * Save the cr4 feature set we're using (ie
- * Pentium 4MB enable and PPro Global page
- * enable), so that any CPU's that boot up
- * after us can get the correct flags.
- */
-extern unsigned long mmu_cr4_features;
-extern u32 *trampoline_cr4_features;
-
-static inline void set_in_cr4(unsigned long mask)
-{
- unsigned long cr4;
-
- mmu_cr4_features |= mask;
- if (trampoline_cr4_features)
- *trampoline_cr4_features = mmu_cr4_features;
- cr4 = read_cr4();
- cr4 |= mask;
- write_cr4(cr4);
-}
-
-static inline void clear_in_cr4(unsigned long mask)
-{
- unsigned long cr4;
-
- mmu_cr4_features &= ~mask;
- if (trampoline_cr4_features)
- *trampoline_cr4_features = mmu_cr4_features;
- cr4 = read_cr4();
- cr4 &= ~mask;
- write_cr4(cr4);
-}
-
typedef struct {
unsigned long seg;
} mm_segment_t;
diff --git a/arch/x86/include/asm/smpboot_hooks.h b/arch/x86/include/asm/smpboot_hooks.h
deleted file mode 100644
index 0da7409f0bec..000000000000
--- a/arch/x86/include/asm/smpboot_hooks.h
+++ /dev/null
@@ -1,68 +0,0 @@
-/* two abstractions specific to kernel/smpboot.c, mainly to cater to visws
- * which needs to alter them. */
-
-static inline void smpboot_clear_io_apic_irqs(void)
-{
-#ifdef CONFIG_X86_IO_APIC
- io_apic_irqs = 0;
-#endif
-}
-
-static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&rtc_lock, flags);
- CMOS_WRITE(0xa, 0xf);
- spin_unlock_irqrestore(&rtc_lock, flags);
- local_flush_tlb();
- pr_debug("1.\n");
- *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_HIGH)) =
- start_eip >> 4;
- pr_debug("2.\n");
- *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) =
- start_eip & 0xf;
- pr_debug("3.\n");
-}
-
-static inline void smpboot_restore_warm_reset_vector(void)
-{
- unsigned long flags;
-
- /*
- * Install writable page 0 entry to set BIOS data area.
- */
- local_flush_tlb();
-
- /*
- * Paranoid: Set warm reset code and vector here back
- * to default values.
- */
- spin_lock_irqsave(&rtc_lock, flags);
- CMOS_WRITE(0, 0xf);
- spin_unlock_irqrestore(&rtc_lock, flags);
-
- *((volatile u32 *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = 0;
-}
-
-static inline void __init smpboot_setup_io_apic(void)
-{
-#ifdef CONFIG_X86_IO_APIC
- /*
- * Here we can be sure that there is an IO-APIC in the system. Let's
- * go and set it up:
- */
- if (!skip_ioapic_setup && nr_ioapics)
- setup_IO_APIC();
- else {
- nr_ioapics = 0;
- }
-#endif
-}
-
-static inline void smpboot_clear_io_apic(void)
-{
-#ifdef CONFIG_X86_IO_APIC
- nr_ioapics = 0;
-#endif
-}
diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h
index e820c080a4e9..6a4b00fafb00 100644
--- a/arch/x86/include/asm/special_insns.h
+++ b/arch/x86/include/asm/special_insns.h
@@ -137,17 +137,17 @@ static inline void write_cr3(unsigned long x)
native_write_cr3(x);
}
-static inline unsigned long read_cr4(void)
+static inline unsigned long __read_cr4(void)
{
return native_read_cr4();
}
-static inline unsigned long read_cr4_safe(void)
+static inline unsigned long __read_cr4_safe(void)
{
return native_read_cr4_safe();
}
-static inline void write_cr4(unsigned long x)
+static inline void __write_cr4(unsigned long x)
{
native_write_cr4(x);
}
diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
index 625660f8a2fc..cf87de3fc390 100644
--- a/arch/x86/include/asm/spinlock.h
+++ b/arch/x86/include/asm/spinlock.h
@@ -46,7 +46,7 @@ static __always_inline bool static_key_false(struct static_key *key);
static inline void __ticket_enter_slowpath(arch_spinlock_t *lock)
{
- set_bit(0, (volatile unsigned long *)&lock->tickets.tail);
+ set_bit(0, (volatile unsigned long *)&lock->tickets.head);
}
#else /* !CONFIG_PARAVIRT_SPINLOCKS */
@@ -60,10 +60,30 @@ static inline void __ticket_unlock_kick(arch_spinlock_t *lock,
}
#endif /* CONFIG_PARAVIRT_SPINLOCKS */
+static inline int __tickets_equal(__ticket_t one, __ticket_t two)
+{
+ return !((one ^ two) & ~TICKET_SLOWPATH_FLAG);
+}
+
+static inline void __ticket_check_and_clear_slowpath(arch_spinlock_t *lock,
+ __ticket_t head)
+{
+ if (head & TICKET_SLOWPATH_FLAG) {
+ arch_spinlock_t old, new;
+
+ old.tickets.head = head;
+ new.tickets.head = head & ~TICKET_SLOWPATH_FLAG;
+ old.tickets.tail = new.tickets.head + TICKET_LOCK_INC;
+ new.tickets.tail = old.tickets.tail;
+
+ /* try to clear slowpath flag when there are no contenders */
+ cmpxchg(&lock->head_tail, old.head_tail, new.head_tail);
+ }
+}
static __always_inline int arch_spin_value_unlocked(arch_spinlock_t lock)
{
- return lock.tickets.head == lock.tickets.tail;
+ return __tickets_equal(lock.tickets.head, lock.tickets.tail);
}
/*
@@ -87,18 +107,21 @@ static __always_inline void arch_spin_lock(arch_spinlock_t *lock)
if (likely(inc.head == inc.tail))
goto out;
- inc.tail &= ~TICKET_SLOWPATH_FLAG;
for (;;) {
unsigned count = SPIN_THRESHOLD;
do {
- if (READ_ONCE(lock->tickets.head) == inc.tail)
- goto out;
+ inc.head = READ_ONCE(lock->tickets.head);
+ if (__tickets_equal(inc.head, inc.tail))
+ goto clear_slowpath;
cpu_relax();
} while (--count);
__ticket_lock_spinning(lock, inc.tail);
}
-out: barrier(); /* make sure nothing creeps before the lock is taken */
+clear_slowpath:
+ __ticket_check_and_clear_slowpath(lock, inc.head);
+out:
+ barrier(); /* make sure nothing creeps before the lock is taken */
}
static __always_inline int arch_spin_trylock(arch_spinlock_t *lock)
@@ -106,56 +129,30 @@ static __always_inline int arch_spin_trylock(arch_spinlock_t *lock)
arch_spinlock_t old, new;
old.tickets = READ_ONCE(lock->tickets);
- if (old.tickets.head != (old.tickets.tail & ~TICKET_SLOWPATH_FLAG))
+ if (!__tickets_equal(old.tickets.head, old.tickets.tail))
return 0;
new.head_tail = old.head_tail + (TICKET_LOCK_INC << TICKET_SHIFT);
+ new.head_tail &= ~TICKET_SLOWPATH_FLAG;
/* cmpxchg is a full barrier, so nothing can move before it */
return cmpxchg(&lock->head_tail, old.head_tail, new.head_tail) == old.head_tail;
}
-static inline void __ticket_unlock_slowpath(arch_spinlock_t *lock,
- arch_spinlock_t old)
-{
- arch_spinlock_t new;
-
- BUILD_BUG_ON(((__ticket_t)NR_CPUS) != NR_CPUS);
-
- /* Perform the unlock on the "before" copy */
- old.tickets.head += TICKET_LOCK_INC;
-
- /* Clear the slowpath flag */
- new.head_tail = old.head_tail & ~(TICKET_SLOWPATH_FLAG << TICKET_SHIFT);
-
- /*
- * If the lock is uncontended, clear the flag - use cmpxchg in
- * case it changes behind our back though.
- */
- if (new.tickets.head != new.tickets.tail ||
- cmpxchg(&lock->head_tail, old.head_tail,
- new.head_tail) != old.head_tail) {
- /*
- * Lock still has someone queued for it, so wake up an
- * appropriate waiter.
- */
- __ticket_unlock_kick(lock, old.tickets.head);
- }
-}
-
static __always_inline void arch_spin_unlock(arch_spinlock_t *lock)
{
if (TICKET_SLOWPATH_FLAG &&
- static_key_false(&paravirt_ticketlocks_enabled)) {
- arch_spinlock_t prev;
+ static_key_false(&paravirt_ticketlocks_enabled)) {
+ __ticket_t head;
- prev = *lock;
- add_smp(&lock->tickets.head, TICKET_LOCK_INC);
+ BUILD_BUG_ON(((__ticket_t)NR_CPUS) != NR_CPUS);
- /* add_smp() is a full mb() */
+ head = xadd(&lock->tickets.head, TICKET_LOCK_INC);
- if (unlikely(lock->tickets.tail & TICKET_SLOWPATH_FLAG))
- __ticket_unlock_slowpath(lock, prev);
+ if (unlikely(head & TICKET_SLOWPATH_FLAG)) {
+ head &= ~TICKET_SLOWPATH_FLAG;
+ __ticket_unlock_kick(lock, (head + TICKET_LOCK_INC));
+ }
} else
__add(&lock->tickets.head, TICKET_LOCK_INC, UNLOCK_LOCK_PREFIX);
}
@@ -164,14 +161,15 @@ static inline int arch_spin_is_locked(arch_spinlock_t *lock)
{
struct __raw_tickets tmp = READ_ONCE(lock->tickets);
- return tmp.tail != tmp.head;
+ return !__tickets_equal(tmp.tail, tmp.head);
}
static inline int arch_spin_is_contended(arch_spinlock_t *lock)
{
struct __raw_tickets tmp = READ_ONCE(lock->tickets);
- return (__ticket_t)(tmp.tail - tmp.head) > TICKET_LOCK_INC;
+ tmp.head &= ~TICKET_SLOWPATH_FLAG;
+ return (tmp.tail - tmp.head) > TICKET_LOCK_INC;
}
#define arch_spin_is_contended arch_spin_is_contended
@@ -183,16 +181,16 @@ static __always_inline void arch_spin_lock_flags(arch_spinlock_t *lock,
static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
{
- __ticket_t head = ACCESS_ONCE(lock->tickets.head);
+ __ticket_t head = READ_ONCE(lock->tickets.head);
for (;;) {
- struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets);
+ struct __raw_tickets tmp = READ_ONCE(lock->tickets);
/*
* We need to check "unlocked" in a loop, tmp.head == head
* can be false positive because of overflow.
*/
- if (tmp.head == (tmp.tail & ~TICKET_SLOWPATH_FLAG) ||
- tmp.head != head)
+ if (__tickets_equal(tmp.head, tmp.tail) ||
+ !__tickets_equal(tmp.head, head))
break;
cpu_relax();
diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h
index 19e2c468fc2c..e4661196994e 100644
--- a/arch/x86/include/asm/string_64.h
+++ b/arch/x86/include/asm/string_64.h
@@ -27,11 +27,12 @@ static __always_inline void *__inline_memcpy(void *to, const void *from, size_t
function. */
#define __HAVE_ARCH_MEMCPY 1
+extern void *__memcpy(void *to, const void *from, size_t len);
+
#ifndef CONFIG_KMEMCHECK
#if (__GNUC__ == 4 && __GNUC_MINOR__ >= 3) || __GNUC__ > 4
extern void *memcpy(void *to, const void *from, size_t len);
#else
-extern void *__memcpy(void *to, const void *from, size_t len);
#define memcpy(dst, src, len) \
({ \
size_t __len = (len); \
@@ -53,9 +54,11 @@ extern void *__memcpy(void *to, const void *from, size_t len);
#define __HAVE_ARCH_MEMSET
void *memset(void *s, int c, size_t n);
+void *__memset(void *s, int c, size_t n);
#define __HAVE_ARCH_MEMMOVE
void *memmove(void *dest, const void *src, size_t count);
+void *__memmove(void *dest, const void *src, size_t count);
int memcmp(const void *cs, const void *ct, size_t count);
size_t strlen(const char *s);
@@ -63,6 +66,19 @@ char *strcpy(char *dest, const char *src);
char *strcat(char *dest, const char *src);
int strcmp(const char *cs, const char *ct);
+#if defined(CONFIG_KASAN) && !defined(__SANITIZE_ADDRESS__)
+
+/*
+ * For files that not instrumented (e.g. mm/slub.c) we
+ * should use not instrumented version of mem* functions.
+ */
+
+#undef memcpy
+#define memcpy(dst, src, len) __memcpy(dst, src, len)
+#define memmove(dst, src, len) __memmove(dst, src, len)
+#define memset(s, c, n) __memset(s, c, n)
+#endif
+
#endif /* __KERNEL__ */
#endif /* _ASM_X86_STRING_64_H */
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 547e344a6dc6..1d4e4f279a32 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -31,7 +31,6 @@ struct thread_info {
__u32 cpu; /* current CPU */
int saved_preempt_count;
mm_segment_t addr_limit;
- struct restart_block restart_block;
void __user *sysenter_return;
unsigned int sig_on_uaccess_error:1;
unsigned int uaccess_err:1; /* uaccess failed */
@@ -45,9 +44,6 @@ struct thread_info {
.cpu = 0, \
.saved_preempt_count = INIT_PREEMPT_COUNT, \
.addr_limit = KERNEL_DS, \
- .restart_block = { \
- .fn = do_no_restart_syscall, \
- }, \
}
#define init_thread_info (init_thread_union.thread_info)
@@ -75,7 +71,6 @@ struct thread_info {
#define TIF_SYSCALL_EMU 6 /* syscall emulation active */
#define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */
#define TIF_SECCOMP 8 /* secure computing */
-#define TIF_MCE_NOTIFY 10 /* notify userspace of an MCE */
#define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of userspace return */
#define TIF_UPROBE 12 /* breakpointed or singlestepping */
#define TIF_NOTSC 16 /* TSC is not accessible in userland */
@@ -100,7 +95,6 @@ struct thread_info {
#define _TIF_SYSCALL_EMU (1 << TIF_SYSCALL_EMU)
#define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT)
#define _TIF_SECCOMP (1 << TIF_SECCOMP)
-#define _TIF_MCE_NOTIFY (1 << TIF_MCE_NOTIFY)
#define _TIF_USER_RETURN_NOTIFY (1 << TIF_USER_RETURN_NOTIFY)
#define _TIF_UPROBE (1 << TIF_UPROBE)
#define _TIF_NOTSC (1 << TIF_NOTSC)
@@ -140,7 +134,7 @@ struct thread_info {
/* Only used for 64 bit */
#define _TIF_DO_NOTIFY_MASK \
- (_TIF_SIGPENDING | _TIF_MCE_NOTIFY | _TIF_NOTIFY_RESUME | \
+ (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | \
_TIF_USER_RETURN_NOTIFY | _TIF_UPROBE)
/* flags to check in __switch_to() */
@@ -170,6 +164,17 @@ static inline struct thread_info *current_thread_info(void)
return ti;
}
+static inline unsigned long current_stack_pointer(void)
+{
+ unsigned long sp;
+#ifdef CONFIG_X86_64
+ asm("mov %%rsp,%0" : "=g" (sp));
+#else
+ asm("mov %%esp,%0" : "=g" (sp));
+#endif
+ return sp;
+}
+
#else /* !__ASSEMBLY__ */
/* how to get the thread information struct from ASM */
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 04905bfc508b..cd791948b286 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -15,6 +15,75 @@
#define __flush_tlb_single(addr) __native_flush_tlb_single(addr)
#endif
+struct tlb_state {
+#ifdef CONFIG_SMP
+ struct mm_struct *active_mm;
+ int state;
+#endif
+
+ /*
+ * Access to this CR4 shadow and to H/W CR4 is protected by
+ * disabling interrupts when modifying either one.
+ */
+ unsigned long cr4;
+};
+DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate);
+
+/* Initialize cr4 shadow for this CPU. */
+static inline void cr4_init_shadow(void)
+{
+ this_cpu_write(cpu_tlbstate.cr4, __read_cr4());
+}
+
+/* Set in this cpu's CR4. */
+static inline void cr4_set_bits(unsigned long mask)
+{
+ unsigned long cr4;
+
+ cr4 = this_cpu_read(cpu_tlbstate.cr4);
+ if ((cr4 | mask) != cr4) {
+ cr4 |= mask;
+ this_cpu_write(cpu_tlbstate.cr4, cr4);
+ __write_cr4(cr4);
+ }
+}
+
+/* Clear in this cpu's CR4. */
+static inline void cr4_clear_bits(unsigned long mask)
+{
+ unsigned long cr4;
+
+ cr4 = this_cpu_read(cpu_tlbstate.cr4);
+ if ((cr4 & ~mask) != cr4) {
+ cr4 &= ~mask;
+ this_cpu_write(cpu_tlbstate.cr4, cr4);
+ __write_cr4(cr4);
+ }
+}
+
+/* Read the CR4 shadow. */
+static inline unsigned long cr4_read_shadow(void)
+{
+ return this_cpu_read(cpu_tlbstate.cr4);
+}
+
+/*
+ * Save some of cr4 feature set we're using (e.g. Pentium 4MB
+ * enable and PPro Global page enable), so that any CPU's that boot
+ * up after us can get the correct flags. This should only be used
+ * during boot on the boot cpu.
+ */
+extern unsigned long mmu_cr4_features;
+extern u32 *trampoline_cr4_features;
+
+static inline void cr4_set_bits_and_update_boot(unsigned long mask)
+{
+ mmu_cr4_features |= mask;
+ if (trampoline_cr4_features)
+ *trampoline_cr4_features = mmu_cr4_features;
+ cr4_set_bits(mask);
+}
+
static inline void __native_flush_tlb(void)
{
native_write_cr3(native_read_cr3());
@@ -24,7 +93,7 @@ static inline void __native_flush_tlb_global_irq_disabled(void)
{
unsigned long cr4;
- cr4 = native_read_cr4();
+ cr4 = this_cpu_read(cpu_tlbstate.cr4);
/* clear PGE */
native_write_cr4(cr4 & ~X86_CR4_PGE);
/* write old PGE again and flush TLBs */
@@ -184,12 +253,6 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
#define TLBSTATE_OK 1
#define TLBSTATE_LAZY 2
-struct tlb_state {
- struct mm_struct *active_mm;
- int state;
-};
-DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate);
-
static inline void reset_lazy_tlbstate(void)
{
this_cpu_write(cpu_tlbstate.state, 0);
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index 707adc6549d8..4e49d7dff78e 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -1,6 +1,7 @@
#ifndef _ASM_X86_TRAPS_H
#define _ASM_X86_TRAPS_H
+#include <linux/context_tracking_state.h>
#include <linux/kprobes.h>
#include <asm/debugreg.h>
@@ -110,6 +111,11 @@ asmlinkage void smp_thermal_interrupt(void);
asmlinkage void mce_threshold_interrupt(void);
#endif
+extern enum ctx_state ist_enter(struct pt_regs *regs);
+extern void ist_exit(struct pt_regs *regs, enum ctx_state prev_state);
+extern void ist_begin_non_atomic(struct pt_regs *regs);
+extern void ist_end_non_atomic(void);
+
/* Interrupts/Exceptions */
enum {
X86_TRAP_DE = 0, /* 0, Divide-by-zero */
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 0d592e0a5b84..ace9dec050b1 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -179,7 +179,7 @@ __typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL))
asm volatile("call __get_user_%P3" \
: "=a" (__ret_gu), "=r" (__val_gu) \
: "0" (ptr), "i" (sizeof(*(ptr)))); \
- (x) = (__typeof__(*(ptr))) __val_gu; \
+ (x) = (__force __typeof__(*(ptr))) __val_gu; \
__ret_gu; \
})
diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h
index e7e9682a33e9..f556c4843aa1 100644
--- a/arch/x86/include/asm/vgtod.h
+++ b/arch/x86/include/asm/vgtod.h
@@ -80,9 +80,11 @@ static inline unsigned int __getcpu(void)
/*
* Load per CPU data from GDT. LSL is faster than RDTSCP and
- * works on all CPUs.
+ * works on all CPUs. This is volatile so that it orders
+ * correctly wrt barrier() and to keep gcc from cleverly
+ * hoisting it out of the calling function.
*/
- asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG));
+ asm volatile ("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG));
return p;
}
diff --git a/arch/x86/include/asm/virtext.h b/arch/x86/include/asm/virtext.h
index 5da71c27cc59..cce9ee68e335 100644
--- a/arch/x86/include/asm/virtext.h
+++ b/arch/x86/include/asm/virtext.h
@@ -19,6 +19,7 @@
#include <asm/vmx.h>
#include <asm/svm.h>
+#include <asm/tlbflush.h>
/*
* VMX functions:
@@ -40,12 +41,12 @@ static inline int cpu_has_vmx(void)
static inline void cpu_vmxoff(void)
{
asm volatile (ASM_VMX_VMXOFF : : : "cc");
- write_cr4(read_cr4() & ~X86_CR4_VMXE);
+ cr4_clear_bits(X86_CR4_VMXE);
}
static inline int cpu_vmx_enabled(void)
{
- return read_cr4() & X86_CR4_VMXE;
+ return __read_cr4() & X86_CR4_VMXE;
}
/** Disable VMX if it is enabled on the current CPU
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 45afaee9555c..da772edd19ab 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -69,6 +69,7 @@
#define SECONDARY_EXEC_PAUSE_LOOP_EXITING 0x00000400
#define SECONDARY_EXEC_ENABLE_INVPCID 0x00001000
#define SECONDARY_EXEC_SHADOW_VMCS 0x00004000
+#define SECONDARY_EXEC_ENABLE_PML 0x00020000
#define SECONDARY_EXEC_XSAVES 0x00100000
@@ -121,6 +122,7 @@ enum vmcs_field {
GUEST_LDTR_SELECTOR = 0x0000080c,
GUEST_TR_SELECTOR = 0x0000080e,
GUEST_INTR_STATUS = 0x00000810,
+ GUEST_PML_INDEX = 0x00000812,
HOST_ES_SELECTOR = 0x00000c00,
HOST_CS_SELECTOR = 0x00000c02,
HOST_SS_SELECTOR = 0x00000c04,
@@ -140,6 +142,8 @@ enum vmcs_field {
VM_EXIT_MSR_LOAD_ADDR_HIGH = 0x00002009,
VM_ENTRY_MSR_LOAD_ADDR = 0x0000200a,
VM_ENTRY_MSR_LOAD_ADDR_HIGH = 0x0000200b,
+ PML_ADDRESS = 0x0000200e,
+ PML_ADDRESS_HIGH = 0x0000200f,
TSC_OFFSET = 0x00002010,
TSC_OFFSET_HIGH = 0x00002011,
VIRTUAL_APIC_PAGE_ADDR = 0x00002012,
diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index 5eea09915a15..358dcd338915 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -55,9 +55,8 @@ extern int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops,
struct gnttab_map_grant_ref *kmap_ops,
struct page **pages, unsigned int count);
extern int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops,
- struct gnttab_map_grant_ref *kmap_ops,
+ struct gnttab_unmap_grant_ref *kunmap_ops,
struct page **pages, unsigned int count);
-extern unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn);
/*
* Helper functions to write or read unsigned long values to/from
@@ -154,21 +153,12 @@ static inline unsigned long mfn_to_pfn(unsigned long mfn)
return mfn;
pfn = mfn_to_pfn_no_overrides(mfn);
- if (__pfn_to_mfn(pfn) != mfn) {
- /*
- * If this appears to be a foreign mfn (because the pfn
- * doesn't map back to the mfn), then check the local override
- * table to see if there's a better pfn to use.
- *
- * m2p_find_override_pfn returns ~0 if it doesn't find anything.
- */
- pfn = m2p_find_override_pfn(mfn, ~0);
- }
+ if (__pfn_to_mfn(pfn) != mfn)
+ pfn = ~0;
/*
- * pfn is ~0 if there are no entries in the m2p for mfn or if the
- * entry doesn't map back to the mfn and m2p_override doesn't have a
- * valid entry for it.
+ * pfn is ~0 if there are no entries in the m2p for mfn or the
+ * entry doesn't map back to the mfn.
*/
if (pfn == ~0 && __pfn_to_mfn(mfn) == IDENTITY_FRAME(mfn))
pfn = mfn;
diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h
index 225b0988043a..44e6dd7e36a2 100644
--- a/arch/x86/include/uapi/asm/bootparam.h
+++ b/arch/x86/include/uapi/asm/bootparam.h
@@ -7,6 +7,7 @@
#define SETUP_DTB 2
#define SETUP_PCI 3
#define SETUP_EFI 4
+#define SETUP_KASLR 5
/* ram_size flags */
#define RAMDISK_IMAGE_START_MASK 0x07FF
diff --git a/arch/x86/include/uapi/asm/hyperv.h b/arch/x86/include/uapi/asm/hyperv.h
index 462efe746d77..90c458e66e13 100644
--- a/arch/x86/include/uapi/asm/hyperv.h
+++ b/arch/x86/include/uapi/asm/hyperv.h
@@ -187,6 +187,17 @@
#define HV_X64_MSR_SINT14 0x4000009E
#define HV_X64_MSR_SINT15 0x4000009F
+/*
+ * Synthetic Timer MSRs. Four timers per vcpu.
+ */
+#define HV_X64_MSR_STIMER0_CONFIG 0x400000B0
+#define HV_X64_MSR_STIMER0_COUNT 0x400000B1
+#define HV_X64_MSR_STIMER1_CONFIG 0x400000B2
+#define HV_X64_MSR_STIMER1_COUNT 0x400000B3
+#define HV_X64_MSR_STIMER2_CONFIG 0x400000B4
+#define HV_X64_MSR_STIMER2_COUNT 0x400000B5
+#define HV_X64_MSR_STIMER3_CONFIG 0x400000B6
+#define HV_X64_MSR_STIMER3_COUNT 0x400000B7
#define HV_X64_MSR_HYPERCALL_ENABLE 0x00000001
#define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT 12
diff --git a/arch/x86/include/uapi/asm/msr-index.h b/arch/x86/include/uapi/asm/msr-index.h
index c8aa65d56027..3ce079136c11 100644
--- a/arch/x86/include/uapi/asm/msr-index.h
+++ b/arch/x86/include/uapi/asm/msr-index.h
@@ -152,6 +152,10 @@
#define MSR_CC6_DEMOTION_POLICY_CONFIG 0x00000668
#define MSR_MC6_DEMOTION_POLICY_CONFIG 0x00000669
+#define MSR_CORE_PERF_LIMIT_REASONS 0x00000690
+#define MSR_GFX_PERF_LIMIT_REASONS 0x000006B0
+#define MSR_RING_PERF_LIMIT_REASONS 0x000006B1
+
/* Hardware P state interface */
#define MSR_PPERF 0x0000064e
#define MSR_PERF_LIMIT_REASONS 0x0000064f
@@ -251,6 +255,10 @@
/* Fam 16h MSRs */
#define MSR_F16H_L2I_PERF_CTL 0xc0010230
#define MSR_F16H_L2I_PERF_CTR 0xc0010231
+#define MSR_F16H_DR1_ADDR_MASK 0xc0011019
+#define MSR_F16H_DR2_ADDR_MASK 0xc001101a
+#define MSR_F16H_DR3_ADDR_MASK 0xc001101b
+#define MSR_F16H_DR0_ADDR_MASK 0xc0011027
/* Fam 15h MSRs */
#define MSR_F15H_PERF_CTL 0xc0010200
@@ -356,8 +364,12 @@
#define MSR_IA32_UCODE_WRITE 0x00000079
#define MSR_IA32_UCODE_REV 0x0000008b
+#define MSR_IA32_SMM_MONITOR_CTL 0x0000009b
+#define MSR_IA32_SMBASE 0x0000009e
+
#define MSR_IA32_PERF_STATUS 0x00000198
#define MSR_IA32_PERF_CTL 0x00000199
+#define INTEL_PERF_CTL_MASK 0xffff
#define MSR_AMD_PSTATE_DEF_BASE 0xc0010064
#define MSR_AMD_PERF_STATUS 0xc0010063
#define MSR_AMD_PERF_CTL 0xc0010062
diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
index b813bf9da1e2..c5f1a1deb91a 100644
--- a/arch/x86/include/uapi/asm/vmx.h
+++ b/arch/x86/include/uapi/asm/vmx.h
@@ -56,6 +56,7 @@
#define EXIT_REASON_MSR_READ 31
#define EXIT_REASON_MSR_WRITE 32
#define EXIT_REASON_INVALID_STATE 33
+#define EXIT_REASON_MSR_LOAD_FAIL 34
#define EXIT_REASON_MWAIT_INSTRUCTION 36
#define EXIT_REASON_MONITOR_INSTRUCTION 39
#define EXIT_REASON_PAUSE_INSTRUCTION 40
@@ -72,6 +73,7 @@
#define EXIT_REASON_XSETBV 55
#define EXIT_REASON_APIC_WRITE 56
#define EXIT_REASON_INVPCID 58
+#define EXIT_REASON_PML_FULL 62
#define EXIT_REASON_XSAVES 63
#define EXIT_REASON_XRSTORS 64
@@ -116,10 +118,14 @@
{ EXIT_REASON_APIC_WRITE, "APIC_WRITE" }, \
{ EXIT_REASON_EOI_INDUCED, "EOI_INDUCED" }, \
{ EXIT_REASON_INVALID_STATE, "INVALID_STATE" }, \
+ { EXIT_REASON_MSR_LOAD_FAIL, "MSR_LOAD_FAIL" }, \
{ EXIT_REASON_INVD, "INVD" }, \
{ EXIT_REASON_INVVPID, "INVVPID" }, \
{ EXIT_REASON_INVPCID, "INVPCID" }, \
{ EXIT_REASON_XSAVES, "XSAVES" }, \
{ EXIT_REASON_XRSTORS, "XRSTORS" }
+#define VMX_ABORT_SAVE_GUEST_MSR_FAIL 1
+#define VMX_ABORT_LOAD_HOST_MSR_FAIL 4
+
#endif /* _UAPIVMX_H */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 5d4502c8b983..cdb1b70ddad0 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -16,6 +16,10 @@ CFLAGS_REMOVE_ftrace.o = -pg
CFLAGS_REMOVE_early_printk.o = -pg
endif
+KASAN_SANITIZE_head$(BITS).o := n
+KASAN_SANITIZE_dumpstack.o := n
+KASAN_SANITIZE_dumpstack_$(BITS).o := n
+
CFLAGS_irq.o := -I$(src)/../include/asm/trace
obj-y := process_$(BITS).o signal.o entry_$(BITS).o
@@ -63,6 +67,7 @@ obj-$(CONFIG_X86_MPPARSE) += mpparse.o
obj-y += apic/
obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o
obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o
+obj-$(CONFIG_LIVEPATCH) += livepatch.o
obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o
obj-$(CONFIG_FTRACE_SYSCALLS) += ftrace.o
obj-$(CONFIG_X86_TSC) += trace_clock.o
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 4433a4be8171..3d525c6124f6 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -611,20 +611,25 @@ void __init acpi_pic_sci_set_trigger(unsigned int irq, u16 trigger)
int acpi_gsi_to_irq(u32 gsi, unsigned int *irqp)
{
- int irq;
+ int rc, irq, trigger, polarity;
if (acpi_irq_model == ACPI_IRQ_MODEL_PIC) {
*irqp = gsi;
- } else {
- mutex_lock(&acpi_ioapic_lock);
- irq = mp_map_gsi_to_irq(gsi,
- IOAPIC_MAP_ALLOC | IOAPIC_MAP_CHECK);
- mutex_unlock(&acpi_ioapic_lock);
- if (irq < 0)
- return -1;
- *irqp = irq;
+ return 0;
}
- return 0;
+
+ rc = acpi_get_override_irq(gsi, &trigger, &polarity);
+ if (rc == 0) {
+ trigger = trigger ? ACPI_LEVEL_SENSITIVE : ACPI_EDGE_SENSITIVE;
+ polarity = polarity ? ACPI_ACTIVE_LOW : ACPI_ACTIVE_HIGH;
+ irq = acpi_register_gsi(NULL, gsi, trigger, polarity);
+ if (irq >= 0) {
+ *irqp = irq;
+ return 0;
+ }
+ }
+
+ return -1;
}
EXPORT_SYMBOL_GPL(acpi_gsi_to_irq);
@@ -653,6 +658,7 @@ static int acpi_register_gsi_pic(struct device *dev, u32 gsi,
return gsi;
}
+#ifdef CONFIG_X86_LOCAL_APIC
static int acpi_register_gsi_ioapic(struct device *dev, u32 gsi,
int trigger, int polarity)
{
@@ -675,6 +681,7 @@ static void acpi_unregister_gsi_ioapic(u32 gsi)
mutex_unlock(&acpi_ioapic_lock);
#endif
}
+#endif
int (*__acpi_register_gsi)(struct device *dev, u32 gsi,
int trigger, int polarity) = acpi_register_gsi_pic;
@@ -750,13 +757,13 @@ static int _acpi_map_lsapic(acpi_handle handle, int physid, int *pcpu)
}
/* wrapper to silence section mismatch warning */
-int __ref acpi_map_lsapic(acpi_handle handle, int physid, int *pcpu)
+int __ref acpi_map_cpu(acpi_handle handle, int physid, int *pcpu)
{
return _acpi_map_lsapic(handle, physid, pcpu);
}
-EXPORT_SYMBOL(acpi_map_lsapic);
+EXPORT_SYMBOL(acpi_map_cpu);
-int acpi_unmap_lsapic(int cpu)
+int acpi_unmap_cpu(int cpu)
{
#ifdef CONFIG_ACPI_NUMA
set_apicid_to_node(per_cpu(x86_cpu_to_apicid, cpu), NUMA_NO_NODE);
@@ -768,8 +775,7 @@ int acpi_unmap_lsapic(int cpu)
return (0);
}
-
-EXPORT_SYMBOL(acpi_unmap_lsapic);
+EXPORT_SYMBOL(acpi_unmap_cpu);
#endif /* CONFIG_ACPI_HOTPLUG_CPU */
int acpi_register_ioapic(acpi_handle handle, u64 phys_addr, u32 gsi_base)
@@ -844,13 +850,7 @@ int acpi_ioapic_registered(acpi_handle handle, u32 gsi_base)
static int __init acpi_parse_sbf(struct acpi_table_header *table)
{
- struct acpi_table_boot *sb;
-
- sb = (struct acpi_table_boot *)table;
- if (!sb) {
- printk(KERN_WARNING PREFIX "Unable to map SBF\n");
- return -ENODEV;
- }
+ struct acpi_table_boot *sb = (struct acpi_table_boot *)table;
sbf_port = sb->cmos_index; /* Save CMOS port */
@@ -864,13 +864,7 @@ static struct resource *hpet_res __initdata;
static int __init acpi_parse_hpet(struct acpi_table_header *table)
{
- struct acpi_table_hpet *hpet_tbl;
-
- hpet_tbl = (struct acpi_table_hpet *)table;
- if (!hpet_tbl) {
- printk(KERN_WARNING PREFIX "Unable to map HPET\n");
- return -ENODEV;
- }
+ struct acpi_table_hpet *hpet_tbl = (struct acpi_table_hpet *)table;
if (hpet_tbl->address.space_id != ACPI_SPACE_MEM) {
printk(KERN_WARNING PREFIX "HPET timers must be located in "
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index 31368207837c..d1daead5fcdd 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -78,7 +78,7 @@ int x86_acpi_suspend_lowlevel(void)
header->pmode_cr0 = read_cr0();
if (__this_cpu_read(cpu_info.cpuid_level) >= 0) {
- header->pmode_cr4 = read_cr4();
+ header->pmode_cr4 = __read_cr4();
header->pmode_behavior |= (1 << WAKEUP_BEHAVIOR_RESTORE_CR4);
}
if (!rdmsr_safe(MSR_IA32_MISC_ENABLE,
diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c
index b708738d016e..6a7c23ff21d3 100644
--- a/arch/x86/kernel/apb_timer.c
+++ b/arch/x86/kernel/apb_timer.c
@@ -135,14 +135,6 @@ static inline void apbt_clear_mapping(void)
apbt_virt_address = NULL;
}
-/*
- * APBT timer interrupt enable / disable
- */
-static inline int is_apbt_capable(void)
-{
- return apbt_virt_address ? 1 : 0;
-}
-
static int __init apbt_clockevent_register(void)
{
struct sfi_timer_table_entry *mtmr;
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 29b5b18afa27..ad3639ae1b9b 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -134,9 +134,6 @@ static inline void imcr_apic_to_pic(void)
*/
static int force_enable_local_apic __initdata;
-/* Control whether x2APIC mode is enabled or not */
-static bool nox2apic __initdata;
-
/*
* APIC command line parameters
*/
@@ -161,33 +158,6 @@ static __init int setup_apicpmtimer(char *s)
__setup("apicpmtimer", setup_apicpmtimer);
#endif
-int x2apic_mode;
-#ifdef CONFIG_X86_X2APIC
-/* x2apic enabled before OS handover */
-int x2apic_preenabled;
-static int x2apic_disabled;
-static int __init setup_nox2apic(char *str)
-{
- if (x2apic_enabled()) {
- int apicid = native_apic_msr_read(APIC_ID);
-
- if (apicid >= 255) {
- pr_warning("Apicid: %08x, cannot enforce nox2apic\n",
- apicid);
- return 0;
- }
-
- pr_warning("x2apic already enabled. will disable it\n");
- } else
- setup_clear_cpu_cap(X86_FEATURE_X2APIC);
-
- nox2apic = true;
-
- return 0;
-}
-early_param("nox2apic", setup_nox2apic);
-#endif
-
unsigned long mp_lapic_addr;
int disable_apic;
/* Disable local APIC timer from the kernel commandline or via dmi quirk */
@@ -1475,7 +1445,7 @@ void setup_local_APIC(void)
#endif
}
-void end_local_APIC_setup(void)
+static void end_local_APIC_setup(void)
{
lapic_setup_esr();
@@ -1492,116 +1462,183 @@ void end_local_APIC_setup(void)
apic_pm_activate();
}
-void __init bsp_end_local_APIC_setup(void)
+/*
+ * APIC setup function for application processors. Called from smpboot.c
+ */
+void apic_ap_setup(void)
{
+ setup_local_APIC();
end_local_APIC_setup();
-
- /*
- * Now that local APIC setup is completed for BP, configure the fault
- * handling for interrupt remapping.
- */
- irq_remap_enable_fault_handling();
-
}
#ifdef CONFIG_X86_X2APIC
-/*
- * Need to disable xapic and x2apic at the same time and then enable xapic mode
- */
-static inline void __disable_x2apic(u64 msr)
-{
- wrmsrl(MSR_IA32_APICBASE,
- msr & ~(X2APIC_ENABLE | XAPIC_ENABLE));
- wrmsrl(MSR_IA32_APICBASE, msr & ~X2APIC_ENABLE);
-}
+int x2apic_mode;
-static __init void disable_x2apic(void)
+enum {
+ X2APIC_OFF,
+ X2APIC_ON,
+ X2APIC_DISABLED,
+};
+static int x2apic_state;
+
+static inline void __x2apic_disable(void)
{
u64 msr;
- if (!cpu_has_x2apic)
+ if (cpu_has_apic)
return;
rdmsrl(MSR_IA32_APICBASE, msr);
- if (msr & X2APIC_ENABLE) {
- u32 x2apic_id = read_apic_id();
-
- if (x2apic_id >= 255)
- panic("Cannot disable x2apic, id: %08x\n", x2apic_id);
+ if (!(msr & X2APIC_ENABLE))
+ return;
+ /* Disable xapic and x2apic first and then reenable xapic mode */
+ wrmsrl(MSR_IA32_APICBASE, msr & ~(X2APIC_ENABLE | XAPIC_ENABLE));
+ wrmsrl(MSR_IA32_APICBASE, msr & ~X2APIC_ENABLE);
+ printk_once(KERN_INFO "x2apic disabled\n");
+}
- pr_info("Disabling x2apic\n");
- __disable_x2apic(msr);
+static inline void __x2apic_enable(void)
+{
+ u64 msr;
- if (nox2apic) {
- clear_cpu_cap(&cpu_data(0), X86_FEATURE_X2APIC);
- setup_clear_cpu_cap(X86_FEATURE_X2APIC);
- }
+ rdmsrl(MSR_IA32_APICBASE, msr);
+ if (msr & X2APIC_ENABLE)
+ return;
+ wrmsrl(MSR_IA32_APICBASE, msr | X2APIC_ENABLE);
+ printk_once(KERN_INFO "x2apic enabled\n");
+}
- x2apic_disabled = 1;
- x2apic_mode = 0;
+static int __init setup_nox2apic(char *str)
+{
+ if (x2apic_enabled()) {
+ int apicid = native_apic_msr_read(APIC_ID);
- register_lapic_address(mp_lapic_addr);
+ if (apicid >= 255) {
+ pr_warning("Apicid: %08x, cannot enforce nox2apic\n",
+ apicid);
+ return 0;
+ }
+ pr_warning("x2apic already enabled.\n");
+ __x2apic_disable();
}
+ setup_clear_cpu_cap(X86_FEATURE_X2APIC);
+ x2apic_state = X2APIC_DISABLED;
+ x2apic_mode = 0;
+ return 0;
}
+early_param("nox2apic", setup_nox2apic);
-void check_x2apic(void)
+/* Called from cpu_init() to enable x2apic on (secondary) cpus */
+void x2apic_setup(void)
{
- if (x2apic_enabled()) {
- pr_info("x2apic enabled by BIOS, switching to x2apic ops\n");
- x2apic_preenabled = x2apic_mode = 1;
+ /*
+ * If x2apic is not in ON state, disable it if already enabled
+ * from BIOS.
+ */
+ if (x2apic_state != X2APIC_ON) {
+ __x2apic_disable();
+ return;
}
+ __x2apic_enable();
}
-void enable_x2apic(void)
+static __init void x2apic_disable(void)
{
- u64 msr;
+ u32 x2apic_id;
- rdmsrl(MSR_IA32_APICBASE, msr);
- if (x2apic_disabled) {
- __disable_x2apic(msr);
+ if (x2apic_state != X2APIC_ON)
+ goto out;
+
+ x2apic_id = read_apic_id();
+ if (x2apic_id >= 255)
+ panic("Cannot disable x2apic, id: %08x\n", x2apic_id);
+
+ __x2apic_disable();
+ register_lapic_address(mp_lapic_addr);
+out:
+ x2apic_state = X2APIC_DISABLED;
+ x2apic_mode = 0;
+}
+
+static __init void x2apic_enable(void)
+{
+ if (x2apic_state != X2APIC_OFF)
return;
- }
- if (!x2apic_mode)
+ x2apic_mode = 1;
+ x2apic_state = X2APIC_ON;
+ __x2apic_enable();
+}
+
+static __init void try_to_enable_x2apic(int remap_mode)
+{
+ if (x2apic_state == X2APIC_DISABLED)
return;
- if (!(msr & X2APIC_ENABLE)) {
- printk_once(KERN_INFO "Enabling x2apic\n");
- wrmsrl(MSR_IA32_APICBASE, msr | X2APIC_ENABLE);
+ if (remap_mode != IRQ_REMAP_X2APIC_MODE) {
+ /* IR is required if there is APIC ID > 255 even when running
+ * under KVM
+ */
+ if (max_physical_apicid > 255 ||
+ !hypervisor_x2apic_available()) {
+ pr_info("x2apic: IRQ remapping doesn't support X2APIC mode\n");
+ x2apic_disable();
+ return;
+ }
+
+ /*
+ * without IR all CPUs can be addressed by IOAPIC/MSI
+ * only in physical mode
+ */
+ x2apic_phys = 1;
}
+ x2apic_enable();
}
-#endif /* CONFIG_X86_X2APIC */
-int __init enable_IR(void)
+void __init check_x2apic(void)
{
-#ifdef CONFIG_IRQ_REMAP
- if (!irq_remapping_supported()) {
- pr_debug("intr-remapping not supported\n");
- return -1;
+ if (x2apic_enabled()) {
+ pr_info("x2apic: enabled by BIOS, switching to x2apic ops\n");
+ x2apic_mode = 1;
+ x2apic_state = X2APIC_ON;
+ } else if (!cpu_has_x2apic) {
+ x2apic_state = X2APIC_DISABLED;
}
+}
+#else /* CONFIG_X86_X2APIC */
+static int __init validate_x2apic(void)
+{
+ if (!apic_is_x2apic_enabled())
+ return 0;
+ /*
+ * Checkme: Can we simply turn off x2apic here instead of panic?
+ */
+ panic("BIOS has enabled x2apic but kernel doesn't support x2apic, please disable x2apic in BIOS.\n");
+}
+early_initcall(validate_x2apic);
- if (!x2apic_preenabled && skip_ioapic_setup) {
- pr_info("Skipped enabling intr-remap because of skipping "
- "io-apic setup\n");
+static inline void try_to_enable_x2apic(int remap_mode) { }
+static inline void __x2apic_enable(void) { }
+#endif /* !CONFIG_X86_X2APIC */
+
+static int __init try_to_enable_IR(void)
+{
+#ifdef CONFIG_X86_IO_APIC
+ if (!x2apic_enabled() && skip_ioapic_setup) {
+ pr_info("Not enabling interrupt remapping due to skipped IO-APIC setup\n");
return -1;
}
-
- return irq_remapping_enable();
#endif
- return -1;
+ return irq_remapping_enable();
}
void __init enable_IR_x2apic(void)
{
unsigned long flags;
- int ret, x2apic_enabled = 0;
- int hardware_init_ret;
-
- /* Make sure irq_remap_ops are initialized */
- setup_irq_remapping_ops();
+ int ret, ir_stat;
- hardware_init_ret = irq_remapping_prepare();
- if (hardware_init_ret && !x2apic_supported())
+ ir_stat = irq_remapping_prepare();
+ if (ir_stat < 0 && !x2apic_supported())
return;
ret = save_ioapic_entries();
@@ -1614,49 +1651,13 @@ void __init enable_IR_x2apic(void)
legacy_pic->mask_all();
mask_ioapic_entries();
- if (x2apic_preenabled && nox2apic)
- disable_x2apic();
-
- if (hardware_init_ret)
- ret = -1;
- else
- ret = enable_IR();
-
- if (!x2apic_supported())
- goto skip_x2apic;
-
- if (ret < 0) {
- /* IR is required if there is APIC ID > 255 even when running
- * under KVM
- */
- if (max_physical_apicid > 255 ||
- !hypervisor_x2apic_available()) {
- if (x2apic_preenabled)
- disable_x2apic();
- goto skip_x2apic;
- }
- /*
- * without IR all CPUs can be addressed by IOAPIC/MSI
- * only in physical mode
- */
- x2apic_force_phys();
- }
+ /* If irq_remapping_prepare() succeded, try to enable it */
+ if (ir_stat >= 0)
+ ir_stat = try_to_enable_IR();
+ /* ir_stat contains the remap mode or an error code */
+ try_to_enable_x2apic(ir_stat);
- if (ret == IRQ_REMAP_XAPIC_MODE) {
- pr_info("x2apic not enabled, IRQ remapping is in xapic mode\n");
- goto skip_x2apic;
- }
-
- x2apic_enabled = 1;
-
- if (x2apic_supported() && !x2apic_mode) {
- x2apic_mode = 1;
- enable_x2apic();
- pr_info("Enabled x2apic\n");
- }
-
-skip_x2apic:
- if (ret < 0) /* IR enabling failed */
+ if (ir_stat < 0)
restore_ioapic_entries();
legacy_pic->restore_mask();
local_irq_restore(flags);
@@ -1847,82 +1848,8 @@ void __init register_lapic_address(unsigned long address)
}
}
-/*
- * This initializes the IO-APIC and APIC hardware if this is
- * a UP kernel.
- */
int apic_version[MAX_LOCAL_APIC];
-int __init APIC_init_uniprocessor(void)
-{
- if (disable_apic) {
- pr_info("Apic disabled\n");
- return -1;
- }
-#ifdef CONFIG_X86_64
- if (!cpu_has_apic) {
- disable_apic = 1;
- pr_info("Apic disabled by BIOS\n");
- return -1;
- }
-#else
- if (!smp_found_config && !cpu_has_apic)
- return -1;
-
- /*
- * Complain if the BIOS pretends there is one.
- */
- if (!cpu_has_apic &&
- APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) {
- pr_err("BIOS bug, local APIC 0x%x not detected!...\n",
- boot_cpu_physical_apicid);
- return -1;
- }
-#endif
-
- default_setup_apic_routing();
-
- verify_local_APIC();
- connect_bsp_APIC();
-
-#ifdef CONFIG_X86_64
- apic_write(APIC_ID, SET_APIC_ID(boot_cpu_physical_apicid));
-#else
- /*
- * Hack: In case of kdump, after a crash, kernel might be booting
- * on a cpu with non-zero lapic id. But boot_cpu_physical_apicid
- * might be zero if read from MP tables. Get it from LAPIC.
- */
-# ifdef CONFIG_CRASH_DUMP
- boot_cpu_physical_apicid = read_apic_id();
-# endif
-#endif
- physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map);
- setup_local_APIC();
-
-#ifdef CONFIG_X86_IO_APIC
- /*
- * Now enable IO-APICs, actually call clear_IO_APIC
- * We need clear_IO_APIC before enabling error vector
- */
- if (!skip_ioapic_setup && nr_ioapics)
- enable_IO_APIC();
-#endif
-
- bsp_end_local_APIC_setup();
-
-#ifdef CONFIG_X86_IO_APIC
- if (smp_found_config && !skip_ioapic_setup && nr_ioapics)
- setup_IO_APIC();
- else {
- nr_ioapics = 0;
- }
-#endif
-
- x86_init.timers.setup_percpu_clockev();
- return 0;
-}
-
/*
* Local APIC interrupts
*/
@@ -2027,7 +1954,7 @@ __visible void smp_trace_error_interrupt(struct pt_regs *regs)
/**
* connect_bsp_APIC - attach the APIC to the interrupt system
*/
-void __init connect_bsp_APIC(void)
+static void __init connect_bsp_APIC(void)
{
#ifdef CONFIG_X86_32
if (pic_mode) {
@@ -2274,6 +2201,100 @@ void __init apic_set_eoi_write(void (*eoi_write)(u32 reg, u32 v))
}
}
+static void __init apic_bsp_up_setup(void)
+{
+#ifdef CONFIG_X86_64
+ apic_write(APIC_ID, SET_APIC_ID(boot_cpu_physical_apicid));
+#else
+ /*
+ * Hack: In case of kdump, after a crash, kernel might be booting
+ * on a cpu with non-zero lapic id. But boot_cpu_physical_apicid
+ * might be zero if read from MP tables. Get it from LAPIC.
+ */
+# ifdef CONFIG_CRASH_DUMP
+ boot_cpu_physical_apicid = read_apic_id();
+# endif
+#endif
+ physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map);
+}
+
+/**
+ * apic_bsp_setup - Setup function for local apic and io-apic
+ * @upmode: Force UP mode (for APIC_init_uniprocessor)
+ *
+ * Returns:
+ * apic_id of BSP APIC
+ */
+int __init apic_bsp_setup(bool upmode)
+{
+ int id;
+
+ connect_bsp_APIC();
+ if (upmode)
+ apic_bsp_up_setup();
+ setup_local_APIC();
+
+ if (x2apic_mode)
+ id = apic_read(APIC_LDR);
+ else
+ id = GET_APIC_LOGICAL_ID(apic_read(APIC_LDR));
+
+ enable_IO_APIC();
+ end_local_APIC_setup();
+ irq_remap_enable_fault_handling();
+ setup_IO_APIC();
+ /* Setup local timer */
+ x86_init.timers.setup_percpu_clockev();
+ return id;
+}
+
+/*
+ * This initializes the IO-APIC and APIC hardware if this is
+ * a UP kernel.
+ */
+int __init APIC_init_uniprocessor(void)
+{
+ if (disable_apic) {
+ pr_info("Apic disabled\n");
+ return -1;
+ }
+#ifdef CONFIG_X86_64
+ if (!cpu_has_apic) {
+ disable_apic = 1;
+ pr_info("Apic disabled by BIOS\n");
+ return -1;
+ }
+#else
+ if (!smp_found_config && !cpu_has_apic)
+ return -1;
+
+ /*
+ * Complain if the BIOS pretends there is one.
+ */
+ if (!cpu_has_apic &&
+ APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) {
+ pr_err("BIOS bug, local APIC 0x%x not detected!...\n",
+ boot_cpu_physical_apicid);
+ return -1;
+ }
+#endif
+
+ if (!smp_found_config)
+ disable_ioapic_support();
+
+ default_setup_apic_routing();
+ verify_local_APIC();
+ apic_bsp_setup(true);
+ return 0;
+}
+
+#ifdef CONFIG_UP_LATE_INIT
+void __init up_late_init(void)
+{
+ APIC_init_uniprocessor();
+}
+#endif
+
/*
* Power management
*/
@@ -2359,9 +2380,9 @@ static void lapic_resume(void)
mask_ioapic_entries();
legacy_pic->mask_all();
- if (x2apic_mode)
- enable_x2apic();
- else {
+ if (x2apic_mode) {
+ __x2apic_enable();
+ } else {
/*
* Make sure the APICBASE points to the right address
*
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 3f5f60406ab1..f4dc2462a1ac 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1507,7 +1507,10 @@ void __init enable_IO_APIC(void)
int i8259_apic, i8259_pin;
int apic, pin;
- if (!nr_legacy_irqs())
+ if (skip_ioapic_setup)
+ nr_ioapics = 0;
+
+ if (!nr_legacy_irqs() || !nr_ioapics)
return;
for_each_ioapic_pin(apic, pin) {
@@ -2295,7 +2298,7 @@ static inline void __init check_timer(void)
}
local_irq_disable();
apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n");
- if (x2apic_preenabled)
+ if (apic_is_x2apic_enabled())
apic_printk(APIC_QUIET, KERN_INFO
"Perhaps problem with the pre-enabled x2apic mode\n"
"Try booting with x2apic and interrupt-remapping disabled in the bios.\n");
@@ -2373,9 +2376,9 @@ void __init setup_IO_APIC(void)
{
int ioapic;
- /*
- * calling enable_IO_APIC() is moved to setup_local_APIC for BP
- */
+ if (skip_ioapic_setup || !nr_ioapics)
+ return;
+
io_apic_irqs = nr_legacy_irqs() ? ~PIC_IRQS : ~0UL;
apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n");
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index e27b49d7c922..80091ae54c2b 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -66,3 +66,4 @@ targets += capflags.c
$(obj)/capflags.c: $(cpufeature) $(src)/mkcapflags.sh FORCE
$(call if_changed,mkcapflags)
endif
+clean-files += capflags.c
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 15c5df92f74e..a220239cea65 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -869,3 +869,22 @@ static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum)
return false;
}
+
+void set_dr_addr_mask(unsigned long mask, int dr)
+{
+ if (!cpu_has_bpext)
+ return;
+
+ switch (dr) {
+ case 0:
+ wrmsr(MSR_F16H_DR0_ADDR_MASK, mask, 0);
+ break;
+ case 1:
+ case 2:
+ case 3:
+ wrmsr(MSR_F16H_DR1_ADDR_MASK - 1 + dr, mask, 0);
+ break;
+ default:
+ break;
+ }
+}
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index c6049650c093..b5c8ff5e9dfc 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -19,6 +19,7 @@
#include <asm/archrandom.h>
#include <asm/hypervisor.h>
#include <asm/processor.h>
+#include <asm/tlbflush.h>
#include <asm/debugreg.h>
#include <asm/sections.h>
#include <asm/vsyscall.h>
@@ -278,7 +279,7 @@ __setup("nosmep", setup_disable_smep);
static __always_inline void setup_smep(struct cpuinfo_x86 *c)
{
if (cpu_has(c, X86_FEATURE_SMEP))
- set_in_cr4(X86_CR4_SMEP);
+ cr4_set_bits(X86_CR4_SMEP);
}
static __init int setup_disable_smap(char *arg)
@@ -298,9 +299,9 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c)
if (cpu_has(c, X86_FEATURE_SMAP)) {
#ifdef CONFIG_X86_SMAP
- set_in_cr4(X86_CR4_SMAP);
+ cr4_set_bits(X86_CR4_SMAP);
#else
- clear_in_cr4(X86_CR4_SMAP);
+ cr4_clear_bits(X86_CR4_SMAP);
#endif
}
}
@@ -491,17 +492,18 @@ u16 __read_mostly tlb_lld_2m[NR_INFO];
u16 __read_mostly tlb_lld_4m[NR_INFO];
u16 __read_mostly tlb_lld_1g[NR_INFO];
-void cpu_detect_tlb(struct cpuinfo_x86 *c)
+static void cpu_detect_tlb(struct cpuinfo_x86 *c)
{
if (this_cpu->c_detect_tlb)
this_cpu->c_detect_tlb(c);
- printk(KERN_INFO "Last level iTLB entries: 4KB %d, 2MB %d, 4MB %d\n"
- "Last level dTLB entries: 4KB %d, 2MB %d, 4MB %d, 1GB %d\n",
+ pr_info("Last level iTLB entries: 4KB %d, 2MB %d, 4MB %d\n",
tlb_lli_4k[ENTRIES], tlb_lli_2m[ENTRIES],
- tlb_lli_4m[ENTRIES], tlb_lld_4k[ENTRIES],
- tlb_lld_2m[ENTRIES], tlb_lld_4m[ENTRIES],
- tlb_lld_1g[ENTRIES]);
+ tlb_lli_4m[ENTRIES]);
+
+ pr_info("Last level dTLB entries: 4KB %d, 2MB %d, 4MB %d, 1GB %d\n",
+ tlb_lld_4k[ENTRIES], tlb_lld_2m[ENTRIES],
+ tlb_lld_4m[ENTRIES], tlb_lld_1g[ENTRIES]);
}
void detect_ht(struct cpuinfo_x86 *c)
@@ -1294,6 +1296,12 @@ void cpu_init(void)
wait_for_master_cpu(cpu);
/*
+ * Initialize the CR4 shadow before doing anything that could
+ * try to read it.
+ */
+ cr4_init_shadow();
+
+ /*
* Load microcode on this cpu if a valid microcode is available.
* This is early microcode loading procedure.
*/
@@ -1312,7 +1320,7 @@ void cpu_init(void)
pr_debug("Initializing CPU#%d\n", cpu);
- clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
+ cr4_clear_bits(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
/*
* Initialize the per-CPU GDT with the boot GDT,
@@ -1332,7 +1340,7 @@ void cpu_init(void)
barrier();
x86_configure_nx();
- enable_x2apic();
+ x2apic_setup();
/*
* set up and load the per-CPU TSS
@@ -1393,7 +1401,7 @@ void cpu_init(void)
printk(KERN_INFO "Initializing CPU#%d\n", cpu);
if (cpu_feature_enabled(X86_FEATURE_VME) || cpu_has_tsc || cpu_has_de)
- clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
+ cr4_clear_bits(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
load_current_idt();
switch_to_new_gdt(cpu);
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 9cc6b6f25f42..94d7dcb12145 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -487,10 +487,8 @@ static void init_intel(struct cpuinfo_x86 *c)
rdmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb);
if ((epb & 0xF) == ENERGY_PERF_BIAS_PERFORMANCE) {
- printk_once(KERN_WARNING "ENERGY_PERF_BIAS:"
- " Set to 'normal', was 'performance'\n"
- "ENERGY_PERF_BIAS: View and update with"
- " x86_energy_perf_policy(8)\n");
+ pr_warn_once("ENERGY_PERF_BIAS: Set to 'normal', was 'performance'\n");
+ pr_warn_once("ENERGY_PERF_BIAS: View and update with x86_energy_perf_policy(8)\n");
epb = (epb & ~0xF) | ENERGY_PERF_BIAS_NORMAL;
wrmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb);
}
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index c7035073dfc1..659643376dbf 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -952,20 +952,18 @@ static ssize_t show_size(struct _cpuid4_info *this_leaf, char *buf,
static ssize_t show_shared_cpu_map_func(struct _cpuid4_info *this_leaf,
int type, char *buf)
{
- ptrdiff_t len = PTR_ALIGN(buf + PAGE_SIZE - 1, PAGE_SIZE) - buf;
- int n = 0;
-
- if (len > 1) {
- const struct cpumask *mask;
-
- mask = to_cpumask(this_leaf->shared_cpu_map);
- n = type ?
- cpulist_scnprintf(buf, len-2, mask) :
- cpumask_scnprintf(buf, len-2, mask);
- buf[n++] = '\n';
- buf[n] = '\0';
- }
- return n;
+ const struct cpumask *mask = to_cpumask(this_leaf->shared_cpu_map);
+ int ret;
+
+ if (type)
+ ret = scnprintf(buf, PAGE_SIZE - 1, "%*pbl",
+ cpumask_pr_args(mask));
+ else
+ ret = scnprintf(buf, PAGE_SIZE - 1, "%*pb",
+ cpumask_pr_args(mask));
+ buf[ret++] = '\n';
+ buf[ret] = '\0';
+ return ret;
}
static inline ssize_t show_shared_cpu_map(struct _cpuid4_info *leaf, char *buf,
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index d2c611699cd9..3c036cb4a370 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -43,6 +43,8 @@
#include <linux/export.h>
#include <asm/processor.h>
+#include <asm/traps.h>
+#include <asm/tlbflush.h>
#include <asm/mce.h>
#include <asm/msr.h>
@@ -115,7 +117,7 @@ static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs);
* CPU/chipset specific EDAC code can register a notifier call here to print
* MCE errors in a human-readable form.
*/
-ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain);
+static ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain);
/* Do initial initialization of a struct mce */
void mce_setup(struct mce *m)
@@ -150,14 +152,11 @@ static struct mce_log mcelog = {
void mce_log(struct mce *mce)
{
unsigned next, entry;
- int ret = 0;
/* Emit the trace record: */
trace_mce_record(mce);
- ret = atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, mce);
- if (ret == NOTIFY_STOP)
- return;
+ atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, mce);
mce->finished = 0;
wmb();
@@ -311,7 +310,7 @@ static void wait_for_panic(void)
panic("Panicing machine check CPU died");
}
-static void mce_panic(char *msg, struct mce *final, char *exp)
+static void mce_panic(const char *msg, struct mce *final, char *exp)
{
int i, apei_err = 0;
@@ -529,7 +528,7 @@ static void mce_schedule_work(void)
schedule_work(this_cpu_ptr(&mce_work));
}
-DEFINE_PER_CPU(struct irq_work, mce_irq_work);
+static DEFINE_PER_CPU(struct irq_work, mce_irq_work);
static void mce_irq_work_cb(struct irq_work *entry)
{
@@ -735,7 +734,7 @@ static atomic_t mce_callin;
/*
* Check if a timeout waiting for other CPUs happened.
*/
-static int mce_timed_out(u64 *t)
+static int mce_timed_out(u64 *t, const char *msg)
{
/*
* The others already did panic for some reason.
@@ -750,8 +749,7 @@ static int mce_timed_out(u64 *t)
goto out;
if ((s64)*t < SPINUNIT) {
if (mca_cfg.tolerant <= 1)
- mce_panic("Timeout synchronizing machine check over CPUs",
- NULL, NULL);
+ mce_panic(msg, NULL, NULL);
cpu_missing = 1;
return 1;
}
@@ -867,7 +865,8 @@ static int mce_start(int *no_way_out)
* Wait for everyone.
*/
while (atomic_read(&mce_callin) != cpus) {
- if (mce_timed_out(&timeout)) {
+ if (mce_timed_out(&timeout,
+ "Timeout: Not all CPUs entered broadcast exception handler")) {
atomic_set(&global_nwo, 0);
return -1;
}
@@ -892,7 +891,8 @@ static int mce_start(int *no_way_out)
* only seen by one CPU before cleared, avoiding duplicates.
*/
while (atomic_read(&mce_executing) < order) {
- if (mce_timed_out(&timeout)) {
+ if (mce_timed_out(&timeout,
+ "Timeout: Subject CPUs unable to finish machine check processing")) {
atomic_set(&global_nwo, 0);
return -1;
}
@@ -936,7 +936,8 @@ static int mce_end(int order)
* loops.
*/
while (atomic_read(&mce_executing) <= cpus) {
- if (mce_timed_out(&timeout))
+ if (mce_timed_out(&timeout,
+ "Timeout: Monarch CPU unable to finish machine check processing"))
goto reset;
ndelay(SPINUNIT);
}
@@ -949,7 +950,8 @@ static int mce_end(int order)
* Subject: Wait for Monarch to finish.
*/
while (atomic_read(&mce_executing) != 0) {
- if (mce_timed_out(&timeout))
+ if (mce_timed_out(&timeout,
+ "Timeout: Monarch CPU did not finish machine check processing"))
goto reset;
ndelay(SPINUNIT);
}
@@ -1003,51 +1005,6 @@ static void mce_clear_state(unsigned long *toclear)
}
/*
- * Need to save faulting physical address associated with a process
- * in the machine check handler some place where we can grab it back
- * later in mce_notify_process()
- */
-#define MCE_INFO_MAX 16
-
-struct mce_info {
- atomic_t inuse;
- struct task_struct *t;
- __u64 paddr;
- int restartable;
-} mce_info[MCE_INFO_MAX];
-
-static void mce_save_info(__u64 addr, int c)
-{
- struct mce_info *mi;
-
- for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++) {
- if (atomic_cmpxchg(&mi->inuse, 0, 1) == 0) {
- mi->t = current;
- mi->paddr = addr;
- mi->restartable = c;
- return;
- }
- }
-
- mce_panic("Too many concurrent recoverable errors", NULL, NULL);
-}
-
-static struct mce_info *mce_find_info(void)
-{
- struct mce_info *mi;
-
- for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++)
- if (atomic_read(&mi->inuse) && mi->t == current)
- return mi;
- return NULL;
-}
-
-static void mce_clear_info(struct mce_info *mi)
-{
- atomic_set(&mi->inuse, 0);
-}
-
-/*
* The actual machine check handler. This only handles real
* exceptions when something got corrupted coming in through int 18.
*
@@ -1063,6 +1020,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
{
struct mca_config *cfg = &mca_cfg;
struct mce m, *final;
+ enum ctx_state prev_state;
int i;
int worst = 0;
int severity;
@@ -1084,6 +1042,10 @@ void do_machine_check(struct pt_regs *regs, long error_code)
DECLARE_BITMAP(toclear, MAX_NR_BANKS);
DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
char *msg = "Unknown";
+ u64 recover_paddr = ~0ull;
+ int flags = MF_ACTION_REQUIRED;
+
+ prev_state = ist_enter(regs);
this_cpu_inc(mce_exception_count);
@@ -1203,9 +1165,9 @@ void do_machine_check(struct pt_regs *regs, long error_code)
if (no_way_out)
mce_panic("Fatal machine check on current CPU", &m, msg);
if (worst == MCE_AR_SEVERITY) {
- /* schedule action before return to userland */
- mce_save_info(m.addr, m.mcgstatus & MCG_STATUS_RIPV);
- set_thread_flag(TIF_MCE_NOTIFY);
+ recover_paddr = m.addr;
+ if (!(m.mcgstatus & MCG_STATUS_RIPV))
+ flags |= MF_MUST_KILL;
} else if (kill_it) {
force_sig(SIGBUS, current);
}
@@ -1216,6 +1178,27 @@ void do_machine_check(struct pt_regs *regs, long error_code)
mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
out:
sync_core();
+
+ if (recover_paddr == ~0ull)
+ goto done;
+
+ pr_err("Uncorrected hardware memory error in user-access at %llx",
+ recover_paddr);
+ /*
+ * We must call memory_failure() here even if the current process is
+ * doomed. We still need to mark the page as poisoned and alert any
+ * other users of the page.
+ */
+ ist_begin_non_atomic(regs);
+ local_irq_enable();
+ if (memory_failure(recover_paddr >> PAGE_SHIFT, MCE_VECTOR, flags) < 0) {
+ pr_err("Memory error not recovered");
+ force_sig(SIGBUS, current);
+ }
+ local_irq_disable();
+ ist_end_non_atomic();
+done:
+ ist_exit(regs, prev_state);
}
EXPORT_SYMBOL_GPL(do_machine_check);
@@ -1233,42 +1216,6 @@ int memory_failure(unsigned long pfn, int vector, int flags)
#endif
/*
- * Called in process context that interrupted by MCE and marked with
- * TIF_MCE_NOTIFY, just before returning to erroneous userland.
- * This code is allowed to sleep.
- * Attempt possible recovery such as calling the high level VM handler to
- * process any corrupted pages, and kill/signal current process if required.
- * Action required errors are handled here.
- */
-void mce_notify_process(void)
-{
- unsigned long pfn;
- struct mce_info *mi = mce_find_info();
- int flags = MF_ACTION_REQUIRED;
-
- if (!mi)
- mce_panic("Lost physical address for unconsumed uncorrectable error", NULL, NULL);
- pfn = mi->paddr >> PAGE_SHIFT;
-
- clear_thread_flag(TIF_MCE_NOTIFY);
-
- pr_err("Uncorrected hardware memory error in user-access at %llx",
- mi->paddr);
- /*
- * We must call memory_failure() here even if the current process is
- * doomed. We still need to mark the page as poisoned and alert any
- * other users of the page.
- */
- if (!mi->restartable)
- flags |= MF_MUST_KILL;
- if (memory_failure(pfn, MCE_VECTOR, flags) < 0) {
- pr_err("Memory error not recovered");
- force_sig(SIGBUS, current);
- }
- mce_clear_info(mi);
-}
-
-/*
* Action optional processing happens here (picking up
* from the list of faulting pages that do_machine_check()
* placed into the "ring").
@@ -1503,7 +1450,7 @@ static void __mcheck_cpu_init_generic(void)
bitmap_fill(all_banks, MAX_NR_BANKS);
machine_check_poll(MCP_UC | m_fl, &all_banks);
- set_in_cr4(X86_CR4_MCE);
+ cr4_set_bits(X86_CR4_MCE);
rdmsrl(MSR_IA32_MCG_CAP, cap);
if (cap & MCG_CTL_P)
diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c
index a3042989398c..737b0ad4e61a 100644
--- a/arch/x86/kernel/cpu/mcheck/p5.c
+++ b/arch/x86/kernel/cpu/mcheck/p5.c
@@ -8,6 +8,8 @@
#include <linux/smp.h>
#include <asm/processor.h>
+#include <asm/traps.h>
+#include <asm/tlbflush.h>
#include <asm/mce.h>
#include <asm/msr.h>
@@ -17,8 +19,11 @@ int mce_p5_enabled __read_mostly;
/* Machine check handler for Pentium class Intel CPUs: */
static void pentium_machine_check(struct pt_regs *regs, long error_code)
{
+ enum ctx_state prev_state;
u32 loaddr, hi, lotype;
+ prev_state = ist_enter(regs);
+
rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi);
rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi);
@@ -33,6 +38,8 @@ static void pentium_machine_check(struct pt_regs *regs, long error_code)
}
add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
+
+ ist_exit(regs, prev_state);
}
/* Set up machine check reporting for processors with Intel style MCE: */
@@ -59,7 +66,7 @@ void intel_p5_mcheck_init(struct cpuinfo_x86 *c)
"Intel old style machine check architecture supported.\n");
/* Enable MCE: */
- set_in_cr4(X86_CR4_MCE);
+ cr4_set_bits(X86_CR4_MCE);
printk(KERN_INFO
"Intel old style machine check reporting enabled on CPU#%d.\n",
smp_processor_id());
diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mcheck/winchip.c
index 7dc5564d0cdf..44f138296fbe 100644
--- a/arch/x86/kernel/cpu/mcheck/winchip.c
+++ b/arch/x86/kernel/cpu/mcheck/winchip.c
@@ -7,14 +7,20 @@
#include <linux/types.h>
#include <asm/processor.h>
+#include <asm/traps.h>
+#include <asm/tlbflush.h>
#include <asm/mce.h>
#include <asm/msr.h>
/* Machine check handler for WinChip C6: */
static void winchip_machine_check(struct pt_regs *regs, long error_code)
{
+ enum ctx_state prev_state = ist_enter(regs);
+
printk(KERN_EMERG "CPU0: Machine Check Exception.\n");
add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
+
+ ist_exit(regs, prev_state);
}
/* Set up machine check reporting on the Winchip C6 series */
@@ -31,7 +37,7 @@ void winchip_mcheck_init(struct cpuinfo_x86 *c)
lo &= ~(1<<4); /* Enable MCE */
wrmsr(MSR_IDT_FCR1, lo, hi);
- set_in_cr4(X86_CR4_MCE);
+ cr4_set_bits(X86_CR4_MCE);
printk(KERN_INFO
"Winchip machine check reporting enabled on CPU#0.\n");
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index 15c29096136b..36a83617eb21 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -552,7 +552,7 @@ static int __init microcode_init(void)
int error;
if (paravirt_enabled() || dis_ucode_ldr)
- return 0;
+ return -EINVAL;
if (c->x86_vendor == X86_VENDOR_INTEL)
microcode_ops = init_intel_microcode();
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index c6826d1e8082..746e7fd08aad 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -196,6 +196,11 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
struct microcode_header_intel mc_header;
unsigned int mc_size;
+ if (leftover < sizeof(mc_header)) {
+ pr_err("error! Truncated header in microcode data file\n");
+ break;
+ }
+
if (get_ucode_data(&mc_header, ucode_ptr, sizeof(mc_header)))
break;
diff --git a/arch/x86/kernel/cpu/microcode/intel_early.c b/arch/x86/kernel/cpu/microcode/intel_early.c
index ec9df6f9cd47..420eb933189c 100644
--- a/arch/x86/kernel/cpu/microcode/intel_early.c
+++ b/arch/x86/kernel/cpu/microcode/intel_early.c
@@ -321,7 +321,11 @@ get_matching_model_microcode(int cpu, unsigned long start,
unsigned int mc_saved_count = mc_saved_data->mc_saved_count;
int i;
- while (leftover) {
+ while (leftover && mc_saved_count < ARRAY_SIZE(mc_saved_tmp)) {
+
+ if (leftover < sizeof(mc_header))
+ break;
+
mc_header = (struct microcode_header_intel *)ucode_ptr;
mc_size = get_totalsize(mc_header);
diff --git a/arch/x86/kernel/cpu/mkcapflags.sh b/arch/x86/kernel/cpu/mkcapflags.sh
index e2b22df964cd..36d99a337b49 100644
--- a/arch/x86/kernel/cpu/mkcapflags.sh
+++ b/arch/x86/kernel/cpu/mkcapflags.sh
@@ -28,7 +28,7 @@ function dump_array()
# If the /* comment */ starts with a quote string, grab that.
VALUE="$(echo "$i" | sed -n 's@.*/\* *\("[^"]*"\).*\*/@\1@p')"
[ -z "$VALUE" ] && VALUE="\"$NAME\""
- [ "$VALUE" == '""' ] && continue
+ [ "$VALUE" = '""' ] && continue
# Name is uppercase, VALUE is all lowercase
VALUE="$(echo "$VALUE" | tr A-Z a-z)"
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index a450373e8e91..939155ffdece 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -107,6 +107,7 @@ static struct clocksource hyperv_cs = {
.rating = 400, /* use this when running on Hyperv*/
.read = read_hv_clock,
.mask = CLOCKSOURCE_MASK(64),
+ .flags = CLOCK_SOURCE_IS_CONTINUOUS,
};
static void __init ms_hyperv_init_platform(void)
diff --git a/arch/x86/kernel/cpu/mtrr/cyrix.c b/arch/x86/kernel/cpu/mtrr/cyrix.c
index 9e451b0876b5..f8c81ba0b465 100644
--- a/arch/x86/kernel/cpu/mtrr/cyrix.c
+++ b/arch/x86/kernel/cpu/mtrr/cyrix.c
@@ -138,8 +138,8 @@ static void prepare_set(void)
/* Save value of CR4 and clear Page Global Enable (bit 7) */
if (cpu_has_pge) {
- cr4 = read_cr4();
- write_cr4(cr4 & ~X86_CR4_PGE);
+ cr4 = __read_cr4();
+ __write_cr4(cr4 & ~X86_CR4_PGE);
}
/*
@@ -171,7 +171,7 @@ static void post_set(void)
/* Restore value of CR4 */
if (cpu_has_pge)
- write_cr4(cr4);
+ __write_cr4(cr4);
}
static void cyrix_set_arr(unsigned int reg, unsigned long base,
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 0e25a1bc5ab5..7d74f7b3c6ba 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -678,8 +678,8 @@ static void prepare_set(void) __acquires(set_atomicity_lock)
/* Save value of CR4 and clear Page Global Enable (bit 7) */
if (cpu_has_pge) {
- cr4 = read_cr4();
- write_cr4(cr4 & ~X86_CR4_PGE);
+ cr4 = __read_cr4();
+ __write_cr4(cr4 & ~X86_CR4_PGE);
}
/* Flush all TLBs via a mov %cr3, %reg; mov %reg, %cr3 */
@@ -708,7 +708,7 @@ static void post_set(void) __releases(set_atomicity_lock)
/* Restore value of CR4 */
if (cpu_has_pge)
- write_cr4(cr4);
+ __write_cr4(cr4);
raw_spin_unlock(&set_atomicity_lock);
}
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 143e5f5dc855..b71a7f86d68a 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -31,6 +31,8 @@
#include <asm/nmi.h>
#include <asm/smp.h>
#include <asm/alternative.h>
+#include <asm/mmu_context.h>
+#include <asm/tlbflush.h>
#include <asm/timer.h>
#include <asm/desc.h>
#include <asm/ldt.h>
@@ -43,6 +45,8 @@ DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
.enabled = 1,
};
+struct static_key rdpmc_always_available = STATIC_KEY_INIT_FALSE;
+
u64 __read_mostly hw_cache_event_ids
[PERF_COUNT_HW_CACHE_MAX]
[PERF_COUNT_HW_CACHE_OP_MAX]
@@ -1327,8 +1331,6 @@ x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
break;
case CPU_STARTING:
- if (x86_pmu.attr_rdpmc)
- set_in_cr4(X86_CR4_PCE);
if (x86_pmu.cpu_starting)
x86_pmu.cpu_starting(cpu);
break;
@@ -1804,14 +1806,44 @@ static int x86_pmu_event_init(struct perf_event *event)
event->destroy(event);
}
+ if (ACCESS_ONCE(x86_pmu.attr_rdpmc))
+ event->hw.flags |= PERF_X86_EVENT_RDPMC_ALLOWED;
+
return err;
}
+static void refresh_pce(void *ignored)
+{
+ if (current->mm)
+ load_mm_cr4(current->mm);
+}
+
+static void x86_pmu_event_mapped(struct perf_event *event)
+{
+ if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED))
+ return;
+
+ if (atomic_inc_return(&current->mm->context.perf_rdpmc_allowed) == 1)
+ on_each_cpu_mask(mm_cpumask(current->mm), refresh_pce, NULL, 1);
+}
+
+static void x86_pmu_event_unmapped(struct perf_event *event)
+{
+ if (!current->mm)
+ return;
+
+ if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED))
+ return;
+
+ if (atomic_dec_and_test(&current->mm->context.perf_rdpmc_allowed))
+ on_each_cpu_mask(mm_cpumask(current->mm), refresh_pce, NULL, 1);
+}
+
static int x86_pmu_event_idx(struct perf_event *event)
{
int idx = event->hw.idx;
- if (!x86_pmu.attr_rdpmc)
+ if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED))
return 0;
if (x86_pmu.num_counters_fixed && idx >= INTEL_PMC_IDX_FIXED) {
@@ -1829,16 +1861,6 @@ static ssize_t get_attr_rdpmc(struct device *cdev,
return snprintf(buf, 40, "%d\n", x86_pmu.attr_rdpmc);
}
-static void change_rdpmc(void *info)
-{
- bool enable = !!(unsigned long)info;
-
- if (enable)
- set_in_cr4(X86_CR4_PCE);
- else
- clear_in_cr4(X86_CR4_PCE);
-}
-
static ssize_t set_attr_rdpmc(struct device *cdev,
struct device_attribute *attr,
const char *buf, size_t count)
@@ -1850,14 +1872,27 @@ static ssize_t set_attr_rdpmc(struct device *cdev,
if (ret)
return ret;
+ if (val > 2)
+ return -EINVAL;
+
if (x86_pmu.attr_rdpmc_broken)
return -ENOTSUPP;
- if (!!val != !!x86_pmu.attr_rdpmc) {
- x86_pmu.attr_rdpmc = !!val;
- on_each_cpu(change_rdpmc, (void *)val, 1);
+ if ((val == 2) != (x86_pmu.attr_rdpmc == 2)) {
+ /*
+ * Changing into or out of always available, aka
+ * perf-event-bypassing mode. This path is extremely slow,
+ * but only root can trigger it, so it's okay.
+ */
+ if (val == 2)
+ static_key_slow_inc(&rdpmc_always_available);
+ else
+ static_key_slow_dec(&rdpmc_always_available);
+ on_each_cpu(refresh_pce, NULL, 1);
}
+ x86_pmu.attr_rdpmc = val;
+
return count;
}
@@ -1900,6 +1935,9 @@ static struct pmu pmu = {
.event_init = x86_pmu_event_init,
+ .event_mapped = x86_pmu_event_mapped,
+ .event_unmapped = x86_pmu_event_unmapped,
+
.add = x86_pmu_add,
.del = x86_pmu_del,
.start = x86_pmu_start,
@@ -1914,13 +1952,15 @@ static struct pmu pmu = {
.flush_branch_stack = x86_pmu_flush_branch_stack,
};
-void arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now)
+void arch_perf_update_userpage(struct perf_event *event,
+ struct perf_event_mmap_page *userpg, u64 now)
{
struct cyc2ns_data *data;
userpg->cap_user_time = 0;
userpg->cap_user_time_zero = 0;
- userpg->cap_user_rdpmc = x86_pmu.attr_rdpmc;
+ userpg->cap_user_rdpmc =
+ !!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED);
userpg->pmc_width = x86_pmu.cntval_bits;
if (!sched_clock_stable())
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 4e6cdb0ddc70..df525d2be1e8 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -71,6 +71,8 @@ struct event_constraint {
#define PERF_X86_EVENT_COMMITTED 0x8 /* event passed commit_txn */
#define PERF_X86_EVENT_PEBS_LD_HSW 0x10 /* haswell style datala, load */
#define PERF_X86_EVENT_PEBS_NA_HSW 0x20 /* haswell style datala, unknown */
+#define PERF_X86_EVENT_RDPMC_ALLOWED 0x40 /* grant rdpmc permission */
+
struct amd_nb {
int nb_id; /* NorthBridge id */
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 944bf019b74f..498b6d967138 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -2431,6 +2431,7 @@ __init int intel_pmu_init(void)
break;
case 55: /* 22nm Atom "Silvermont" */
+ case 76: /* 14nm Atom "Airmont" */
case 77: /* 22nm Atom "Silvermont Avoton/Rangely" */
memcpy(hw_cache_event_ids, slm_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 3c895d480cd7..073983398364 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -568,8 +568,8 @@ struct event_constraint intel_atom_pebs_event_constraints[] = {
};
struct event_constraint intel_slm_pebs_event_constraints[] = {
- /* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
- INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
+ /* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
+ INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x1),
/* Allow all events as PEBS with no flags */
INTEL_ALL_EVENT_CONSTRAINT(0, 0x1),
EVENT_CONSTRAINT_END
diff --git a/arch/x86/kernel/cpu/perf_event_intel_rapl.c b/arch/x86/kernel/cpu/perf_event_intel_rapl.c
index 673f930c700f..c4bb8b8e5017 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_rapl.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_rapl.c
@@ -103,6 +103,13 @@ static struct kobj_attribute format_attr_##_var = \
#define RAPL_CNTR_WIDTH 32 /* 32-bit rapl counters */
+#define RAPL_EVENT_ATTR_STR(_name, v, str) \
+static struct perf_pmu_events_attr event_attr_##v = { \
+ .attr = __ATTR(_name, 0444, rapl_sysfs_show, NULL), \
+ .id = 0, \
+ .event_str = str, \
+};
+
struct rapl_pmu {
spinlock_t lock;
int hw_unit; /* 1/2^hw_unit Joule */
@@ -135,7 +142,7 @@ static inline u64 rapl_scale(u64 v)
* or use ldexp(count, -32).
* Watts = Joules/Time delta
*/
- return v << (32 - __this_cpu_read(rapl_pmu->hw_unit));
+ return v << (32 - __this_cpu_read(rapl_pmu)->hw_unit);
}
static u64 rapl_event_update(struct perf_event *event)
@@ -379,23 +386,36 @@ static struct attribute_group rapl_pmu_attr_group = {
.attrs = rapl_pmu_attrs,
};
-EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01");
-EVENT_ATTR_STR(energy-pkg , rapl_pkg, "event=0x02");
-EVENT_ATTR_STR(energy-ram , rapl_ram, "event=0x03");
-EVENT_ATTR_STR(energy-gpu , rapl_gpu, "event=0x04");
+static ssize_t rapl_sysfs_show(struct device *dev,
+ struct device_attribute *attr,
+ char *page)
+{
+ struct perf_pmu_events_attr *pmu_attr = \
+ container_of(attr, struct perf_pmu_events_attr, attr);
+
+ if (pmu_attr->event_str)
+ return sprintf(page, "%s", pmu_attr->event_str);
+
+ return 0;
+}
+
+RAPL_EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01");
+RAPL_EVENT_ATTR_STR(energy-pkg , rapl_pkg, "event=0x02");
+RAPL_EVENT_ATTR_STR(energy-ram , rapl_ram, "event=0x03");
+RAPL_EVENT_ATTR_STR(energy-gpu , rapl_gpu, "event=0x04");
-EVENT_ATTR_STR(energy-cores.unit, rapl_cores_unit, "Joules");
-EVENT_ATTR_STR(energy-pkg.unit , rapl_pkg_unit, "Joules");
-EVENT_ATTR_STR(energy-ram.unit , rapl_ram_unit, "Joules");
-EVENT_ATTR_STR(energy-gpu.unit , rapl_gpu_unit, "Joules");
+RAPL_EVENT_ATTR_STR(energy-cores.unit, rapl_cores_unit, "Joules");
+RAPL_EVENT_ATTR_STR(energy-pkg.unit , rapl_pkg_unit, "Joules");
+RAPL_EVENT_ATTR_STR(energy-ram.unit , rapl_ram_unit, "Joules");
+RAPL_EVENT_ATTR_STR(energy-gpu.unit , rapl_gpu_unit, "Joules");
/*
* we compute in 0.23 nJ increments regardless of MSR
*/
-EVENT_ATTR_STR(energy-cores.scale, rapl_cores_scale, "2.3283064365386962890625e-10");
-EVENT_ATTR_STR(energy-pkg.scale, rapl_pkg_scale, "2.3283064365386962890625e-10");
-EVENT_ATTR_STR(energy-ram.scale, rapl_ram_scale, "2.3283064365386962890625e-10");
-EVENT_ATTR_STR(energy-gpu.scale, rapl_gpu_scale, "2.3283064365386962890625e-10");
+RAPL_EVENT_ATTR_STR(energy-cores.scale, rapl_cores_scale, "2.3283064365386962890625e-10");
+RAPL_EVENT_ATTR_STR(energy-pkg.scale, rapl_pkg_scale, "2.3283064365386962890625e-10");
+RAPL_EVENT_ATTR_STR(energy-ram.scale, rapl_ram_scale, "2.3283064365386962890625e-10");
+RAPL_EVENT_ATTR_STR(energy-gpu.scale, rapl_gpu_scale, "2.3283064365386962890625e-10");
static struct attribute *rapl_events_srv_attr[] = {
EVENT_PTR(rapl_cores),
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
index 10b8d3eaaf15..c635b8b49e93 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
@@ -840,7 +840,6 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id
box->phys_id = phys_id;
box->pci_dev = pdev;
box->pmu = pmu;
- uncore_box_init(box);
pci_set_drvdata(pdev, box);
raw_spin_lock(&uncore_box_lock);
@@ -1004,10 +1003,8 @@ static int uncore_cpu_starting(int cpu)
pmu = &type->pmus[j];
box = *per_cpu_ptr(pmu->box, cpu);
/* called by uncore_cpu_init? */
- if (box && box->phys_id >= 0) {
- uncore_box_init(box);
+ if (box && box->phys_id >= 0)
continue;
- }
for_each_online_cpu(k) {
exist = *per_cpu_ptr(pmu->box, k);
@@ -1023,10 +1020,8 @@ static int uncore_cpu_starting(int cpu)
}
}
- if (box) {
+ if (box)
box->phys_id = phys_id;
- uncore_box_init(box);
- }
}
}
return 0;
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
index 18eb78bbdd10..6c8c1e7e69d8 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
@@ -17,7 +17,7 @@
#define UNCORE_PCI_DEV_TYPE(data) ((data >> 8) & 0xff)
#define UNCORE_PCI_DEV_IDX(data) (data & 0xff)
#define UNCORE_EXTRA_PCI_DEV 0xff
-#define UNCORE_EXTRA_PCI_DEV_MAX 2
+#define UNCORE_EXTRA_PCI_DEV_MAX 3
/* support up to 8 sockets */
#define UNCORE_SOCKET_MAX 8
@@ -257,6 +257,14 @@ static inline int uncore_num_counters(struct intel_uncore_box *box)
return box->pmu->type->num_counters;
}
+static inline void uncore_box_init(struct intel_uncore_box *box)
+{
+ if (!test_and_set_bit(UNCORE_BOX_FLAG_INITIATED, &box->flags)) {
+ if (box->pmu->type->ops->init_box)
+ box->pmu->type->ops->init_box(box);
+ }
+}
+
static inline void uncore_disable_box(struct intel_uncore_box *box)
{
if (box->pmu->type->ops->disable_box)
@@ -265,6 +273,8 @@ static inline void uncore_disable_box(struct intel_uncore_box *box)
static inline void uncore_enable_box(struct intel_uncore_box *box)
{
+ uncore_box_init(box);
+
if (box->pmu->type->ops->enable_box)
box->pmu->type->ops->enable_box(box);
}
@@ -287,14 +297,6 @@ static inline u64 uncore_read_counter(struct intel_uncore_box *box,
return box->pmu->type->ops->read_counter(box, event);
}
-static inline void uncore_box_init(struct intel_uncore_box *box)
-{
- if (!test_and_set_bit(UNCORE_BOX_FLAG_INITIATED, &box->flags)) {
- if (box->pmu->type->ops->init_box)
- box->pmu->type->ops->init_box(box);
- }
-}
-
static inline bool uncore_box_is_fake(struct intel_uncore_box *box)
{
return (box->phys_id < 0);
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
index 745b158e9a65..21af6149edf2 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
@@ -891,6 +891,7 @@ void snbep_uncore_cpu_init(void)
enum {
SNBEP_PCI_QPI_PORT0_FILTER,
SNBEP_PCI_QPI_PORT1_FILTER,
+ HSWEP_PCI_PCU_3,
};
static int snbep_qpi_hw_config(struct intel_uncore_box *box, struct perf_event *event)
@@ -2026,6 +2027,17 @@ void hswep_uncore_cpu_init(void)
{
if (hswep_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
hswep_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
+
+ /* Detect 6-8 core systems with only two SBOXes */
+ if (uncore_extra_pci_dev[0][HSWEP_PCI_PCU_3]) {
+ u32 capid4;
+
+ pci_read_config_dword(uncore_extra_pci_dev[0][HSWEP_PCI_PCU_3],
+ 0x94, &capid4);
+ if (((capid4 >> 6) & 0x3) == 0)
+ hswep_uncore_sbox.num_boxes = 2;
+ }
+
uncore_msr_uncores = hswep_msr_uncores;
}
@@ -2287,6 +2299,11 @@ static DEFINE_PCI_DEVICE_TABLE(hswep_uncore_pci_ids) = {
.driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
SNBEP_PCI_QPI_PORT1_FILTER),
},
+ { /* PCU.3 (for Capability registers) */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fc0),
+ .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
+ HSWEP_PCI_PCU_3),
+ },
{ /* end: all zeroes */ }
};
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index b74ebc7c4402..cf3df1d8d039 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -265,7 +265,10 @@ int __die(const char *str, struct pt_regs *regs, long err)
printk("SMP ");
#endif
#ifdef CONFIG_DEBUG_PAGEALLOC
- printk("DEBUG_PAGEALLOC");
+ printk("DEBUG_PAGEALLOC ");
+#endif
+#ifdef CONFIG_KASAN
+ printk("KASAN");
#endif
printk("\n");
if (notify_die(DIE_OOPS, str, regs, err,
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index dd2f07ae9d0c..46201deee923 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -184,9 +184,9 @@ void __init e820_print_map(char *who)
* overwritten in the same location, starting at biosmap.
*
* The integer pointed to by pnr_map must be valid on entry (the
- * current number of valid entries located at biosmap) and will
- * be updated on return, with the new number of valid entries
- * (something no more than max_nr_map.)
+ * current number of valid entries located at biosmap). If the
+ * sanitizing succeeds the *pnr_map will be updated with the new
+ * number of valid entries (something no more than max_nr_map).
*
* The return value from sanitize_e820_map() is zero if it
* successfully 'sanitized' the map entries passed in, and is -1
@@ -561,23 +561,15 @@ u64 __init e820_remove_range(u64 start, u64 size, unsigned old_type,
void __init update_e820(void)
{
- u32 nr_map;
-
- nr_map = e820.nr_map;
- if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr_map))
+ if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map))
return;
- e820.nr_map = nr_map;
printk(KERN_INFO "e820: modified physical RAM map:\n");
e820_print_map("modified");
}
static void __init update_e820_saved(void)
{
- u32 nr_map;
-
- nr_map = e820_saved.nr_map;
- if (sanitize_e820_map(e820_saved.map, ARRAY_SIZE(e820_saved.map), &nr_map))
- return;
- e820_saved.nr_map = nr_map;
+ sanitize_e820_map(e820_saved.map, ARRAY_SIZE(e820_saved.map),
+ &e820_saved.nr_map);
}
#define MAX_GAP_END 0x100000000ull
/*
@@ -898,11 +890,9 @@ early_param("memmap", parse_memmap_opt);
void __init finish_e820_parsing(void)
{
if (userdef) {
- u32 nr = e820.nr_map;
-
- if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr) < 0)
+ if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map),
+ &e820.nr_map) < 0)
early_panic("Invalid user supplied memory map");
- e820.nr_map = nr;
printk(KERN_INFO "e820: user-defined physical RAM map:\n");
e820_print_map("user");
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index 01d1c187c9f9..a62536a1be88 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -19,6 +19,7 @@
#include <linux/usb/ehci_def.h>
#include <linux/efi.h>
#include <asm/efi.h>
+#include <asm/pci_x86.h>
/* Simple VGA output */
#define VGABASE (__ISA_IO_base + 0xb8000)
@@ -76,7 +77,7 @@ static struct console early_vga_console = {
/* Serial functions loosely based on a similar package from Klaus P. Gerlicher */
-static int early_serial_base = 0x3f8; /* ttyS0 */
+static unsigned long early_serial_base = 0x3f8; /* ttyS0 */
#define XMTRDY 0x20
@@ -94,13 +95,40 @@ static int early_serial_base = 0x3f8; /* ttyS0 */
#define DLL 0 /* Divisor Latch Low */
#define DLH 1 /* Divisor latch High */
+static void mem32_serial_out(unsigned long addr, int offset, int value)
+{
+ uint32_t *vaddr = (uint32_t *)addr;
+ /* shift implied by pointer type */
+ writel(value, vaddr + offset);
+}
+
+static unsigned int mem32_serial_in(unsigned long addr, int offset)
+{
+ uint32_t *vaddr = (uint32_t *)addr;
+ /* shift implied by pointer type */
+ return readl(vaddr + offset);
+}
+
+static unsigned int io_serial_in(unsigned long addr, int offset)
+{
+ return inb(addr + offset);
+}
+
+static void io_serial_out(unsigned long addr, int offset, int value)
+{
+ outb(value, addr + offset);
+}
+
+static unsigned int (*serial_in)(unsigned long addr, int offset) = io_serial_in;
+static void (*serial_out)(unsigned long addr, int offset, int value) = io_serial_out;
+
static int early_serial_putc(unsigned char ch)
{
unsigned timeout = 0xffff;
- while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout)
+ while ((serial_in(early_serial_base, LSR) & XMTRDY) == 0 && --timeout)
cpu_relax();
- outb(ch, early_serial_base + TXR);
+ serial_out(early_serial_base, TXR, ch);
return timeout ? 0 : -1;
}
@@ -114,13 +142,28 @@ static void early_serial_write(struct console *con, const char *s, unsigned n)
}
}
+static __init void early_serial_hw_init(unsigned divisor)
+{
+ unsigned char c;
+
+ serial_out(early_serial_base, LCR, 0x3); /* 8n1 */
+ serial_out(early_serial_base, IER, 0); /* no interrupt */
+ serial_out(early_serial_base, FCR, 0); /* no fifo */
+ serial_out(early_serial_base, MCR, 0x3); /* DTR + RTS */
+
+ c = serial_in(early_serial_base, LCR);
+ serial_out(early_serial_base, LCR, c | DLAB);
+ serial_out(early_serial_base, DLL, divisor & 0xff);
+ serial_out(early_serial_base, DLH, (divisor >> 8) & 0xff);
+ serial_out(early_serial_base, LCR, c & ~DLAB);
+}
+
#define DEFAULT_BAUD 9600
static __init void early_serial_init(char *s)
{
- unsigned char c;
unsigned divisor;
- unsigned baud = DEFAULT_BAUD;
+ unsigned long baud = DEFAULT_BAUD;
char *e;
if (*s == ',')
@@ -145,24 +188,124 @@ static __init void early_serial_init(char *s)
s++;
}
- outb(0x3, early_serial_base + LCR); /* 8n1 */
- outb(0, early_serial_base + IER); /* no interrupt */
- outb(0, early_serial_base + FCR); /* no fifo */
- outb(0x3, early_serial_base + MCR); /* DTR + RTS */
+ if (*s) {
+ if (kstrtoul(s, 0, &baud) < 0 || baud == 0)
+ baud = DEFAULT_BAUD;
+ }
+
+ /* Convert from baud to divisor value */
+ divisor = 115200 / baud;
+
+ /* These will always be IO based ports */
+ serial_in = io_serial_in;
+ serial_out = io_serial_out;
+
+ /* Set up the HW */
+ early_serial_hw_init(divisor);
+}
+
+#ifdef CONFIG_PCI
+/*
+ * early_pci_serial_init()
+ *
+ * This function is invoked when the early_printk param starts with "pciserial"
+ * The rest of the param should be ",B:D.F,baud" where B, D & F describe the
+ * location of a PCI device that must be a UART device.
+ */
+static __init void early_pci_serial_init(char *s)
+{
+ unsigned divisor;
+ unsigned long baud = DEFAULT_BAUD;
+ u8 bus, slot, func;
+ uint32_t classcode, bar0;
+ uint16_t cmdreg;
+ char *e;
+
+
+ /*
+ * First, part the param to get the BDF values
+ */
+ if (*s == ',')
+ ++s;
+
+ if (*s == 0)
+ return;
+
+ bus = (u8)simple_strtoul(s, &e, 16);
+ s = e;
+ if (*s != ':')
+ return;
+ ++s;
+ slot = (u8)simple_strtoul(s, &e, 16);
+ s = e;
+ if (*s != '.')
+ return;
+ ++s;
+ func = (u8)simple_strtoul(s, &e, 16);
+ s = e;
+
+ /* A baud might be following */
+ if (*s == ',')
+ s++;
+
+ /*
+ * Second, find the device from the BDF
+ */
+ cmdreg = read_pci_config(bus, slot, func, PCI_COMMAND);
+ classcode = read_pci_config(bus, slot, func, PCI_CLASS_REVISION);
+ bar0 = read_pci_config(bus, slot, func, PCI_BASE_ADDRESS_0);
+
+ /*
+ * Verify it is a UART type device
+ */
+ if (((classcode >> 16 != PCI_CLASS_COMMUNICATION_MODEM) &&
+ (classcode >> 16 != PCI_CLASS_COMMUNICATION_SERIAL)) ||
+ (((classcode >> 8) & 0xff) != 0x02)) /* 16550 I/F at BAR0 */
+ return;
+
+ /*
+ * Determine if it is IO or memory mapped
+ */
+ if (bar0 & 0x01) {
+ /* it is IO mapped */
+ serial_in = io_serial_in;
+ serial_out = io_serial_out;
+ early_serial_base = bar0&0xfffffffc;
+ write_pci_config(bus, slot, func, PCI_COMMAND,
+ cmdreg|PCI_COMMAND_IO);
+ } else {
+ /* It is memory mapped - assume 32-bit alignment */
+ serial_in = mem32_serial_in;
+ serial_out = mem32_serial_out;
+ /* WARNING! assuming the address is always in the first 4G */
+ early_serial_base =
+ (unsigned long)early_ioremap(bar0 & 0xfffffff0, 0x10);
+ write_pci_config(bus, slot, func, PCI_COMMAND,
+ cmdreg|PCI_COMMAND_MEMORY);
+ }
+ /*
+ * Lastly, initalize the hardware
+ */
if (*s) {
- baud = simple_strtoul(s, &e, 0);
- if (baud == 0 || s == e)
+ if (strcmp(s, "nocfg") == 0)
+ /* Sometimes, we want to leave the UART alone
+ * and assume the BIOS has set it up correctly.
+ * "nocfg" tells us this is the case, and we
+ * should do no more setup.
+ */
+ return;
+ if (kstrtoul(s, 0, &baud) < 0 || baud == 0)
baud = DEFAULT_BAUD;
}
+ /* Convert from baud to divisor value */
divisor = 115200 / baud;
- c = inb(early_serial_base + LCR);
- outb(c | DLAB, early_serial_base + LCR);
- outb(divisor & 0xff, early_serial_base + DLL);
- outb((divisor >> 8) & 0xff, early_serial_base + DLH);
- outb(c & ~DLAB, early_serial_base + LCR);
+
+ /* Set up the HW */
+ early_serial_hw_init(divisor);
}
+#endif
static struct console early_serial_console = {
.name = "earlyser",
@@ -210,6 +353,13 @@ static int __init setup_early_printk(char *buf)
early_serial_init(buf + 4);
early_console_register(&early_serial_console, keep);
}
+#ifdef CONFIG_PCI
+ if (!strncmp(buf, "pciserial", 9)) {
+ early_pci_serial_init(buf + 9);
+ early_console_register(&early_serial_console, keep);
+ buf += 9; /* Keep from match the above "serial" */
+ }
+#endif
if (!strncmp(buf, "vga", 3) &&
boot_params.screen_info.orig_video_isVGA == 1) {
max_xpos = boot_params.screen_info.orig_video_cols;
@@ -226,11 +376,6 @@ static int __init setup_early_printk(char *buf)
early_console_register(&xenboot_console, keep);
#endif
#ifdef CONFIG_EARLY_PRINTK_INTEL_MID
- if (!strncmp(buf, "mrst", 4)) {
- mrst_early_console_init();
- early_console_register(&early_mrst_console, keep);
- }
-
if (!strncmp(buf, "hsu", 3)) {
hsu_early_console_init(buf + 3);
early_console_register(&early_hsu_console, keep);
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 9ebaf63ba182..db13655c3a2a 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -143,7 +143,8 @@ ENDPROC(native_usergs_sysret64)
movq \tmp,RSP+\offset(%rsp)
movq $__USER_DS,SS+\offset(%rsp)
movq $__USER_CS,CS+\offset(%rsp)
- movq $-1,RCX+\offset(%rsp)
+ movq RIP+\offset(%rsp),\tmp /* get rip */
+ movq \tmp,RCX+\offset(%rsp) /* copy it to rcx as sysret would do */
movq R11+\offset(%rsp),\tmp /* get eflags */
movq \tmp,EFLAGS+\offset(%rsp)
.endm
@@ -155,27 +156,6 @@ ENDPROC(native_usergs_sysret64)
movq \tmp,R11+\offset(%rsp)
.endm
- .macro FAKE_STACK_FRAME child_rip
- /* push in order ss, rsp, eflags, cs, rip */
- xorl %eax, %eax
- pushq_cfi $__KERNEL_DS /* ss */
- /*CFI_REL_OFFSET ss,0*/
- pushq_cfi %rax /* rsp */
- CFI_REL_OFFSET rsp,0
- pushq_cfi $(X86_EFLAGS_IF|X86_EFLAGS_FIXED) /* eflags - interrupts on */
- /*CFI_REL_OFFSET rflags,0*/
- pushq_cfi $__KERNEL_CS /* cs */
- /*CFI_REL_OFFSET cs,0*/
- pushq_cfi \child_rip /* rip */
- CFI_REL_OFFSET rip,0
- pushq_cfi %rax /* orig rax */
- .endm
-
- .macro UNFAKE_STACK_FRAME
- addq $8*6, %rsp
- CFI_ADJUST_CFA_OFFSET -(6*8)
- .endm
-
/*
* initial frame state for interrupts (and exceptions without error code)
*/
@@ -238,51 +218,6 @@ ENDPROC(native_usergs_sysret64)
CFI_REL_OFFSET r15, R15+\offset
.endm
-/* save partial stack frame */
- .macro SAVE_ARGS_IRQ
- cld
- /* start from rbp in pt_regs and jump over */
- movq_cfi rdi, (RDI-RBP)
- movq_cfi rsi, (RSI-RBP)
- movq_cfi rdx, (RDX-RBP)
- movq_cfi rcx, (RCX-RBP)
- movq_cfi rax, (RAX-RBP)
- movq_cfi r8, (R8-RBP)
- movq_cfi r9, (R9-RBP)
- movq_cfi r10, (R10-RBP)
- movq_cfi r11, (R11-RBP)
-
- /* Save rbp so that we can unwind from get_irq_regs() */
- movq_cfi rbp, 0
-
- /* Save previous stack value */
- movq %rsp, %rsi
-
- leaq -RBP(%rsp),%rdi /* arg1 for handler */
- testl $3, CS-RBP(%rsi)
- je 1f
- SWAPGS
- /*
- * irq_count is used to check if a CPU is already on an interrupt stack
- * or not. While this is essentially redundant with preempt_count it is
- * a little cheaper to use a separate counter in the PDA (short of
- * moving irq_enter into assembly, which would be too much work)
- */
-1: incl PER_CPU_VAR(irq_count)
- cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp
- CFI_DEF_CFA_REGISTER rsi
-
- /* Store previous stack value */
- pushq %rsi
- CFI_ESCAPE 0x0f /* DW_CFA_def_cfa_expression */, 6, \
- 0x77 /* DW_OP_breg7 */, 0, \
- 0x06 /* DW_OP_deref */, \
- 0x08 /* DW_OP_const1u */, SS+8-RBP, \
- 0x22 /* DW_OP_plus */
- /* We entered an interrupt context - irqs are off: */
- TRACE_IRQS_OFF
- .endm
-
ENTRY(save_paranoid)
XCPT_FRAME 1 RDI+8
cld
@@ -426,15 +361,12 @@ system_call_fastpath:
* Has incomplete stack frame and undefined top of stack.
*/
ret_from_sys_call:
- movl $_TIF_ALLWORK_MASK,%edi
- /* edi: flagmask */
-sysret_check:
+ testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+ jnz int_ret_from_sys_call_fixup /* Go the the slow path */
+
LOCKDEP_SYS_EXIT
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
- movl TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET),%edx
- andl %edi,%edx
- jnz sysret_careful
CFI_REMEMBER_STATE
/*
* sysretq will re-enable interrupts:
@@ -448,49 +380,10 @@ sysret_check:
USERGS_SYSRET64
CFI_RESTORE_STATE
- /* Handle reschedules */
- /* edx: work, edi: workmask */
-sysret_careful:
- bt $TIF_NEED_RESCHED,%edx
- jnc sysret_signal
- TRACE_IRQS_ON
- ENABLE_INTERRUPTS(CLBR_NONE)
- pushq_cfi %rdi
- SCHEDULE_USER
- popq_cfi %rdi
- jmp sysret_check
- /* Handle a signal */
-sysret_signal:
- TRACE_IRQS_ON
- ENABLE_INTERRUPTS(CLBR_NONE)
-#ifdef CONFIG_AUDITSYSCALL
- bt $TIF_SYSCALL_AUDIT,%edx
- jc sysret_audit
-#endif
- /*
- * We have a signal, or exit tracing or single-step.
- * These all wind up with the iret return path anyway,
- * so just join that path right now.
- */
+int_ret_from_sys_call_fixup:
FIXUP_TOP_OF_STACK %r11, -ARGOFFSET
- jmp int_check_syscall_exit_work
-
-#ifdef CONFIG_AUDITSYSCALL
- /*
- * Return fast path for syscall audit. Call __audit_syscall_exit()
- * directly and then jump back to the fast path with TIF_SYSCALL_AUDIT
- * masked off.
- */
-sysret_audit:
- movq RAX-ARGOFFSET(%rsp),%rsi /* second arg, syscall return value */
- cmpq $-MAX_ERRNO,%rsi /* is it < -MAX_ERRNO? */
- setbe %al /* 1 if so, 0 if not */
- movzbl %al,%edi /* zero-extend that into %edi */
- call __audit_syscall_exit
- movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
- jmp sysret_check
-#endif /* CONFIG_AUDITSYSCALL */
+ jmp int_ret_from_sys_call
/* Do syscall tracing */
tracesys:
@@ -626,19 +519,6 @@ END(\label)
FORK_LIKE vfork
FIXED_FRAME stub_iopl, sys_iopl
-ENTRY(ptregscall_common)
- DEFAULT_FRAME 1 8 /* offset 8: return address */
- RESTORE_TOP_OF_STACK %r11, 8
- movq_cfi_restore R15+8, r15
- movq_cfi_restore R14+8, r14
- movq_cfi_restore R13+8, r13
- movq_cfi_restore R12+8, r12
- movq_cfi_restore RBP+8, rbp
- movq_cfi_restore RBX+8, rbx
- ret $REST_SKIP /* pop extended registers */
- CFI_ENDPROC
-END(ptregscall_common)
-
ENTRY(stub_execve)
CFI_STARTPROC
addq $8, %rsp
@@ -779,7 +659,48 @@ END(interrupt)
/* reserve pt_regs for scratch regs and rbp */
subq $ORIG_RAX-RBP, %rsp
CFI_ADJUST_CFA_OFFSET ORIG_RAX-RBP
- SAVE_ARGS_IRQ
+ cld
+ /* start from rbp in pt_regs and jump over */
+ movq_cfi rdi, (RDI-RBP)
+ movq_cfi rsi, (RSI-RBP)
+ movq_cfi rdx, (RDX-RBP)
+ movq_cfi rcx, (RCX-RBP)
+ movq_cfi rax, (RAX-RBP)
+ movq_cfi r8, (R8-RBP)
+ movq_cfi r9, (R9-RBP)
+ movq_cfi r10, (R10-RBP)
+ movq_cfi r11, (R11-RBP)
+
+ /* Save rbp so that we can unwind from get_irq_regs() */
+ movq_cfi rbp, 0
+
+ /* Save previous stack value */
+ movq %rsp, %rsi
+
+ leaq -RBP(%rsp),%rdi /* arg1 for handler */
+ testl $3, CS-RBP(%rsi)
+ je 1f
+ SWAPGS
+ /*
+ * irq_count is used to check if a CPU is already on an interrupt stack
+ * or not. While this is essentially redundant with preempt_count it is
+ * a little cheaper to use a separate counter in the PDA (short of
+ * moving irq_enter into assembly, which would be too much work)
+ */
+1: incl PER_CPU_VAR(irq_count)
+ cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp
+ CFI_DEF_CFA_REGISTER rsi
+
+ /* Store previous stack value */
+ pushq %rsi
+ CFI_ESCAPE 0x0f /* DW_CFA_def_cfa_expression */, 6, \
+ 0x77 /* DW_OP_breg7 */, 0, \
+ 0x06 /* DW_OP_deref */, \
+ 0x08 /* DW_OP_const1u */, SS+8-RBP, \
+ 0x22 /* DW_OP_plus */
+ /* We entered an interrupt context - irqs are off: */
+ TRACE_IRQS_OFF
+
call \func
.endm
@@ -831,6 +752,60 @@ retint_swapgs: /* return to user-space */
*/
DISABLE_INTERRUPTS(CLBR_ANY)
TRACE_IRQS_IRETQ
+
+ /*
+ * Try to use SYSRET instead of IRET if we're returning to
+ * a completely clean 64-bit userspace context.
+ */
+ movq (RCX-R11)(%rsp), %rcx
+ cmpq %rcx,(RIP-R11)(%rsp) /* RCX == RIP */
+ jne opportunistic_sysret_failed
+
+ /*
+ * On Intel CPUs, sysret with non-canonical RCX/RIP will #GP
+ * in kernel space. This essentially lets the user take over
+ * the kernel, since userspace controls RSP. It's not worth
+ * testing for canonicalness exactly -- this check detects any
+ * of the 17 high bits set, which is true for non-canonical
+ * or kernel addresses. (This will pessimize vsyscall=native.
+ * Big deal.)
+ *
+ * If virtual addresses ever become wider, this will need
+ * to be updated to remain correct on both old and new CPUs.
+ */
+ .ifne __VIRTUAL_MASK_SHIFT - 47
+ .error "virtual address width changed -- sysret checks need update"
+ .endif
+ shr $__VIRTUAL_MASK_SHIFT, %rcx
+ jnz opportunistic_sysret_failed
+
+ cmpq $__USER_CS,(CS-R11)(%rsp) /* CS must match SYSRET */
+ jne opportunistic_sysret_failed
+
+ movq (R11-ARGOFFSET)(%rsp), %r11
+ cmpq %r11,(EFLAGS-ARGOFFSET)(%rsp) /* R11 == RFLAGS */
+ jne opportunistic_sysret_failed
+
+ testq $X86_EFLAGS_RF,%r11 /* sysret can't restore RF */
+ jnz opportunistic_sysret_failed
+
+ /* nothing to check for RSP */
+
+ cmpq $__USER_DS,(SS-ARGOFFSET)(%rsp) /* SS must match SYSRET */
+ jne opportunistic_sysret_failed
+
+ /*
+ * We win! This label is here just for ease of understanding
+ * perf profiles. Nothing jumps here.
+ */
+irq_return_via_sysret:
+ CFI_REMEMBER_STATE
+ RESTORE_ARGS 1,8,1
+ movq (RSP-RIP)(%rsp),%rsp
+ USERGS_SYSRET64
+ CFI_RESTORE_STATE
+
+opportunistic_sysret_failed:
SWAPGS
jmp restore_args
@@ -1048,6 +1023,11 @@ ENTRY(\sym)
CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
.if \paranoid
+ .if \paranoid == 1
+ CFI_REMEMBER_STATE
+ testl $3, CS(%rsp) /* If coming from userspace, switch */
+ jnz 1f /* stacks. */
+ .endif
call save_paranoid
.else
call error_entry
@@ -1088,6 +1068,36 @@ ENTRY(\sym)
jmp error_exit /* %ebx: no swapgs flag */
.endif
+ .if \paranoid == 1
+ CFI_RESTORE_STATE
+ /*
+ * Paranoid entry from userspace. Switch stacks and treat it
+ * as a normal entry. This means that paranoid handlers
+ * run in real process context if user_mode(regs).
+ */
+1:
+ call error_entry
+
+ DEFAULT_FRAME 0
+
+ movq %rsp,%rdi /* pt_regs pointer */
+ call sync_regs
+ movq %rax,%rsp /* switch stack */
+
+ movq %rsp,%rdi /* pt_regs pointer */
+
+ .if \has_error_code
+ movq ORIG_RAX(%rsp),%rsi /* get error code */
+ movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */
+ .else
+ xorl %esi,%esi /* no error code */
+ .endif
+
+ call \do_sym
+
+ jmp error_exit /* %ebx: no swapgs flag */
+ .endif
+
CFI_ENDPROC
END(\sym)
.endm
@@ -1108,7 +1118,7 @@ idtentry overflow do_overflow has_error_code=0
idtentry bounds do_bounds has_error_code=0
idtentry invalid_op do_invalid_op has_error_code=0
idtentry device_not_available do_device_not_available has_error_code=0
-idtentry double_fault do_double_fault has_error_code=1 paranoid=1
+idtentry double_fault do_double_fault has_error_code=1 paranoid=2
idtentry coprocessor_segment_overrun do_coprocessor_segment_overrun has_error_code=0
idtentry invalid_TSS do_invalid_TSS has_error_code=1
idtentry segment_not_present do_segment_not_present has_error_code=1
@@ -1289,16 +1299,14 @@ idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vector(
#endif
/*
- * "Paranoid" exit path from exception stack.
- * Paranoid because this is used by NMIs and cannot take
- * any kernel state for granted.
- * We don't do kernel preemption checks here, because only
- * NMI should be common and it does not enable IRQs and
- * cannot get reschedule ticks.
+ * "Paranoid" exit path from exception stack. This is invoked
+ * only on return from non-NMI IST interrupts that came
+ * from kernel space.
*
- * "trace" is 0 for the NMI handler only, because irq-tracing
- * is fundamentally NMI-unsafe. (we cannot change the soft and
- * hard flags at once, atomically)
+ * We may be returning to very strange contexts (e.g. very early
+ * in syscall entry), so checking for preemption here would
+ * be complicated. Fortunately, we there's no good reason
+ * to try to handle preemption here.
*/
/* ebx: no swapgs flag */
@@ -1308,43 +1316,14 @@ ENTRY(paranoid_exit)
TRACE_IRQS_OFF_DEBUG
testl %ebx,%ebx /* swapgs needed? */
jnz paranoid_restore
- testl $3,CS(%rsp)
- jnz paranoid_userspace
-paranoid_swapgs:
TRACE_IRQS_IRETQ 0
SWAPGS_UNSAFE_STACK
RESTORE_ALL 8
- jmp irq_return
+ INTERRUPT_RETURN
paranoid_restore:
TRACE_IRQS_IRETQ_DEBUG 0
RESTORE_ALL 8
- jmp irq_return
-paranoid_userspace:
- GET_THREAD_INFO(%rcx)
- movl TI_flags(%rcx),%ebx
- andl $_TIF_WORK_MASK,%ebx
- jz paranoid_swapgs
- movq %rsp,%rdi /* &pt_regs */
- call sync_regs
- movq %rax,%rsp /* switch stack for scheduling */
- testl $_TIF_NEED_RESCHED,%ebx
- jnz paranoid_schedule
- movl %ebx,%edx /* arg3: thread flags */
- TRACE_IRQS_ON
- ENABLE_INTERRUPTS(CLBR_NONE)
- xorl %esi,%esi /* arg2: oldset */
- movq %rsp,%rdi /* arg1: &pt_regs */
- call do_notify_resume
- DISABLE_INTERRUPTS(CLBR_NONE)
- TRACE_IRQS_OFF
- jmp paranoid_userspace
-paranoid_schedule:
- TRACE_IRQS_ON
- ENABLE_INTERRUPTS(CLBR_ANY)
- SCHEDULE_USER
- DISABLE_INTERRUPTS(CLBR_ANY)
- TRACE_IRQS_OFF
- jmp paranoid_userspace
+ INTERRUPT_RETURN
CFI_ENDPROC
END(paranoid_exit)
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 2142376dc8c6..8b7b0a51e742 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -674,7 +674,7 @@ static inline void *alloc_tramp(unsigned long size)
}
static inline void tramp_free(void *tramp)
{
- module_free(NULL, tramp);
+ module_memfree(tramp);
}
#else
/* Trampolines can only be created if modules are supported */
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index d6c1b9836995..2911ef3a9f1c 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -31,6 +31,7 @@ static void __init i386_default_early_setup(void)
asmlinkage __visible void __init i386_start_kernel(void)
{
+ cr4_init_shadow();
sanitize_boot_params(&boot_params);
/* Call the subarch specific early setup function */
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index eda1a865641e..c4f8d4659070 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -27,6 +27,7 @@
#include <asm/bios_ebda.h>
#include <asm/bootparam_utils.h>
#include <asm/microcode.h>
+#include <asm/kasan.h>
/*
* Manage page tables very early on.
@@ -46,7 +47,7 @@ static void __init reset_early_page_tables(void)
next_early_pgt = 0;
- write_cr3(__pa(early_level4_pgt));
+ write_cr3(__pa_nodebug(early_level4_pgt));
}
/* Create a new PMD entry */
@@ -59,7 +60,7 @@ int __init early_make_pgtable(unsigned long address)
pmdval_t pmd, *pmd_p;
/* Invalid address or early pgt is done ? */
- if (physaddr >= MAXMEM || read_cr3() != __pa(early_level4_pgt))
+ if (physaddr >= MAXMEM || read_cr3() != __pa_nodebug(early_level4_pgt))
return -1;
again:
@@ -155,9 +156,13 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)
(__START_KERNEL & PGDIR_MASK)));
BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END);
+ cr4_init_shadow();
+
/* Kill off the identity-map trampoline */
reset_early_page_tables();
+ kasan_map_early_shadow(early_level4_pgt);
+
/* clear bss before set_intr_gate with early_idt_handler */
clear_bss();
@@ -179,6 +184,8 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)
/* set init_level4_pgt kernel high mapping*/
init_level4_pgt[511] = early_level4_pgt[511];
+ kasan_map_early_shadow(init_level4_pgt);
+
x86_64_start_reservations(real_mode_data);
}
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index a468c0a65c42..6fd514d9f69a 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -514,8 +514,38 @@ ENTRY(phys_base)
/* This must match the first entry in level2_kernel_pgt */
.quad 0x0000000000000000
+#ifdef CONFIG_KASAN
+#define FILL(VAL, COUNT) \
+ .rept (COUNT) ; \
+ .quad (VAL) ; \
+ .endr
+
+NEXT_PAGE(kasan_zero_pte)
+ FILL(kasan_zero_page - __START_KERNEL_map + _KERNPG_TABLE, 512)
+NEXT_PAGE(kasan_zero_pmd)
+ FILL(kasan_zero_pte - __START_KERNEL_map + _KERNPG_TABLE, 512)
+NEXT_PAGE(kasan_zero_pud)
+ FILL(kasan_zero_pmd - __START_KERNEL_map + _KERNPG_TABLE, 512)
+
+#undef FILL
+#endif
+
+
#include "../../x86/xen/xen-head.S"
__PAGE_ALIGNED_BSS
NEXT_PAGE(empty_zero_page)
.skip PAGE_SIZE
+
+#ifdef CONFIG_KASAN
+/*
+ * This page used as early shadow. We don't use empty_zero_page
+ * at early stages, stack instrumentation could write some garbage
+ * to this page.
+ * Latter we reuse it as zero shadow for large ranges of memory
+ * that allowed to access, but not instrumented by kasan
+ * (vmalloc/vmemmap ...).
+ */
+NEXT_PAGE(kasan_zero_page)
+ .skip PAGE_SIZE
+#endif
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 319bcb9372fe..3acbff4716b0 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -168,7 +168,7 @@ static void _hpet_print_config(const char *function, int line)
#define hpet_print_config() \
do { \
if (hpet_verbose) \
- _hpet_print_config(__FUNCTION__, __LINE__); \
+ _hpet_print_config(__func__, __LINE__); \
} while (0)
/*
diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
index 3d5fb509bdeb..7114ba220fd4 100644
--- a/arch/x86/kernel/hw_breakpoint.c
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -126,6 +126,8 @@ int arch_install_hw_breakpoint(struct perf_event *bp)
*dr7 |= encode_dr7(i, info->len, info->type);
set_debugreg(*dr7, 7);
+ if (info->mask)
+ set_dr_addr_mask(info->mask, i);
return 0;
}
@@ -161,29 +163,8 @@ void arch_uninstall_hw_breakpoint(struct perf_event *bp)
*dr7 &= ~__encode_dr7(i, info->len, info->type);
set_debugreg(*dr7, 7);
-}
-
-static int get_hbp_len(u8 hbp_len)
-{
- unsigned int len_in_bytes = 0;
-
- switch (hbp_len) {
- case X86_BREAKPOINT_LEN_1:
- len_in_bytes = 1;
- break;
- case X86_BREAKPOINT_LEN_2:
- len_in_bytes = 2;
- break;
- case X86_BREAKPOINT_LEN_4:
- len_in_bytes = 4;
- break;
-#ifdef CONFIG_X86_64
- case X86_BREAKPOINT_LEN_8:
- len_in_bytes = 8;
- break;
-#endif
- }
- return len_in_bytes;
+ if (info->mask)
+ set_dr_addr_mask(0, i);
}
/*
@@ -196,7 +177,7 @@ int arch_check_bp_in_kernelspace(struct perf_event *bp)
struct arch_hw_breakpoint *info = counter_arch_bp(bp);
va = info->address;
- len = get_hbp_len(info->len);
+ len = bp->attr.bp_len;
return (va >= TASK_SIZE) && ((va + len - 1) >= TASK_SIZE);
}
@@ -277,6 +258,8 @@ static int arch_build_bp_info(struct perf_event *bp)
}
/* Len */
+ info->mask = 0;
+
switch (bp->attr.bp_len) {
case HW_BREAKPOINT_LEN_1:
info->len = X86_BREAKPOINT_LEN_1;
@@ -293,11 +276,17 @@ static int arch_build_bp_info(struct perf_event *bp)
break;
#endif
default:
- return -EINVAL;
+ if (!is_power_of_2(bp->attr.bp_len))
+ return -EINVAL;
+ if (!cpu_has_bpext)
+ return -EOPNOTSUPP;
+ info->mask = bp->attr.bp_len - 1;
+ info->len = X86_BREAKPOINT_LEN_1;
}
return 0;
}
+
/*
* Validate the arch-specific HW Breakpoint register settings
*/
@@ -312,11 +301,11 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp)
if (ret)
return ret;
- ret = -EINVAL;
-
switch (info->len) {
case X86_BREAKPOINT_LEN_1:
align = 0;
+ if (info->mask)
+ align = info->mask;
break;
case X86_BREAKPOINT_LEN_2:
align = 1;
@@ -330,7 +319,7 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp)
break;
#endif
default:
- return ret;
+ WARN_ON_ONCE(1);
}
/*
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index a9a4229f6161..d5651fce0b71 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -13,12 +13,26 @@
#include <asm/sigcontext.h>
#include <asm/processor.h>
#include <asm/math_emu.h>
+#include <asm/tlbflush.h>
#include <asm/uaccess.h>
#include <asm/ptrace.h>
#include <asm/i387.h>
#include <asm/fpu-internal.h>
#include <asm/user.h>
+static DEFINE_PER_CPU(bool, in_kernel_fpu);
+
+void kernel_fpu_disable(void)
+{
+ WARN_ON(this_cpu_read(in_kernel_fpu));
+ this_cpu_write(in_kernel_fpu, true);
+}
+
+void kernel_fpu_enable(void)
+{
+ this_cpu_write(in_kernel_fpu, false);
+}
+
/*
* Were we in an interrupt that interrupted kernel mode?
*
@@ -33,6 +47,9 @@
*/
static inline bool interrupted_kernel_fpu_idle(void)
{
+ if (this_cpu_read(in_kernel_fpu))
+ return false;
+
if (use_eager_fpu())
return __thread_has_fpu(current);
@@ -73,10 +90,10 @@ void __kernel_fpu_begin(void)
{
struct task_struct *me = current;
+ this_cpu_write(in_kernel_fpu, true);
+
if (__thread_has_fpu(me)) {
- __thread_clear_has_fpu(me);
__save_init_fpu(me);
- /* We do 'stts()' in __kernel_fpu_end() */
} else if (!use_eager_fpu()) {
this_cpu_write(fpu_owner_task, NULL);
clts();
@@ -86,19 +103,16 @@ EXPORT_SYMBOL(__kernel_fpu_begin);
void __kernel_fpu_end(void)
{
- if (use_eager_fpu()) {
- /*
- * For eager fpu, most the time, tsk_used_math() is true.
- * Restore the user math as we are done with the kernel usage.
- * At few instances during thread exit, signal handling etc,
- * tsk_used_math() is false. Those few places will take proper
- * actions, so we don't need to restore the math here.
- */
- if (likely(tsk_used_math(current)))
- math_state_restore();
- } else {
+ struct task_struct *me = current;
+
+ if (__thread_has_fpu(me)) {
+ if (WARN_ON(restore_fpu_checking(me)))
+ drop_init_fpu(me);
+ } else if (!use_eager_fpu()) {
stts();
}
+
+ this_cpu_write(in_kernel_fpu, false);
}
EXPORT_SYMBOL(__kernel_fpu_end);
@@ -180,7 +194,7 @@ void fpu_init(void)
if (cpu_has_xmm)
cr4_mask |= X86_CR4_OSXMMEXCPT;
if (cr4_mask)
- set_in_cr4(cr4_mask);
+ cr4_set_bits(cr4_mask);
cr0 = read_cr0();
cr0 &= ~(X86_CR0_TS|X86_CR0_EM); /* clear TS and EM */
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 6307a0f0cf17..67b1cbe0093a 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -127,7 +127,7 @@ int arch_show_interrupts(struct seq_file *p, int prec)
seq_puts(p, " Machine check polls\n");
#endif
#if IS_ENABLED(CONFIG_HYPERV) || defined(CONFIG_XEN)
- seq_printf(p, "%*s: ", prec, "THR");
+ seq_printf(p, "%*s: ", prec, "HYP");
for_each_online_cpu(j)
seq_printf(p, "%10u ", irq_stats(j)->irq_hv_callback_count);
seq_puts(p, " Hypervisor callback interrupts\n");
@@ -302,6 +302,9 @@ int check_irq_vectors_for_cpu_disable(void)
irq = __this_cpu_read(vector_irq[vector]);
if (irq >= 0) {
desc = irq_to_desc(irq);
+ if (!desc)
+ continue;
+
data = irq_desc_get_irq_data(desc);
cpumask_copy(&affinity_new, data->affinity);
cpu_clear(this_cpu, affinity_new);
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 63ce838e5a54..28d28f5eb8f4 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -69,16 +69,9 @@ static void call_on_stack(void *func, void *stack)
: "memory", "cc", "edx", "ecx", "eax");
}
-/* how to get the current stack pointer from C */
-#define current_stack_pointer ({ \
- unsigned long sp; \
- asm("mov %%esp,%0" : "=g" (sp)); \
- sp; \
-})
-
static inline void *current_stack(void)
{
- return (void *)(current_stack_pointer & ~(THREAD_SIZE - 1));
+ return (void *)(current_stack_pointer() & ~(THREAD_SIZE - 1));
}
static inline int
@@ -103,7 +96,7 @@ execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq)
/* Save the next esp at the bottom of the stack */
prev_esp = (u32 *)irqstk;
- *prev_esp = current_stack_pointer;
+ *prev_esp = current_stack_pointer();
if (unlikely(overflow))
call_on_stack(print_stack_overflow, isp);
@@ -156,7 +149,7 @@ void do_softirq_own_stack(void)
/* Push the previous esp onto the stack */
prev_esp = (u32 *)irqstk;
- *prev_esp = current_stack_pointer;
+ *prev_esp = current_stack_pointer();
call_on_stack(__do_softirq, isp);
}
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index f7e3cd50ece0..6a1146ea4d4d 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -84,7 +84,7 @@ static volatile u32 twobyte_is_boostable[256 / 32] = {
/* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
/* ---------------------------------------------- */
W(0x00, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0) | /* 00 */
- W(0x10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 10 */
+ W(0x10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1) , /* 10 */
W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 20 */
W(0x30, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 30 */
W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */
@@ -1020,6 +1020,15 @@ int setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
regs->flags &= ~X86_EFLAGS_IF;
trace_hardirqs_off();
regs->ip = (unsigned long)(jp->entry);
+
+ /*
+ * jprobes use jprobe_return() which skips the normal return
+ * path of the function, and this messes up the accounting of the
+ * function graph tracer to get messed up.
+ *
+ * Pause function graph tracing while performing the jprobe function.
+ */
+ pause_graph_tracing();
return 1;
}
NOKPROBE_SYMBOL(setjmp_pre_handler);
@@ -1048,24 +1057,25 @@ int longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
u8 *addr = (u8 *) (regs->ip - 1);
struct jprobe *jp = container_of(p, struct jprobe, kp);
+ void *saved_sp = kcb->jprobe_saved_sp;
if ((addr > (u8 *) jprobe_return) &&
(addr < (u8 *) jprobe_return_end)) {
- if (stack_addr(regs) != kcb->jprobe_saved_sp) {
+ if (stack_addr(regs) != saved_sp) {
struct pt_regs *saved_regs = &kcb->jprobe_saved_regs;
printk(KERN_ERR
"current sp %p does not match saved sp %p\n",
- stack_addr(regs), kcb->jprobe_saved_sp);
+ stack_addr(regs), saved_sp);
printk(KERN_ERR "Saved registers for jprobe %p\n", jp);
show_regs(saved_regs);
printk(KERN_ERR "Current registers\n");
show_regs(regs);
BUG();
}
+ /* It's OK to start function graph tracing again */
+ unpause_graph_tracing();
*regs = kcb->jprobe_saved_regs;
- memcpy((kprobe_opcode_t *)(kcb->jprobe_saved_sp),
- kcb->jprobes_stack,
- MIN_STACK_SIZE(kcb->jprobe_saved_sp));
+ memcpy(saved_sp, kcb->jprobes_stack, MIN_STACK_SIZE(saved_sp));
preempt_enable_no_resched();
return 1;
}
diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c
index 7c523bbf3dc8..0dd8d089c315 100644
--- a/arch/x86/kernel/kprobes/opt.c
+++ b/arch/x86/kernel/kprobes/opt.c
@@ -322,7 +322,8 @@ void arch_remove_optimized_kprobe(struct optimized_kprobe *op)
* Target instructions MUST be relocatable (checked inside)
* This is called when new aggr(opt)probe is allocated or reused.
*/
-int arch_prepare_optimized_kprobe(struct optimized_kprobe *op)
+int arch_prepare_optimized_kprobe(struct optimized_kprobe *op,
+ struct kprobe *__unused)
{
u8 *buf;
int ret;
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 94f643484300..e354cc6446ab 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -609,7 +609,7 @@ static inline void check_zero(void)
u8 ret;
u8 old;
- old = ACCESS_ONCE(zero_stats);
+ old = READ_ONCE(zero_stats);
if (unlikely(old)) {
ret = cmpxchg(&zero_stats, old, 0);
/* This ensures only one fellow resets the stat */
@@ -727,6 +727,7 @@ __visible void kvm_lock_spinning(struct arch_spinlock *lock, __ticket_t want)
int cpu;
u64 start;
unsigned long flags;
+ __ticket_t head;
if (in_nmi())
return;
@@ -768,11 +769,15 @@ __visible void kvm_lock_spinning(struct arch_spinlock *lock, __ticket_t want)
*/
__ticket_enter_slowpath(lock);
+ /* make sure enter_slowpath, which is atomic does not cross the read */
+ smp_mb__after_atomic();
+
/*
* check again make sure it didn't become free while
* we weren't looking.
*/
- if (ACCESS_ONCE(lock->tickets.head) == want) {
+ head = READ_ONCE(lock->tickets.head);
+ if (__tickets_equal(head, want)) {
add_stats(TAKEN_SLOW_PICKUP, 1);
goto out;
}
@@ -803,8 +808,8 @@ static void kvm_unlock_kick(struct arch_spinlock *lock, __ticket_t ticket)
add_stats(RELEASED_SLOW, 1);
for_each_cpu(cpu, &waiting_cpus) {
const struct kvm_lock_waiting *w = &per_cpu(klock_waiting, cpu);
- if (ACCESS_ONCE(w->lock) == lock &&
- ACCESS_ONCE(w->want) == ticket) {
+ if (READ_ONCE(w->lock) == lock &&
+ READ_ONCE(w->want) == ticket) {
add_stats(RELEASED_SLOW_KICKED, 1);
kvm_kick_cpu(cpu);
break;
diff --git a/arch/x86/kernel/livepatch.c b/arch/x86/kernel/livepatch.c
new file mode 100644
index 000000000000..ff3c3101d003
--- /dev/null
+++ b/arch/x86/kernel/livepatch.c
@@ -0,0 +1,90 @@
+/*
+ * livepatch.c - x86-specific Kernel Live Patching Core
+ *
+ * Copyright (C) 2014 Seth Jennings <sjenning@redhat.com>
+ * Copyright (C) 2014 SUSE
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/module.h>
+#include <linux/uaccess.h>
+#include <asm/cacheflush.h>
+#include <asm/page_types.h>
+#include <asm/elf.h>
+#include <asm/livepatch.h>
+
+/**
+ * klp_write_module_reloc() - write a relocation in a module
+ * @mod: module in which the section to be modified is found
+ * @type: ELF relocation type (see asm/elf.h)
+ * @loc: address that the relocation should be written to
+ * @value: relocation value (sym address + addend)
+ *
+ * This function writes a relocation to the specified location for
+ * a particular module.
+ */
+int klp_write_module_reloc(struct module *mod, unsigned long type,
+ unsigned long loc, unsigned long value)
+{
+ int ret, numpages, size = 4;
+ bool readonly;
+ unsigned long val;
+ unsigned long core = (unsigned long)mod->module_core;
+ unsigned long core_ro_size = mod->core_ro_size;
+ unsigned long core_size = mod->core_size;
+
+ switch (type) {
+ case R_X86_64_NONE:
+ return 0;
+ case R_X86_64_64:
+ val = value;
+ size = 8;
+ break;
+ case R_X86_64_32:
+ val = (u32)value;
+ break;
+ case R_X86_64_32S:
+ val = (s32)value;
+ break;
+ case R_X86_64_PC32:
+ val = (u32)(value - loc);
+ break;
+ default:
+ /* unsupported relocation type */
+ return -EINVAL;
+ }
+
+ if (loc < core || loc >= core + core_size)
+ /* loc does not point to any symbol inside the module */
+ return -EINVAL;
+
+ if (loc < core + core_ro_size)
+ readonly = true;
+ else
+ readonly = false;
+
+ /* determine if the relocation spans a page boundary */
+ numpages = ((loc & PAGE_MASK) == ((loc + size) & PAGE_MASK)) ? 1 : 2;
+
+ if (readonly)
+ set_memory_rw(loc & PAGE_MASK, numpages);
+
+ ret = probe_kernel_write((void *)loc, &val, size);
+
+ if (readonly)
+ set_memory_ro(loc & PAGE_MASK, numpages);
+
+ return ret;
+}
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index e69f9882bf95..9bbb9b35c144 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -24,6 +24,7 @@
#include <linux/fs.h>
#include <linux/string.h>
#include <linux/kernel.h>
+#include <linux/kasan.h>
#include <linux/bug.h>
#include <linux/mm.h>
#include <linux/gfp.h>
@@ -46,21 +47,13 @@ do { \
#ifdef CONFIG_RANDOMIZE_BASE
static unsigned long module_load_offset;
-static int randomize_modules = 1;
/* Mutex protects the module_load_offset. */
static DEFINE_MUTEX(module_kaslr_mutex);
-static int __init parse_nokaslr(char *p)
-{
- randomize_modules = 0;
- return 0;
-}
-early_param("nokaslr", parse_nokaslr);
-
static unsigned long int get_module_load_offset(void)
{
- if (randomize_modules) {
+ if (kaslr_enabled) {
mutex_lock(&module_kaslr_mutex);
/*
* Calculate the module_load_offset the first time this
@@ -83,13 +76,22 @@ static unsigned long int get_module_load_offset(void)
void *module_alloc(unsigned long size)
{
+ void *p;
+
if (PAGE_ALIGN(size) > MODULES_LEN)
return NULL;
- return __vmalloc_node_range(size, 1,
+
+ p = __vmalloc_node_range(size, MODULE_ALIGN,
MODULES_VADDR + get_module_load_offset(),
MODULES_END, GFP_KERNEL | __GFP_HIGHMEM,
- PAGE_KERNEL_EXEC, NUMA_NO_NODE,
+ PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
__builtin_return_address(0));
+ if (p && (kasan_module_alloc(p, size) < 0)) {
+ vfree(p);
+ return NULL;
+ }
+
+ return p;
}
#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/perf_regs.c b/arch/x86/kernel/perf_regs.c
index e309cc5c276e..781861cc5ee8 100644
--- a/arch/x86/kernel/perf_regs.c
+++ b/arch/x86/kernel/perf_regs.c
@@ -78,6 +78,14 @@ u64 perf_reg_abi(struct task_struct *task)
{
return PERF_SAMPLE_REGS_ABI_32;
}
+
+void perf_get_regs_user(struct perf_regs *regs_user,
+ struct pt_regs *regs,
+ struct pt_regs *regs_user_copy)
+{
+ regs_user->regs = task_pt_regs(current);
+ regs_user->abi = perf_reg_abi(current);
+}
#else /* CONFIG_X86_64 */
#define REG_NOSUPPORT ((1ULL << PERF_REG_X86_DS) | \
(1ULL << PERF_REG_X86_ES) | \
@@ -102,4 +110,86 @@ u64 perf_reg_abi(struct task_struct *task)
else
return PERF_SAMPLE_REGS_ABI_64;
}
+
+void perf_get_regs_user(struct perf_regs *regs_user,
+ struct pt_regs *regs,
+ struct pt_regs *regs_user_copy)
+{
+ struct pt_regs *user_regs = task_pt_regs(current);
+
+ /*
+ * If we're in an NMI that interrupted task_pt_regs setup, then
+ * we can't sample user regs at all. This check isn't really
+ * sufficient, though, as we could be in an NMI inside an interrupt
+ * that happened during task_pt_regs setup.
+ */
+ if (regs->sp > (unsigned long)&user_regs->r11 &&
+ regs->sp <= (unsigned long)(user_regs + 1)) {
+ regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE;
+ regs_user->regs = NULL;
+ return;
+ }
+
+ /*
+ * RIP, flags, and the argument registers are usually saved.
+ * orig_ax is probably okay, too.
+ */
+ regs_user_copy->ip = user_regs->ip;
+ regs_user_copy->cx = user_regs->cx;
+ regs_user_copy->dx = user_regs->dx;
+ regs_user_copy->si = user_regs->si;
+ regs_user_copy->di = user_regs->di;
+ regs_user_copy->r8 = user_regs->r8;
+ regs_user_copy->r9 = user_regs->r9;
+ regs_user_copy->r10 = user_regs->r10;
+ regs_user_copy->r11 = user_regs->r11;
+ regs_user_copy->orig_ax = user_regs->orig_ax;
+ regs_user_copy->flags = user_regs->flags;
+
+ /*
+ * Don't even try to report the "rest" regs.
+ */
+ regs_user_copy->bx = -1;
+ regs_user_copy->bp = -1;
+ regs_user_copy->r12 = -1;
+ regs_user_copy->r13 = -1;
+ regs_user_copy->r14 = -1;
+ regs_user_copy->r15 = -1;
+
+ /*
+ * For this to be at all useful, we need a reasonable guess for
+ * sp and the ABI. Be careful: we're in NMI context, and we're
+ * considering current to be the current task, so we should
+ * be careful not to look at any other percpu variables that might
+ * change during context switches.
+ */
+ if (IS_ENABLED(CONFIG_IA32_EMULATION) &&
+ task_thread_info(current)->status & TS_COMPAT) {
+ /* Easy case: we're in a compat syscall. */
+ regs_user->abi = PERF_SAMPLE_REGS_ABI_32;
+ regs_user_copy->sp = user_regs->sp;
+ regs_user_copy->cs = user_regs->cs;
+ regs_user_copy->ss = user_regs->ss;
+ } else if (user_regs->orig_ax != -1) {
+ /*
+ * We're probably in a 64-bit syscall.
+ * Warning: this code is severely racy. At least it's better
+ * than just blindly copying user_regs.
+ */
+ regs_user->abi = PERF_SAMPLE_REGS_ABI_64;
+ regs_user_copy->sp = this_cpu_read(old_rsp);
+ regs_user_copy->cs = __USER_CS;
+ regs_user_copy->ss = __USER_DS;
+ regs_user_copy->cx = -1; /* usually contains garbage */
+ } else {
+ /* We're probably in an interrupt or exception. */
+ regs_user->abi = user_64bit_mode(user_regs) ?
+ PERF_SAMPLE_REGS_ABI_64 : PERF_SAMPLE_REGS_ABI_32;
+ regs_user_copy->sp = user_regs->sp;
+ regs_user_copy->cs = user_regs->cs;
+ regs_user_copy->ss = user_regs->ss;
+ }
+
+ regs_user->regs = regs_user_copy;
+}
#endif /* CONFIG_X86_32 */
diff --git a/arch/x86/kernel/pmc_atom.c b/arch/x86/kernel/pmc_atom.c
index 0ee5025e0fa4..d66a4fe6caee 100644
--- a/arch/x86/kernel/pmc_atom.c
+++ b/arch/x86/kernel/pmc_atom.c
@@ -25,8 +25,6 @@
#include <asm/pmc_atom.h>
-#define DRIVER_NAME KBUILD_MODNAME
-
struct pmc_dev {
u32 base_addr;
void __iomem *regmap;
@@ -38,12 +36,12 @@ struct pmc_dev {
static struct pmc_dev pmc_device;
static u32 acpi_base_addr;
-struct pmc_dev_map {
+struct pmc_bit_map {
const char *name;
u32 bit_mask;
};
-static const struct pmc_dev_map dev_map[] = {
+static const struct pmc_bit_map dev_map[] = {
{"0 - LPSS1_F0_DMA", BIT_LPSS1_F0_DMA},
{"1 - LPSS1_F1_PWM1", BIT_LPSS1_F1_PWM1},
{"2 - LPSS1_F2_PWM2", BIT_LPSS1_F2_PWM2},
@@ -82,6 +80,27 @@ static const struct pmc_dev_map dev_map[] = {
{"35 - DFX", BIT_DFX},
};
+static const struct pmc_bit_map pss_map[] = {
+ {"0 - GBE", PMC_PSS_BIT_GBE},
+ {"1 - SATA", PMC_PSS_BIT_SATA},
+ {"2 - HDA", PMC_PSS_BIT_HDA},
+ {"3 - SEC", PMC_PSS_BIT_SEC},
+ {"4 - PCIE", PMC_PSS_BIT_PCIE},
+ {"5 - LPSS", PMC_PSS_BIT_LPSS},
+ {"6 - LPE", PMC_PSS_BIT_LPE},
+ {"7 - DFX", PMC_PSS_BIT_DFX},
+ {"8 - USH_CTRL", PMC_PSS_BIT_USH_CTRL},
+ {"9 - USH_SUS", PMC_PSS_BIT_USH_SUS},
+ {"10 - USH_VCCS", PMC_PSS_BIT_USH_VCCS},
+ {"11 - USH_VCCA", PMC_PSS_BIT_USH_VCCA},
+ {"12 - OTG_CTRL", PMC_PSS_BIT_OTG_CTRL},
+ {"13 - OTG_VCCS", PMC_PSS_BIT_OTG_VCCS},
+ {"14 - OTG_VCCA_CLK", PMC_PSS_BIT_OTG_VCCA_CLK},
+ {"15 - OTG_VCCA", PMC_PSS_BIT_OTG_VCCA},
+ {"16 - USB", PMC_PSS_BIT_USB},
+ {"17 - USB_SUS", PMC_PSS_BIT_USB_SUS},
+};
+
static inline u32 pmc_reg_read(struct pmc_dev *pmc, int reg_offset)
{
return readl(pmc->regmap + reg_offset);
@@ -169,6 +188,32 @@ static const struct file_operations pmc_dev_state_ops = {
.release = single_release,
};
+static int pmc_pss_state_show(struct seq_file *s, void *unused)
+{
+ struct pmc_dev *pmc = s->private;
+ u32 pss = pmc_reg_read(pmc, PMC_PSS);
+ int pss_index;
+
+ for (pss_index = 0; pss_index < ARRAY_SIZE(pss_map); pss_index++) {
+ seq_printf(s, "Island: %-32s\tState: %s\n",
+ pss_map[pss_index].name,
+ pss_map[pss_index].bit_mask & pss ? "Off" : "On");
+ }
+ return 0;
+}
+
+static int pmc_pss_state_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, pmc_pss_state_show, inode->i_private);
+}
+
+static const struct file_operations pmc_pss_state_ops = {
+ .open = pmc_pss_state_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
static int pmc_sleep_tmr_show(struct seq_file *s, void *unused)
{
struct pmc_dev *pmc = s->private;
@@ -202,11 +247,7 @@ static const struct file_operations pmc_sleep_tmr_ops = {
static void pmc_dbgfs_unregister(struct pmc_dev *pmc)
{
- if (!pmc->dbgfs_dir)
- return;
-
debugfs_remove_recursive(pmc->dbgfs_dir);
- pmc->dbgfs_dir = NULL;
}
static int pmc_dbgfs_register(struct pmc_dev *pmc, struct pci_dev *pdev)
@@ -217,19 +258,29 @@ static int pmc_dbgfs_register(struct pmc_dev *pmc, struct pci_dev *pdev)
if (!dir)
return -ENOMEM;
+ pmc->dbgfs_dir = dir;
+
f = debugfs_create_file("dev_state", S_IFREG | S_IRUGO,
dir, pmc, &pmc_dev_state_ops);
if (!f) {
- dev_err(&pdev->dev, "dev_states register failed\n");
+ dev_err(&pdev->dev, "dev_state register failed\n");
goto err;
}
+
+ f = debugfs_create_file("pss_state", S_IFREG | S_IRUGO,
+ dir, pmc, &pmc_pss_state_ops);
+ if (!f) {
+ dev_err(&pdev->dev, "pss_state register failed\n");
+ goto err;
+ }
+
f = debugfs_create_file("sleep_state", S_IFREG | S_IRUGO,
dir, pmc, &pmc_sleep_tmr_ops);
if (!f) {
dev_err(&pdev->dev, "sleep_state register failed\n");
goto err;
}
- pmc->dbgfs_dir = dir;
+
return 0;
err:
pmc_dbgfs_unregister(pmc);
@@ -292,7 +343,6 @@ MODULE_DEVICE_TABLE(pci, pmc_pci_ids);
static int __init pmc_atom_init(void)
{
- int err = -ENODEV;
struct pci_dev *pdev = NULL;
const struct pci_device_id *ent;
@@ -306,14 +356,11 @@ static int __init pmc_atom_init(void)
*/
for_each_pci_dev(pdev) {
ent = pci_match_id(pmc_pci_ids, pdev);
- if (ent) {
- err = pmc_setup_dev(pdev);
- goto out;
- }
+ if (ent)
+ return pmc_setup_dev(pdev);
}
/* Device not found. */
-out:
- return err;
+ return -ENODEV;
}
module_init(pmc_atom_init);
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index e127ddaa2d5a..046e2d620bbe 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -28,6 +28,7 @@
#include <asm/fpu-internal.h>
#include <asm/debugreg.h>
#include <asm/nmi.h>
+#include <asm/tlbflush.h>
/*
* per-CPU TSS segments. Threads are completely 'soft' on Linux,
@@ -141,7 +142,7 @@ void flush_thread(void)
static void hard_disable_TSC(void)
{
- write_cr4(read_cr4() | X86_CR4_TSD);
+ cr4_set_bits(X86_CR4_TSD);
}
void disable_TSC(void)
@@ -158,7 +159,7 @@ void disable_TSC(void)
static void hard_enable_TSC(void)
{
- write_cr4(read_cr4() & ~X86_CR4_TSD);
+ cr4_clear_bits(X86_CR4_TSD);
}
static void enable_TSC(void)
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 8f3ebfe710d0..603c4f99cb5a 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -101,7 +101,7 @@ void __show_regs(struct pt_regs *regs, int all)
cr0 = read_cr0();
cr2 = read_cr2();
cr3 = read_cr3();
- cr4 = read_cr4_safe();
+ cr4 = __read_cr4_safe();
printk(KERN_DEFAULT "CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n",
cr0, cr2, cr3, cr4);
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 5a2c02913af3..67fcc43577d2 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -93,7 +93,7 @@ void __show_regs(struct pt_regs *regs, int all)
cr0 = read_cr0();
cr2 = read_cr2();
cr3 = read_cr3();
- cr4 = read_cr4();
+ cr4 = __read_cr4();
printk(KERN_DEFAULT "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
fs, fsindex, gs, gsindex, shadowgs);
diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c
index ca9622a25e95..cd9685235df9 100644
--- a/arch/x86/kernel/rtc.c
+++ b/arch/x86/kernel/rtc.c
@@ -49,11 +49,11 @@ int mach_set_rtc_mmss(const struct timespec *now)
retval = set_rtc_time(&tm);
if (retval)
printk(KERN_ERR "%s: RTC write failed with error %d\n",
- __FUNCTION__, retval);
+ __func__, retval);
} else {
printk(KERN_ERR
"%s: Invalid RTC value: write of %lx to RTC failed\n",
- __FUNCTION__, nowtime);
+ __func__, nowtime);
retval = -EINVAL;
}
return retval;
@@ -170,7 +170,7 @@ static struct platform_device rtc_device = {
static __init int add_rtc_cmos(void)
{
#ifdef CONFIG_PNP
- static const char * const const ids[] __initconst =
+ static const char * const ids[] __initconst =
{ "PNP0b00", "PNP0b01", "PNP0b02", };
struct pnp_dev *dev;
struct pnp_id *id;
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index ab4734e5411d..98dc9317286e 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -89,6 +89,7 @@
#include <asm/cacheflush.h>
#include <asm/processor.h>
#include <asm/bugs.h>
+#include <asm/kasan.h>
#include <asm/vsyscall.h>
#include <asm/cpu.h>
@@ -121,6 +122,8 @@
unsigned long max_low_pfn_mapped;
unsigned long max_pfn_mapped;
+bool __read_mostly kaslr_enabled = false;
+
#ifdef CONFIG_DMI
RESERVE_BRK(dmi_alloc, 65536);
#endif
@@ -424,6 +427,11 @@ static void __init reserve_initrd(void)
}
#endif /* CONFIG_BLK_DEV_INITRD */
+static void __init parse_kaslr_setup(u64 pa_data, u32 data_len)
+{
+ kaslr_enabled = (bool)(pa_data + sizeof(struct setup_data));
+}
+
static void __init parse_setup_data(void)
{
struct setup_data *data;
@@ -431,15 +439,13 @@ static void __init parse_setup_data(void)
pa_data = boot_params.hdr.setup_data;
while (pa_data) {
- u32 data_len, map_len, data_type;
+ u32 data_len, data_type;
- map_len = max(PAGE_SIZE - (pa_data & ~PAGE_MASK),
- (u64)sizeof(struct setup_data));
- data = early_memremap(pa_data, map_len);
+ data = early_memremap(pa_data, sizeof(*data));
data_len = data->len + sizeof(struct setup_data);
data_type = data->type;
pa_next = data->next;
- early_iounmap(data, map_len);
+ early_iounmap(data, sizeof(*data));
switch (data_type) {
case SETUP_E820_EXT:
@@ -451,6 +457,9 @@ static void __init parse_setup_data(void)
case SETUP_EFI:
parse_efi_setup(pa_data, data_len);
break;
+ case SETUP_KASLR:
+ parse_kaslr_setup(pa_data, data_len);
+ break;
default:
break;
}
@@ -833,10 +842,14 @@ static void __init trim_low_memory_range(void)
static int
dump_kernel_offset(struct notifier_block *self, unsigned long v, void *p)
{
- pr_emerg("Kernel Offset: 0x%lx from 0x%lx "
- "(relocation range: 0x%lx-0x%lx)\n",
- (unsigned long)&_text - __START_KERNEL, __START_KERNEL,
- __START_KERNEL_map, MODULES_VADDR-1);
+ if (kaslr_enabled)
+ pr_emerg("Kernel Offset: 0x%lx from 0x%lx (relocation range: 0x%lx-0x%lx)\n",
+ (unsigned long)&_text - __START_KERNEL,
+ __START_KERNEL,
+ __START_KERNEL_map,
+ MODULES_VADDR-1);
+ else
+ pr_emerg("Kernel Offset: disabled\n");
return 0;
}
@@ -1176,9 +1189,11 @@ void __init setup_arch(char **cmdline_p)
x86_init.paging.pagetable_init();
+ kasan_init();
+
if (boot_cpu_data.cpuid_level >= 0) {
/* A CPU has %cr4 if and only if it has CPUID */
- mmu_cr4_features = read_cr4();
+ mmu_cr4_features = __read_cr4();
if (trampoline_cr4_features)
*trampoline_cr4_features = mmu_cr4_features;
}
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index ed37a768d0fc..e5042463c1bc 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -69,7 +69,7 @@ int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
unsigned int err = 0;
/* Always make any pending restarted system calls return -EINTR */
- current_thread_info()->restart_block.fn = do_no_restart_syscall;
+ current->restart_block.fn = do_no_restart_syscall;
get_user_try {
@@ -740,12 +740,6 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
{
user_exit();
-#ifdef CONFIG_X86_MCE
- /* notify userspace of pending MCEs */
- if (thread_info_flags & _TIF_MCE_NOTIFY)
- mce_notify_process();
-#endif /* CONFIG_X86_64 && CONFIG_X86_MCE */
-
if (thread_info_flags & _TIF_UPROBE)
uprobe_notify_resume(regs);
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 6d7022c683e3..febc6aabc72e 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -73,7 +73,6 @@
#include <asm/setup.h>
#include <asm/uv/uv.h>
#include <linux/mc146818rtc.h>
-#include <asm/smpboot_hooks.h>
#include <asm/i8259.h>
#include <asm/realmode.h>
#include <asm/misc.h>
@@ -104,6 +103,43 @@ EXPORT_PER_CPU_SYMBOL(cpu_info);
atomic_t init_deasserted;
+static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&rtc_lock, flags);
+ CMOS_WRITE(0xa, 0xf);
+ spin_unlock_irqrestore(&rtc_lock, flags);
+ local_flush_tlb();
+ pr_debug("1.\n");
+ *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_HIGH)) =
+ start_eip >> 4;
+ pr_debug("2.\n");
+ *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) =
+ start_eip & 0xf;
+ pr_debug("3.\n");
+}
+
+static inline void smpboot_restore_warm_reset_vector(void)
+{
+ unsigned long flags;
+
+ /*
+ * Install writable page 0 entry to set BIOS data area.
+ */
+ local_flush_tlb();
+
+ /*
+ * Paranoid: Set warm reset code and vector here back
+ * to default values.
+ */
+ spin_lock_irqsave(&rtc_lock, flags);
+ CMOS_WRITE(0, 0xf);
+ spin_unlock_irqrestore(&rtc_lock, flags);
+
+ *((volatile u32 *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = 0;
+}
+
/*
* Report back to the Boot Processor during boot time or to the caller processor
* during CPU online.
@@ -136,8 +172,7 @@ static void smp_callin(void)
* CPU, first the APIC. (this is probably redundant on most
* boards)
*/
- setup_local_APIC();
- end_local_APIC_setup();
+ apic_ap_setup();
/*
* Need to setup vector mappings before we enable interrupts.
@@ -955,9 +990,12 @@ void arch_disable_smp_support(void)
*/
static __init void disable_smp(void)
{
+ pr_info("SMP disabled\n");
+
+ disable_ioapic_support();
+
init_cpu_present(cpumask_of(0));
init_cpu_possible(cpumask_of(0));
- smpboot_clear_io_apic_irqs();
if (smp_found_config)
physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map);
@@ -967,6 +1005,13 @@ static __init void disable_smp(void)
cpumask_set_cpu(0, cpu_core_mask(0));
}
+enum {
+ SMP_OK,
+ SMP_NO_CONFIG,
+ SMP_NO_APIC,
+ SMP_FORCE_UP,
+};
+
/*
* Various sanity checks.
*/
@@ -1014,10 +1059,7 @@ static int __init smp_sanity_check(unsigned max_cpus)
if (!smp_found_config && !acpi_lapic) {
preempt_enable();
pr_notice("SMP motherboard not detected\n");
- disable_smp();
- if (APIC_init_uniprocessor())
- pr_notice("Local APIC not detected. Using dummy APIC emulation.\n");
- return -1;
+ return SMP_NO_CONFIG;
}
/*
@@ -1041,9 +1083,7 @@ static int __init smp_sanity_check(unsigned max_cpus)
boot_cpu_physical_apicid);
pr_err("... forcing use of dummy APIC emulation (tell your hw vendor)\n");
}
- smpboot_clear_io_apic();
- disable_ioapic_support();
- return -1;
+ return SMP_NO_APIC;
}
verify_local_APIC();
@@ -1053,15 +1093,10 @@ static int __init smp_sanity_check(unsigned max_cpus)
*/
if (!max_cpus) {
pr_info("SMP mode deactivated\n");
- smpboot_clear_io_apic();
-
- connect_bsp_APIC();
- setup_local_APIC();
- bsp_end_local_APIC_setup();
- return -1;
+ return SMP_FORCE_UP;
}
- return 0;
+ return SMP_OK;
}
static void __init smp_cpu_index_default(void)
@@ -1101,10 +1136,21 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
}
set_cpu_sibling_map(0);
- if (smp_sanity_check(max_cpus) < 0) {
- pr_info("SMP disabled\n");
+ switch (smp_sanity_check(max_cpus)) {
+ case SMP_NO_CONFIG:
disable_smp();
+ if (APIC_init_uniprocessor())
+ pr_notice("Local APIC not detected. Using dummy APIC emulation.\n");
return;
+ case SMP_NO_APIC:
+ disable_smp();
+ return;
+ case SMP_FORCE_UP:
+ disable_smp();
+ apic_bsp_setup(false);
+ return;
+ case SMP_OK:
+ break;
}
default_setup_apic_routing();
@@ -1115,33 +1161,10 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
/* Or can we switch back to PIC here? */
}
- connect_bsp_APIC();
-
- /*
- * Switch from PIC to APIC mode.
- */
- setup_local_APIC();
-
- if (x2apic_mode)
- cpu0_logical_apicid = apic_read(APIC_LDR);
- else
- cpu0_logical_apicid = GET_APIC_LOGICAL_ID(apic_read(APIC_LDR));
-
- /*
- * Enable IO APIC before setting up error vector
- */
- if (!skip_ioapic_setup && nr_ioapics)
- enable_IO_APIC();
-
- bsp_end_local_APIC_setup();
- smpboot_setup_io_apic();
- /*
- * Set up local APIC timer on boot CPU.
- */
+ cpu0_logical_apicid = apic_bsp_setup(false);
pr_info("CPU%d: ", 0);
print_cpu_info(&cpu_data(0));
- x86_init.timers.setup_percpu_clockev();
if (is_uv_system())
uv_system_init();
@@ -1177,9 +1200,7 @@ void __init native_smp_cpus_done(unsigned int max_cpus)
nmi_selftest();
impress_friends();
-#ifdef CONFIG_X86_IO_APIC
setup_ioapic_dest();
-#endif
mtrr_aps_init();
}
diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c
index 4e942f31b1a7..7fc5e843f247 100644
--- a/arch/x86/kernel/tls.c
+++ b/arch/x86/kernel/tls.c
@@ -29,7 +29,28 @@ static int get_free_idx(void)
static bool tls_desc_okay(const struct user_desc *info)
{
- if (LDT_empty(info))
+ /*
+ * For historical reasons (i.e. no one ever documented how any
+ * of the segmentation APIs work), user programs can and do
+ * assume that a struct user_desc that's all zeros except for
+ * entry_number means "no segment at all". This never actually
+ * worked. In fact, up to Linux 3.19, a struct user_desc like
+ * this would create a 16-bit read-write segment with base and
+ * limit both equal to zero.
+ *
+ * That was close enough to "no segment at all" until we
+ * hardened this function to disallow 16-bit TLS segments. Fix
+ * it up by interpreting these zeroed segments the way that they
+ * were almost certainly intended to be interpreted.
+ *
+ * The correct way to ask for "no segment at all" is to specify
+ * a user_desc that satisfies LDT_empty. To keep everything
+ * working, we accept both.
+ *
+ * Note that there's a similar kludge in modify_ldt -- look at
+ * the distinction between modes 1 and 0x11.
+ */
+ if (LDT_empty(info) || LDT_zero(info))
return true;
/*
@@ -71,7 +92,7 @@ static void set_tls_desc(struct task_struct *p, int idx,
cpu = get_cpu();
while (n-- > 0) {
- if (LDT_empty(info))
+ if (LDT_empty(info) || LDT_zero(info))
desc->a = desc->b = 0;
else
fill_ldt(desc, info);
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 88900e288021..9d2073e2ecc9 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -108,6 +108,88 @@ static inline void preempt_conditional_cli(struct pt_regs *regs)
preempt_count_dec();
}
+enum ctx_state ist_enter(struct pt_regs *regs)
+{
+ enum ctx_state prev_state;
+
+ if (user_mode_vm(regs)) {
+ /* Other than that, we're just an exception. */
+ prev_state = exception_enter();
+ } else {
+ /*
+ * We might have interrupted pretty much anything. In
+ * fact, if we're a machine check, we can even interrupt
+ * NMI processing. We don't want in_nmi() to return true,
+ * but we need to notify RCU.
+ */
+ rcu_nmi_enter();
+ prev_state = IN_KERNEL; /* the value is irrelevant. */
+ }
+
+ /*
+ * We are atomic because we're on the IST stack (or we're on x86_32,
+ * in which case we still shouldn't schedule).
+ *
+ * This must be after exception_enter(), because exception_enter()
+ * won't do anything if in_interrupt() returns true.
+ */
+ preempt_count_add(HARDIRQ_OFFSET);
+
+ /* This code is a bit fragile. Test it. */
+ rcu_lockdep_assert(rcu_is_watching(), "ist_enter didn't work");
+
+ return prev_state;
+}
+
+void ist_exit(struct pt_regs *regs, enum ctx_state prev_state)
+{
+ /* Must be before exception_exit. */
+ preempt_count_sub(HARDIRQ_OFFSET);
+
+ if (user_mode_vm(regs))
+ return exception_exit(prev_state);
+ else
+ rcu_nmi_exit();
+}
+
+/**
+ * ist_begin_non_atomic() - begin a non-atomic section in an IST exception
+ * @regs: regs passed to the IST exception handler
+ *
+ * IST exception handlers normally cannot schedule. As a special
+ * exception, if the exception interrupted userspace code (i.e.
+ * user_mode_vm(regs) would return true) and the exception was not
+ * a double fault, it can be safe to schedule. ist_begin_non_atomic()
+ * begins a non-atomic section within an ist_enter()/ist_exit() region.
+ * Callers are responsible for enabling interrupts themselves inside
+ * the non-atomic section, and callers must call is_end_non_atomic()
+ * before ist_exit().
+ */
+void ist_begin_non_atomic(struct pt_regs *regs)
+{
+ BUG_ON(!user_mode_vm(regs));
+
+ /*
+ * Sanity check: we need to be on the normal thread stack. This
+ * will catch asm bugs and any attempt to use ist_preempt_enable
+ * from double_fault.
+ */
+ BUG_ON(((current_stack_pointer() ^ this_cpu_read_stable(kernel_stack))
+ & ~(THREAD_SIZE - 1)) != 0);
+
+ preempt_count_sub(HARDIRQ_OFFSET);
+}
+
+/**
+ * ist_end_non_atomic() - begin a non-atomic section in an IST exception
+ *
+ * Ends a non-atomic section started with ist_begin_non_atomic().
+ */
+void ist_end_non_atomic(void)
+{
+ preempt_count_add(HARDIRQ_OFFSET);
+}
+
static nokprobe_inline int
do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str,
struct pt_regs *regs, long error_code)
@@ -251,6 +333,8 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
* end up promoting it to a doublefault. In that case, modify
* the stack to make it look like we just entered the #GP
* handler from user space, similar to bad_iret.
+ *
+ * No need for ist_enter here because we don't use RCU.
*/
if (((long)regs->sp >> PGDIR_SHIFT) == ESPFIX_PGD_ENTRY &&
regs->cs == __KERNEL_CS &&
@@ -263,12 +347,12 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
normal_regs->orig_ax = 0; /* Missing (lost) #GP error code */
regs->ip = (unsigned long)general_protection;
regs->sp = (unsigned long)&normal_regs->orig_ax;
+
return;
}
#endif
- exception_enter();
- /* Return not checked because double check cannot be ignored */
+ ist_enter(regs); /* Discard prev_state because we won't return. */
notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV);
tsk->thread.error_code = error_code;
@@ -434,7 +518,7 @@ dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code)
if (poke_int3_handler(regs))
return;
- prev_state = exception_enter();
+ prev_state = ist_enter(regs);
#ifdef CONFIG_KGDB_LOW_LEVEL_TRAP
if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP,
SIGTRAP) == NOTIFY_STOP)
@@ -460,33 +544,20 @@ dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code)
preempt_conditional_cli(regs);
debug_stack_usage_dec();
exit:
- exception_exit(prev_state);
+ ist_exit(regs, prev_state);
}
NOKPROBE_SYMBOL(do_int3);
#ifdef CONFIG_X86_64
/*
- * Help handler running on IST stack to switch back to user stack
- * for scheduling or signal handling. The actual stack switch is done in
- * entry.S
+ * Help handler running on IST stack to switch off the IST stack if the
+ * interrupted code was in user mode. The actual stack switch is done in
+ * entry_64.S
*/
asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs)
{
- struct pt_regs *regs = eregs;
- /* Did already sync */
- if (eregs == (struct pt_regs *)eregs->sp)
- ;
- /* Exception from user space */
- else if (user_mode(eregs))
- regs = task_pt_regs(current);
- /*
- * Exception from kernel and interrupts are enabled. Move to
- * kernel process stack.
- */
- else if (eregs->flags & X86_EFLAGS_IF)
- regs = (struct pt_regs *)(eregs->sp -= sizeof(struct pt_regs));
- if (eregs != regs)
- *regs = *eregs;
+ struct pt_regs *regs = task_pt_regs(current);
+ *regs = *eregs;
return regs;
}
NOKPROBE_SYMBOL(sync_regs);
@@ -554,7 +625,7 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
unsigned long dr6;
int si_code;
- prev_state = exception_enter();
+ prev_state = ist_enter(regs);
get_debugreg(dr6, 6);
@@ -629,7 +700,7 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
debug_stack_usage_dec();
exit:
- exception_exit(prev_state);
+ ist_exit(regs, prev_state);
}
NOKPROBE_SYMBOL(do_debug);
@@ -788,18 +859,16 @@ void math_state_restore(void)
local_irq_disable();
}
+ /* Avoid __kernel_fpu_begin() right after __thread_fpu_begin() */
+ kernel_fpu_disable();
__thread_fpu_begin(tsk);
-
- /*
- * Paranoid restore. send a SIGSEGV if we fail to restore the state.
- */
if (unlikely(restore_fpu_checking(tsk))) {
drop_init_fpu(tsk);
force_sig_info(SIGSEGV, SEND_SIG_PRIV, tsk);
- return;
+ } else {
+ tsk->thread.fpu_counter++;
}
-
- tsk->thread.fpu_counter++;
+ kernel_fpu_enable();
}
EXPORT_SYMBOL_GPL(math_state_restore);
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index b7e50bba3bbb..505449700e0c 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -617,7 +617,7 @@ static unsigned long quick_pit_calibrate(void)
goto success;
}
}
- pr_err("Fast TSC calibration failed\n");
+ pr_info("Fast TSC calibration failed\n");
return 0;
success:
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index 8b96a947021f..81f8adb0679e 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -66,27 +66,54 @@
* Good-instruction tables for 32-bit apps. This is non-const and volatile
* to keep gcc from statically optimizing it out, as variable_test_bit makes
* some versions of gcc to think only *(unsigned long*) is used.
+ *
+ * Opcodes we'll probably never support:
+ * 6c-6f - ins,outs. SEGVs if used in userspace
+ * e4-e7 - in,out imm. SEGVs if used in userspace
+ * ec-ef - in,out acc. SEGVs if used in userspace
+ * cc - int3. SIGTRAP if used in userspace
+ * ce - into. Not used in userspace - no kernel support to make it useful. SEGVs
+ * (why we support bound (62) then? it's similar, and similarly unused...)
+ * f1 - int1. SIGTRAP if used in userspace
+ * f4 - hlt. SEGVs if used in userspace
+ * fa - cli. SEGVs if used in userspace
+ * fb - sti. SEGVs if used in userspace
+ *
+ * Opcodes which need some work to be supported:
+ * 07,17,1f - pop es/ss/ds
+ * Normally not used in userspace, but would execute if used.
+ * Can cause GP or stack exception if tries to load wrong segment descriptor.
+ * We hesitate to run them under single step since kernel's handling
+ * of userspace single-stepping (TF flag) is fragile.
+ * We can easily refuse to support push es/cs/ss/ds (06/0e/16/1e)
+ * on the same grounds that they are never used.
+ * cd - int N.
+ * Used by userspace for "int 80" syscall entry. (Other "int N"
+ * cause GP -> SEGV since their IDT gates don't allow calls from CPL 3).
+ * Not supported since kernel's handling of userspace single-stepping
+ * (TF flag) is fragile.
+ * cf - iret. Normally not used in userspace. Doesn't SEGV unless arguments are bad
*/
#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
static volatile u32 good_insns_32[256 / 32] = {
/* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
/* ---------------------------------------------- */
- W(0x00, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0) | /* 00 */
+ W(0x00, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* 00 */
W(0x10, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0) , /* 10 */
- W(0x20, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1) | /* 20 */
- W(0x30, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1) , /* 30 */
+ W(0x20, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 20 */
+ W(0x30, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 30 */
W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */
W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */
- W(0x60, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 60 */
+ W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* 60 */
W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 70 */
W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */
W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */
W(0xa0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* a0 */
W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* b0 */
W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* c0 */
- W(0xd0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */
+ W(0xd0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */
W(0xe0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* e0 */
- W(0xf0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1) /* f0 */
+ W(0xf0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1) /* f0 */
/* ---------------------------------------------- */
/* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
};
@@ -94,27 +121,61 @@ static volatile u32 good_insns_32[256 / 32] = {
#define good_insns_32 NULL
#endif
-/* Good-instruction tables for 64-bit apps */
+/* Good-instruction tables for 64-bit apps.
+ *
+ * Genuinely invalid opcodes:
+ * 06,07 - formerly push/pop es
+ * 0e - formerly push cs
+ * 16,17 - formerly push/pop ss
+ * 1e,1f - formerly push/pop ds
+ * 27,2f,37,3f - formerly daa/das/aaa/aas
+ * 60,61 - formerly pusha/popa
+ * 62 - formerly bound. EVEX prefix for AVX512 (not yet supported)
+ * 82 - formerly redundant encoding of Group1
+ * 9a - formerly call seg:ofs
+ * ce - formerly into
+ * d4,d5 - formerly aam/aad
+ * d6 - formerly undocumented salc
+ * ea - formerly jmp seg:ofs
+ *
+ * Opcodes we'll probably never support:
+ * 6c-6f - ins,outs. SEGVs if used in userspace
+ * e4-e7 - in,out imm. SEGVs if used in userspace
+ * ec-ef - in,out acc. SEGVs if used in userspace
+ * cc - int3. SIGTRAP if used in userspace
+ * f1 - int1. SIGTRAP if used in userspace
+ * f4 - hlt. SEGVs if used in userspace
+ * fa - cli. SEGVs if used in userspace
+ * fb - sti. SEGVs if used in userspace
+ *
+ * Opcodes which need some work to be supported:
+ * cd - int N.
+ * Used by userspace for "int 80" syscall entry. (Other "int N"
+ * cause GP -> SEGV since their IDT gates don't allow calls from CPL 3).
+ * Not supported since kernel's handling of userspace single-stepping
+ * (TF flag) is fragile.
+ * cf - iret. Normally not used in userspace. Doesn't SEGV unless arguments are bad
+ */
#if defined(CONFIG_X86_64)
static volatile u32 good_insns_64[256 / 32] = {
/* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
/* ---------------------------------------------- */
- W(0x00, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) | /* 00 */
+ W(0x00, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1) | /* 00 */
W(0x10, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) , /* 10 */
- W(0x20, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) | /* 20 */
- W(0x30, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) , /* 30 */
- W(0x40, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 40 */
+ W(0x20, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0) | /* 20 */
+ W(0x30, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0) , /* 30 */
+ W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */
W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */
- W(0x60, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 60 */
+ W(0x60, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* 60 */
W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 70 */
W(0x80, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */
- W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */
+ W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1) , /* 90 */
W(0xa0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* a0 */
W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* b0 */
- W(0xc0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* c0 */
+ W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* c0 */
W(0xd0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */
- W(0xe0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* e0 */
- W(0xf0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1) /* f0 */
+ W(0xe0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0) | /* e0 */
+ W(0xf0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1) /* f0 */
/* ---------------------------------------------- */
/* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
};
@@ -122,49 +183,55 @@ static volatile u32 good_insns_64[256 / 32] = {
#define good_insns_64 NULL
#endif
-/* Using this for both 64-bit and 32-bit apps */
+/* Using this for both 64-bit and 32-bit apps.
+ * Opcodes we don't support:
+ * 0f 00 - SLDT/STR/LLDT/LTR/VERR/VERW/-/- group. System insns
+ * 0f 01 - SGDT/SIDT/LGDT/LIDT/SMSW/-/LMSW/INVLPG group.
+ * Also encodes tons of other system insns if mod=11.
+ * Some are in fact non-system: xend, xtest, rdtscp, maybe more
+ * 0f 05 - syscall
+ * 0f 06 - clts (CPL0 insn)
+ * 0f 07 - sysret
+ * 0f 08 - invd (CPL0 insn)
+ * 0f 09 - wbinvd (CPL0 insn)
+ * 0f 0b - ud2
+ * 0f 30 - wrmsr (CPL0 insn) (then why rdmsr is allowed, it's also CPL0 insn?)
+ * 0f 34 - sysenter
+ * 0f 35 - sysexit
+ * 0f 37 - getsec
+ * 0f 78 - vmread (Intel VMX. CPL0 insn)
+ * 0f 79 - vmwrite (Intel VMX. CPL0 insn)
+ * Note: with prefixes, these two opcodes are
+ * extrq/insertq/AVX512 convert vector ops.
+ * 0f ae - group15: [f]xsave,[f]xrstor,[v]{ld,st}mxcsr,clflush[opt],
+ * {rd,wr}{fs,gs}base,{s,l,m}fence.
+ * Why? They are all user-executable.
+ */
static volatile u32 good_2byte_insns[256 / 32] = {
/* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
/* ---------------------------------------------- */
- W(0x00, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1) | /* 00 */
- W(0x10, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1) , /* 10 */
- W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* 20 */
- W(0x30, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 30 */
+ W(0x00, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1) | /* 00 */
+ W(0x10, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 10 */
+ W(0x20, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 20 */
+ W(0x30, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1) , /* 30 */
W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */
W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */
W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 60 */
- W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1) , /* 70 */
+ W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1) , /* 70 */
W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */
W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */
- W(0xa0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1) | /* a0 */
- W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1) , /* b0 */
+ W(0xa0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1) | /* a0 */
+ W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* b0 */
W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* c0 */
- W(0xd0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */
+ W(0xd0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */
W(0xe0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* e0 */
- W(0xf0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0) /* f0 */
+ W(0xf0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) /* f0 */
/* ---------------------------------------------- */
/* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
};
#undef W
/*
- * opcodes we'll probably never support:
- *
- * 6c-6d, e4-e5, ec-ed - in
- * 6e-6f, e6-e7, ee-ef - out
- * cc, cd - int3, int
- * cf - iret
- * d6 - illegal instruction
- * f1 - int1/icebp
- * f4 - hlt
- * fa, fb - cli, sti
- * 0f - lar, lsl, syscall, clts, sysret, sysenter, sysexit, invd, wbinvd, ud2
- *
- * invalid opcodes in 64-bit mode:
- *
- * 06, 0e, 16, 1e, 27, 2f, 37, 3f, 60-62, 82, c4-c5, d4-d5
- * 63 - we support this opcode in x86_64 but not in i386.
- *
* opcodes we may need to refine support for:
*
* 0f - 2-byte instructions: For many of these instructions, the validity
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index 040681928e9d..37d8fa4438f0 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -50,13 +50,19 @@ EXPORT_SYMBOL(csum_partial);
#undef memset
#undef memmove
+extern void *__memset(void *, int, __kernel_size_t);
+extern void *__memcpy(void *, const void *, __kernel_size_t);
+extern void *__memmove(void *, const void *, __kernel_size_t);
extern void *memset(void *, int, __kernel_size_t);
extern void *memcpy(void *, const void *, __kernel_size_t);
-extern void *__memcpy(void *, const void *, __kernel_size_t);
+extern void *memmove(void *, const void *, __kernel_size_t);
+
+EXPORT_SYMBOL(__memset);
+EXPORT_SYMBOL(__memcpy);
+EXPORT_SYMBOL(__memmove);
EXPORT_SYMBOL(memset);
EXPORT_SYMBOL(memcpy);
-EXPORT_SYMBOL(__memcpy);
EXPORT_SYMBOL(memmove);
#ifndef CONFIG_DEBUG_VIRTUAL
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index 0de1fae2bdf0..34f66e58a896 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -12,6 +12,7 @@
#include <asm/i387.h>
#include <asm/fpu-internal.h>
#include <asm/sigframe.h>
+#include <asm/tlbflush.h>
#include <asm/xcr.h>
/*
@@ -453,7 +454,7 @@ static void prepare_fx_sw_frame(void)
*/
static inline void xstate_enable(void)
{
- set_in_cr4(X86_CR4_OSXSAVE);
+ cr4_set_bits(X86_CR4_OSXSAVE);
xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask);
}
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index f9d16ff56c6b..413a7bf9efbb 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -39,7 +39,9 @@ config KVM
select PERF_EVENTS
select HAVE_KVM_MSI
select HAVE_KVM_CPU_RELAX_INTERCEPT
+ select KVM_GENERIC_DIRTYLOG_READ_PROTECT
select KVM_VFIO
+ select SRCU
---help---
Support hosting fully virtualized guest machines using hardware
virtualization extensions. You will need a fairly recent
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 169b09d76ddd..e0b794a84c35 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -86,6 +86,7 @@
#define DstAcc (OpAcc << DstShift)
#define DstDI (OpDI << DstShift)
#define DstMem64 (OpMem64 << DstShift)
+#define DstMem16 (OpMem16 << DstShift)
#define DstImmUByte (OpImmUByte << DstShift)
#define DstDX (OpDX << DstShift)
#define DstAccLo (OpAccLo << DstShift)
@@ -124,6 +125,7 @@
#define RMExt (4<<15) /* Opcode extension in ModRM r/m if mod == 3 */
#define Escape (5<<15) /* Escape to coprocessor instruction */
#define InstrDual (6<<15) /* Alternate instruction decoding of mod == 3 */
+#define ModeDual (7<<15) /* Different instruction for 32/64 bit */
#define Sse (1<<18) /* SSE Vector instruction */
/* Generic ModRM decode. */
#define ModRM (1<<19)
@@ -165,10 +167,10 @@
#define NoMod ((u64)1 << 47) /* Mod field is ignored */
#define Intercept ((u64)1 << 48) /* Has valid intercept field */
#define CheckPerm ((u64)1 << 49) /* Has valid check_perm field */
-#define NoBigReal ((u64)1 << 50) /* No big real mode */
#define PrivUD ((u64)1 << 51) /* #UD instead of #GP on CPL > 0 */
#define NearBranch ((u64)1 << 52) /* Near branches */
#define No16 ((u64)1 << 53) /* No 16 bit operand */
+#define IncSP ((u64)1 << 54) /* SP is incremented before ModRM calc */
#define DstXacc (DstAccLo | SrcAccHi | SrcWrite)
@@ -213,6 +215,7 @@ struct opcode {
const struct gprefix *gprefix;
const struct escape *esc;
const struct instr_dual *idual;
+ const struct mode_dual *mdual;
void (*fastop)(struct fastop *fake);
} u;
int (*check_perm)(struct x86_emulate_ctxt *ctxt);
@@ -240,6 +243,11 @@ struct instr_dual {
struct opcode mod3;
};
+struct mode_dual {
+ struct opcode mode32;
+ struct opcode mode64;
+};
+
/* EFLAGS bit definitions. */
#define EFLG_ID (1<<21)
#define EFLG_VIP (1<<20)
@@ -262,6 +270,13 @@ struct instr_dual {
#define EFLG_RESERVED_ZEROS_MASK 0xffc0802a
#define EFLG_RESERVED_ONE_MASK 2
+enum x86_transfer_type {
+ X86_TRANSFER_NONE,
+ X86_TRANSFER_CALL_JMP,
+ X86_TRANSFER_RET,
+ X86_TRANSFER_TASK_SWITCH,
+};
+
static ulong reg_read(struct x86_emulate_ctxt *ctxt, unsigned nr)
{
if (!(ctxt->regs_valid & (1 << nr))) {
@@ -669,9 +684,13 @@ static __always_inline int __linearize(struct x86_emulate_ctxt *ctxt,
}
if (addr.ea > lim)
goto bad;
- *max_size = min_t(u64, ~0u, (u64)lim + 1 - addr.ea);
- if (size > *max_size)
- goto bad;
+ if (lim == 0xffffffff)
+ *max_size = ~0u;
+ else {
+ *max_size = (u64)lim + 1 - addr.ea;
+ if (size > *max_size)
+ goto bad;
+ }
la &= (u32)-1;
break;
}
@@ -722,19 +741,26 @@ static int assign_eip_far(struct x86_emulate_ctxt *ctxt, ulong dst,
const struct desc_struct *cs_desc)
{
enum x86emul_mode mode = ctxt->mode;
+ int rc;
#ifdef CONFIG_X86_64
- if (ctxt->mode >= X86EMUL_MODE_PROT32 && cs_desc->l) {
- u64 efer = 0;
+ if (ctxt->mode >= X86EMUL_MODE_PROT16) {
+ if (cs_desc->l) {
+ u64 efer = 0;
- ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
- if (efer & EFER_LMA)
- mode = X86EMUL_MODE_PROT64;
+ ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
+ if (efer & EFER_LMA)
+ mode = X86EMUL_MODE_PROT64;
+ } else
+ mode = X86EMUL_MODE_PROT32; /* temporary value */
}
#endif
if (mode == X86EMUL_MODE_PROT16 || mode == X86EMUL_MODE_PROT32)
mode = cs_desc->d ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
- return assign_eip(ctxt, dst, mode);
+ rc = assign_eip(ctxt, dst, mode);
+ if (rc == X86EMUL_CONTINUE)
+ ctxt->mode = mode;
+ return rc;
}
static inline int jmp_rel(struct x86_emulate_ctxt *ctxt, int rel)
@@ -1057,8 +1083,6 @@ static int em_fnstcw(struct x86_emulate_ctxt *ctxt)
asm volatile("fnstcw %0": "+m"(fcw));
ctxt->ops->put_fpu(ctxt);
- /* force 2 byte destination */
- ctxt->dst.bytes = 2;
ctxt->dst.val = fcw;
return X86EMUL_CONTINUE;
@@ -1075,8 +1099,6 @@ static int em_fnstsw(struct x86_emulate_ctxt *ctxt)
asm volatile("fnstsw %0": "+m"(fsw));
ctxt->ops->put_fpu(ctxt);
- /* force 2 byte destination */
- ctxt->dst.bytes = 2;
ctxt->dst.val = fsw;
return X86EMUL_CONTINUE;
@@ -1223,6 +1245,10 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
else {
modrm_ea += reg_read(ctxt, base_reg);
adjust_modrm_seg(ctxt, base_reg);
+ /* Increment ESP on POP [ESP] */
+ if ((ctxt->d & IncSP) &&
+ base_reg == VCPU_REGS_RSP)
+ modrm_ea += ctxt->op_bytes;
}
if (index_reg != 4)
modrm_ea += reg_read(ctxt, index_reg) << scale;
@@ -1435,10 +1461,8 @@ static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt,
ops->get_gdt(ctxt, dt);
}
-/* allowed just for 8 bytes segments */
-static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
- u16 selector, struct desc_struct *desc,
- ulong *desc_addr_p)
+static int get_descriptor_ptr(struct x86_emulate_ctxt *ctxt,
+ u16 selector, ulong *desc_addr_p)
{
struct desc_ptr dt;
u16 index = selector >> 3;
@@ -1449,8 +1473,34 @@ static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
if (dt.size < index * 8 + 7)
return emulate_gp(ctxt, selector & 0xfffc);
- *desc_addr_p = addr = dt.address + index * 8;
- return ctxt->ops->read_std(ctxt, addr, desc, sizeof *desc,
+ addr = dt.address + index * 8;
+
+#ifdef CONFIG_X86_64
+ if (addr >> 32 != 0) {
+ u64 efer = 0;
+
+ ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
+ if (!(efer & EFER_LMA))
+ addr &= (u32)-1;
+ }
+#endif
+
+ *desc_addr_p = addr;
+ return X86EMUL_CONTINUE;
+}
+
+/* allowed just for 8 bytes segments */
+static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
+ u16 selector, struct desc_struct *desc,
+ ulong *desc_addr_p)
+{
+ int rc;
+
+ rc = get_descriptor_ptr(ctxt, selector, desc_addr_p);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ return ctxt->ops->read_std(ctxt, *desc_addr_p, desc, sizeof(*desc),
&ctxt->exception);
}
@@ -1458,16 +1508,13 @@ static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
u16 selector, struct desc_struct *desc)
{
- struct desc_ptr dt;
- u16 index = selector >> 3;
+ int rc;
ulong addr;
- get_descriptor_table_ptr(ctxt, selector, &dt);
-
- if (dt.size < index * 8 + 7)
- return emulate_gp(ctxt, selector & 0xfffc);
+ rc = get_descriptor_ptr(ctxt, selector, &addr);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
- addr = dt.address + index * 8;
return ctxt->ops->write_std(ctxt, addr, desc, sizeof *desc,
&ctxt->exception);
}
@@ -1475,7 +1522,7 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
/* Does not support long mode */
static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
u16 selector, int seg, u8 cpl,
- bool in_task_switch,
+ enum x86_transfer_type transfer,
struct desc_struct *desc)
{
struct desc_struct seg_desc, old_desc;
@@ -1529,11 +1576,15 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
return ret;
err_code = selector & 0xfffc;
- err_vec = in_task_switch ? TS_VECTOR : GP_VECTOR;
+ err_vec = (transfer == X86_TRANSFER_TASK_SWITCH) ? TS_VECTOR :
+ GP_VECTOR;
/* can't load system descriptor into segment selector */
- if (seg <= VCPU_SREG_GS && !seg_desc.s)
+ if (seg <= VCPU_SREG_GS && !seg_desc.s) {
+ if (transfer == X86_TRANSFER_CALL_JMP)
+ return X86EMUL_UNHANDLEABLE;
goto exception;
+ }
if (!seg_desc.p) {
err_vec = (seg == VCPU_SREG_SS) ? SS_VECTOR : NP_VECTOR;
@@ -1605,10 +1656,13 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
if (seg_desc.s) {
/* mark segment as accessed */
- seg_desc.type |= 1;
- ret = write_segment_descriptor(ctxt, selector, &seg_desc);
- if (ret != X86EMUL_CONTINUE)
- return ret;
+ if (!(seg_desc.type & 1)) {
+ seg_desc.type |= 1;
+ ret = write_segment_descriptor(ctxt, selector,
+ &seg_desc);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
+ }
} else if (ctxt->mode == X86EMUL_MODE_PROT64) {
ret = ctxt->ops->read_std(ctxt, desc_addr+8, &base3,
sizeof(base3), &ctxt->exception);
@@ -1631,7 +1685,8 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
u16 selector, int seg)
{
u8 cpl = ctxt->ops->cpl(ctxt);
- return __load_segment_descriptor(ctxt, selector, seg, cpl, false, NULL);
+ return __load_segment_descriptor(ctxt, selector, seg, cpl,
+ X86_TRANSFER_NONE, NULL);
}
static void write_register_operand(struct operand *op)
@@ -1828,12 +1883,14 @@ static int em_pop_sreg(struct x86_emulate_ctxt *ctxt)
unsigned long selector;
int rc;
- rc = emulate_pop(ctxt, &selector, ctxt->op_bytes);
+ rc = emulate_pop(ctxt, &selector, 2);
if (rc != X86EMUL_CONTINUE)
return rc;
if (ctxt->modrm_reg == VCPU_SREG_SS)
ctxt->interruptibility = KVM_X86_SHADOW_INT_MOV_SS;
+ if (ctxt->op_bytes > 2)
+ rsp_increment(ctxt, ctxt->op_bytes - 2);
rc = load_segment_descriptor(ctxt, (u16)selector, seg);
return rc;
@@ -2007,6 +2064,7 @@ static int emulate_iret_real(struct x86_emulate_ctxt *ctxt)
ctxt->eflags &= ~EFLG_RESERVED_ZEROS_MASK; /* Clear reserved zeros */
ctxt->eflags |= EFLG_RESERVED_ONE_MASK;
+ ctxt->ops->set_nmi_mask(ctxt, false);
return rc;
}
@@ -2041,7 +2099,8 @@ static int em_jmp_far(struct x86_emulate_ctxt *ctxt)
memcpy(&sel, ctxt->src.valptr + ctxt->op_bytes, 2);
- rc = __load_segment_descriptor(ctxt, sel, VCPU_SREG_CS, cpl, false,
+ rc = __load_segment_descriptor(ctxt, sel, VCPU_SREG_CS, cpl,
+ X86_TRANSFER_CALL_JMP,
&new_desc);
if (rc != X86EMUL_CONTINUE)
return rc;
@@ -2130,7 +2189,8 @@ static int em_ret_far(struct x86_emulate_ctxt *ctxt)
/* Outer-privilege level return is not implemented */
if (ctxt->mode >= X86EMUL_MODE_PROT16 && (cs & 3) > cpl)
return X86EMUL_UNHANDLEABLE;
- rc = __load_segment_descriptor(ctxt, (u16)cs, VCPU_SREG_CS, cpl, false,
+ rc = __load_segment_descriptor(ctxt, (u16)cs, VCPU_SREG_CS, cpl,
+ X86_TRANSFER_RET,
&new_desc);
if (rc != X86EMUL_CONTINUE)
return rc;
@@ -2163,12 +2223,15 @@ static int em_cmpxchg(struct x86_emulate_ctxt *ctxt)
fastop(ctxt, em_cmp);
if (ctxt->eflags & EFLG_ZF) {
- /* Success: write back to memory. */
+ /* Success: write back to memory; no update of EAX */
+ ctxt->src.type = OP_NONE;
ctxt->dst.val = ctxt->src.orig_val;
} else {
/* Failure: write the value we saw to EAX. */
- ctxt->dst.type = OP_REG;
- ctxt->dst.addr.reg = reg_rmw(ctxt, VCPU_REGS_RAX);
+ ctxt->src.type = OP_REG;
+ ctxt->src.addr.reg = reg_rmw(ctxt, VCPU_REGS_RAX);
+ ctxt->src.val = ctxt->dst.orig_val;
+ /* Create write-cycle to dest by writing the same value */
ctxt->dst.val = ctxt->dst.orig_val;
}
return X86EMUL_CONTINUE;
@@ -2348,7 +2411,7 @@ static int em_sysenter(struct x86_emulate_ctxt *ctxt)
* Not recognized on AMD in compat mode (but is recognized in legacy
* mode).
*/
- if ((ctxt->mode == X86EMUL_MODE_PROT32) && (efer & EFER_LMA)
+ if ((ctxt->mode != X86EMUL_MODE_PROT64) && (efer & EFER_LMA)
&& !vendor_intel(ctxt))
return emulate_ud(ctxt);
@@ -2359,25 +2422,13 @@ static int em_sysenter(struct x86_emulate_ctxt *ctxt)
setup_syscalls_segments(ctxt, &cs, &ss);
ops->get_msr(ctxt, MSR_IA32_SYSENTER_CS, &msr_data);
- switch (ctxt->mode) {
- case X86EMUL_MODE_PROT32:
- if ((msr_data & 0xfffc) == 0x0)
- return emulate_gp(ctxt, 0);
- break;
- case X86EMUL_MODE_PROT64:
- if (msr_data == 0x0)
- return emulate_gp(ctxt, 0);
- break;
- default:
- break;
- }
+ if ((msr_data & 0xfffc) == 0x0)
+ return emulate_gp(ctxt, 0);
ctxt->eflags &= ~(EFLG_VM | EFLG_IF);
- cs_sel = (u16)msr_data;
- cs_sel &= ~SELECTOR_RPL_MASK;
+ cs_sel = (u16)msr_data & ~SELECTOR_RPL_MASK;
ss_sel = cs_sel + 8;
- ss_sel &= ~SELECTOR_RPL_MASK;
- if (ctxt->mode == X86EMUL_MODE_PROT64 || (efer & EFER_LMA)) {
+ if (efer & EFER_LMA) {
cs.d = 0;
cs.l = 1;
}
@@ -2386,10 +2437,11 @@ static int em_sysenter(struct x86_emulate_ctxt *ctxt)
ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS);
ops->get_msr(ctxt, MSR_IA32_SYSENTER_EIP, &msr_data);
- ctxt->_eip = msr_data;
+ ctxt->_eip = (efer & EFER_LMA) ? msr_data : (u32)msr_data;
ops->get_msr(ctxt, MSR_IA32_SYSENTER_ESP, &msr_data);
- *reg_write(ctxt, VCPU_REGS_RSP) = msr_data;
+ *reg_write(ctxt, VCPU_REGS_RSP) = (efer & EFER_LMA) ? msr_data :
+ (u32)msr_data;
return X86EMUL_CONTINUE;
}
@@ -2567,23 +2619,23 @@ static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt,
* it is handled in a context of new task
*/
ret = __load_segment_descriptor(ctxt, tss->ldt, VCPU_SREG_LDTR, cpl,
- true, NULL);
+ X86_TRANSFER_TASK_SWITCH, NULL);
if (ret != X86EMUL_CONTINUE)
return ret;
ret = __load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES, cpl,
- true, NULL);
+ X86_TRANSFER_TASK_SWITCH, NULL);
if (ret != X86EMUL_CONTINUE)
return ret;
ret = __load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS, cpl,
- true, NULL);
+ X86_TRANSFER_TASK_SWITCH, NULL);
if (ret != X86EMUL_CONTINUE)
return ret;
ret = __load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS, cpl,
- true, NULL);
+ X86_TRANSFER_TASK_SWITCH, NULL);
if (ret != X86EMUL_CONTINUE)
return ret;
ret = __load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS, cpl,
- true, NULL);
+ X86_TRANSFER_TASK_SWITCH, NULL);
if (ret != X86EMUL_CONTINUE)
return ret;
@@ -2705,31 +2757,31 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
* it is handled in a context of new task
*/
ret = __load_segment_descriptor(ctxt, tss->ldt_selector, VCPU_SREG_LDTR,
- cpl, true, NULL);
+ cpl, X86_TRANSFER_TASK_SWITCH, NULL);
if (ret != X86EMUL_CONTINUE)
return ret;
ret = __load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES, cpl,
- true, NULL);
+ X86_TRANSFER_TASK_SWITCH, NULL);
if (ret != X86EMUL_CONTINUE)
return ret;
ret = __load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS, cpl,
- true, NULL);
+ X86_TRANSFER_TASK_SWITCH, NULL);
if (ret != X86EMUL_CONTINUE)
return ret;
ret = __load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS, cpl,
- true, NULL);
+ X86_TRANSFER_TASK_SWITCH, NULL);
if (ret != X86EMUL_CONTINUE)
return ret;
ret = __load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS, cpl,
- true, NULL);
+ X86_TRANSFER_TASK_SWITCH, NULL);
if (ret != X86EMUL_CONTINUE)
return ret;
ret = __load_segment_descriptor(ctxt, tss->fs, VCPU_SREG_FS, cpl,
- true, NULL);
+ X86_TRANSFER_TASK_SWITCH, NULL);
if (ret != X86EMUL_CONTINUE)
return ret;
ret = __load_segment_descriptor(ctxt, tss->gs, VCPU_SREG_GS, cpl,
- true, NULL);
+ X86_TRANSFER_TASK_SWITCH, NULL);
if (ret != X86EMUL_CONTINUE)
return ret;
@@ -2750,7 +2802,6 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
ret = ops->read_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg,
&ctxt->exception);
if (ret != X86EMUL_CONTINUE)
- /* FIXME: need to provide precise fault address */
return ret;
save_state_to_tss32(ctxt, &tss_seg);
@@ -2759,13 +2810,11 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
ret = ops->write_std(ctxt, old_tss_base + eip_offset, &tss_seg.eip,
ldt_sel_offset - eip_offset, &ctxt->exception);
if (ret != X86EMUL_CONTINUE)
- /* FIXME: need to provide precise fault address */
return ret;
ret = ops->read_std(ctxt, new_tss_base, &tss_seg, sizeof tss_seg,
&ctxt->exception);
if (ret != X86EMUL_CONTINUE)
- /* FIXME: need to provide precise fault address */
return ret;
if (old_tss_sel != 0xffff) {
@@ -2776,7 +2825,6 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
sizeof tss_seg.prev_task_link,
&ctxt->exception);
if (ret != X86EMUL_CONTINUE)
- /* FIXME: need to provide precise fault address */
return ret;
}
@@ -3010,15 +3058,16 @@ static int em_call_far(struct x86_emulate_ctxt *ctxt)
struct desc_struct old_desc, new_desc;
const struct x86_emulate_ops *ops = ctxt->ops;
int cpl = ctxt->ops->cpl(ctxt);
+ enum x86emul_mode prev_mode = ctxt->mode;
old_eip = ctxt->_eip;
ops->get_segment(ctxt, &old_cs, &old_desc, NULL, VCPU_SREG_CS);
memcpy(&sel, ctxt->src.valptr + ctxt->op_bytes, 2);
- rc = __load_segment_descriptor(ctxt, sel, VCPU_SREG_CS, cpl, false,
- &new_desc);
+ rc = __load_segment_descriptor(ctxt, sel, VCPU_SREG_CS, cpl,
+ X86_TRANSFER_CALL_JMP, &new_desc);
if (rc != X86EMUL_CONTINUE)
- return X86EMUL_CONTINUE;
+ return rc;
rc = assign_eip_far(ctxt, ctxt->src.val, &new_desc);
if (rc != X86EMUL_CONTINUE)
@@ -3033,11 +3082,14 @@ static int em_call_far(struct x86_emulate_ctxt *ctxt)
rc = em_push(ctxt);
/* If we failed, we tainted the memory, but the very least we should
restore cs */
- if (rc != X86EMUL_CONTINUE)
+ if (rc != X86EMUL_CONTINUE) {
+ pr_warn_once("faulting far call emulation tainted memory\n");
goto fail;
+ }
return rc;
fail:
ops->set_segment(ctxt, old_cs, &old_desc, 0, VCPU_SREG_CS);
+ ctxt->mode = prev_mode;
return rc;
}
@@ -3488,6 +3540,12 @@ static int em_clflush(struct x86_emulate_ctxt *ctxt)
return X86EMUL_CONTINUE;
}
+static int em_movsxd(struct x86_emulate_ctxt *ctxt)
+{
+ ctxt->dst.val = (s32) ctxt->src.val;
+ return X86EMUL_CONTINUE;
+}
+
static bool valid_cr(int nr)
{
switch (nr) {
@@ -3687,6 +3745,7 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt)
#define G(_f, _g) { .flags = ((_f) | Group | ModRM), .u.group = (_g) }
#define GD(_f, _g) { .flags = ((_f) | GroupDual | ModRM), .u.gdual = (_g) }
#define ID(_f, _i) { .flags = ((_f) | InstrDual | ModRM), .u.idual = (_i) }
+#define MD(_f, _m) { .flags = ((_f) | ModeDual), .u.mdual = (_m) }
#define E(_f, _e) { .flags = ((_f) | Escape | ModRM), .u.esc = (_e) }
#define I(_f, _e) { .flags = (_f), .u.execute = (_e) }
#define F(_f, _e) { .flags = (_f) | Fastop, .u.fastop = (_e) }
@@ -3749,7 +3808,7 @@ static const struct opcode group1[] = {
};
static const struct opcode group1A[] = {
- I(DstMem | SrcNone | Mov | Stack, em_pop), N, N, N, N, N, N, N,
+ I(DstMem | SrcNone | Mov | Stack | IncSP, em_pop), N, N, N, N, N, N, N,
};
static const struct opcode group2[] = {
@@ -3791,8 +3850,8 @@ static const struct opcode group5[] = {
};
static const struct opcode group6[] = {
- DI(Prot, sldt),
- DI(Prot, str),
+ DI(Prot | DstMem, sldt),
+ DI(Prot | DstMem, str),
II(Prot | Priv | SrcMem16, em_lldt, lldt),
II(Prot | Priv | SrcMem16, em_ltr, ltr),
N, N, N, N,
@@ -3865,7 +3924,7 @@ static const struct gprefix pfx_0f_e7 = {
};
static const struct escape escape_d9 = { {
- N, N, N, N, N, N, N, I(DstMem, em_fnstcw),
+ N, N, N, N, N, N, N, I(DstMem16 | Mov, em_fnstcw),
}, {
/* 0xC0 - 0xC7 */
N, N, N, N, N, N, N, N,
@@ -3907,7 +3966,7 @@ static const struct escape escape_db = { {
} };
static const struct escape escape_dd = { {
- N, N, N, N, N, N, N, I(DstMem, em_fnstsw),
+ N, N, N, N, N, N, N, I(DstMem16 | Mov, em_fnstsw),
}, {
/* 0xC0 - 0xC7 */
N, N, N, N, N, N, N, N,
@@ -3931,6 +3990,10 @@ static const struct instr_dual instr_dual_0f_c3 = {
I(DstMem | SrcReg | ModRM | No16 | Mov, em_mov), N
};
+static const struct mode_dual mode_dual_63 = {
+ N, I(DstReg | SrcMem32 | ModRM | Mov, em_movsxd)
+};
+
static const struct opcode opcode_table[256] = {
/* 0x00 - 0x07 */
F6ALU(Lock, em_add),
@@ -3965,7 +4028,7 @@ static const struct opcode opcode_table[256] = {
/* 0x60 - 0x67 */
I(ImplicitOps | Stack | No64, em_pusha),
I(ImplicitOps | Stack | No64, em_popa),
- N, D(DstReg | SrcMem32 | ModRM | Mov) /* movsxd (x86/64) */ ,
+ N, MD(ModRM, &mode_dual_63),
N, N, N, N,
/* 0x68 - 0x6F */
I(SrcImm | Mov | Stack, em_push),
@@ -4021,8 +4084,8 @@ static const struct opcode opcode_table[256] = {
G(ByteOp, group11), G(0, group11),
/* 0xC8 - 0xCF */
I(Stack | SrcImmU16 | Src2ImmByte, em_enter), I(Stack, em_leave),
- I(ImplicitOps | Stack | SrcImmU16, em_ret_far_imm),
- I(ImplicitOps | Stack, em_ret_far),
+ I(ImplicitOps | SrcImmU16, em_ret_far_imm),
+ I(ImplicitOps, em_ret_far),
D(ImplicitOps), DI(SrcImmByte, intn),
D(ImplicitOps | No64), II(ImplicitOps, em_iret, iret),
/* 0xD0 - 0xD7 */
@@ -4119,7 +4182,7 @@ static const struct opcode twobyte_table[256] = {
F(DstMem | SrcReg | Src2CL | ModRM, em_shrd),
GD(0, &group15), F(DstReg | SrcMem | ModRM, em_imul),
/* 0xB0 - 0xB7 */
- I2bv(DstMem | SrcReg | ModRM | Lock | PageTable, em_cmpxchg),
+ I2bv(DstMem | SrcReg | ModRM | Lock | PageTable | SrcWrite, em_cmpxchg),
I(DstReg | SrcMemFAddr | ModRM | Src2SS, em_lseg),
F(DstMem | SrcReg | ModRM | BitOp | Lock, em_btr),
I(DstReg | SrcMemFAddr | ModRM | Src2FS, em_lseg),
@@ -4185,6 +4248,8 @@ static const struct opcode opcode_map_0f_38[256] = {
#undef I
#undef GP
#undef EXT
+#undef MD
+#undef ID
#undef D2bv
#undef D2bvIP
@@ -4574,6 +4639,12 @@ done_prefixes:
else
opcode = opcode.u.idual->mod012;
break;
+ case ModeDual:
+ if (ctxt->mode == X86EMUL_MODE_PROT64)
+ opcode = opcode.u.mdual->mode64;
+ else
+ opcode = opcode.u.mdual->mode32;
+ break;
default:
return EMULATION_FAILED;
}
@@ -4871,8 +4942,13 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
/* optimisation - avoid slow emulated read if Mov */
rc = segmented_read(ctxt, ctxt->dst.addr.mem,
&ctxt->dst.val, ctxt->dst.bytes);
- if (rc != X86EMUL_CONTINUE)
+ if (rc != X86EMUL_CONTINUE) {
+ if (!(ctxt->d & NoWrite) &&
+ rc == X86EMUL_PROPAGATE_FAULT &&
+ ctxt->exception.vector == PF_VECTOR)
+ ctxt->exception.error_code |= PFERR_WRITE_MASK;
goto done;
+ }
}
ctxt->dst.orig_val = ctxt->dst.val;
@@ -4910,11 +4986,6 @@ special_insn:
goto threebyte_insn;
switch (ctxt->b) {
- case 0x63: /* movsxd */
- if (ctxt->mode != X86EMUL_MODE_PROT64)
- goto cannot_emulate;
- ctxt->dst.val = (s32) ctxt->src.val;
- break;
case 0x70 ... 0x7f: /* jcc (short) */
if (test_cc(ctxt->b, ctxt->eflags))
rc = jmp_rel(ctxt, ctxt->src.val);
diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h
index 3c9195535ffc..c2e36d934af4 100644
--- a/arch/x86/kvm/ioapic.h
+++ b/arch/x86/kvm/ioapic.h
@@ -98,7 +98,7 @@ static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm)
}
void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu);
-int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
+bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
int short_hand, unsigned int dest, int dest_mode);
int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2);
void kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, int vector,
diff --git a/arch/x86/kvm/iommu.c b/arch/x86/kvm/iommu.c
index 17b73eeac8a4..7dbced309ddb 100644
--- a/arch/x86/kvm/iommu.c
+++ b/arch/x86/kvm/iommu.c
@@ -138,7 +138,7 @@ int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot)
gfn += page_size >> PAGE_SHIFT;
-
+ cond_resched();
}
return 0;
@@ -306,6 +306,8 @@ static void kvm_iommu_put_pages(struct kvm *kvm,
kvm_unpin_pages(kvm, pfn, unmap_pages);
gfn += unmap_pages;
+
+ cond_resched();
}
}
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 4f0c0b954686..e55b5fc344eb 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -33,6 +33,7 @@
#include <asm/page.h>
#include <asm/current.h>
#include <asm/apicdef.h>
+#include <asm/delay.h>
#include <linux/atomic.h>
#include <linux/jump_label.h>
#include "kvm_cache_regs.h"
@@ -192,6 +193,9 @@ static void recalculate_apic_map(struct kvm *kvm)
u16 cid, lid;
u32 ldr, aid;
+ if (!kvm_apic_present(vcpu))
+ continue;
+
aid = kvm_apic_id(apic);
ldr = kvm_apic_get_reg(apic, APIC_LDR);
cid = apic_cluster_id(new, ldr);
@@ -324,17 +328,24 @@ static u8 count_vectors(void *bitmap)
return count;
}
-void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir)
+void __kvm_apic_update_irr(u32 *pir, void *regs)
{
u32 i, pir_val;
- struct kvm_lapic *apic = vcpu->arch.apic;
for (i = 0; i <= 7; i++) {
pir_val = xchg(&pir[i], 0);
if (pir_val)
- *((u32 *)(apic->regs + APIC_IRR + i * 0x10)) |= pir_val;
+ *((u32 *)(regs + APIC_IRR + i * 0x10)) |= pir_val;
}
}
+EXPORT_SYMBOL_GPL(__kvm_apic_update_irr);
+
+void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ __kvm_apic_update_irr(pir, apic->regs);
+}
EXPORT_SYMBOL_GPL(kvm_apic_update_irr);
static inline void apic_set_irr(int vec, struct kvm_lapic *apic)
@@ -402,7 +413,7 @@ static inline void apic_set_isr(int vec, struct kvm_lapic *apic)
* because the processor can modify ISR under the hood. Instead
* just set SVI.
*/
- if (unlikely(kvm_apic_vid_enabled(vcpu->kvm)))
+ if (unlikely(kvm_x86_ops->hwapic_isr_update))
kvm_x86_ops->hwapic_isr_update(vcpu->kvm, vec);
else {
++apic->isr_count;
@@ -450,7 +461,7 @@ static inline void apic_clear_isr(int vec, struct kvm_lapic *apic)
* on the other hand isr_count and highest_isr_cache are unused
* and must be left alone.
*/
- if (unlikely(kvm_apic_vid_enabled(vcpu->kvm)))
+ if (unlikely(kvm_x86_ops->hwapic_isr_update))
kvm_x86_ops->hwapic_isr_update(vcpu->kvm,
apic_find_highest_isr(apic));
else {
@@ -577,55 +588,48 @@ static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr)
apic_update_ppr(apic);
}
-static int kvm_apic_broadcast(struct kvm_lapic *apic, u32 dest)
+static bool kvm_apic_broadcast(struct kvm_lapic *apic, u32 dest)
{
return dest == (apic_x2apic_mode(apic) ?
X2APIC_BROADCAST : APIC_BROADCAST);
}
-int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u32 dest)
+static bool kvm_apic_match_physical_addr(struct kvm_lapic *apic, u32 dest)
{
return kvm_apic_id(apic) == dest || kvm_apic_broadcast(apic, dest);
}
-int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda)
+static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda)
{
- int result = 0;
u32 logical_id;
if (kvm_apic_broadcast(apic, mda))
- return 1;
+ return true;
- if (apic_x2apic_mode(apic)) {
- logical_id = kvm_apic_get_reg(apic, APIC_LDR);
- return logical_id & mda;
- }
+ logical_id = kvm_apic_get_reg(apic, APIC_LDR);
+
+ if (apic_x2apic_mode(apic))
+ return ((logical_id >> 16) == (mda >> 16))
+ && (logical_id & mda & 0xffff) != 0;
- logical_id = GET_APIC_LOGICAL_ID(kvm_apic_get_reg(apic, APIC_LDR));
+ logical_id = GET_APIC_LOGICAL_ID(logical_id);
switch (kvm_apic_get_reg(apic, APIC_DFR)) {
case APIC_DFR_FLAT:
- if (logical_id & mda)
- result = 1;
- break;
+ return (logical_id & mda) != 0;
case APIC_DFR_CLUSTER:
- if (((logical_id >> 4) == (mda >> 0x4))
- && (logical_id & mda & 0xf))
- result = 1;
- break;
+ return ((logical_id >> 4) == (mda >> 4))
+ && (logical_id & mda & 0xf) != 0;
default:
apic_debug("Bad DFR vcpu %d: %08x\n",
apic->vcpu->vcpu_id, kvm_apic_get_reg(apic, APIC_DFR));
- break;
+ return false;
}
-
- return result;
}
-int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
+bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
int short_hand, unsigned int dest, int dest_mode)
{
- int result = 0;
struct kvm_lapic *target = vcpu->arch.apic;
apic_debug("target %p, source %p, dest 0x%x, "
@@ -635,29 +639,21 @@ int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
ASSERT(target);
switch (short_hand) {
case APIC_DEST_NOSHORT:
- if (dest_mode == 0)
- /* Physical mode. */
- result = kvm_apic_match_physical_addr(target, dest);
+ if (dest_mode == APIC_DEST_PHYSICAL)
+ return kvm_apic_match_physical_addr(target, dest);
else
- /* Logical mode. */
- result = kvm_apic_match_logical_addr(target, dest);
- break;
+ return kvm_apic_match_logical_addr(target, dest);
case APIC_DEST_SELF:
- result = (target == source);
- break;
+ return target == source;
case APIC_DEST_ALLINC:
- result = 1;
- break;
+ return true;
case APIC_DEST_ALLBUT:
- result = (target != source);
- break;
+ return target != source;
default:
apic_debug("kvm: apic: Bad dest shorthand value %x\n",
short_hand);
- break;
+ return false;
}
-
- return result;
}
bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
@@ -690,7 +686,7 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
ret = true;
- if (irq->dest_mode == 0) { /* physical mode */
+ if (irq->dest_mode == APIC_DEST_PHYSICAL) {
if (irq->dest_id >= ARRAY_SIZE(map->phys_map))
goto out;
@@ -1073,25 +1069,72 @@ static void apic_timer_expired(struct kvm_lapic *apic)
{
struct kvm_vcpu *vcpu = apic->vcpu;
wait_queue_head_t *q = &vcpu->wq;
+ struct kvm_timer *ktimer = &apic->lapic_timer;
- /*
- * Note: KVM_REQ_PENDING_TIMER is implicitly checked in
- * vcpu_enter_guest.
- */
if (atomic_read(&apic->lapic_timer.pending))
return;
atomic_inc(&apic->lapic_timer.pending);
- /* FIXME: this code should not know anything about vcpus */
- kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
+ kvm_set_pending_timer(vcpu);
if (waitqueue_active(q))
wake_up_interruptible(q);
+
+ if (apic_lvtt_tscdeadline(apic))
+ ktimer->expired_tscdeadline = ktimer->tscdeadline;
+}
+
+/*
+ * On APICv, this test will cause a busy wait
+ * during a higher-priority task.
+ */
+
+static bool lapic_timer_int_injected(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ u32 reg = kvm_apic_get_reg(apic, APIC_LVTT);
+
+ if (kvm_apic_hw_enabled(apic)) {
+ int vec = reg & APIC_VECTOR_MASK;
+ void *bitmap = apic->regs + APIC_ISR;
+
+ if (kvm_x86_ops->deliver_posted_interrupt)
+ bitmap = apic->regs + APIC_IRR;
+
+ if (apic_test_vector(vec, bitmap))
+ return true;
+ }
+ return false;
+}
+
+void wait_lapic_expire(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ u64 guest_tsc, tsc_deadline;
+
+ if (!kvm_vcpu_has_lapic(vcpu))
+ return;
+
+ if (apic->lapic_timer.expired_tscdeadline == 0)
+ return;
+
+ if (!lapic_timer_int_injected(vcpu))
+ return;
+
+ tsc_deadline = apic->lapic_timer.expired_tscdeadline;
+ apic->lapic_timer.expired_tscdeadline = 0;
+ guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, native_read_tsc());
+ trace_kvm_wait_lapic_expire(vcpu->vcpu_id, guest_tsc - tsc_deadline);
+
+ /* __delay is delay_tsc whenever the hardware has TSC, thus always. */
+ if (guest_tsc < tsc_deadline)
+ __delay(tsc_deadline - guest_tsc);
}
static void start_apic_timer(struct kvm_lapic *apic)
{
ktime_t now;
+
atomic_set(&apic->lapic_timer.pending, 0);
if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) {
@@ -1137,6 +1180,7 @@ static void start_apic_timer(struct kvm_lapic *apic)
/* lapic timer in tsc deadline mode */
u64 guest_tsc, tscdeadline = apic->lapic_timer.tscdeadline;
u64 ns = 0;
+ ktime_t expire;
struct kvm_vcpu *vcpu = apic->vcpu;
unsigned long this_tsc_khz = vcpu->arch.virtual_tsc_khz;
unsigned long flags;
@@ -1151,8 +1195,10 @@ static void start_apic_timer(struct kvm_lapic *apic)
if (likely(tscdeadline > guest_tsc)) {
ns = (tscdeadline - guest_tsc) * 1000000ULL;
do_div(ns, this_tsc_khz);
+ expire = ktime_add_ns(now, ns);
+ expire = ktime_sub_ns(expire, lapic_timer_advance_ns);
hrtimer_start(&apic->lapic_timer.timer,
- ktime_add_ns(now, ns), HRTIMER_MODE_ABS);
+ expire, HRTIMER_MODE_ABS);
} else
apic_timer_expired(apic);
@@ -1742,7 +1788,9 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu,
if (kvm_x86_ops->hwapic_irr_update)
kvm_x86_ops->hwapic_irr_update(vcpu,
apic_find_highest_irr(apic));
- kvm_x86_ops->hwapic_isr_update(vcpu->kvm, apic_find_highest_isr(apic));
+ if (unlikely(kvm_x86_ops->hwapic_isr_update))
+ kvm_x86_ops->hwapic_isr_update(vcpu->kvm,
+ apic_find_highest_isr(apic));
kvm_make_request(KVM_REQ_EVENT, vcpu);
kvm_rtc_eoi_tracking_restore_one(vcpu);
}
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index c674fce53cf9..0bc6c656625b 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -14,6 +14,7 @@ struct kvm_timer {
u32 timer_mode;
u32 timer_mode_mask;
u64 tscdeadline;
+ u64 expired_tscdeadline;
atomic_t pending; /* accumulated triggered timers */
};
@@ -56,9 +57,8 @@ u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu);
void kvm_apic_set_version(struct kvm_vcpu *vcpu);
void kvm_apic_update_tmr(struct kvm_vcpu *vcpu, u32 *tmr);
+void __kvm_apic_update_irr(u32 *pir, void *regs);
void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir);
-int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u32 dest);
-int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda);
int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
unsigned long *dest_map);
int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type);
@@ -170,4 +170,6 @@ static inline bool kvm_apic_has_events(struct kvm_vcpu *vcpu)
bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector);
+void wait_lapic_expire(struct kvm_vcpu *vcpu);
+
#endif
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index f83fc6c5e0ba..cee759299a35 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -63,30 +63,16 @@ enum {
#undef MMU_DEBUG
#ifdef MMU_DEBUG
+static bool dbg = 0;
+module_param(dbg, bool, 0644);
#define pgprintk(x...) do { if (dbg) printk(x); } while (0)
#define rmap_printk(x...) do { if (dbg) printk(x); } while (0)
-
+#define MMU_WARN_ON(x) WARN_ON(x)
#else
-
#define pgprintk(x...) do { } while (0)
#define rmap_printk(x...) do { } while (0)
-
-#endif
-
-#ifdef MMU_DEBUG
-static bool dbg = 0;
-module_param(dbg, bool, 0644);
-#endif
-
-#ifndef MMU_DEBUG
-#define ASSERT(x) do { } while (0)
-#else
-#define ASSERT(x) \
- if (!(x)) { \
- printk(KERN_WARNING "assertion failed %s:%d: %s\n", \
- __FILE__, __LINE__, #x); \
- }
+#define MMU_WARN_ON(x) do { } while (0)
#endif
#define PTE_PREFETCH_NUM 8
@@ -546,6 +532,11 @@ static bool spte_is_bit_cleared(u64 old_spte, u64 new_spte, u64 bit_mask)
return (old_spte & bit_mask) && !(new_spte & bit_mask);
}
+static bool spte_is_bit_changed(u64 old_spte, u64 new_spte, u64 bit_mask)
+{
+ return (old_spte & bit_mask) != (new_spte & bit_mask);
+}
+
/* Rules for using mmu_spte_set:
* Set the sptep from nonpresent to present.
* Note: the sptep being assigned *must* be either not present
@@ -596,6 +587,14 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte)
if (!shadow_accessed_mask)
return ret;
+ /*
+ * Flush TLB when accessed/dirty bits are changed in the page tables,
+ * to guarantee consistency between TLB and page tables.
+ */
+ if (spte_is_bit_changed(old_spte, new_spte,
+ shadow_accessed_mask | shadow_dirty_mask))
+ ret = true;
+
if (spte_is_bit_cleared(old_spte, new_spte, shadow_accessed_mask))
kvm_set_pfn_accessed(spte_to_pfn(old_spte));
if (spte_is_bit_cleared(old_spte, new_spte, shadow_dirty_mask))
@@ -1216,6 +1215,60 @@ static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp,
return flush;
}
+static bool spte_clear_dirty(struct kvm *kvm, u64 *sptep)
+{
+ u64 spte = *sptep;
+
+ rmap_printk("rmap_clear_dirty: spte %p %llx\n", sptep, *sptep);
+
+ spte &= ~shadow_dirty_mask;
+
+ return mmu_spte_update(sptep, spte);
+}
+
+static bool __rmap_clear_dirty(struct kvm *kvm, unsigned long *rmapp)
+{
+ u64 *sptep;
+ struct rmap_iterator iter;
+ bool flush = false;
+
+ for (sptep = rmap_get_first(*rmapp, &iter); sptep;) {
+ BUG_ON(!(*sptep & PT_PRESENT_MASK));
+
+ flush |= spte_clear_dirty(kvm, sptep);
+ sptep = rmap_get_next(&iter);
+ }
+
+ return flush;
+}
+
+static bool spte_set_dirty(struct kvm *kvm, u64 *sptep)
+{
+ u64 spte = *sptep;
+
+ rmap_printk("rmap_set_dirty: spte %p %llx\n", sptep, *sptep);
+
+ spte |= shadow_dirty_mask;
+
+ return mmu_spte_update(sptep, spte);
+}
+
+static bool __rmap_set_dirty(struct kvm *kvm, unsigned long *rmapp)
+{
+ u64 *sptep;
+ struct rmap_iterator iter;
+ bool flush = false;
+
+ for (sptep = rmap_get_first(*rmapp, &iter); sptep;) {
+ BUG_ON(!(*sptep & PT_PRESENT_MASK));
+
+ flush |= spte_set_dirty(kvm, sptep);
+ sptep = rmap_get_next(&iter);
+ }
+
+ return flush;
+}
+
/**
* kvm_mmu_write_protect_pt_masked - write protect selected PT level pages
* @kvm: kvm instance
@@ -1226,7 +1279,7 @@ static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp,
* Used when we do not need to care about huge page mappings: e.g. during dirty
* logging we do not have any such mappings.
*/
-void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
+static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
struct kvm_memory_slot *slot,
gfn_t gfn_offset, unsigned long mask)
{
@@ -1242,6 +1295,53 @@ void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
}
}
+/**
+ * kvm_mmu_clear_dirty_pt_masked - clear MMU D-bit for PT level pages
+ * @kvm: kvm instance
+ * @slot: slot to clear D-bit
+ * @gfn_offset: start of the BITS_PER_LONG pages we care about
+ * @mask: indicates which pages we should clear D-bit
+ *
+ * Used for PML to re-log the dirty GPAs after userspace querying dirty_bitmap.
+ */
+void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
+ struct kvm_memory_slot *slot,
+ gfn_t gfn_offset, unsigned long mask)
+{
+ unsigned long *rmapp;
+
+ while (mask) {
+ rmapp = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
+ PT_PAGE_TABLE_LEVEL, slot);
+ __rmap_clear_dirty(kvm, rmapp);
+
+ /* clear the first set bit */
+ mask &= mask - 1;
+ }
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_clear_dirty_pt_masked);
+
+/**
+ * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected
+ * PT level pages.
+ *
+ * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to
+ * enable dirty logging for them.
+ *
+ * Used when we do not need to care about huge page mappings: e.g. during dirty
+ * logging we do not have any such mappings.
+ */
+void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
+ struct kvm_memory_slot *slot,
+ gfn_t gfn_offset, unsigned long mask)
+{
+ if (kvm_x86_ops->enable_log_dirty_pt_masked)
+ kvm_x86_ops->enable_log_dirty_pt_masked(kvm, slot, gfn_offset,
+ mask);
+ else
+ kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
+}
+
static bool rmap_write_protect(struct kvm *kvm, u64 gfn)
{
struct kvm_memory_slot *slot;
@@ -1536,7 +1636,7 @@ static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, int nr)
static void kvm_mmu_free_page(struct kvm_mmu_page *sp)
{
- ASSERT(is_empty_shadow_page(sp->spt));
+ MMU_WARN_ON(!is_empty_shadow_page(sp->spt));
hlist_del(&sp->hash_link);
list_del(&sp->link);
free_page((unsigned long)sp->spt);
@@ -2501,8 +2601,10 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
}
}
- if (pte_access & ACC_WRITE_MASK)
+ if (pte_access & ACC_WRITE_MASK) {
mark_page_dirty(vcpu->kvm, gfn);
+ spte |= shadow_dirty_mask;
+ }
set_pte:
if (mmu_spte_update(sptep, spte))
@@ -2818,6 +2920,18 @@ fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
*/
gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
+ /*
+ * Theoretically we could also set dirty bit (and flush TLB) here in
+ * order to eliminate unnecessary PML logging. See comments in
+ * set_spte. But fast_page_fault is very unlikely to happen with PML
+ * enabled, so we do not do this. This might result in the same GPA
+ * to be logged in PML buffer again when the write really happens, and
+ * eventually to be called by mark_page_dirty twice. But it's also no
+ * harm. This also avoids the TLB flush needed after setting dirty bit
+ * so non-PML cases won't be impacted.
+ *
+ * Compare with set_spte where instead shadow_dirty_mask is set.
+ */
if (cmpxchg64(sptep, spte, spte | PT_WRITABLE_MASK) == spte)
mark_page_dirty(vcpu->kvm, gfn);
@@ -3041,7 +3155,7 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
for (i = 0; i < 4; ++i) {
hpa_t root = vcpu->arch.mmu.pae_root[i];
- ASSERT(!VALID_PAGE(root));
+ MMU_WARN_ON(VALID_PAGE(root));
spin_lock(&vcpu->kvm->mmu_lock);
make_mmu_pages_available(vcpu);
sp = kvm_mmu_get_page(vcpu, i << (30 - PAGE_SHIFT),
@@ -3079,7 +3193,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
hpa_t root = vcpu->arch.mmu.root_hpa;
- ASSERT(!VALID_PAGE(root));
+ MMU_WARN_ON(VALID_PAGE(root));
spin_lock(&vcpu->kvm->mmu_lock);
make_mmu_pages_available(vcpu);
@@ -3104,7 +3218,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
for (i = 0; i < 4; ++i) {
hpa_t root = vcpu->arch.mmu.pae_root[i];
- ASSERT(!VALID_PAGE(root));
+ MMU_WARN_ON(VALID_PAGE(root));
if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
pdptr = vcpu->arch.mmu.get_pdptr(vcpu, i);
if (!is_present_gpte(pdptr)) {
@@ -3329,8 +3443,7 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
if (r)
return r;
- ASSERT(vcpu);
- ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
+ MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
gfn = gva >> PAGE_SHIFT;
@@ -3396,8 +3509,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
int write = error_code & PFERR_WRITE_MASK;
bool map_writable;
- ASSERT(vcpu);
- ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
+ MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
if (unlikely(error_code & PFERR_RSVD_MASK)) {
r = handle_mmio_page_fault(vcpu, gpa, error_code, true);
@@ -3718,7 +3830,7 @@ static void paging64_init_context_common(struct kvm_vcpu *vcpu,
update_permission_bitmask(vcpu, context, false);
update_last_pte_bitmap(vcpu, context);
- ASSERT(is_pae(vcpu));
+ MMU_WARN_ON(!is_pae(vcpu));
context->page_fault = paging64_page_fault;
context->gva_to_gpa = paging64_gva_to_gpa;
context->sync_page = paging64_sync_page;
@@ -3763,7 +3875,7 @@ static void paging32E_init_context(struct kvm_vcpu *vcpu,
static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
{
- struct kvm_mmu *context = vcpu->arch.walk_mmu;
+ struct kvm_mmu *context = &vcpu->arch.mmu;
context->base_role.word = 0;
context->page_fault = tdp_page_fault;
@@ -3803,11 +3915,12 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
update_last_pte_bitmap(vcpu, context);
}
-void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
+void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu)
{
bool smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP);
- ASSERT(vcpu);
- ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
+ struct kvm_mmu *context = &vcpu->arch.mmu;
+
+ MMU_WARN_ON(VALID_PAGE(context->root_hpa));
if (!is_paging(vcpu))
nonpaging_init_context(vcpu, context);
@@ -3818,19 +3931,19 @@ void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
else
paging32_init_context(vcpu, context);
- vcpu->arch.mmu.base_role.nxe = is_nx(vcpu);
- vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu);
- vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu);
- vcpu->arch.mmu.base_role.smep_andnot_wp
+ context->base_role.nxe = is_nx(vcpu);
+ context->base_role.cr4_pae = !!is_pae(vcpu);
+ context->base_role.cr0_wp = is_write_protection(vcpu);
+ context->base_role.smep_andnot_wp
= smep && !is_write_protection(vcpu);
}
EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu);
-void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context,
- bool execonly)
+void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly)
{
- ASSERT(vcpu);
- ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
+ struct kvm_mmu *context = &vcpu->arch.mmu;
+
+ MMU_WARN_ON(VALID_PAGE(context->root_hpa));
context->shadow_root_level = kvm_x86_ops->get_tdp_level();
@@ -3851,11 +3964,13 @@ EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu);
static void init_kvm_softmmu(struct kvm_vcpu *vcpu)
{
- kvm_init_shadow_mmu(vcpu, vcpu->arch.walk_mmu);
- vcpu->arch.walk_mmu->set_cr3 = kvm_x86_ops->set_cr3;
- vcpu->arch.walk_mmu->get_cr3 = get_cr3;
- vcpu->arch.walk_mmu->get_pdptr = kvm_pdptr_read;
- vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault;
+ struct kvm_mmu *context = &vcpu->arch.mmu;
+
+ kvm_init_shadow_mmu(vcpu);
+ context->set_cr3 = kvm_x86_ops->set_cr3;
+ context->get_cr3 = get_cr3;
+ context->get_pdptr = kvm_pdptr_read;
+ context->inject_page_fault = kvm_inject_page_fault;
}
static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
@@ -3900,17 +4015,15 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
static void init_kvm_mmu(struct kvm_vcpu *vcpu)
{
if (mmu_is_nested(vcpu))
- return init_kvm_nested_mmu(vcpu);
+ init_kvm_nested_mmu(vcpu);
else if (tdp_enabled)
- return init_kvm_tdp_mmu(vcpu);
+ init_kvm_tdp_mmu(vcpu);
else
- return init_kvm_softmmu(vcpu);
+ init_kvm_softmmu(vcpu);
}
void kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
{
- ASSERT(vcpu);
-
kvm_mmu_unload(vcpu);
init_kvm_mmu(vcpu);
}
@@ -4266,8 +4379,6 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
struct page *page;
int i;
- ASSERT(vcpu);
-
/*
* When emulating 32-bit mode, cr3 is only 32 bits even on x86_64.
* Therefore we need to allocate shadow page tables in the first
@@ -4286,8 +4397,6 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
int kvm_mmu_create(struct kvm_vcpu *vcpu)
{
- ASSERT(vcpu);
-
vcpu->arch.walk_mmu = &vcpu->arch.mmu;
vcpu->arch.mmu.root_hpa = INVALID_PAGE;
vcpu->arch.mmu.translate_gpa = translate_gpa;
@@ -4298,19 +4407,18 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu)
void kvm_mmu_setup(struct kvm_vcpu *vcpu)
{
- ASSERT(vcpu);
- ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
+ MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu.root_hpa));
init_kvm_mmu(vcpu);
}
-void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
+void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
+ struct kvm_memory_slot *memslot)
{
- struct kvm_memory_slot *memslot;
gfn_t last_gfn;
int i;
+ bool flush = false;
- memslot = id_to_memslot(kvm->memslots, slot);
last_gfn = memslot->base_gfn + memslot->npages - 1;
spin_lock(&kvm->mmu_lock);
@@ -4325,7 +4433,8 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
for (index = 0; index <= last_index; ++index, ++rmapp) {
if (*rmapp)
- __rmap_write_protect(kvm, rmapp, false);
+ flush |= __rmap_write_protect(kvm, rmapp,
+ false);
if (need_resched() || spin_needbreak(&kvm->mmu_lock))
cond_resched_lock(&kvm->mmu_lock);
@@ -4352,8 +4461,124 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
* instead of PT_WRITABLE_MASK, that means it does not depend
* on PT_WRITABLE_MASK anymore.
*/
- kvm_flush_remote_tlbs(kvm);
+ if (flush)
+ kvm_flush_remote_tlbs(kvm);
+}
+
+void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
+ struct kvm_memory_slot *memslot)
+{
+ gfn_t last_gfn;
+ unsigned long *rmapp;
+ unsigned long last_index, index;
+ bool flush = false;
+
+ last_gfn = memslot->base_gfn + memslot->npages - 1;
+
+ spin_lock(&kvm->mmu_lock);
+
+ rmapp = memslot->arch.rmap[PT_PAGE_TABLE_LEVEL - 1];
+ last_index = gfn_to_index(last_gfn, memslot->base_gfn,
+ PT_PAGE_TABLE_LEVEL);
+
+ for (index = 0; index <= last_index; ++index, ++rmapp) {
+ if (*rmapp)
+ flush |= __rmap_clear_dirty(kvm, rmapp);
+
+ if (need_resched() || spin_needbreak(&kvm->mmu_lock))
+ cond_resched_lock(&kvm->mmu_lock);
+ }
+
+ spin_unlock(&kvm->mmu_lock);
+
+ lockdep_assert_held(&kvm->slots_lock);
+
+ /*
+ * It's also safe to flush TLBs out of mmu lock here as currently this
+ * function is only used for dirty logging, in which case flushing TLB
+ * out of mmu lock also guarantees no dirty pages will be lost in
+ * dirty_bitmap.
+ */
+ if (flush)
+ kvm_flush_remote_tlbs(kvm);
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_slot_leaf_clear_dirty);
+
+void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm,
+ struct kvm_memory_slot *memslot)
+{
+ gfn_t last_gfn;
+ int i;
+ bool flush = false;
+
+ last_gfn = memslot->base_gfn + memslot->npages - 1;
+
+ spin_lock(&kvm->mmu_lock);
+
+ for (i = PT_PAGE_TABLE_LEVEL + 1; /* skip rmap for 4K page */
+ i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
+ unsigned long *rmapp;
+ unsigned long last_index, index;
+
+ rmapp = memslot->arch.rmap[i - PT_PAGE_TABLE_LEVEL];
+ last_index = gfn_to_index(last_gfn, memslot->base_gfn, i);
+
+ for (index = 0; index <= last_index; ++index, ++rmapp) {
+ if (*rmapp)
+ flush |= __rmap_write_protect(kvm, rmapp,
+ false);
+
+ if (need_resched() || spin_needbreak(&kvm->mmu_lock))
+ cond_resched_lock(&kvm->mmu_lock);
+ }
+ }
+ spin_unlock(&kvm->mmu_lock);
+
+ /* see kvm_mmu_slot_remove_write_access */
+ lockdep_assert_held(&kvm->slots_lock);
+
+ if (flush)
+ kvm_flush_remote_tlbs(kvm);
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_slot_largepage_remove_write_access);
+
+void kvm_mmu_slot_set_dirty(struct kvm *kvm,
+ struct kvm_memory_slot *memslot)
+{
+ gfn_t last_gfn;
+ int i;
+ bool flush = false;
+
+ last_gfn = memslot->base_gfn + memslot->npages - 1;
+
+ spin_lock(&kvm->mmu_lock);
+
+ for (i = PT_PAGE_TABLE_LEVEL;
+ i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
+ unsigned long *rmapp;
+ unsigned long last_index, index;
+
+ rmapp = memslot->arch.rmap[i - PT_PAGE_TABLE_LEVEL];
+ last_index = gfn_to_index(last_gfn, memslot->base_gfn, i);
+
+ for (index = 0; index <= last_index; ++index, ++rmapp) {
+ if (*rmapp)
+ flush |= __rmap_set_dirty(kvm, rmapp);
+
+ if (need_resched() || spin_needbreak(&kvm->mmu_lock))
+ cond_resched_lock(&kvm->mmu_lock);
+ }
+ }
+
+ spin_unlock(&kvm->mmu_lock);
+
+ lockdep_assert_held(&kvm->slots_lock);
+
+ /* see kvm_mmu_slot_leaf_clear_dirty */
+ if (flush)
+ kvm_flush_remote_tlbs(kvm);
}
+EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty);
#define BATCH_ZAP_PAGES 10
static void kvm_zap_obsolete_pages(struct kvm *kvm)
@@ -4606,8 +4831,6 @@ EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy);
void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
{
- ASSERT(vcpu);
-
kvm_mmu_unload(vcpu);
free_mmu_pages(vcpu);
mmu_free_memory_caches(vcpu);
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index bde8ee725754..c7d65637c851 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -44,18 +44,6 @@
#define PT_DIRECTORY_LEVEL 2
#define PT_PAGE_TABLE_LEVEL 1
-#define PFERR_PRESENT_BIT 0
-#define PFERR_WRITE_BIT 1
-#define PFERR_USER_BIT 2
-#define PFERR_RSVD_BIT 3
-#define PFERR_FETCH_BIT 4
-
-#define PFERR_PRESENT_MASK (1U << PFERR_PRESENT_BIT)
-#define PFERR_WRITE_MASK (1U << PFERR_WRITE_BIT)
-#define PFERR_USER_MASK (1U << PFERR_USER_BIT)
-#define PFERR_RSVD_MASK (1U << PFERR_RSVD_BIT)
-#define PFERR_FETCH_MASK (1U << PFERR_FETCH_BIT)
-
static inline u64 rsvd_bits(int s, int e)
{
return ((1ULL << (e - s + 1)) - 1) << s;
@@ -81,9 +69,8 @@ enum {
};
int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct);
-void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
-void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context,
- bool execonly);
+void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu);
+void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly);
void update_permission_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
bool ept);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 41dd0387cccb..d319e0c24758 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1583,7 +1583,7 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
static int svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
{
- unsigned long host_cr4_mce = read_cr4() & X86_CR4_MCE;
+ unsigned long host_cr4_mce = cr4_read_shadow() & X86_CR4_MCE;
unsigned long old_cr4 = to_svm(vcpu)->vmcb->save.cr4;
if (cr4 & X86_CR4_VMXE)
@@ -2003,8 +2003,8 @@ static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu,
static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
{
- kvm_init_shadow_mmu(vcpu, &vcpu->arch.mmu);
-
+ WARN_ON(mmu_is_nested(vcpu));
+ kvm_init_shadow_mmu(vcpu);
vcpu->arch.mmu.set_cr3 = nested_svm_set_tdp_cr3;
vcpu->arch.mmu.get_cr3 = nested_svm_get_tdp_cr3;
vcpu->arch.mmu.get_pdptr = nested_svm_get_tdp_pdptr;
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index c2a34bb5ad93..7c7bc8bef21f 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -848,6 +848,24 @@ TRACE_EVENT(kvm_track_tsc,
#endif /* CONFIG_X86_64 */
+/*
+ * Tracepoint for PML full VMEXIT.
+ */
+TRACE_EVENT(kvm_pml_full,
+ TP_PROTO(unsigned int vcpu_id),
+ TP_ARGS(vcpu_id),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, vcpu_id )
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ ),
+
+ TP_printk("vcpu %d: PML full", __entry->vcpu_id)
+);
+
TRACE_EVENT(kvm_ple_window,
TP_PROTO(bool grow, unsigned int vcpu_id, int new, int old),
TP_ARGS(grow, vcpu_id, new, old),
@@ -914,6 +932,26 @@ TRACE_EVENT(kvm_pvclock_update,
__entry->flags)
);
+TRACE_EVENT(kvm_wait_lapic_expire,
+ TP_PROTO(unsigned int vcpu_id, s64 delta),
+ TP_ARGS(vcpu_id, delta),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, vcpu_id )
+ __field( s64, delta )
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->delta = delta;
+ ),
+
+ TP_printk("vcpu %u: delta %lld (%s)",
+ __entry->vcpu_id,
+ __entry->delta,
+ __entry->delta < 0 ? "early" : "late")
+);
+
#endif /* _TRACE_KVM_H */
#undef TRACE_INCLUDE_PATH
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index d4c58d884838..14c1a18d206a 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -45,6 +45,7 @@
#include <asm/perf_event.h>
#include <asm/debugreg.h>
#include <asm/kexec.h>
+#include <asm/apic.h>
#include "trace.h"
@@ -101,6 +102,9 @@ module_param(nested, bool, S_IRUGO);
static u64 __read_mostly host_xss;
+static bool __read_mostly enable_pml = 1;
+module_param_named(pml, enable_pml, bool, S_IRUGO);
+
#define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD)
#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST (X86_CR0_WP | X86_CR0_NE)
#define KVM_VM_CR0_ALWAYS_ON \
@@ -215,7 +219,12 @@ struct __packed vmcs12 {
u64 tsc_offset;
u64 virtual_apic_page_addr;
u64 apic_access_addr;
+ u64 posted_intr_desc_addr;
u64 ept_pointer;
+ u64 eoi_exit_bitmap0;
+ u64 eoi_exit_bitmap1;
+ u64 eoi_exit_bitmap2;
+ u64 eoi_exit_bitmap3;
u64 xss_exit_bitmap;
u64 guest_physical_address;
u64 vmcs_link_pointer;
@@ -330,6 +339,7 @@ struct __packed vmcs12 {
u32 vmx_preemption_timer_value;
u32 padding32[7]; /* room for future expansion */
u16 virtual_processor_id;
+ u16 posted_intr_nv;
u16 guest_es_selector;
u16 guest_cs_selector;
u16 guest_ss_selector;
@@ -338,6 +348,7 @@ struct __packed vmcs12 {
u16 guest_gs_selector;
u16 guest_ldtr_selector;
u16 guest_tr_selector;
+ u16 guest_intr_status;
u16 host_es_selector;
u16 host_cs_selector;
u16 host_ss_selector;
@@ -401,6 +412,10 @@ struct nested_vmx {
*/
struct page *apic_access_page;
struct page *virtual_apic_page;
+ struct page *pi_desc_page;
+ struct pi_desc *pi_desc;
+ bool pi_pending;
+ u16 posted_intr_nv;
u64 msr_ia32_feature_control;
struct hrtimer preemption_timer;
@@ -408,6 +423,23 @@ struct nested_vmx {
/* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */
u64 vmcs01_debugctl;
+
+ u32 nested_vmx_procbased_ctls_low;
+ u32 nested_vmx_procbased_ctls_high;
+ u32 nested_vmx_true_procbased_ctls_low;
+ u32 nested_vmx_secondary_ctls_low;
+ u32 nested_vmx_secondary_ctls_high;
+ u32 nested_vmx_pinbased_ctls_low;
+ u32 nested_vmx_pinbased_ctls_high;
+ u32 nested_vmx_exit_ctls_low;
+ u32 nested_vmx_exit_ctls_high;
+ u32 nested_vmx_true_exit_ctls_low;
+ u32 nested_vmx_entry_ctls_low;
+ u32 nested_vmx_entry_ctls_high;
+ u32 nested_vmx_true_entry_ctls_low;
+ u32 nested_vmx_misc_low;
+ u32 nested_vmx_misc_high;
+ u32 nested_vmx_ept_caps;
};
#define POSTED_INTR_ON 0
@@ -511,6 +543,10 @@ struct vcpu_vmx {
/* Dynamic PLE window. */
int ple_window;
bool ple_window_dirty;
+
+ /* Support for PML */
+#define PML_ENTITY_NUM 512
+ struct page *pml_pg;
};
enum segment_cache_field {
@@ -594,6 +630,7 @@ static int max_shadow_read_write_fields =
static const unsigned short vmcs_field_to_offset_table[] = {
FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id),
+ FIELD(POSTED_INTR_NV, posted_intr_nv),
FIELD(GUEST_ES_SELECTOR, guest_es_selector),
FIELD(GUEST_CS_SELECTOR, guest_cs_selector),
FIELD(GUEST_SS_SELECTOR, guest_ss_selector),
@@ -602,6 +639,7 @@ static const unsigned short vmcs_field_to_offset_table[] = {
FIELD(GUEST_GS_SELECTOR, guest_gs_selector),
FIELD(GUEST_LDTR_SELECTOR, guest_ldtr_selector),
FIELD(GUEST_TR_SELECTOR, guest_tr_selector),
+ FIELD(GUEST_INTR_STATUS, guest_intr_status),
FIELD(HOST_ES_SELECTOR, host_es_selector),
FIELD(HOST_CS_SELECTOR, host_cs_selector),
FIELD(HOST_SS_SELECTOR, host_ss_selector),
@@ -618,7 +656,12 @@ static const unsigned short vmcs_field_to_offset_table[] = {
FIELD64(TSC_OFFSET, tsc_offset),
FIELD64(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr),
FIELD64(APIC_ACCESS_ADDR, apic_access_addr),
+ FIELD64(POSTED_INTR_DESC_ADDR, posted_intr_desc_addr),
FIELD64(EPT_POINTER, ept_pointer),
+ FIELD64(EOI_EXIT_BITMAP0, eoi_exit_bitmap0),
+ FIELD64(EOI_EXIT_BITMAP1, eoi_exit_bitmap1),
+ FIELD64(EOI_EXIT_BITMAP2, eoi_exit_bitmap2),
+ FIELD64(EOI_EXIT_BITMAP3, eoi_exit_bitmap3),
FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap),
FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address),
FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer),
@@ -766,6 +809,7 @@ static void kvm_cpu_vmxon(u64 addr);
static void kvm_cpu_vmxoff(void);
static bool vmx_mpx_supported(void);
static bool vmx_xsaves_supported(void);
+static int vmx_vm_has_apicv(struct kvm *kvm);
static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
static void vmx_set_segment(struct kvm_vcpu *vcpu,
struct kvm_segment *var, int seg);
@@ -793,6 +837,7 @@ static unsigned long *vmx_msr_bitmap_legacy;
static unsigned long *vmx_msr_bitmap_longmode;
static unsigned long *vmx_msr_bitmap_legacy_x2apic;
static unsigned long *vmx_msr_bitmap_longmode_x2apic;
+static unsigned long *vmx_msr_bitmap_nested;
static unsigned long *vmx_vmread_bitmap;
static unsigned long *vmx_vmwrite_bitmap;
@@ -959,16 +1004,6 @@ static inline bool cpu_has_vmx_ept_execute_only(void)
return vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT;
}
-static inline bool cpu_has_vmx_eptp_uncacheable(void)
-{
- return vmx_capability.ept & VMX_EPTP_UC_BIT;
-}
-
-static inline bool cpu_has_vmx_eptp_writeback(void)
-{
- return vmx_capability.ept & VMX_EPTP_WB_BIT;
-}
-
static inline bool cpu_has_vmx_ept_2m_page(void)
{
return vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT;
@@ -1073,6 +1108,11 @@ static inline bool cpu_has_vmx_shadow_vmcs(void)
SECONDARY_EXEC_SHADOW_VMCS;
}
+static inline bool cpu_has_vmx_pml(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_ENABLE_PML;
+}
+
static inline bool report_flexpriority(void)
{
return flexpriority_enabled;
@@ -1112,6 +1152,26 @@ static inline bool nested_cpu_has_xsaves(struct vmcs12 *vmcs12)
vmx_xsaves_supported();
}
+static inline bool nested_cpu_has_virt_x2apic_mode(struct vmcs12 *vmcs12)
+{
+ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE);
+}
+
+static inline bool nested_cpu_has_apic_reg_virt(struct vmcs12 *vmcs12)
+{
+ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_APIC_REGISTER_VIRT);
+}
+
+static inline bool nested_cpu_has_vid(struct vmcs12 *vmcs12)
+{
+ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
+}
+
+static inline bool nested_cpu_has_posted_intr(struct vmcs12 *vmcs12)
+{
+ return vmcs12->pin_based_vm_exec_control & PIN_BASED_POSTED_INTR;
+}
+
static inline bool is_exception(u32 intr_info)
{
return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
@@ -2284,20 +2344,8 @@ static inline bool nested_vmx_allowed(struct kvm_vcpu *vcpu)
* if the corresponding bit in the (32-bit) control field *must* be on, and a
* bit in the high half is on if the corresponding bit in the control field
* may be on. See also vmx_control_verify().
- * TODO: allow these variables to be modified (downgraded) by module options
- * or other means.
*/
-static u32 nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high;
-static u32 nested_vmx_true_procbased_ctls_low;
-static u32 nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high;
-static u32 nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high;
-static u32 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high;
-static u32 nested_vmx_true_exit_ctls_low;
-static u32 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high;
-static u32 nested_vmx_true_entry_ctls_low;
-static u32 nested_vmx_misc_low, nested_vmx_misc_high;
-static u32 nested_vmx_ept_caps;
-static __init void nested_vmx_setup_ctls_msrs(void)
+static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
{
/*
* Note that as a general rule, the high half of the MSRs (bits in
@@ -2316,57 +2364,74 @@ static __init void nested_vmx_setup_ctls_msrs(void)
/* pin-based controls */
rdmsr(MSR_IA32_VMX_PINBASED_CTLS,
- nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high);
- nested_vmx_pinbased_ctls_low |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
- nested_vmx_pinbased_ctls_high &= PIN_BASED_EXT_INTR_MASK |
- PIN_BASED_NMI_EXITING | PIN_BASED_VIRTUAL_NMIS;
- nested_vmx_pinbased_ctls_high |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
+ vmx->nested.nested_vmx_pinbased_ctls_low,
+ vmx->nested.nested_vmx_pinbased_ctls_high);
+ vmx->nested.nested_vmx_pinbased_ctls_low |=
+ PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
+ vmx->nested.nested_vmx_pinbased_ctls_high &=
+ PIN_BASED_EXT_INTR_MASK |
+ PIN_BASED_NMI_EXITING |
+ PIN_BASED_VIRTUAL_NMIS;
+ vmx->nested.nested_vmx_pinbased_ctls_high |=
+ PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
PIN_BASED_VMX_PREEMPTION_TIMER;
+ if (vmx_vm_has_apicv(vmx->vcpu.kvm))
+ vmx->nested.nested_vmx_pinbased_ctls_high |=
+ PIN_BASED_POSTED_INTR;
/* exit controls */
rdmsr(MSR_IA32_VMX_EXIT_CTLS,
- nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high);
- nested_vmx_exit_ctls_low = VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
+ vmx->nested.nested_vmx_exit_ctls_low,
+ vmx->nested.nested_vmx_exit_ctls_high);
+ vmx->nested.nested_vmx_exit_ctls_low =
+ VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
- nested_vmx_exit_ctls_high &=
+ vmx->nested.nested_vmx_exit_ctls_high &=
#ifdef CONFIG_X86_64
VM_EXIT_HOST_ADDR_SPACE_SIZE |
#endif
VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
- nested_vmx_exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
+ vmx->nested.nested_vmx_exit_ctls_high |=
+ VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER |
VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT;
if (vmx_mpx_supported())
- nested_vmx_exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS;
+ vmx->nested.nested_vmx_exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS;
/* We support free control of debug control saving. */
- nested_vmx_true_exit_ctls_low = nested_vmx_exit_ctls_low &
+ vmx->nested.nested_vmx_true_exit_ctls_low =
+ vmx->nested.nested_vmx_exit_ctls_low &
~VM_EXIT_SAVE_DEBUG_CONTROLS;
/* entry controls */
rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
- nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high);
- nested_vmx_entry_ctls_low = VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
- nested_vmx_entry_ctls_high &=
+ vmx->nested.nested_vmx_entry_ctls_low,
+ vmx->nested.nested_vmx_entry_ctls_high);
+ vmx->nested.nested_vmx_entry_ctls_low =
+ VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
+ vmx->nested.nested_vmx_entry_ctls_high &=
#ifdef CONFIG_X86_64
VM_ENTRY_IA32E_MODE |
#endif
VM_ENTRY_LOAD_IA32_PAT;
- nested_vmx_entry_ctls_high |= (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR |
- VM_ENTRY_LOAD_IA32_EFER);
+ vmx->nested.nested_vmx_entry_ctls_high |=
+ (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER);
if (vmx_mpx_supported())
- nested_vmx_entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS;
+ vmx->nested.nested_vmx_entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS;
/* We support free control of debug control loading. */
- nested_vmx_true_entry_ctls_low = nested_vmx_entry_ctls_low &
+ vmx->nested.nested_vmx_true_entry_ctls_low =
+ vmx->nested.nested_vmx_entry_ctls_low &
~VM_ENTRY_LOAD_DEBUG_CONTROLS;
/* cpu-based controls */
rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,
- nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high);
- nested_vmx_procbased_ctls_low = CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
- nested_vmx_procbased_ctls_high &=
+ vmx->nested.nested_vmx_procbased_ctls_low,
+ vmx->nested.nested_vmx_procbased_ctls_high);
+ vmx->nested.nested_vmx_procbased_ctls_low =
+ CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
+ vmx->nested.nested_vmx_procbased_ctls_high &=
CPU_BASED_VIRTUAL_INTR_PENDING |
CPU_BASED_VIRTUAL_NMI_PENDING | CPU_BASED_USE_TSC_OFFSETING |
CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING |
@@ -2386,45 +2451,55 @@ static __init void nested_vmx_setup_ctls_msrs(void)
* can use it to avoid exits to L1 - even when L0 runs L2
* without MSR bitmaps.
*/
- nested_vmx_procbased_ctls_high |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
+ vmx->nested.nested_vmx_procbased_ctls_high |=
+ CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
CPU_BASED_USE_MSR_BITMAPS;
/* We support free control of CR3 access interception. */
- nested_vmx_true_procbased_ctls_low = nested_vmx_procbased_ctls_low &
+ vmx->nested.nested_vmx_true_procbased_ctls_low =
+ vmx->nested.nested_vmx_procbased_ctls_low &
~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING);
/* secondary cpu-based controls */
rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
- nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high);
- nested_vmx_secondary_ctls_low = 0;
- nested_vmx_secondary_ctls_high &=
+ vmx->nested.nested_vmx_secondary_ctls_low,
+ vmx->nested.nested_vmx_secondary_ctls_high);
+ vmx->nested.nested_vmx_secondary_ctls_low = 0;
+ vmx->nested.nested_vmx_secondary_ctls_high &=
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
+ SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
+ SECONDARY_EXEC_APIC_REGISTER_VIRT |
+ SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
SECONDARY_EXEC_WBINVD_EXITING |
SECONDARY_EXEC_XSAVES;
if (enable_ept) {
/* nested EPT: emulate EPT also to L1 */
- nested_vmx_secondary_ctls_high |= SECONDARY_EXEC_ENABLE_EPT |
+ vmx->nested.nested_vmx_secondary_ctls_high |=
+ SECONDARY_EXEC_ENABLE_EPT |
SECONDARY_EXEC_UNRESTRICTED_GUEST;
- nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT |
+ vmx->nested.nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT |
VMX_EPTP_WB_BIT | VMX_EPT_2MB_PAGE_BIT |
VMX_EPT_INVEPT_BIT;
- nested_vmx_ept_caps &= vmx_capability.ept;
+ vmx->nested.nested_vmx_ept_caps &= vmx_capability.ept;
/*
* For nested guests, we don't do anything specific
* for single context invalidation. Hence, only advertise
* support for global context invalidation.
*/
- nested_vmx_ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT;
+ vmx->nested.nested_vmx_ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT;
} else
- nested_vmx_ept_caps = 0;
+ vmx->nested.nested_vmx_ept_caps = 0;
/* miscellaneous data */
- rdmsr(MSR_IA32_VMX_MISC, nested_vmx_misc_low, nested_vmx_misc_high);
- nested_vmx_misc_low &= VMX_MISC_SAVE_EFER_LMA;
- nested_vmx_misc_low |= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE |
+ rdmsr(MSR_IA32_VMX_MISC,
+ vmx->nested.nested_vmx_misc_low,
+ vmx->nested.nested_vmx_misc_high);
+ vmx->nested.nested_vmx_misc_low &= VMX_MISC_SAVE_EFER_LMA;
+ vmx->nested.nested_vmx_misc_low |=
+ VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE |
VMX_MISC_ACTIVITY_HLT;
- nested_vmx_misc_high = 0;
+ vmx->nested.nested_vmx_misc_high = 0;
}
static inline bool vmx_control_verify(u32 control, u32 low, u32 high)
@@ -2443,6 +2518,8 @@ static inline u64 vmx_control_msr(u32 low, u32 high)
/* Returns 0 on success, non-0 otherwise. */
static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
switch (msr_index) {
case MSR_IA32_VMX_BASIC:
/*
@@ -2457,36 +2534,44 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
break;
case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
case MSR_IA32_VMX_PINBASED_CTLS:
- *pdata = vmx_control_msr(nested_vmx_pinbased_ctls_low,
- nested_vmx_pinbased_ctls_high);
+ *pdata = vmx_control_msr(
+ vmx->nested.nested_vmx_pinbased_ctls_low,
+ vmx->nested.nested_vmx_pinbased_ctls_high);
break;
case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
- *pdata = vmx_control_msr(nested_vmx_true_procbased_ctls_low,
- nested_vmx_procbased_ctls_high);
+ *pdata = vmx_control_msr(
+ vmx->nested.nested_vmx_true_procbased_ctls_low,
+ vmx->nested.nested_vmx_procbased_ctls_high);
break;
case MSR_IA32_VMX_PROCBASED_CTLS:
- *pdata = vmx_control_msr(nested_vmx_procbased_ctls_low,
- nested_vmx_procbased_ctls_high);
+ *pdata = vmx_control_msr(
+ vmx->nested.nested_vmx_procbased_ctls_low,
+ vmx->nested.nested_vmx_procbased_ctls_high);
break;
case MSR_IA32_VMX_TRUE_EXIT_CTLS:
- *pdata = vmx_control_msr(nested_vmx_true_exit_ctls_low,
- nested_vmx_exit_ctls_high);
+ *pdata = vmx_control_msr(
+ vmx->nested.nested_vmx_true_exit_ctls_low,
+ vmx->nested.nested_vmx_exit_ctls_high);
break;
case MSR_IA32_VMX_EXIT_CTLS:
- *pdata = vmx_control_msr(nested_vmx_exit_ctls_low,
- nested_vmx_exit_ctls_high);
+ *pdata = vmx_control_msr(
+ vmx->nested.nested_vmx_exit_ctls_low,
+ vmx->nested.nested_vmx_exit_ctls_high);
break;
case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
- *pdata = vmx_control_msr(nested_vmx_true_entry_ctls_low,
- nested_vmx_entry_ctls_high);
+ *pdata = vmx_control_msr(
+ vmx->nested.nested_vmx_true_entry_ctls_low,
+ vmx->nested.nested_vmx_entry_ctls_high);
break;
case MSR_IA32_VMX_ENTRY_CTLS:
- *pdata = vmx_control_msr(nested_vmx_entry_ctls_low,
- nested_vmx_entry_ctls_high);
+ *pdata = vmx_control_msr(
+ vmx->nested.nested_vmx_entry_ctls_low,
+ vmx->nested.nested_vmx_entry_ctls_high);
break;
case MSR_IA32_VMX_MISC:
- *pdata = vmx_control_msr(nested_vmx_misc_low,
- nested_vmx_misc_high);
+ *pdata = vmx_control_msr(
+ vmx->nested.nested_vmx_misc_low,
+ vmx->nested.nested_vmx_misc_high);
break;
/*
* These MSRs specify bits which the guest must keep fixed (on or off)
@@ -2511,12 +2596,13 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
*pdata = 0x2e; /* highest index: VMX_PREEMPTION_TIMER_VALUE */
break;
case MSR_IA32_VMX_PROCBASED_CTLS2:
- *pdata = vmx_control_msr(nested_vmx_secondary_ctls_low,
- nested_vmx_secondary_ctls_high);
+ *pdata = vmx_control_msr(
+ vmx->nested.nested_vmx_secondary_ctls_low,
+ vmx->nested.nested_vmx_secondary_ctls_high);
break;
case MSR_IA32_VMX_EPT_VPID_CAP:
/* Currently, no nested vpid support */
- *pdata = nested_vmx_ept_caps;
+ *pdata = vmx->nested.nested_vmx_ept_caps;
break;
default:
return 1;
@@ -2785,7 +2871,7 @@ static int hardware_enable(void)
u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
u64 old, test_bits;
- if (read_cr4() & X86_CR4_VMXE)
+ if (cr4_read_shadow() & X86_CR4_VMXE)
return -EBUSY;
INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
@@ -2812,7 +2898,7 @@ static int hardware_enable(void)
/* enable and lock */
wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits);
}
- write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */
+ cr4_set_bits(X86_CR4_VMXE);
if (vmm_exclusive) {
kvm_cpu_vmxon(phys_addr);
@@ -2849,7 +2935,7 @@ static void hardware_disable(void)
vmclear_local_loaded_vmcss();
kvm_cpu_vmxoff();
}
- write_cr4(read_cr4() & ~X86_CR4_VMXE);
+ cr4_clear_bits(X86_CR4_VMXE);
}
static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
@@ -2929,7 +3015,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
SECONDARY_EXEC_APIC_REGISTER_VIRT |
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
SECONDARY_EXEC_SHADOW_VMCS |
- SECONDARY_EXEC_XSAVES;
+ SECONDARY_EXEC_XSAVES |
+ SECONDARY_EXEC_ENABLE_PML;
if (adjust_vmx_controls(min2, opt2,
MSR_IA32_VMX_PROCBASED_CTLS2,
&_cpu_based_2nd_exec_control) < 0)
@@ -4159,6 +4246,52 @@ static void __vmx_enable_intercept_for_msr(unsigned long *msr_bitmap,
}
}
+/*
+ * If a msr is allowed by L0, we should check whether it is allowed by L1.
+ * The corresponding bit will be cleared unless both of L0 and L1 allow it.
+ */
+static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1,
+ unsigned long *msr_bitmap_nested,
+ u32 msr, int type)
+{
+ int f = sizeof(unsigned long);
+
+ if (!cpu_has_vmx_msr_bitmap()) {
+ WARN_ON(1);
+ return;
+ }
+
+ /*
+ * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
+ * have the write-low and read-high bitmap offsets the wrong way round.
+ * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
+ */
+ if (msr <= 0x1fff) {
+ if (type & MSR_TYPE_R &&
+ !test_bit(msr, msr_bitmap_l1 + 0x000 / f))
+ /* read-low */
+ __clear_bit(msr, msr_bitmap_nested + 0x000 / f);
+
+ if (type & MSR_TYPE_W &&
+ !test_bit(msr, msr_bitmap_l1 + 0x800 / f))
+ /* write-low */
+ __clear_bit(msr, msr_bitmap_nested + 0x800 / f);
+
+ } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
+ msr &= 0x1fff;
+ if (type & MSR_TYPE_R &&
+ !test_bit(msr, msr_bitmap_l1 + 0x400 / f))
+ /* read-high */
+ __clear_bit(msr, msr_bitmap_nested + 0x400 / f);
+
+ if (type & MSR_TYPE_W &&
+ !test_bit(msr, msr_bitmap_l1 + 0xc00 / f))
+ /* write-high */
+ __clear_bit(msr, msr_bitmap_nested + 0xc00 / f);
+
+ }
+}
+
static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only)
{
if (!longmode_only)
@@ -4197,6 +4330,64 @@ static int vmx_vm_has_apicv(struct kvm *kvm)
return enable_apicv && irqchip_in_kernel(kvm);
}
+static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ int max_irr;
+ void *vapic_page;
+ u16 status;
+
+ if (vmx->nested.pi_desc &&
+ vmx->nested.pi_pending) {
+ vmx->nested.pi_pending = false;
+ if (!pi_test_and_clear_on(vmx->nested.pi_desc))
+ return 0;
+
+ max_irr = find_last_bit(
+ (unsigned long *)vmx->nested.pi_desc->pir, 256);
+
+ if (max_irr == 256)
+ return 0;
+
+ vapic_page = kmap(vmx->nested.virtual_apic_page);
+ if (!vapic_page) {
+ WARN_ON(1);
+ return -ENOMEM;
+ }
+ __kvm_apic_update_irr(vmx->nested.pi_desc->pir, vapic_page);
+ kunmap(vmx->nested.virtual_apic_page);
+
+ status = vmcs_read16(GUEST_INTR_STATUS);
+ if ((u8)max_irr > ((u8)status & 0xff)) {
+ status &= ~0xff;
+ status |= (u8)max_irr;
+ vmcs_write16(GUEST_INTR_STATUS, status);
+ }
+ }
+ return 0;
+}
+
+static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
+ int vector)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ if (is_guest_mode(vcpu) &&
+ vector == vmx->nested.posted_intr_nv) {
+ /* the PIR and ON have been set by L1. */
+ if (vcpu->mode == IN_GUEST_MODE)
+ apic->send_IPI_mask(get_cpu_mask(vcpu->cpu),
+ POSTED_INTR_VECTOR);
+ /*
+ * If a posted intr is not recognized by hardware,
+ * we will accomplish it in the next vmentry.
+ */
+ vmx->nested.pi_pending = true;
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ return 0;
+ }
+ return -1;
+}
/*
* Send interrupt to vcpu via posted interrupt way.
* 1. If target vcpu is running(non-root mode), send posted interrupt
@@ -4209,6 +4400,10 @@ static void vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
struct vcpu_vmx *vmx = to_vmx(vcpu);
int r;
+ r = vmx_deliver_nested_posted_interrupt(vcpu, vector);
+ if (!r)
+ return;
+
if (pi_test_and_set_pir(vector, &vmx->pi_desc))
return;
@@ -4255,7 +4450,7 @@ static void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */
/* Save the most likely value for this task's CR4 in the VMCS. */
- cr4 = read_cr4();
+ cr4 = cr4_read_shadow();
vmcs_writel(HOST_CR4, cr4); /* 22.2.3, 22.2.5 */
vmx->host_state.vmcs_host_cr4 = cr4;
@@ -4360,6 +4555,9 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
a current VMCS12
*/
exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
+ /* PML is enabled/disabled in creating/destorying vcpu */
+ exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
+
return exec_control;
}
@@ -4986,11 +5184,12 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
hypercall[2] = 0xc1;
}
-static bool nested_cr0_valid(struct vmcs12 *vmcs12, unsigned long val)
+static bool nested_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val)
{
unsigned long always_on = VMXON_CR0_ALWAYSON;
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
- if (nested_vmx_secondary_ctls_high &
+ if (to_vmx(vcpu)->nested.nested_vmx_secondary_ctls_high &
SECONDARY_EXEC_UNRESTRICTED_GUEST &&
nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST))
always_on &= ~(X86_CR0_PE | X86_CR0_PG);
@@ -5015,7 +5214,7 @@ static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val)
val = (val & ~vmcs12->cr0_guest_host_mask) |
(vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask);
- if (!nested_cr0_valid(vmcs12, val))
+ if (!nested_cr0_valid(vcpu, val))
return 1;
if (kvm_set_cr0(vcpu, val))
@@ -5817,13 +6016,21 @@ static __init int hardware_setup(void)
(unsigned long *)__get_free_page(GFP_KERNEL);
if (!vmx_msr_bitmap_longmode_x2apic)
goto out4;
+
+ if (nested) {
+ vmx_msr_bitmap_nested =
+ (unsigned long *)__get_free_page(GFP_KERNEL);
+ if (!vmx_msr_bitmap_nested)
+ goto out5;
+ }
+
vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
if (!vmx_vmread_bitmap)
- goto out5;
+ goto out6;
vmx_vmwrite_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
if (!vmx_vmwrite_bitmap)
- goto out6;
+ goto out7;
memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE);
memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE);
@@ -5839,10 +6046,12 @@ static __init int hardware_setup(void)
memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE);
memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE);
+ if (nested)
+ memset(vmx_msr_bitmap_nested, 0xff, PAGE_SIZE);
if (setup_vmcs_config(&vmcs_config) < 0) {
r = -EIO;
- goto out7;
+ goto out8;
}
if (boot_cpu_has(X86_FEATURE_NX))
@@ -5868,16 +6077,16 @@ static __init int hardware_setup(void)
if (!cpu_has_vmx_unrestricted_guest())
enable_unrestricted_guest = 0;
- if (!cpu_has_vmx_flexpriority()) {
+ if (!cpu_has_vmx_flexpriority())
flexpriority_enabled = 0;
- /*
- * set_apic_access_page_addr() is used to reload apic access
- * page upon invalidation. No need to do anything if the
- * processor does not have the APIC_ACCESS_ADDR VMCS field.
- */
+ /*
+ * set_apic_access_page_addr() is used to reload apic access
+ * page upon invalidation. No need to do anything if not
+ * using the APIC_ACCESS_ADDR VMCS field.
+ */
+ if (!flexpriority_enabled)
kvm_x86_ops->set_apic_access_page_addr = NULL;
- }
if (!cpu_has_vmx_tpr_shadow())
kvm_x86_ops->update_cr8_intercept = NULL;
@@ -5895,13 +6104,11 @@ static __init int hardware_setup(void)
kvm_x86_ops->update_cr8_intercept = NULL;
else {
kvm_x86_ops->hwapic_irr_update = NULL;
+ kvm_x86_ops->hwapic_isr_update = NULL;
kvm_x86_ops->deliver_posted_interrupt = NULL;
kvm_x86_ops->sync_pir_to_irr = vmx_sync_pir_to_irr_dummy;
}
- if (nested)
- nested_vmx_setup_ctls_msrs();
-
vmx_disable_intercept_for_msr(MSR_FS_BASE, false);
vmx_disable_intercept_for_msr(MSR_GS_BASE, false);
vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true);
@@ -5945,12 +6152,29 @@ static __init int hardware_setup(void)
update_ple_window_actual_max();
+ /*
+ * Only enable PML when hardware supports PML feature, and both EPT
+ * and EPT A/D bit features are enabled -- PML depends on them to work.
+ */
+ if (!enable_ept || !enable_ept_ad_bits || !cpu_has_vmx_pml())
+ enable_pml = 0;
+
+ if (!enable_pml) {
+ kvm_x86_ops->slot_enable_log_dirty = NULL;
+ kvm_x86_ops->slot_disable_log_dirty = NULL;
+ kvm_x86_ops->flush_log_dirty = NULL;
+ kvm_x86_ops->enable_log_dirty_pt_masked = NULL;
+ }
+
return alloc_kvm_area();
-out7:
+out8:
free_page((unsigned long)vmx_vmwrite_bitmap);
-out6:
+out7:
free_page((unsigned long)vmx_vmread_bitmap);
+out6:
+ if (nested)
+ free_page((unsigned long)vmx_msr_bitmap_nested);
out5:
free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic);
out4:
@@ -5977,6 +6201,8 @@ static __exit void hardware_unsetup(void)
free_page((unsigned long)vmx_io_bitmap_a);
free_page((unsigned long)vmx_vmwrite_bitmap);
free_page((unsigned long)vmx_vmread_bitmap);
+ if (nested)
+ free_page((unsigned long)vmx_msr_bitmap_nested);
free_kvm_area();
}
@@ -6143,6 +6369,13 @@ static void nested_vmx_failValid(struct kvm_vcpu *vcpu,
*/
}
+static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator)
+{
+ /* TODO: not to reset guest simply here. */
+ kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+ pr_warn("kvm: nested vmx abort, indicator %d\n", indicator);
+}
+
static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer)
{
struct vcpu_vmx *vmx =
@@ -6432,6 +6665,7 @@ static inline void nested_release_vmcs12(struct vcpu_vmx *vmx)
vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
vmcs_write64(VMCS_LINK_POINTER, -1ull);
}
+ vmx->nested.posted_intr_nv = -1;
kunmap(vmx->nested.current_vmcs12_page);
nested_release_page(vmx->nested.current_vmcs12_page);
vmx->nested.current_vmptr = -1ull;
@@ -6460,6 +6694,12 @@ static void free_nested(struct vcpu_vmx *vmx)
nested_release_page(vmx->nested.virtual_apic_page);
vmx->nested.virtual_apic_page = NULL;
}
+ if (vmx->nested.pi_desc_page) {
+ kunmap(vmx->nested.pi_desc_page);
+ nested_release_page(vmx->nested.pi_desc_page);
+ vmx->nested.pi_desc_page = NULL;
+ vmx->nested.pi_desc = NULL;
+ }
nested_free_all_saved_vmcss(vmx);
}
@@ -6893,6 +7133,7 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu)
/* Emulate the INVEPT instruction */
static int handle_invept(struct kvm_vcpu *vcpu)
{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
u32 vmx_instruction_info, types;
unsigned long type;
gva_t gva;
@@ -6901,8 +7142,9 @@ static int handle_invept(struct kvm_vcpu *vcpu)
u64 eptp, gpa;
} operand;
- if (!(nested_vmx_secondary_ctls_high & SECONDARY_EXEC_ENABLE_EPT) ||
- !(nested_vmx_ept_caps & VMX_EPT_INVEPT_BIT)) {
+ if (!(vmx->nested.nested_vmx_secondary_ctls_high &
+ SECONDARY_EXEC_ENABLE_EPT) ||
+ !(vmx->nested.nested_vmx_ept_caps & VMX_EPT_INVEPT_BIT)) {
kvm_queue_exception(vcpu, UD_VECTOR);
return 1;
}
@@ -6918,7 +7160,7 @@ static int handle_invept(struct kvm_vcpu *vcpu)
vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf);
- types = (nested_vmx_ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6;
+ types = (vmx->nested.nested_vmx_ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6;
if (!(types & (1UL << type))) {
nested_vmx_failValid(vcpu,
@@ -6960,6 +7202,31 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
return 1;
}
+static int handle_pml_full(struct kvm_vcpu *vcpu)
+{
+ unsigned long exit_qualification;
+
+ trace_kvm_pml_full(vcpu->vcpu_id);
+
+ exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+
+ /*
+ * PML buffer FULL happened while executing iret from NMI,
+ * "blocked by NMI" bit has to be set before next VM entry.
+ */
+ if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
+ cpu_has_virtual_nmis() &&
+ (exit_qualification & INTR_INFO_UNBLOCK_NMI))
+ vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
+ GUEST_INTR_STATE_NMI);
+
+ /*
+ * PML buffer already flushed at beginning of VMEXIT. Nothing to do
+ * here.., and there's no userspace involvement needed for PML.
+ */
+ return 1;
+}
+
/*
* The exit handlers return 1 if the exit was handled fully and guest execution
* may resume. Otherwise they set the kvm_run parameter to indicate what needs
@@ -7008,6 +7275,7 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
[EXIT_REASON_INVVPID] = handle_invvpid,
[EXIT_REASON_XSAVES] = handle_xsaves,
[EXIT_REASON_XRSTORS] = handle_xrstors,
+ [EXIT_REASON_PML_FULL] = handle_pml_full,
};
static const int kvm_vmx_max_exit_handlers =
@@ -7275,6 +7543,10 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
case EXIT_REASON_APIC_ACCESS:
return nested_cpu_has2(vmcs12,
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
+ case EXIT_REASON_APIC_WRITE:
+ case EXIT_REASON_EOI_INDUCED:
+ /* apic_write and eoi_induced should exit unconditionally. */
+ return 1;
case EXIT_REASON_EPT_VIOLATION:
/*
* L0 always deals with the EPT violation. If nested EPT is
@@ -7314,6 +7586,89 @@ static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
*info2 = vmcs_read32(VM_EXIT_INTR_INFO);
}
+static int vmx_enable_pml(struct vcpu_vmx *vmx)
+{
+ struct page *pml_pg;
+ u32 exec_control;
+
+ pml_pg = alloc_page(GFP_KERNEL | __GFP_ZERO);
+ if (!pml_pg)
+ return -ENOMEM;
+
+ vmx->pml_pg = pml_pg;
+
+ vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
+ vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
+
+ exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
+ exec_control |= SECONDARY_EXEC_ENABLE_PML;
+ vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
+
+ return 0;
+}
+
+static void vmx_disable_pml(struct vcpu_vmx *vmx)
+{
+ u32 exec_control;
+
+ ASSERT(vmx->pml_pg);
+ __free_page(vmx->pml_pg);
+ vmx->pml_pg = NULL;
+
+ exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
+ exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
+ vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
+}
+
+static void vmx_flush_pml_buffer(struct vcpu_vmx *vmx)
+{
+ struct kvm *kvm = vmx->vcpu.kvm;
+ u64 *pml_buf;
+ u16 pml_idx;
+
+ pml_idx = vmcs_read16(GUEST_PML_INDEX);
+
+ /* Do nothing if PML buffer is empty */
+ if (pml_idx == (PML_ENTITY_NUM - 1))
+ return;
+
+ /* PML index always points to next available PML buffer entity */
+ if (pml_idx >= PML_ENTITY_NUM)
+ pml_idx = 0;
+ else
+ pml_idx++;
+
+ pml_buf = page_address(vmx->pml_pg);
+ for (; pml_idx < PML_ENTITY_NUM; pml_idx++) {
+ u64 gpa;
+
+ gpa = pml_buf[pml_idx];
+ WARN_ON(gpa & (PAGE_SIZE - 1));
+ mark_page_dirty(kvm, gpa >> PAGE_SHIFT);
+ }
+
+ /* reset PML index */
+ vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
+}
+
+/*
+ * Flush all vcpus' PML buffer and update logged GPAs to dirty_bitmap.
+ * Called before reporting dirty_bitmap to userspace.
+ */
+static void kvm_flush_pml_buffers(struct kvm *kvm)
+{
+ int i;
+ struct kvm_vcpu *vcpu;
+ /*
+ * We only need to kick vcpu out of guest mode here, as PML buffer
+ * is flushed at beginning of all VMEXITs, and it's obvious that only
+ * vcpus running in guest are possible to have unflushed GPAs in PML
+ * buffer.
+ */
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ kvm_vcpu_kick(vcpu);
+}
+
/*
* The guest has exited. See if we can fix it or if we need userspace
* assistance.
@@ -7324,6 +7679,16 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
u32 exit_reason = vmx->exit_reason;
u32 vectoring_info = vmx->idt_vectoring_info;
+ /*
+ * Flush logged GPAs PML buffer, this will make dirty_bitmap more
+ * updated. Another good is, in kvm_vm_ioctl_get_dirty_log, before
+ * querying dirty_bitmap, we only need to kick all vcpus out of guest
+ * mode as if vcpus is in root mode, the PML buffer must has been
+ * flushed already.
+ */
+ if (enable_pml)
+ vmx_flush_pml_buffer(vmx);
+
/* If guest state is invalid, start emulating */
if (vmx->emulation_required)
return handle_invalid_guest_state(vcpu);
@@ -7471,9 +7836,6 @@ static void vmx_hwapic_isr_update(struct kvm *kvm, int isr)
u16 status;
u8 old;
- if (!vmx_vm_has_apicv(kvm))
- return;
-
if (isr == -1)
isr = 0;
@@ -7784,7 +8146,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty))
vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
- cr4 = read_cr4();
+ cr4 = cr4_read_shadow();
if (unlikely(cr4 != vmx->host_state.vmcs_host_cr4)) {
vmcs_writel(HOST_CR4, cr4);
vmx->host_state.vmcs_host_cr4 = cr4;
@@ -7973,6 +8335,8 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ if (enable_pml)
+ vmx_disable_pml(vmx);
free_vpid(vmx);
leave_guest_mode(vcpu);
vmx_load_vmcs01(vcpu);
@@ -8040,9 +8404,25 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
goto free_vmcs;
}
+ if (nested)
+ nested_vmx_setup_ctls_msrs(vmx);
+
+ vmx->nested.posted_intr_nv = -1;
vmx->nested.current_vmptr = -1ull;
vmx->nested.current_vmcs12 = NULL;
+ /*
+ * If PML is turned on, failure on enabling PML just results in failure
+ * of creating the vcpu, therefore we can simplify PML logic (by
+ * avoiding dealing with cases, such as enabling PML partially on vcpus
+ * for the guest, etc.
+ */
+ if (enable_pml) {
+ err = vmx_enable_pml(vmx);
+ if (err)
+ goto free_vmcs;
+ }
+
return &vmx->vcpu;
free_vmcs:
@@ -8184,9 +8564,10 @@ static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu)
static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
{
- kvm_init_shadow_ept_mmu(vcpu, &vcpu->arch.mmu,
- nested_vmx_ept_caps & VMX_EPT_EXECUTE_ONLY_BIT);
-
+ WARN_ON(mmu_is_nested(vcpu));
+ kvm_init_shadow_ept_mmu(vcpu,
+ to_vmx(vcpu)->nested.nested_vmx_ept_caps &
+ VMX_EPT_EXECUTE_ONLY_BIT);
vcpu->arch.mmu.set_cr3 = vmx_set_cr3;
vcpu->arch.mmu.get_cr3 = nested_ept_get_cr3;
vcpu->arch.mmu.inject_page_fault = nested_ept_inject_page_fault;
@@ -8199,6 +8580,18 @@ static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu)
vcpu->arch.walk_mmu = &vcpu->arch.mmu;
}
+static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
+ u16 error_code)
+{
+ bool inequality, bit;
+
+ bit = (vmcs12->exception_bitmap & (1u << PF_VECTOR)) != 0;
+ inequality =
+ (error_code & vmcs12->page_fault_error_code_mask) !=
+ vmcs12->page_fault_error_code_match;
+ return inequality ^ bit;
+}
+
static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
struct x86_exception *fault)
{
@@ -8206,8 +8599,7 @@ static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
WARN_ON(!is_guest_mode(vcpu));
- /* TODO: also check PFEC_MATCH/MASK, not just EB.PF. */
- if (vmcs12->exception_bitmap & (1u << PF_VECTOR))
+ if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code))
nested_vmx_vmexit(vcpu, to_vmx(vcpu)->exit_reason,
vmcs_read32(VM_EXIT_INTR_INFO),
vmcs_readl(EXIT_QUALIFICATION));
@@ -8261,6 +8653,31 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu,
return false;
}
+ if (nested_cpu_has_posted_intr(vmcs12)) {
+ if (!IS_ALIGNED(vmcs12->posted_intr_desc_addr, 64))
+ return false;
+
+ if (vmx->nested.pi_desc_page) { /* shouldn't happen */
+ kunmap(vmx->nested.pi_desc_page);
+ nested_release_page(vmx->nested.pi_desc_page);
+ }
+ vmx->nested.pi_desc_page =
+ nested_get_page(vcpu, vmcs12->posted_intr_desc_addr);
+ if (!vmx->nested.pi_desc_page)
+ return false;
+
+ vmx->nested.pi_desc =
+ (struct pi_desc *)kmap(vmx->nested.pi_desc_page);
+ if (!vmx->nested.pi_desc) {
+ nested_release_page_clean(vmx->nested.pi_desc_page);
+ return false;
+ }
+ vmx->nested.pi_desc =
+ (struct pi_desc *)((void *)vmx->nested.pi_desc +
+ (unsigned long)(vmcs12->posted_intr_desc_addr &
+ (PAGE_SIZE - 1)));
+ }
+
return true;
}
@@ -8286,6 +8703,310 @@ static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu)
ns_to_ktime(preemption_timeout), HRTIMER_MODE_REL);
}
+static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ int maxphyaddr;
+ u64 addr;
+
+ if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
+ return 0;
+
+ if (vmcs12_read_any(vcpu, MSR_BITMAP, &addr)) {
+ WARN_ON(1);
+ return -EINVAL;
+ }
+ maxphyaddr = cpuid_maxphyaddr(vcpu);
+
+ if (!PAGE_ALIGNED(vmcs12->msr_bitmap) ||
+ ((addr + PAGE_SIZE) >> maxphyaddr))
+ return -EINVAL;
+
+ return 0;
+}
+
+/*
+ * Merge L0's and L1's MSR bitmap, return false to indicate that
+ * we do not use the hardware.
+ */
+static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ int msr;
+ struct page *page;
+ unsigned long *msr_bitmap;
+
+ if (!nested_cpu_has_virt_x2apic_mode(vmcs12))
+ return false;
+
+ page = nested_get_page(vcpu, vmcs12->msr_bitmap);
+ if (!page) {
+ WARN_ON(1);
+ return false;
+ }
+ msr_bitmap = (unsigned long *)kmap(page);
+ if (!msr_bitmap) {
+ nested_release_page_clean(page);
+ WARN_ON(1);
+ return false;
+ }
+
+ if (nested_cpu_has_virt_x2apic_mode(vmcs12)) {
+ if (nested_cpu_has_apic_reg_virt(vmcs12))
+ for (msr = 0x800; msr <= 0x8ff; msr++)
+ nested_vmx_disable_intercept_for_msr(
+ msr_bitmap,
+ vmx_msr_bitmap_nested,
+ msr, MSR_TYPE_R);
+ /* TPR is allowed */
+ nested_vmx_disable_intercept_for_msr(msr_bitmap,
+ vmx_msr_bitmap_nested,
+ APIC_BASE_MSR + (APIC_TASKPRI >> 4),
+ MSR_TYPE_R | MSR_TYPE_W);
+ if (nested_cpu_has_vid(vmcs12)) {
+ /* EOI and self-IPI are allowed */
+ nested_vmx_disable_intercept_for_msr(
+ msr_bitmap,
+ vmx_msr_bitmap_nested,
+ APIC_BASE_MSR + (APIC_EOI >> 4),
+ MSR_TYPE_W);
+ nested_vmx_disable_intercept_for_msr(
+ msr_bitmap,
+ vmx_msr_bitmap_nested,
+ APIC_BASE_MSR + (APIC_SELF_IPI >> 4),
+ MSR_TYPE_W);
+ }
+ } else {
+ /*
+ * Enable reading intercept of all the x2apic
+ * MSRs. We should not rely on vmcs12 to do any
+ * optimizations here, it may have been modified
+ * by L1.
+ */
+ for (msr = 0x800; msr <= 0x8ff; msr++)
+ __vmx_enable_intercept_for_msr(
+ vmx_msr_bitmap_nested,
+ msr,
+ MSR_TYPE_R);
+
+ __vmx_enable_intercept_for_msr(
+ vmx_msr_bitmap_nested,
+ APIC_BASE_MSR + (APIC_TASKPRI >> 4),
+ MSR_TYPE_W);
+ __vmx_enable_intercept_for_msr(
+ vmx_msr_bitmap_nested,
+ APIC_BASE_MSR + (APIC_EOI >> 4),
+ MSR_TYPE_W);
+ __vmx_enable_intercept_for_msr(
+ vmx_msr_bitmap_nested,
+ APIC_BASE_MSR + (APIC_SELF_IPI >> 4),
+ MSR_TYPE_W);
+ }
+ kunmap(page);
+ nested_release_page_clean(page);
+
+ return true;
+}
+
+static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ if (!nested_cpu_has_virt_x2apic_mode(vmcs12) &&
+ !nested_cpu_has_apic_reg_virt(vmcs12) &&
+ !nested_cpu_has_vid(vmcs12) &&
+ !nested_cpu_has_posted_intr(vmcs12))
+ return 0;
+
+ /*
+ * If virtualize x2apic mode is enabled,
+ * virtualize apic access must be disabled.
+ */
+ if (nested_cpu_has_virt_x2apic_mode(vmcs12) &&
+ nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
+ return -EINVAL;
+
+ /*
+ * If virtual interrupt delivery is enabled,
+ * we must exit on external interrupts.
+ */
+ if (nested_cpu_has_vid(vmcs12) &&
+ !nested_exit_on_intr(vcpu))
+ return -EINVAL;
+
+ /*
+ * bits 15:8 should be zero in posted_intr_nv,
+ * the descriptor address has been already checked
+ * in nested_get_vmcs12_pages.
+ */
+ if (nested_cpu_has_posted_intr(vmcs12) &&
+ (!nested_cpu_has_vid(vmcs12) ||
+ !nested_exit_intr_ack_set(vcpu) ||
+ vmcs12->posted_intr_nv & 0xff00))
+ return -EINVAL;
+
+ /* tpr shadow is needed by all apicv features. */
+ if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu,
+ unsigned long count_field,
+ unsigned long addr_field,
+ int maxphyaddr)
+{
+ u64 count, addr;
+
+ if (vmcs12_read_any(vcpu, count_field, &count) ||
+ vmcs12_read_any(vcpu, addr_field, &addr)) {
+ WARN_ON(1);
+ return -EINVAL;
+ }
+ if (count == 0)
+ return 0;
+ if (!IS_ALIGNED(addr, 16) || addr >> maxphyaddr ||
+ (addr + count * sizeof(struct vmx_msr_entry) - 1) >> maxphyaddr) {
+ pr_warn_ratelimited(
+ "nVMX: invalid MSR switch (0x%lx, %d, %llu, 0x%08llx)",
+ addr_field, maxphyaddr, count, addr);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static int nested_vmx_check_msr_switch_controls(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ int maxphyaddr;
+
+ if (vmcs12->vm_exit_msr_load_count == 0 &&
+ vmcs12->vm_exit_msr_store_count == 0 &&
+ vmcs12->vm_entry_msr_load_count == 0)
+ return 0; /* Fast path */
+ maxphyaddr = cpuid_maxphyaddr(vcpu);
+ if (nested_vmx_check_msr_switch(vcpu, VM_EXIT_MSR_LOAD_COUNT,
+ VM_EXIT_MSR_LOAD_ADDR, maxphyaddr) ||
+ nested_vmx_check_msr_switch(vcpu, VM_EXIT_MSR_STORE_COUNT,
+ VM_EXIT_MSR_STORE_ADDR, maxphyaddr) ||
+ nested_vmx_check_msr_switch(vcpu, VM_ENTRY_MSR_LOAD_COUNT,
+ VM_ENTRY_MSR_LOAD_ADDR, maxphyaddr))
+ return -EINVAL;
+ return 0;
+}
+
+static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu,
+ struct vmx_msr_entry *e)
+{
+ /* x2APIC MSR accesses are not allowed */
+ if (apic_x2apic_mode(vcpu->arch.apic) && e->index >> 8 == 0x8)
+ return -EINVAL;
+ if (e->index == MSR_IA32_UCODE_WRITE || /* SDM Table 35-2 */
+ e->index == MSR_IA32_UCODE_REV)
+ return -EINVAL;
+ if (e->reserved != 0)
+ return -EINVAL;
+ return 0;
+}
+
+static int nested_vmx_load_msr_check(struct kvm_vcpu *vcpu,
+ struct vmx_msr_entry *e)
+{
+ if (e->index == MSR_FS_BASE ||
+ e->index == MSR_GS_BASE ||
+ e->index == MSR_IA32_SMM_MONITOR_CTL || /* SMM is not supported */
+ nested_vmx_msr_check_common(vcpu, e))
+ return -EINVAL;
+ return 0;
+}
+
+static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu,
+ struct vmx_msr_entry *e)
+{
+ if (e->index == MSR_IA32_SMBASE || /* SMM is not supported */
+ nested_vmx_msr_check_common(vcpu, e))
+ return -EINVAL;
+ return 0;
+}
+
+/*
+ * Load guest's/host's msr at nested entry/exit.
+ * return 0 for success, entry index for failure.
+ */
+static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
+{
+ u32 i;
+ struct vmx_msr_entry e;
+ struct msr_data msr;
+
+ msr.host_initiated = false;
+ for (i = 0; i < count; i++) {
+ if (kvm_read_guest(vcpu->kvm, gpa + i * sizeof(e),
+ &e, sizeof(e))) {
+ pr_warn_ratelimited(
+ "%s cannot read MSR entry (%u, 0x%08llx)\n",
+ __func__, i, gpa + i * sizeof(e));
+ goto fail;
+ }
+ if (nested_vmx_load_msr_check(vcpu, &e)) {
+ pr_warn_ratelimited(
+ "%s check failed (%u, 0x%x, 0x%x)\n",
+ __func__, i, e.index, e.reserved);
+ goto fail;
+ }
+ msr.index = e.index;
+ msr.data = e.value;
+ if (kvm_set_msr(vcpu, &msr)) {
+ pr_warn_ratelimited(
+ "%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
+ __func__, i, e.index, e.value);
+ goto fail;
+ }
+ }
+ return 0;
+fail:
+ return i + 1;
+}
+
+static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
+{
+ u32 i;
+ struct vmx_msr_entry e;
+
+ for (i = 0; i < count; i++) {
+ if (kvm_read_guest(vcpu->kvm,
+ gpa + i * sizeof(e),
+ &e, 2 * sizeof(u32))) {
+ pr_warn_ratelimited(
+ "%s cannot read MSR entry (%u, 0x%08llx)\n",
+ __func__, i, gpa + i * sizeof(e));
+ return -EINVAL;
+ }
+ if (nested_vmx_store_msr_check(vcpu, &e)) {
+ pr_warn_ratelimited(
+ "%s check failed (%u, 0x%x, 0x%x)\n",
+ __func__, i, e.index, e.reserved);
+ return -EINVAL;
+ }
+ if (kvm_get_msr(vcpu, e.index, &e.value)) {
+ pr_warn_ratelimited(
+ "%s cannot read MSR (%u, 0x%x)\n",
+ __func__, i, e.index);
+ return -EINVAL;
+ }
+ if (kvm_write_guest(vcpu->kvm,
+ gpa + i * sizeof(e) +
+ offsetof(struct vmx_msr_entry, value),
+ &e.value, sizeof(e.value))) {
+ pr_warn_ratelimited(
+ "%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
+ __func__, i, e.index, e.value);
+ return -EINVAL;
+ }
+ }
+ return 0;
+}
+
/*
* prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
* L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it
@@ -8365,8 +9086,23 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
exec_control = vmcs12->pin_based_vm_exec_control;
exec_control |= vmcs_config.pin_based_exec_ctrl;
- exec_control &= ~(PIN_BASED_VMX_PREEMPTION_TIMER |
- PIN_BASED_POSTED_INTR);
+ exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
+
+ if (nested_cpu_has_posted_intr(vmcs12)) {
+ /*
+ * Note that we use L0's vector here and in
+ * vmx_deliver_nested_posted_interrupt.
+ */
+ vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv;
+ vmx->nested.pi_pending = false;
+ vmcs_write64(POSTED_INTR_NV, POSTED_INTR_VECTOR);
+ vmcs_write64(POSTED_INTR_DESC_ADDR,
+ page_to_phys(vmx->nested.pi_desc_page) +
+ (unsigned long)(vmcs12->posted_intr_desc_addr &
+ (PAGE_SIZE - 1)));
+ } else
+ exec_control &= ~PIN_BASED_POSTED_INTR;
+
vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, exec_control);
vmx->nested.preemption_timer_expired = false;
@@ -8423,12 +9159,26 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
else
vmcs_write64(APIC_ACCESS_ADDR,
page_to_phys(vmx->nested.apic_access_page));
- } else if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) {
+ } else if (!(nested_cpu_has_virt_x2apic_mode(vmcs12)) &&
+ (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))) {
exec_control |=
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
kvm_vcpu_reload_apic_access_page(vcpu);
}
+ if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) {
+ vmcs_write64(EOI_EXIT_BITMAP0,
+ vmcs12->eoi_exit_bitmap0);
+ vmcs_write64(EOI_EXIT_BITMAP1,
+ vmcs12->eoi_exit_bitmap1);
+ vmcs_write64(EOI_EXIT_BITMAP2,
+ vmcs12->eoi_exit_bitmap2);
+ vmcs_write64(EOI_EXIT_BITMAP3,
+ vmcs12->eoi_exit_bitmap3);
+ vmcs_write16(GUEST_INTR_STATUS,
+ vmcs12->guest_intr_status);
+ }
+
vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
}
@@ -8462,11 +9212,17 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold);
}
+ if (cpu_has_vmx_msr_bitmap() &&
+ exec_control & CPU_BASED_USE_MSR_BITMAPS &&
+ nested_vmx_merge_msr_bitmap(vcpu, vmcs12)) {
+ vmcs_write64(MSR_BITMAP, __pa(vmx_msr_bitmap_nested));
+ } else
+ exec_control &= ~CPU_BASED_USE_MSR_BITMAPS;
+
/*
- * Merging of IO and MSR bitmaps not currently supported.
+ * Merging of IO bitmap not currently supported.
* Rather, exit every time.
*/
- exec_control &= ~CPU_BASED_USE_MSR_BITMAPS;
exec_control &= ~CPU_BASED_USE_IO_BITMAPS;
exec_control |= CPU_BASED_UNCOND_IO_EXITING;
@@ -8582,6 +9338,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
int cpu;
struct loaded_vmcs *vmcs02;
bool ia32e;
+ u32 msr_entry_idx;
if (!nested_vmx_check_permission(vcpu) ||
!nested_vmx_check_vmcs12(vcpu))
@@ -8616,41 +9373,42 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
return 1;
}
- if ((vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_MSR_BITMAPS) &&
- !PAGE_ALIGNED(vmcs12->msr_bitmap)) {
+ if (!nested_get_vmcs12_pages(vcpu, vmcs12)) {
/*TODO: Also verify bits beyond physical address width are 0*/
nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
return 1;
}
- if (!nested_get_vmcs12_pages(vcpu, vmcs12)) {
- /*TODO: Also verify bits beyond physical address width are 0*/
+ if (nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12)) {
nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
return 1;
}
- if (vmcs12->vm_entry_msr_load_count > 0 ||
- vmcs12->vm_exit_msr_load_count > 0 ||
- vmcs12->vm_exit_msr_store_count > 0) {
- pr_warn_ratelimited("%s: VMCS MSR_{LOAD,STORE} unsupported\n",
- __func__);
+ if (nested_vmx_check_apicv_controls(vcpu, vmcs12)) {
+ nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
+ return 1;
+ }
+
+ if (nested_vmx_check_msr_switch_controls(vcpu, vmcs12)) {
nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
return 1;
}
if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control,
- nested_vmx_true_procbased_ctls_low,
- nested_vmx_procbased_ctls_high) ||
+ vmx->nested.nested_vmx_true_procbased_ctls_low,
+ vmx->nested.nested_vmx_procbased_ctls_high) ||
!vmx_control_verify(vmcs12->secondary_vm_exec_control,
- nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high) ||
+ vmx->nested.nested_vmx_secondary_ctls_low,
+ vmx->nested.nested_vmx_secondary_ctls_high) ||
!vmx_control_verify(vmcs12->pin_based_vm_exec_control,
- nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high) ||
+ vmx->nested.nested_vmx_pinbased_ctls_low,
+ vmx->nested.nested_vmx_pinbased_ctls_high) ||
!vmx_control_verify(vmcs12->vm_exit_controls,
- nested_vmx_true_exit_ctls_low,
- nested_vmx_exit_ctls_high) ||
+ vmx->nested.nested_vmx_true_exit_ctls_low,
+ vmx->nested.nested_vmx_exit_ctls_high) ||
!vmx_control_verify(vmcs12->vm_entry_controls,
- nested_vmx_true_entry_ctls_low,
- nested_vmx_entry_ctls_high))
+ vmx->nested.nested_vmx_true_entry_ctls_low,
+ vmx->nested.nested_vmx_entry_ctls_high))
{
nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
return 1;
@@ -8663,7 +9421,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
return 1;
}
- if (!nested_cr0_valid(vmcs12, vmcs12->guest_cr0) ||
+ if (!nested_cr0_valid(vcpu, vmcs12->guest_cr0) ||
((vmcs12->guest_cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON)) {
nested_vmx_entry_failure(vcpu, vmcs12,
EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT);
@@ -8739,10 +9497,21 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
vmx_segment_cache_clear(vmx);
- vmcs12->launch_state = 1;
-
prepare_vmcs02(vcpu, vmcs12);
+ msr_entry_idx = nested_vmx_load_msr(vcpu,
+ vmcs12->vm_entry_msr_load_addr,
+ vmcs12->vm_entry_msr_load_count);
+ if (msr_entry_idx) {
+ leave_guest_mode(vcpu);
+ vmx_load_vmcs01(vcpu);
+ nested_vmx_entry_failure(vcpu, vmcs12,
+ EXIT_REASON_MSR_LOAD_FAIL, msr_entry_idx);
+ return 1;
+ }
+
+ vmcs12->launch_state = 1;
+
if (vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT)
return kvm_emulate_halt(vcpu);
@@ -8869,9 +9638,10 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr)
if (vmx->nested.nested_run_pending)
return -EBUSY;
nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0);
+ return 0;
}
- return 0;
+ return vmx_complete_nested_posted_interrupt(vcpu);
}
static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu)
@@ -8981,6 +9751,9 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3);
}
+ if (nested_cpu_has_vid(vmcs12))
+ vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS);
+
vmcs12->vm_entry_controls =
(vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) |
(vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE);
@@ -9172,6 +9945,13 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
kvm_set_dr(vcpu, 7, 0x400);
vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
+
+ if (cpu_has_vmx_msr_bitmap())
+ vmx_set_msr_bitmap(vcpu);
+
+ if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr,
+ vmcs12->vm_exit_msr_load_count))
+ nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL);
}
/*
@@ -9193,6 +9973,10 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info,
exit_qualification);
+ if (nested_vmx_store_msr(vcpu, vmcs12->vm_exit_msr_store_addr,
+ vmcs12->vm_exit_msr_store_count))
+ nested_vmx_abort(vcpu, VMX_ABORT_SAVE_GUEST_MSR_FAIL);
+
vmx_load_vmcs01(vcpu);
if ((exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT)
@@ -9235,6 +10019,12 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
nested_release_page(vmx->nested.virtual_apic_page);
vmx->nested.virtual_apic_page = NULL;
}
+ if (vmx->nested.pi_desc_page) {
+ kunmap(vmx->nested.pi_desc_page);
+ nested_release_page(vmx->nested.pi_desc_page);
+ vmx->nested.pi_desc_page = NULL;
+ vmx->nested.pi_desc = NULL;
+ }
/*
* We are now running in L2, mmu_notifier will force to reload the
@@ -9301,6 +10091,31 @@ static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu)
shrink_ple_window(vcpu);
}
+static void vmx_slot_enable_log_dirty(struct kvm *kvm,
+ struct kvm_memory_slot *slot)
+{
+ kvm_mmu_slot_leaf_clear_dirty(kvm, slot);
+ kvm_mmu_slot_largepage_remove_write_access(kvm, slot);
+}
+
+static void vmx_slot_disable_log_dirty(struct kvm *kvm,
+ struct kvm_memory_slot *slot)
+{
+ kvm_mmu_slot_set_dirty(kvm, slot);
+}
+
+static void vmx_flush_log_dirty(struct kvm *kvm)
+{
+ kvm_flush_pml_buffers(kvm);
+}
+
+static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm,
+ struct kvm_memory_slot *memslot,
+ gfn_t offset, unsigned long mask)
+{
+ kvm_mmu_clear_dirty_pt_masked(kvm, memslot, offset, mask);
+}
+
static struct kvm_x86_ops vmx_x86_ops = {
.cpu_has_kvm_support = cpu_has_kvm_support,
.disabled_by_bios = vmx_disabled_by_bios,
@@ -9409,6 +10224,11 @@ static struct kvm_x86_ops vmx_x86_ops = {
.check_nested_events = vmx_check_nested_events,
.sched_in = vmx_sched_in,
+
+ .slot_enable_log_dirty = vmx_slot_enable_log_dirty,
+ .slot_disable_log_dirty = vmx_slot_disable_log_dirty,
+ .flush_log_dirty = vmx_flush_log_dirty,
+ .enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked,
};
static int __init vmx_init(void)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c259814200bd..bd7a70be41b3 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -108,6 +108,10 @@ EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz);
static u32 tsc_tolerance_ppm = 250;
module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
+/* lapic timer advance (tscdeadline mode only) in nanoseconds */
+unsigned int lapic_timer_advance_ns = 0;
+module_param(lapic_timer_advance_ns, uint, S_IRUGO | S_IWUSR);
+
static bool backwards_tsc_observed = false;
#define KVM_NR_SHARED_MSRS 16
@@ -141,6 +145,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
{ "irq_window", VCPU_STAT(irq_window_exits) },
{ "nmi_window", VCPU_STAT(nmi_window_exits) },
{ "halt_exits", VCPU_STAT(halt_exits) },
+ { "halt_successful_poll", VCPU_STAT(halt_successful_poll) },
{ "halt_wakeup", VCPU_STAT(halt_wakeup) },
{ "hypercalls", VCPU_STAT(hypercalls) },
{ "request_irq", VCPU_STAT(request_irq_exits) },
@@ -492,7 +497,7 @@ int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
}
EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu);
-int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
+static int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
void *data, int offset, int len, u32 access)
{
return kvm_read_guest_page_mmu(vcpu, vcpu->arch.walk_mmu, gfn,
@@ -643,7 +648,7 @@ static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu)
}
}
-int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
+static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
{
u64 xcr0 = xcr;
u64 old_xcr0 = vcpu->arch.xcr0;
@@ -1083,6 +1088,15 @@ static void update_pvclock_gtod(struct timekeeper *tk)
}
#endif
+void kvm_set_pending_timer(struct kvm_vcpu *vcpu)
+{
+ /*
+ * Note: KVM_REQ_PENDING_TIMER is implicitly checked in
+ * vcpu_enter_guest. This function is only called from
+ * the physical CPU that is running vcpu.
+ */
+ kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
+}
static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
{
@@ -1180,7 +1194,7 @@ static atomic_t kvm_guest_has_master_clock = ATOMIC_INIT(0);
#endif
static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
-unsigned long max_tsc_khz;
+static unsigned long max_tsc_khz;
static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec)
{
@@ -1234,7 +1248,7 @@ static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
return tsc;
}
-void kvm_track_tsc_matching(struct kvm_vcpu *vcpu)
+static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu)
{
#ifdef CONFIG_X86_64
bool vcpus_matched;
@@ -1529,7 +1543,8 @@ static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
&ka->master_cycle_now);
ka->use_master_clock = host_tsc_clocksource && vcpus_matched
- && !backwards_tsc_observed;
+ && !backwards_tsc_observed
+ && !ka->boot_vcpu_runs_old_kvmclock;
if (ka->use_master_clock)
atomic_set(&kvm_guest_has_master_clock, 1);
@@ -2161,8 +2176,20 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_KVM_SYSTEM_TIME_NEW:
case MSR_KVM_SYSTEM_TIME: {
u64 gpa_offset;
+ struct kvm_arch *ka = &vcpu->kvm->arch;
+
kvmclock_reset(vcpu);
+ if (vcpu->vcpu_id == 0 && !msr_info->host_initiated) {
+ bool tmp = (msr == MSR_KVM_SYSTEM_TIME);
+
+ if (ka->boot_vcpu_runs_old_kvmclock != tmp)
+ set_bit(KVM_REQ_MASTERCLOCK_UPDATE,
+ &vcpu->requests);
+
+ ka->boot_vcpu_runs_old_kvmclock = tmp;
+ }
+
vcpu->arch.time = data;
kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
@@ -2324,6 +2351,7 @@ int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
{
return kvm_x86_ops->get_msr(vcpu, msr_index, pdata);
}
+EXPORT_SYMBOL_GPL(kvm_get_msr);
static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
{
@@ -2738,6 +2766,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_READONLY_MEM:
case KVM_CAP_HYPERV_TIME:
case KVM_CAP_IOAPIC_POLARITY_IGNORED:
+ case KVM_CAP_TSC_DEADLINE_TIMER:
#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
case KVM_CAP_ASSIGN_DEV_IRQ:
case KVM_CAP_PCI_2_3:
@@ -2776,9 +2805,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_TSC_CONTROL:
r = kvm_has_tsc_control;
break;
- case KVM_CAP_TSC_DEADLINE_TIMER:
- r = boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER);
- break;
default:
r = 0;
break;
@@ -3734,83 +3760,43 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm,
* @kvm: kvm instance
* @log: slot id and address to which we copy the log
*
- * We need to keep it in mind that VCPU threads can write to the bitmap
- * concurrently. So, to avoid losing data, we keep the following order for
- * each bit:
+ * Steps 1-4 below provide general overview of dirty page logging. See
+ * kvm_get_dirty_log_protect() function description for additional details.
+ *
+ * We call kvm_get_dirty_log_protect() to handle steps 1-3, upon return we
+ * always flush the TLB (step 4) even if previous step failed and the dirty
+ * bitmap may be corrupt. Regardless of previous outcome the KVM logging API
+ * does not preclude user space subsequent dirty log read. Flushing TLB ensures
+ * writes will be marked dirty for next log read.
*
* 1. Take a snapshot of the bit and clear it if needed.
* 2. Write protect the corresponding page.
- * 3. Flush TLB's if needed.
- * 4. Copy the snapshot to the userspace.
- *
- * Between 2 and 3, the guest may write to the page using the remaining TLB
- * entry. This is not a problem because the page will be reported dirty at
- * step 4 using the snapshot taken before and step 3 ensures that successive
- * writes will be logged for the next call.
+ * 3. Copy the snapshot to the userspace.
+ * 4. Flush TLB's if needed.
*/
int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
{
- int r;
- struct kvm_memory_slot *memslot;
- unsigned long n, i;
- unsigned long *dirty_bitmap;
- unsigned long *dirty_bitmap_buffer;
bool is_dirty = false;
+ int r;
mutex_lock(&kvm->slots_lock);
- r = -EINVAL;
- if (log->slot >= KVM_USER_MEM_SLOTS)
- goto out;
-
- memslot = id_to_memslot(kvm->memslots, log->slot);
-
- dirty_bitmap = memslot->dirty_bitmap;
- r = -ENOENT;
- if (!dirty_bitmap)
- goto out;
-
- n = kvm_dirty_bitmap_bytes(memslot);
-
- dirty_bitmap_buffer = dirty_bitmap + n / sizeof(long);
- memset(dirty_bitmap_buffer, 0, n);
-
- spin_lock(&kvm->mmu_lock);
-
- for (i = 0; i < n / sizeof(long); i++) {
- unsigned long mask;
- gfn_t offset;
-
- if (!dirty_bitmap[i])
- continue;
-
- is_dirty = true;
-
- mask = xchg(&dirty_bitmap[i], 0);
- dirty_bitmap_buffer[i] = mask;
-
- offset = i * BITS_PER_LONG;
- kvm_mmu_write_protect_pt_masked(kvm, memslot, offset, mask);
- }
-
- spin_unlock(&kvm->mmu_lock);
+ /*
+ * Flush potentially hardware-cached dirty pages to dirty_bitmap.
+ */
+ if (kvm_x86_ops->flush_log_dirty)
+ kvm_x86_ops->flush_log_dirty(kvm);
- /* See the comments in kvm_mmu_slot_remove_write_access(). */
- lockdep_assert_held(&kvm->slots_lock);
+ r = kvm_get_dirty_log_protect(kvm, log, &is_dirty);
/*
* All the TLBs can be flushed out of mmu lock, see the comments in
* kvm_mmu_slot_remove_write_access().
*/
+ lockdep_assert_held(&kvm->slots_lock);
if (is_dirty)
kvm_flush_remote_tlbs(kvm);
- r = -EFAULT;
- if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n))
- goto out;
-
- r = 0;
-out:
mutex_unlock(&kvm->slots_lock);
return r;
}
@@ -4516,6 +4502,8 @@ int emulator_read_write(struct x86_emulate_ctxt *ctxt, unsigned long addr,
if (rc != X86EMUL_CONTINUE)
return rc;
addr += now;
+ if (ctxt->mode != X86EMUL_MODE_PROT64)
+ addr = (u32)addr;
val += now;
bytes -= now;
}
@@ -4984,6 +4972,11 @@ static void emulator_write_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg, ulon
kvm_register_write(emul_to_vcpu(ctxt), reg, val);
}
+static void emulator_set_nmi_mask(struct x86_emulate_ctxt *ctxt, bool masked)
+{
+ kvm_x86_ops->set_nmi_mask(emul_to_vcpu(ctxt), masked);
+}
+
static const struct x86_emulate_ops emulate_ops = {
.read_gpr = emulator_read_gpr,
.write_gpr = emulator_write_gpr,
@@ -5019,6 +5012,7 @@ static const struct x86_emulate_ops emulate_ops = {
.put_fpu = emulator_put_fpu,
.intercept = emulator_intercept,
.get_cpuid = emulator_get_cpuid,
+ .set_nmi_mask = emulator_set_nmi_mask,
};
static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
@@ -6311,6 +6305,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
}
trace_kvm_entry(vcpu->vcpu_id);
+ wait_lapic_expire(vcpu);
kvm_x86_ops->run(vcpu);
/*
@@ -7041,15 +7036,13 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
return r;
}
-int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
+void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
{
- int r;
struct msr_data msr;
struct kvm *kvm = vcpu->kvm;
- r = vcpu_load(vcpu);
- if (r)
- return r;
+ if (vcpu_load(vcpu))
+ return;
msr.data = 0x0;
msr.index = MSR_IA32_TSC;
msr.host_initiated = true;
@@ -7058,8 +7051,6 @@ int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
KVMCLOCK_SYNC_PERIOD);
-
- return r;
}
void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
@@ -7549,12 +7540,62 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
return 0;
}
+static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
+ struct kvm_memory_slot *new)
+{
+ /* Still write protect RO slot */
+ if (new->flags & KVM_MEM_READONLY) {
+ kvm_mmu_slot_remove_write_access(kvm, new);
+ return;
+ }
+
+ /*
+ * Call kvm_x86_ops dirty logging hooks when they are valid.
+ *
+ * kvm_x86_ops->slot_disable_log_dirty is called when:
+ *
+ * - KVM_MR_CREATE with dirty logging is disabled
+ * - KVM_MR_FLAGS_ONLY with dirty logging is disabled in new flag
+ *
+ * The reason is, in case of PML, we need to set D-bit for any slots
+ * with dirty logging disabled in order to eliminate unnecessary GPA
+ * logging in PML buffer (and potential PML buffer full VMEXT). This
+ * guarantees leaving PML enabled during guest's lifetime won't have
+ * any additonal overhead from PML when guest is running with dirty
+ * logging disabled for memory slots.
+ *
+ * kvm_x86_ops->slot_enable_log_dirty is called when switching new slot
+ * to dirty logging mode.
+ *
+ * If kvm_x86_ops dirty logging hooks are invalid, use write protect.
+ *
+ * In case of write protect:
+ *
+ * Write protect all pages for dirty logging.
+ *
+ * All the sptes including the large sptes which point to this
+ * slot are set to readonly. We can not create any new large
+ * spte on this slot until the end of the logging.
+ *
+ * See the comments in fast_page_fault().
+ */
+ if (new->flags & KVM_MEM_LOG_DIRTY_PAGES) {
+ if (kvm_x86_ops->slot_enable_log_dirty)
+ kvm_x86_ops->slot_enable_log_dirty(kvm, new);
+ else
+ kvm_mmu_slot_remove_write_access(kvm, new);
+ } else {
+ if (kvm_x86_ops->slot_disable_log_dirty)
+ kvm_x86_ops->slot_disable_log_dirty(kvm, new);
+ }
+}
+
void kvm_arch_commit_memory_region(struct kvm *kvm,
struct kvm_userspace_memory_region *mem,
const struct kvm_memory_slot *old,
enum kvm_mr_change change)
{
-
+ struct kvm_memory_slot *new;
int nr_mmu_pages = 0;
if ((mem->slot >= KVM_USER_MEM_SLOTS) && (change == KVM_MR_DELETE)) {
@@ -7573,17 +7614,20 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
if (nr_mmu_pages)
kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
+
+ /* It's OK to get 'new' slot here as it has already been installed */
+ new = id_to_memslot(kvm->memslots, mem->slot);
+
/*
- * Write protect all pages for dirty logging.
+ * Set up write protection and/or dirty logging for the new slot.
*
- * All the sptes including the large sptes which point to this
- * slot are set to readonly. We can not create any new large
- * spte on this slot until the end of the logging.
- *
- * See the comments in fast_page_fault().
+ * For KVM_MR_DELETE and KVM_MR_MOVE, the shadow pages of old slot have
+ * been zapped so no dirty logging staff is needed for old slot. For
+ * KVM_MR_FLAGS_ONLY, the old slot is essentially the same one as the
+ * new and it's also covered when dealing with the new slot.
*/
- if ((change != KVM_MR_DELETE) && (mem->flags & KVM_MEM_LOG_DIRTY_PAGES))
- kvm_mmu_slot_remove_write_access(kvm, mem->slot);
+ if (change != KVM_MR_DELETE)
+ kvm_mmu_slot_apply_flags(kvm, new);
}
void kvm_arch_flush_shadow_all(struct kvm *kvm)
@@ -7837,3 +7881,4 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_tsc_offset);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ple_window);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pml_full);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index cc1d61af6140..f5fef1868096 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -147,6 +147,7 @@ static inline void kvm_register_writel(struct kvm_vcpu *vcpu,
void kvm_before_handle_nmi(struct kvm_vcpu *vcpu);
void kvm_after_handle_nmi(struct kvm_vcpu *vcpu);
+void kvm_set_pending_timer(struct kvm_vcpu *vcpu);
int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip);
void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr);
@@ -170,5 +171,7 @@ extern u64 kvm_supported_xcr0(void);
extern unsigned int min_timer_period_us;
+extern unsigned int lapic_timer_advance_ns;
+
extern struct static_key kvm_no_apic_vcpu;
#endif
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index c1c1544b8485..ac4453d8520e 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -56,6 +56,9 @@
#include <linux/virtio_console.h>
#include <linux/pm.h>
#include <linux/export.h>
+#include <linux/pci.h>
+#include <linux/virtio_pci.h>
+#include <asm/acpi.h>
#include <asm/apic.h>
#include <asm/lguest.h>
#include <asm/paravirt.h>
@@ -71,6 +74,8 @@
#include <asm/stackprotector.h>
#include <asm/reboot.h> /* for struct machine_ops */
#include <asm/kvm_para.h>
+#include <asm/pci_x86.h>
+#include <asm/pci-direct.h>
/*G:010
* Welcome to the Guest!
@@ -831,6 +836,24 @@ static struct irq_chip lguest_irq_controller = {
.irq_unmask = enable_lguest_irq,
};
+static int lguest_enable_irq(struct pci_dev *dev)
+{
+ u8 line = 0;
+
+ /* We literally use the PCI interrupt line as the irq number. */
+ pci_read_config_byte(dev, PCI_INTERRUPT_LINE, &line);
+ irq_set_chip_and_handler_name(line, &lguest_irq_controller,
+ handle_level_irq, "level");
+ dev->irq = line;
+ return 0;
+}
+
+/* We don't do hotplug PCI, so this shouldn't be called. */
+static void lguest_disable_irq(struct pci_dev *dev)
+{
+ WARN_ON(1);
+}
+
/*
* This sets up the Interrupt Descriptor Table (IDT) entry for each hardware
* interrupt (except 128, which is used for system calls), and then tells the
@@ -1181,25 +1204,136 @@ static __init char *lguest_memory_setup(void)
return "LGUEST";
}
+/* Offset within PCI config space of BAR access capability. */
+static int console_cfg_offset = 0;
+static int console_access_cap;
+
+/* Set up so that we access off in bar0 (on bus 0, device 1, function 0) */
+static void set_cfg_window(u32 cfg_offset, u32 off)
+{
+ write_pci_config_byte(0, 1, 0,
+ cfg_offset + offsetof(struct virtio_pci_cap, bar),
+ 0);
+ write_pci_config(0, 1, 0,
+ cfg_offset + offsetof(struct virtio_pci_cap, length),
+ 4);
+ write_pci_config(0, 1, 0,
+ cfg_offset + offsetof(struct virtio_pci_cap, offset),
+ off);
+}
+
+static void write_bar_via_cfg(u32 cfg_offset, u32 off, u32 val)
+{
+ /*
+ * We could set this up once, then leave it; nothing else in the *
+ * kernel should touch these registers. But if it went wrong, that
+ * would be a horrible bug to find.
+ */
+ set_cfg_window(cfg_offset, off);
+ write_pci_config(0, 1, 0,
+ cfg_offset + sizeof(struct virtio_pci_cap), val);
+}
+
+static void probe_pci_console(void)
+{
+ u8 cap, common_cap = 0, device_cap = 0;
+ /* Offset within BAR0 */
+ u32 device_offset;
+ u32 device_len;
+
+ /* Avoid recursive printk into here. */
+ console_cfg_offset = -1;
+
+ if (!early_pci_allowed()) {
+ printk(KERN_ERR "lguest: early PCI access not allowed!\n");
+ return;
+ }
+
+ /* We expect a console PCI device at BUS0, slot 1. */
+ if (read_pci_config(0, 1, 0, 0) != 0x10431AF4) {
+ printk(KERN_ERR "lguest: PCI device is %#x!\n",
+ read_pci_config(0, 1, 0, 0));
+ return;
+ }
+
+ /* Find the capabilities we need (must be in bar0) */
+ cap = read_pci_config_byte(0, 1, 0, PCI_CAPABILITY_LIST);
+ while (cap) {
+ u8 vndr = read_pci_config_byte(0, 1, 0, cap);
+ if (vndr == PCI_CAP_ID_VNDR) {
+ u8 type, bar;
+ u32 offset, length;
+
+ type = read_pci_config_byte(0, 1, 0,
+ cap + offsetof(struct virtio_pci_cap, cfg_type));
+ bar = read_pci_config_byte(0, 1, 0,
+ cap + offsetof(struct virtio_pci_cap, bar));
+ offset = read_pci_config(0, 1, 0,
+ cap + offsetof(struct virtio_pci_cap, offset));
+ length = read_pci_config(0, 1, 0,
+ cap + offsetof(struct virtio_pci_cap, length));
+
+ switch (type) {
+ case VIRTIO_PCI_CAP_DEVICE_CFG:
+ if (bar == 0) {
+ device_cap = cap;
+ device_offset = offset;
+ device_len = length;
+ }
+ break;
+ case VIRTIO_PCI_CAP_PCI_CFG:
+ console_access_cap = cap;
+ break;
+ }
+ }
+ cap = read_pci_config_byte(0, 1, 0, cap + PCI_CAP_LIST_NEXT);
+ }
+ if (!device_cap || !console_access_cap) {
+ printk(KERN_ERR "lguest: No caps (%u/%u/%u) in console!\n",
+ common_cap, device_cap, console_access_cap);
+ return;
+ }
+
+ /*
+ * Note that we can't check features, until we've set the DRIVER
+ * status bit. We don't want to do that until we have a real driver,
+ * so we just check that the device-specific config has room for
+ * emerg_wr. If it doesn't support VIRTIO_CONSOLE_F_EMERG_WRITE
+ * it should ignore the access.
+ */
+ if (device_len < (offsetof(struct virtio_console_config, emerg_wr)
+ + sizeof(u32))) {
+ printk(KERN_ERR "lguest: console missing emerg_wr field\n");
+ return;
+ }
+
+ console_cfg_offset = device_offset;
+ printk(KERN_INFO "lguest: Console via virtio-pci emerg_wr\n");
+}
+
/*
* We will eventually use the virtio console device to produce console output,
- * but before that is set up we use LHCALL_NOTIFY on normal memory to produce
- * console output.
+ * but before that is set up we use the virtio PCI console's backdoor mmio
+ * access and the "emergency" write facility (which is legal even before the
+ * device is configured).
*/
static __init int early_put_chars(u32 vtermno, const char *buf, int count)
{
- char scratch[17];
- unsigned int len = count;
+ /* If we couldn't find PCI console, forget it. */
+ if (console_cfg_offset < 0)
+ return count;
- /* We use a nul-terminated string, so we make a copy. Icky, huh? */
- if (len > sizeof(scratch) - 1)
- len = sizeof(scratch) - 1;
- scratch[len] = '\0';
- memcpy(scratch, buf, len);
- hcall(LHCALL_NOTIFY, __pa(scratch), 0, 0, 0);
+ if (unlikely(!console_cfg_offset)) {
+ probe_pci_console();
+ if (console_cfg_offset < 0)
+ return count;
+ }
- /* This routine returns the number of bytes actually written. */
- return len;
+ write_bar_via_cfg(console_access_cap,
+ console_cfg_offset
+ + offsetof(struct virtio_console_config, emerg_wr),
+ buf[0]);
+ return 1;
}
/*
@@ -1400,14 +1534,6 @@ __init void lguest_init(void)
atomic_notifier_chain_register(&panic_notifier_list, &paniced);
/*
- * The IDE code spends about 3 seconds probing for disks: if we reserve
- * all the I/O ports up front it can't get them and so doesn't probe.
- * Other device drivers are similar (but less severe). This cuts the
- * kernel boot time on my machine from 4.1 seconds to 0.45 seconds.
- */
- paravirt_disable_iospace();
-
- /*
* This is messy CPU setup stuff which the native boot code does before
* start_kernel, so we have to do, too:
*/
@@ -1436,6 +1562,13 @@ __init void lguest_init(void)
/* Register our very early console. */
virtio_cons_early_init(early_put_chars);
+ /* Don't let ACPI try to control our PCI interrupts. */
+ disable_acpi();
+
+ /* We control them ourselves, by overriding these two hooks. */
+ pcibios_enable_irq = lguest_enable_irq;
+ pcibios_disable_irq = lguest_disable_irq;
+
/*
* Last of all, we set the power management poweroff hook to point to
* the Guest routine to power off, and the reboot hook to our restart
diff --git a/arch/x86/lib/insn.c b/arch/x86/lib/insn.c
index 2480978b31cc..1313ae6b478b 100644
--- a/arch/x86/lib/insn.c
+++ b/arch/x86/lib/insn.c
@@ -28,7 +28,7 @@
/* Verify next sizeof(t) bytes can be on the same instruction */
#define validate_next(t, insn, n) \
- ((insn)->next_byte + sizeof(t) + n < (insn)->end_kaddr)
+ ((insn)->next_byte + sizeof(t) + n <= (insn)->end_kaddr)
#define __get_next(t, insn) \
({ t r = *(t*)insn->next_byte; insn->next_byte += sizeof(t); r; })
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index 56313a326188..89b53c9968e7 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -53,6 +53,8 @@
.Lmemcpy_e_e:
.previous
+.weak memcpy
+
ENTRY(__memcpy)
ENTRY(memcpy)
CFI_STARTPROC
@@ -199,8 +201,8 @@ ENDPROC(__memcpy)
* only outcome...
*/
.section .altinstructions, "a"
- altinstruction_entry memcpy,.Lmemcpy_c,X86_FEATURE_REP_GOOD,\
+ altinstruction_entry __memcpy,.Lmemcpy_c,X86_FEATURE_REP_GOOD,\
.Lmemcpy_e-.Lmemcpy_c,.Lmemcpy_e-.Lmemcpy_c
- altinstruction_entry memcpy,.Lmemcpy_c_e,X86_FEATURE_ERMS, \
+ altinstruction_entry __memcpy,.Lmemcpy_c_e,X86_FEATURE_ERMS, \
.Lmemcpy_e_e-.Lmemcpy_c_e,.Lmemcpy_e_e-.Lmemcpy_c_e
.previous
diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S
index 65268a6104f4..9c4b530575da 100644
--- a/arch/x86/lib/memmove_64.S
+++ b/arch/x86/lib/memmove_64.S
@@ -24,7 +24,10 @@
* Output:
* rax: dest
*/
+.weak memmove
+
ENTRY(memmove)
+ENTRY(__memmove)
CFI_STARTPROC
/* Handle more 32 bytes in loop */
@@ -220,4 +223,5 @@ ENTRY(memmove)
.Lmemmove_end_forward-.Lmemmove_begin_forward, \
.Lmemmove_end_forward_efs-.Lmemmove_begin_forward_efs
.previous
+ENDPROC(__memmove)
ENDPROC(memmove)
diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S
index 2dcb3808cbda..6f44935c6a60 100644
--- a/arch/x86/lib/memset_64.S
+++ b/arch/x86/lib/memset_64.S
@@ -56,6 +56,8 @@
.Lmemset_e_e:
.previous
+.weak memset
+
ENTRY(memset)
ENTRY(__memset)
CFI_STARTPROC
@@ -147,8 +149,8 @@ ENDPROC(__memset)
* feature to implement the right patch order.
*/
.section .altinstructions,"a"
- altinstruction_entry memset,.Lmemset_c,X86_FEATURE_REP_GOOD,\
- .Lfinal-memset,.Lmemset_e-.Lmemset_c
- altinstruction_entry memset,.Lmemset_c_e,X86_FEATURE_ERMS, \
- .Lfinal-memset,.Lmemset_e_e-.Lmemset_c_e
+ altinstruction_entry __memset,.Lmemset_c,X86_FEATURE_REP_GOOD,\
+ .Lfinal-__memset,.Lmemset_e-.Lmemset_c
+ altinstruction_entry __memset,.Lmemset_c_e,X86_FEATURE_ERMS, \
+ .Lfinal-__memset,.Lmemset_e_e-.Lmemset_c_e
.previous
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index ecfdc46a024a..c4cc74006c61 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -20,6 +20,9 @@ obj-$(CONFIG_HIGHMEM) += highmem_32.o
obj-$(CONFIG_KMEMCHECK) += kmemcheck/
+KASAN_SANITIZE_kasan_init_$(BITS).o := n
+obj-$(CONFIG_KASAN) += kasan_init_$(BITS).o
+
obj-$(CONFIG_MMIOTRACE) += mmiotrace.o
mmiotrace-y := kmmio.o pf_in.o mmio-mod.o
obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 38dcec403b46..ede025fb46f1 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -600,7 +600,7 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code,
printk(nx_warning, from_kuid(&init_user_ns, current_uid()));
if (pte && pte_present(*pte) && pte_exec(*pte) &&
(pgd_flags(*pgd) & _PAGE_USER) &&
- (read_cr4() & X86_CR4_SMEP))
+ (__read_cr4() & X86_CR4_SMEP))
printk(smep_warning, from_kuid(&init_user_ns, current_uid()));
}
@@ -898,6 +898,8 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|
VM_FAULT_HWPOISON_LARGE))
do_sigbus(regs, error_code, address, fault);
+ else if (fault & VM_FAULT_SIGSEGV)
+ bad_area_nosemaphore(regs, error_code, address);
else
BUG();
}
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
index d7547824e763..81bf3d2af3eb 100644
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -84,7 +84,7 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
struct page *page;
/* Similar to the PMD case, NUMA hinting must take slow path */
- if (pte_numa(pte)) {
+ if (pte_protnone(pte)) {
pte_unmap(ptep);
return 0;
}
@@ -172,13 +172,13 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
*/
if (pmd_none(pmd) || pmd_trans_splitting(pmd))
return 0;
- if (unlikely(pmd_large(pmd))) {
+ if (unlikely(pmd_large(pmd) || !pmd_present(pmd))) {
/*
* NUMA hinting faults need to be handled in the GUP
* slowpath for accounting purposes and so that they
* can be serialised against THP migration.
*/
- if (pmd_numa(pmd))
+ if (pmd_protnone(pmd))
return 0;
if (!gup_huge_pmd(pmd, addr, next, write, pages, nr))
return 0;
@@ -388,10 +388,9 @@ slow_irqon:
start += nr << PAGE_SHIFT;
pages += nr;
- down_read(&mm->mmap_sem);
- ret = get_user_pages(current, mm, start,
- (end - start) >> PAGE_SHIFT, write, 0, pages, NULL);
- up_read(&mm->mmap_sem);
+ ret = get_user_pages_unlocked(current, mm, start,
+ (end - start) >> PAGE_SHIFT,
+ write, 0, pages);
/* Have to be a bit careful with return values */
if (nr > 0) {
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index 8b977ebf9388..42982b26e32b 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -52,23 +52,17 @@ int pud_huge(pud_t pud)
return 0;
}
-struct page *
-follow_huge_pmd(struct mm_struct *mm, unsigned long address,
- pmd_t *pmd, int write)
-{
- return NULL;
-}
#else
-struct page *
-follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
-{
- return ERR_PTR(-EINVAL);
-}
-
+/*
+ * pmd_huge() returns 1 if @pmd is hugetlb related entry, that is normal
+ * hugetlb entry or non-present (migration or hwpoisoned) hugetlb entry.
+ * Otherwise, returns 0.
+ */
int pmd_huge(pmd_t pmd)
{
- return !!(pmd_val(pmd) & _PAGE_PSE);
+ return !pmd_none(pmd) &&
+ (pmd_val(pmd) & (_PAGE_PRESENT|_PAGE_PSE)) != _PAGE_PRESENT;
}
int pud_huge(pud_t pud)
@@ -178,4 +172,15 @@ static __init int setup_hugepagesz(char *opt)
return 1;
}
__setup("hugepagesz=", setup_hugepagesz);
+
+#ifdef CONFIG_CMA
+static __init int gigantic_pages_init(void)
+{
+ /* With CMA we can allocate gigantic pages at runtime */
+ if (cpu_has_gbpages && !size_to_hstate(1UL << PUD_SHIFT))
+ hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT);
+ return 0;
+}
+arch_initcall(gigantic_pages_init);
+#endif
#endif
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index a97ee0801475..a110efca6d06 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -43,7 +43,7 @@ uint16_t __cachemode2pte_tbl[_PAGE_CACHE_MODE_NUM] = {
[_PAGE_CACHE_MODE_WT] = _PAGE_PCD,
[_PAGE_CACHE_MODE_WP] = _PAGE_PCD,
};
-EXPORT_SYMBOL_GPL(__cachemode2pte_tbl);
+EXPORT_SYMBOL(__cachemode2pte_tbl);
uint8_t __pte2cachemode_tbl[8] = {
[__pte2cm_idx(0)] = _PAGE_CACHE_MODE_WB,
[__pte2cm_idx(_PAGE_PWT)] = _PAGE_CACHE_MODE_WC,
@@ -54,7 +54,7 @@ uint8_t __pte2cachemode_tbl[8] = {
[__pte2cm_idx(_PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC_MINUS,
[__pte2cm_idx(_PAGE_PWT | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC,
};
-EXPORT_SYMBOL_GPL(__pte2cachemode_tbl);
+EXPORT_SYMBOL(__pte2cachemode_tbl);
static unsigned long __initdata pgt_buf_start;
static unsigned long __initdata pgt_buf_end;
@@ -173,11 +173,11 @@ static void __init probe_page_size_mask(void)
/* Enable PSE if available */
if (cpu_has_pse)
- set_in_cr4(X86_CR4_PSE);
+ cr4_set_bits_and_update_boot(X86_CR4_PSE);
/* Enable PGE if available */
if (cpu_has_pge) {
- set_in_cr4(X86_CR4_PGE);
+ cr4_set_bits_and_update_boot(X86_CR4_PGE);
__supported_pte_mask |= _PAGE_GLOBAL;
}
}
@@ -238,6 +238,31 @@ static void __init_refok adjust_range_page_size_mask(struct map_range *mr,
}
}
+static const char *page_size_string(struct map_range *mr)
+{
+ static const char str_1g[] = "1G";
+ static const char str_2m[] = "2M";
+ static const char str_4m[] = "4M";
+ static const char str_4k[] = "4k";
+
+ if (mr->page_size_mask & (1<<PG_LEVEL_1G))
+ return str_1g;
+ /*
+ * 32-bit without PAE has a 4M large page size.
+ * PG_LEVEL_2M is misnamed, but we can at least
+ * print out the right size in the string.
+ */
+ if (IS_ENABLED(CONFIG_X86_32) &&
+ !IS_ENABLED(CONFIG_X86_PAE) &&
+ mr->page_size_mask & (1<<PG_LEVEL_2M))
+ return str_4m;
+
+ if (mr->page_size_mask & (1<<PG_LEVEL_2M))
+ return str_2m;
+
+ return str_4k;
+}
+
static int __meminit split_mem_range(struct map_range *mr, int nr_range,
unsigned long start,
unsigned long end)
@@ -333,8 +358,7 @@ static int __meminit split_mem_range(struct map_range *mr, int nr_range,
for (i = 0; i < nr_range; i++)
printk(KERN_DEBUG " [mem %#010lx-%#010lx] page %s\n",
mr[i].start, mr[i].end - 1,
- (mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":(
- (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k"));
+ page_size_string(&mr[i]));
return nr_range;
}
@@ -438,20 +462,20 @@ static unsigned long __init init_range_memory_mapping(
static unsigned long __init get_new_step_size(unsigned long step_size)
{
/*
- * Explain why we shift by 5 and why we don't have to worry about
- * 'step_size << 5' overflowing:
- *
- * initial mapped size is PMD_SIZE (2M).
+ * Initial mapped size is PMD_SIZE (2M).
* We can not set step_size to be PUD_SIZE (1G) yet.
* In worse case, when we cross the 1G boundary, and
* PG_LEVEL_2M is not set, we will need 1+1+512 pages (2M + 8k)
- * to map 1G range with PTE. Use 5 as shift for now.
+ * to map 1G range with PTE. Hence we use one less than the
+ * difference of page table level shifts.
*
- * Don't need to worry about overflow, on 32bit, when step_size
- * is 0, round_down() returns 0 for start, and that turns it
- * into 0x100000000ULL.
+ * Don't need to worry about overflow in the top-down case, on 32bit,
+ * when step_size is 0, round_down() returns 0 for start, and that
+ * turns it into 0x100000000ULL.
+ * In the bottom-up case, round_up(x, 0) returns 0 though too, which
+ * needs to be taken into consideration by the code below.
*/
- return step_size << 5;
+ return step_size << (PMD_SHIFT - PAGE_SHIFT - 1);
}
/**
@@ -471,7 +495,6 @@ static void __init memory_map_top_down(unsigned long map_start,
unsigned long step_size;
unsigned long addr;
unsigned long mapped_ram_size = 0;
- unsigned long new_mapped_ram_size;
/* xen has big range in reserved near end of ram, skip it at first.*/
addr = memblock_find_in_range(map_start, map_end, PMD_SIZE, PMD_SIZE);
@@ -496,14 +519,12 @@ static void __init memory_map_top_down(unsigned long map_start,
start = map_start;
} else
start = map_start;
- new_mapped_ram_size = init_range_memory_mapping(start,
+ mapped_ram_size += init_range_memory_mapping(start,
last_start);
last_start = start;
min_pfn_mapped = last_start >> PAGE_SHIFT;
- /* only increase step_size after big range get mapped */
- if (new_mapped_ram_size > mapped_ram_size)
+ if (mapped_ram_size >= step_size)
step_size = get_new_step_size(step_size);
- mapped_ram_size += new_mapped_ram_size;
}
if (real_end < map_end)
@@ -524,7 +545,7 @@ static void __init memory_map_top_down(unsigned long map_start,
static void __init memory_map_bottom_up(unsigned long map_start,
unsigned long map_end)
{
- unsigned long next, new_mapped_ram_size, start;
+ unsigned long next, start;
unsigned long mapped_ram_size = 0;
/* step_size need to be small so pgt_buf from BRK could cover it */
unsigned long step_size = PMD_SIZE;
@@ -539,19 +560,19 @@ static void __init memory_map_bottom_up(unsigned long map_start,
* for page table.
*/
while (start < map_end) {
- if (map_end - start > step_size) {
+ if (step_size && map_end - start > step_size) {
next = round_up(start + 1, step_size);
if (next > map_end)
next = map_end;
- } else
+ } else {
next = map_end;
+ }
- new_mapped_ram_size = init_range_memory_mapping(start, next);
+ mapped_ram_size += init_range_memory_mapping(start, next);
start = next;
- if (new_mapped_ram_size > mapped_ram_size)
+ if (mapped_ram_size >= step_size)
step_size = get_new_step_size(step_size);
- mapped_ram_size += new_mapped_ram_size;
}
}
@@ -611,7 +632,7 @@ void __init init_mem_mapping(void)
*
*
* On x86, access has to be given to the first megabyte of ram because that area
- * contains bios code and data regions used by X and dosemu and similar apps.
+ * contains BIOS code and data regions used by X and dosemu and similar apps.
* Access has to be given to non-kernel-ram areas as well, these contain the PCI
* mmio resources as well as potential bios/acpi data regions.
*/
@@ -716,6 +737,15 @@ void __init zone_sizes_init(void)
free_area_init_nodes(max_zone_pfns);
}
+DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = {
+#ifdef CONFIG_SMP
+ .active_mm = &init_mm,
+ .state = 0,
+#endif
+ .cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */
+};
+EXPORT_SYMBOL_GPL(cpu_tlbstate);
+
void update_cache_mode_entry(unsigned entry, enum page_cache_mode cache)
{
/* entry 0 MUST be WB (hardwired to speed up translations) */
diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
new file mode 100644
index 000000000000..4860906c6b9f
--- /dev/null
+++ b/arch/x86/mm/kasan_init_64.c
@@ -0,0 +1,206 @@
+#include <linux/bootmem.h>
+#include <linux/kasan.h>
+#include <linux/kdebug.h>
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/vmalloc.h>
+
+#include <asm/tlbflush.h>
+#include <asm/sections.h>
+
+extern pgd_t early_level4_pgt[PTRS_PER_PGD];
+extern struct range pfn_mapped[E820_X_MAX];
+
+extern unsigned char kasan_zero_page[PAGE_SIZE];
+
+static int __init map_range(struct range *range)
+{
+ unsigned long start;
+ unsigned long end;
+
+ start = (unsigned long)kasan_mem_to_shadow(pfn_to_kaddr(range->start));
+ end = (unsigned long)kasan_mem_to_shadow(pfn_to_kaddr(range->end));
+
+ /*
+ * end + 1 here is intentional. We check several shadow bytes in advance
+ * to slightly speed up fastpath. In some rare cases we could cross
+ * boundary of mapped shadow, so we just map some more here.
+ */
+ return vmemmap_populate(start, end + 1, NUMA_NO_NODE);
+}
+
+static void __init clear_pgds(unsigned long start,
+ unsigned long end)
+{
+ for (; start < end; start += PGDIR_SIZE)
+ pgd_clear(pgd_offset_k(start));
+}
+
+void __init kasan_map_early_shadow(pgd_t *pgd)
+{
+ int i;
+ unsigned long start = KASAN_SHADOW_START;
+ unsigned long end = KASAN_SHADOW_END;
+
+ for (i = pgd_index(start); start < end; i++) {
+ pgd[i] = __pgd(__pa_nodebug(kasan_zero_pud)
+ | _KERNPG_TABLE);
+ start += PGDIR_SIZE;
+ }
+}
+
+static int __init zero_pte_populate(pmd_t *pmd, unsigned long addr,
+ unsigned long end)
+{
+ pte_t *pte = pte_offset_kernel(pmd, addr);
+
+ while (addr + PAGE_SIZE <= end) {
+ WARN_ON(!pte_none(*pte));
+ set_pte(pte, __pte(__pa_nodebug(kasan_zero_page)
+ | __PAGE_KERNEL_RO));
+ addr += PAGE_SIZE;
+ pte = pte_offset_kernel(pmd, addr);
+ }
+ return 0;
+}
+
+static int __init zero_pmd_populate(pud_t *pud, unsigned long addr,
+ unsigned long end)
+{
+ int ret = 0;
+ pmd_t *pmd = pmd_offset(pud, addr);
+
+ while (IS_ALIGNED(addr, PMD_SIZE) && addr + PMD_SIZE <= end) {
+ WARN_ON(!pmd_none(*pmd));
+ set_pmd(pmd, __pmd(__pa_nodebug(kasan_zero_pte)
+ | __PAGE_KERNEL_RO));
+ addr += PMD_SIZE;
+ pmd = pmd_offset(pud, addr);
+ }
+ if (addr < end) {
+ if (pmd_none(*pmd)) {
+ void *p = vmemmap_alloc_block(PAGE_SIZE, NUMA_NO_NODE);
+ if (!p)
+ return -ENOMEM;
+ set_pmd(pmd, __pmd(__pa_nodebug(p) | _KERNPG_TABLE));
+ }
+ ret = zero_pte_populate(pmd, addr, end);
+ }
+ return ret;
+}
+
+
+static int __init zero_pud_populate(pgd_t *pgd, unsigned long addr,
+ unsigned long end)
+{
+ int ret = 0;
+ pud_t *pud = pud_offset(pgd, addr);
+
+ while (IS_ALIGNED(addr, PUD_SIZE) && addr + PUD_SIZE <= end) {
+ WARN_ON(!pud_none(*pud));
+ set_pud(pud, __pud(__pa_nodebug(kasan_zero_pmd)
+ | __PAGE_KERNEL_RO));
+ addr += PUD_SIZE;
+ pud = pud_offset(pgd, addr);
+ }
+
+ if (addr < end) {
+ if (pud_none(*pud)) {
+ void *p = vmemmap_alloc_block(PAGE_SIZE, NUMA_NO_NODE);
+ if (!p)
+ return -ENOMEM;
+ set_pud(pud, __pud(__pa_nodebug(p) | _KERNPG_TABLE));
+ }
+ ret = zero_pmd_populate(pud, addr, end);
+ }
+ return ret;
+}
+
+static int __init zero_pgd_populate(unsigned long addr, unsigned long end)
+{
+ int ret = 0;
+ pgd_t *pgd = pgd_offset_k(addr);
+
+ while (IS_ALIGNED(addr, PGDIR_SIZE) && addr + PGDIR_SIZE <= end) {
+ WARN_ON(!pgd_none(*pgd));
+ set_pgd(pgd, __pgd(__pa_nodebug(kasan_zero_pud)
+ | __PAGE_KERNEL_RO));
+ addr += PGDIR_SIZE;
+ pgd = pgd_offset_k(addr);
+ }
+
+ if (addr < end) {
+ if (pgd_none(*pgd)) {
+ void *p = vmemmap_alloc_block(PAGE_SIZE, NUMA_NO_NODE);
+ if (!p)
+ return -ENOMEM;
+ set_pgd(pgd, __pgd(__pa_nodebug(p) | _KERNPG_TABLE));
+ }
+ ret = zero_pud_populate(pgd, addr, end);
+ }
+ return ret;
+}
+
+
+static void __init populate_zero_shadow(const void *start, const void *end)
+{
+ if (zero_pgd_populate((unsigned long)start, (unsigned long)end))
+ panic("kasan: unable to map zero shadow!");
+}
+
+
+#ifdef CONFIG_KASAN_INLINE
+static int kasan_die_handler(struct notifier_block *self,
+ unsigned long val,
+ void *data)
+{
+ if (val == DIE_GPF) {
+ pr_emerg("CONFIG_KASAN_INLINE enabled");
+ pr_emerg("GPF could be caused by NULL-ptr deref or user memory access");
+ }
+ return NOTIFY_OK;
+}
+
+static struct notifier_block kasan_die_notifier = {
+ .notifier_call = kasan_die_handler,
+};
+#endif
+
+void __init kasan_init(void)
+{
+ int i;
+
+#ifdef CONFIG_KASAN_INLINE
+ register_die_notifier(&kasan_die_notifier);
+#endif
+
+ memcpy(early_level4_pgt, init_level4_pgt, sizeof(early_level4_pgt));
+ load_cr3(early_level4_pgt);
+
+ clear_pgds(KASAN_SHADOW_START, KASAN_SHADOW_END);
+
+ populate_zero_shadow((void *)KASAN_SHADOW_START,
+ kasan_mem_to_shadow((void *)PAGE_OFFSET));
+
+ for (i = 0; i < E820_X_MAX; i++) {
+ if (pfn_mapped[i].end == 0)
+ break;
+
+ if (map_range(&pfn_mapped[i]))
+ panic("kasan: unable to allocate shadow!");
+ }
+ populate_zero_shadow(kasan_mem_to_shadow((void *)PAGE_OFFSET + MAXMEM),
+ kasan_mem_to_shadow((void *)__START_KERNEL_map));
+
+ vmemmap_populate((unsigned long)kasan_mem_to_shadow(_stext),
+ (unsigned long)kasan_mem_to_shadow(_end),
+ NUMA_NO_NODE);
+
+ populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END),
+ (void *)KASAN_SHADOW_END);
+
+ memset(kasan_zero_page, 0, PAGE_SIZE);
+
+ load_cr3(init_level4_pgt);
+ init_task.kasan_depth = 0;
+}
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index 919b91205cd4..df4552bd239e 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -35,12 +35,12 @@ struct va_alignment __read_mostly va_align = {
.flags = -1,
};
-static unsigned int stack_maxrandom_size(void)
+static unsigned long stack_maxrandom_size(void)
{
- unsigned int max = 0;
+ unsigned long max = 0;
if ((current->flags & PF_RANDOMIZE) &&
!(current->personality & ADDR_NO_RANDOMIZE)) {
- max = ((-1U) & STACK_RND_MASK) << PAGE_SHIFT;
+ max = ((-1UL) & STACK_RND_MASK) << PAGE_SHIFT;
}
return max;
diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c
index 67ebf5751222..c439ec478216 100644
--- a/arch/x86/mm/mpx.c
+++ b/arch/x86/mm/mpx.c
@@ -349,6 +349,12 @@ static __user void *task_get_bounds_dir(struct task_struct *tsk)
return MPX_INVALID_BOUNDS_DIR;
/*
+ * 32-bit binaries on 64-bit kernels are currently
+ * unsupported.
+ */
+ if (IS_ENABLED(CONFIG_X86_64) && test_thread_flag(TIF_IA32))
+ return MPX_INVALID_BOUNDS_DIR;
+ /*
* The bounds directory pointer is stored in a register
* only accessible if we first do an xsave.
*/
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 1a883705a12a..cd4785bbacb9 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -794,7 +794,6 @@ int early_cpu_to_node(int cpu)
void debug_cpumask_set_cpu(int cpu, int node, bool enable)
{
struct cpumask *mask;
- char buf[64];
if (node == NUMA_NO_NODE) {
/* early_cpu_to_node() already emits a warning and trace */
@@ -812,10 +811,9 @@ void debug_cpumask_set_cpu(int cpu, int node, bool enable)
else
cpumask_clear_cpu(cpu, mask);
- cpulist_scnprintf(buf, sizeof(buf), mask);
- printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n",
+ printk(KERN_DEBUG "%s cpu %d node %d: mask now %*pbl\n",
enable ? "numa_add_cpu" : "numa_remove_cpu",
- cpu, node, buf);
+ cpu, node, cpumask_pr_args(mask));
return;
}
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index edf299c8ff6c..7ac68698406c 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -234,8 +234,13 @@ void pat_init(void)
PAT(4, WB) | PAT(5, WC) | PAT(6, UC_MINUS) | PAT(7, UC);
/* Boot CPU check */
- if (!boot_pat_state)
+ if (!boot_pat_state) {
rdmsrl(MSR_IA32_CR_PAT, boot_pat_state);
+ if (!boot_pat_state) {
+ pat_disable("PAT read returns always zero, disabled.");
+ return;
+ }
+ }
wrmsrl(MSR_IA32_CR_PAT, pat);
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 6fb6927f9e76..7b22adaad4f1 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -190,7 +190,7 @@ void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
#endif /* CONFIG_X86_PAE */
-static void free_pmds(pmd_t *pmds[])
+static void free_pmds(struct mm_struct *mm, pmd_t *pmds[])
{
int i;
@@ -198,10 +198,11 @@ static void free_pmds(pmd_t *pmds[])
if (pmds[i]) {
pgtable_pmd_page_dtor(virt_to_page(pmds[i]));
free_page((unsigned long)pmds[i]);
+ mm_dec_nr_pmds(mm);
}
}
-static int preallocate_pmds(pmd_t *pmds[])
+static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[])
{
int i;
bool failed = false;
@@ -215,11 +216,13 @@ static int preallocate_pmds(pmd_t *pmds[])
pmd = NULL;
failed = true;
}
+ if (pmd)
+ mm_inc_nr_pmds(mm);
pmds[i] = pmd;
}
if (failed) {
- free_pmds(pmds);
+ free_pmds(mm, pmds);
return -ENOMEM;
}
@@ -246,6 +249,7 @@ static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT);
pmd_free(mm, pmd);
+ mm_dec_nr_pmds(mm);
}
}
}
@@ -283,7 +287,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
mm->pgd = pgd;
- if (preallocate_pmds(pmds) != 0)
+ if (preallocate_pmds(mm, pmds) != 0)
goto out_free_pgd;
if (paravirt_pgd_alloc(mm) != 0)
@@ -304,7 +308,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
return pgd;
out_free_pmds:
- free_pmds(pmds);
+ free_pmds(mm, pmds);
out_free_pgd:
free_page((unsigned long)pgd);
out:
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index ee61c36d64f8..3250f2371aea 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -14,9 +14,6 @@
#include <asm/uv/uv.h>
#include <linux/debugfs.h>
-DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate)
- = { &init_mm, 0, };
-
/*
* Smarter SMP flushing macros.
* c/o Linus Torvalds.
diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index cfd1b132b8e3..6ac273832f28 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -10,9 +10,6 @@
struct pci_root_info {
struct acpi_device *bridge;
char name[16];
- unsigned int res_num;
- struct resource *res;
- resource_size_t *res_offset;
struct pci_sysdata sd;
#ifdef CONFIG_PCI_MMCONFIG
bool mcfg_added;
@@ -218,130 +215,41 @@ static void teardown_mcfg_map(struct pci_root_info *info)
}
#endif
-static acpi_status resource_to_addr(struct acpi_resource *resource,
- struct acpi_resource_address64 *addr)
-{
- acpi_status status;
- struct acpi_resource_memory24 *memory24;
- struct acpi_resource_memory32 *memory32;
- struct acpi_resource_fixed_memory32 *fixed_memory32;
-
- memset(addr, 0, sizeof(*addr));
- switch (resource->type) {
- case ACPI_RESOURCE_TYPE_MEMORY24:
- memory24 = &resource->data.memory24;
- addr->resource_type = ACPI_MEMORY_RANGE;
- addr->minimum = memory24->minimum;
- addr->address_length = memory24->address_length;
- addr->maximum = addr->minimum + addr->address_length - 1;
- return AE_OK;
- case ACPI_RESOURCE_TYPE_MEMORY32:
- memory32 = &resource->data.memory32;
- addr->resource_type = ACPI_MEMORY_RANGE;
- addr->minimum = memory32->minimum;
- addr->address_length = memory32->address_length;
- addr->maximum = addr->minimum + addr->address_length - 1;
- return AE_OK;
- case ACPI_RESOURCE_TYPE_FIXED_MEMORY32:
- fixed_memory32 = &resource->data.fixed_memory32;
- addr->resource_type = ACPI_MEMORY_RANGE;
- addr->minimum = fixed_memory32->address;
- addr->address_length = fixed_memory32->address_length;
- addr->maximum = addr->minimum + addr->address_length - 1;
- return AE_OK;
- case ACPI_RESOURCE_TYPE_ADDRESS16:
- case ACPI_RESOURCE_TYPE_ADDRESS32:
- case ACPI_RESOURCE_TYPE_ADDRESS64:
- status = acpi_resource_to_address64(resource, addr);
- if (ACPI_SUCCESS(status) &&
- (addr->resource_type == ACPI_MEMORY_RANGE ||
- addr->resource_type == ACPI_IO_RANGE) &&
- addr->address_length > 0) {
- return AE_OK;
- }
- break;
- }
- return AE_ERROR;
-}
-
-static acpi_status count_resource(struct acpi_resource *acpi_res, void *data)
+static void validate_resources(struct device *dev, struct list_head *crs_res,
+ unsigned long type)
{
- struct pci_root_info *info = data;
- struct acpi_resource_address64 addr;
- acpi_status status;
-
- status = resource_to_addr(acpi_res, &addr);
- if (ACPI_SUCCESS(status))
- info->res_num++;
- return AE_OK;
-}
-
-static acpi_status setup_resource(struct acpi_resource *acpi_res, void *data)
-{
- struct pci_root_info *info = data;
- struct resource *res;
- struct acpi_resource_address64 addr;
- acpi_status status;
- unsigned long flags;
- u64 start, orig_end, end;
-
- status = resource_to_addr(acpi_res, &addr);
- if (!ACPI_SUCCESS(status))
- return AE_OK;
-
- if (addr.resource_type == ACPI_MEMORY_RANGE) {
- flags = IORESOURCE_MEM;
- if (addr.info.mem.caching == ACPI_PREFETCHABLE_MEMORY)
- flags |= IORESOURCE_PREFETCH;
- } else if (addr.resource_type == ACPI_IO_RANGE) {
- flags = IORESOURCE_IO;
- } else
- return AE_OK;
-
- start = addr.minimum + addr.translation_offset;
- orig_end = end = addr.maximum + addr.translation_offset;
-
- /* Exclude non-addressable range or non-addressable portion of range */
- end = min(end, (u64)iomem_resource.end);
- if (end <= start) {
- dev_info(&info->bridge->dev,
- "host bridge window [%#llx-%#llx] "
- "(ignored, not CPU addressable)\n", start, orig_end);
- return AE_OK;
- } else if (orig_end != end) {
- dev_info(&info->bridge->dev,
- "host bridge window [%#llx-%#llx] "
- "([%#llx-%#llx] ignored, not CPU addressable)\n",
- start, orig_end, end + 1, orig_end);
- }
+ LIST_HEAD(list);
+ struct resource *res1, *res2, *root = NULL;
+ struct resource_entry *tmp, *entry, *entry2;
- res = &info->res[info->res_num];
- res->name = info->name;
- res->flags = flags;
- res->start = start;
- res->end = end;
- info->res_offset[info->res_num] = addr.translation_offset;
- info->res_num++;
+ BUG_ON((type & (IORESOURCE_MEM | IORESOURCE_IO)) == 0);
+ root = (type & IORESOURCE_MEM) ? &iomem_resource : &ioport_resource;
- if (!pci_use_crs)
- dev_printk(KERN_DEBUG, &info->bridge->dev,
- "host bridge window %pR (ignored)\n", res);
+ list_splice_init(crs_res, &list);
+ resource_list_for_each_entry_safe(entry, tmp, &list) {
+ bool free = false;
+ resource_size_t end;
- return AE_OK;
-}
-
-static void coalesce_windows(struct pci_root_info *info, unsigned long type)
-{
- int i, j;
- struct resource *res1, *res2;
-
- for (i = 0; i < info->res_num; i++) {
- res1 = &info->res[i];
+ res1 = entry->res;
if (!(res1->flags & type))
- continue;
+ goto next;
+
+ /* Exclude non-addressable range or non-addressable portion */
+ end = min(res1->end, root->end);
+ if (end <= res1->start) {
+ dev_info(dev, "host bridge window %pR (ignored, not CPU addressable)\n",
+ res1);
+ free = true;
+ goto next;
+ } else if (res1->end != end) {
+ dev_info(dev, "host bridge window %pR ([%#llx-%#llx] ignored, not CPU addressable)\n",
+ res1, (unsigned long long)end + 1,
+ (unsigned long long)res1->end);
+ res1->end = end;
+ }
- for (j = i + 1; j < info->res_num; j++) {
- res2 = &info->res[j];
+ resource_list_for_each_entry(entry2, crs_res) {
+ res2 = entry2->res;
if (!(res2->flags & type))
continue;
@@ -353,118 +261,92 @@ static void coalesce_windows(struct pci_root_info *info, unsigned long type)
if (resource_overlaps(res1, res2)) {
res2->start = min(res1->start, res2->start);
res2->end = max(res1->end, res2->end);
- dev_info(&info->bridge->dev,
- "host bridge window expanded to %pR; %pR ignored\n",
+ dev_info(dev, "host bridge window expanded to %pR; %pR ignored\n",
res2, res1);
- res1->flags = 0;
+ free = true;
+ goto next;
}
}
+
+next:
+ resource_list_del(entry);
+ if (free)
+ resource_list_free_entry(entry);
+ else
+ resource_list_add_tail(entry, crs_res);
}
}
static void add_resources(struct pci_root_info *info,
- struct list_head *resources)
+ struct list_head *resources,
+ struct list_head *crs_res)
{
- int i;
- struct resource *res, *root, *conflict;
-
- coalesce_windows(info, IORESOURCE_MEM);
- coalesce_windows(info, IORESOURCE_IO);
+ struct resource_entry *entry, *tmp;
+ struct resource *res, *conflict, *root = NULL;
- for (i = 0; i < info->res_num; i++) {
- res = &info->res[i];
+ validate_resources(&info->bridge->dev, crs_res, IORESOURCE_MEM);
+ validate_resources(&info->bridge->dev, crs_res, IORESOURCE_IO);
+ resource_list_for_each_entry_safe(entry, tmp, crs_res) {
+ res = entry->res;
if (res->flags & IORESOURCE_MEM)
root = &iomem_resource;
else if (res->flags & IORESOURCE_IO)
root = &ioport_resource;
else
- continue;
+ BUG_ON(res);
conflict = insert_resource_conflict(root, res);
- if (conflict)
+ if (conflict) {
dev_info(&info->bridge->dev,
"ignoring host bridge window %pR (conflicts with %s %pR)\n",
res, conflict->name, conflict);
- else
- pci_add_resource_offset(resources, res,
- info->res_offset[i]);
+ resource_list_destroy_entry(entry);
+ }
}
-}
-static void free_pci_root_info_res(struct pci_root_info *info)
-{
- kfree(info->res);
- info->res = NULL;
- kfree(info->res_offset);
- info->res_offset = NULL;
- info->res_num = 0;
+ list_splice_tail(crs_res, resources);
}
-static void __release_pci_root_info(struct pci_root_info *info)
+static void release_pci_root_info(struct pci_host_bridge *bridge)
{
- int i;
struct resource *res;
+ struct resource_entry *entry;
+ struct pci_root_info *info = bridge->release_data;
- for (i = 0; i < info->res_num; i++) {
- res = &info->res[i];
-
- if (!res->parent)
- continue;
-
- if (!(res->flags & (IORESOURCE_MEM | IORESOURCE_IO)))
- continue;
-
- release_resource(res);
+ resource_list_for_each_entry(entry, &bridge->windows) {
+ res = entry->res;
+ if (res->parent &&
+ (res->flags & (IORESOURCE_MEM | IORESOURCE_IO)))
+ release_resource(res);
}
- free_pci_root_info_res(info);
-
teardown_mcfg_map(info);
-
kfree(info);
}
-static void release_pci_root_info(struct pci_host_bridge *bridge)
-{
- struct pci_root_info *info = bridge->release_data;
-
- __release_pci_root_info(info);
-}
-
static void probe_pci_root_info(struct pci_root_info *info,
struct acpi_device *device,
- int busnum, int domain)
+ int busnum, int domain,
+ struct list_head *list)
{
- size_t size;
+ int ret;
+ struct resource_entry *entry;
sprintf(info->name, "PCI Bus %04x:%02x", domain, busnum);
info->bridge = device;
-
- info->res_num = 0;
- acpi_walk_resources(device->handle, METHOD_NAME__CRS, count_resource,
- info);
- if (!info->res_num)
- return;
-
- size = sizeof(*info->res) * info->res_num;
- info->res = kzalloc_node(size, GFP_KERNEL, info->sd.node);
- if (!info->res) {
- info->res_num = 0;
- return;
- }
-
- size = sizeof(*info->res_offset) * info->res_num;
- info->res_num = 0;
- info->res_offset = kzalloc_node(size, GFP_KERNEL, info->sd.node);
- if (!info->res_offset) {
- kfree(info->res);
- info->res = NULL;
- return;
- }
-
- acpi_walk_resources(device->handle, METHOD_NAME__CRS, setup_resource,
- info);
+ ret = acpi_dev_get_resources(device, list,
+ acpi_dev_filter_resource_type_cb,
+ (void *)(IORESOURCE_IO | IORESOURCE_MEM));
+ if (ret < 0)
+ dev_warn(&device->dev,
+ "failed to parse _CRS method, error code %d\n", ret);
+ else if (ret == 0)
+ dev_dbg(&device->dev,
+ "no IO and memory resources present in _CRS\n");
+ else
+ resource_list_for_each_entry(entry, list)
+ entry->res->name = info->name;
}
struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root)
@@ -473,6 +355,8 @@ struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root)
struct pci_root_info *info;
int domain = root->segment;
int busnum = root->secondary.start;
+ struct resource_entry *res_entry;
+ LIST_HEAD(crs_res);
LIST_HEAD(resources);
struct pci_bus *bus;
struct pci_sysdata *sd;
@@ -520,18 +404,22 @@ struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root)
memcpy(bus->sysdata, sd, sizeof(*sd));
kfree(info);
} else {
- probe_pci_root_info(info, device, busnum, domain);
-
/* insert busn res at first */
pci_add_resource(&resources, &root->secondary);
+
/*
* _CRS with no apertures is normal, so only fall back to
* defaults or native bridge info if we're ignoring _CRS.
*/
- if (pci_use_crs)
- add_resources(info, &resources);
- else {
- free_pci_root_info_res(info);
+ probe_pci_root_info(info, device, busnum, domain, &crs_res);
+ if (pci_use_crs) {
+ add_resources(info, &resources, &crs_res);
+ } else {
+ resource_list_for_each_entry(res_entry, &crs_res)
+ dev_printk(KERN_DEBUG, &device->dev,
+ "host bridge window %pR (ignored)\n",
+ res_entry->res);
+ resource_list_free(&crs_res);
x86_pci_root_bus_resources(busnum, &resources);
}
@@ -546,8 +434,9 @@ struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root)
to_pci_host_bridge(bus->bridge),
release_pci_root_info, info);
} else {
- pci_free_resource_list(&resources);
- __release_pci_root_info(info);
+ resource_list_free(&resources);
+ teardown_mcfg_map(info);
+ kfree(info);
}
}
diff --git a/arch/x86/pci/bus_numa.c b/arch/x86/pci/bus_numa.c
index f3a2cfc14125..7bcf06a7cd12 100644
--- a/arch/x86/pci/bus_numa.c
+++ b/arch/x86/pci/bus_numa.c
@@ -31,7 +31,7 @@ void x86_pci_root_bus_resources(int bus, struct list_head *resources)
{
struct pci_root_info *info = x86_find_pci_root_info(bus);
struct pci_root_res *root_res;
- struct pci_host_bridge_window *window;
+ struct resource_entry *window;
bool found = false;
if (!info)
@@ -41,7 +41,7 @@ void x86_pci_root_bus_resources(int bus, struct list_head *resources)
bus);
/* already added by acpi ? */
- list_for_each_entry(window, resources, list)
+ resource_list_for_each_entry(window, resources)
if (window->res->flags & IORESOURCE_BUS) {
found = true;
break;
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index 7b20bccf3648..3d2612b68694 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -448,6 +448,22 @@ static const struct dmi_system_id pciprobe_dmi_table[] __initconst = {
DMI_MATCH(DMI_PRODUCT_NAME, "ftServer"),
},
},
+ {
+ .callback = set_scan_all,
+ .ident = "Stratus/NEC ftServer",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "NEC"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Express5800/R32"),
+ },
+ },
+ {
+ .callback = set_scan_all,
+ .ident = "Stratus/NEC ftServer",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "NEC"),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Express5800/R31"),
+ },
+ },
{}
};
@@ -497,6 +513,31 @@ void __init pcibios_set_cache_line_size(void)
}
}
+/*
+ * Some device drivers assume dev->irq won't change after calling
+ * pci_disable_device(). So delay releasing of IRQ resource to driver
+ * unbinding time. Otherwise it will break PM subsystem and drivers
+ * like xen-pciback etc.
+ */
+static int pci_irq_notifier(struct notifier_block *nb, unsigned long action,
+ void *data)
+{
+ struct pci_dev *dev = to_pci_dev(data);
+
+ if (action != BUS_NOTIFY_UNBOUND_DRIVER)
+ return NOTIFY_DONE;
+
+ if (pcibios_disable_irq)
+ pcibios_disable_irq(dev);
+
+ return NOTIFY_OK;
+}
+
+static struct notifier_block pci_irq_nb = {
+ .notifier_call = pci_irq_notifier,
+ .priority = INT_MIN,
+};
+
int __init pcibios_init(void)
{
if (!raw_pci_ops) {
@@ -509,6 +550,9 @@ int __init pcibios_init(void)
if (pci_bf_sort >= pci_force_bf)
pci_sort_breadthfirst();
+
+ bus_register_notifier(&pci_bus_type, &pci_irq_nb);
+
return 0;
}
@@ -667,12 +711,6 @@ int pcibios_enable_device(struct pci_dev *dev, int mask)
return 0;
}
-void pcibios_disable_device (struct pci_dev *dev)
-{
- if (!pci_dev_msi_enabled(dev) && pcibios_disable_irq)
- pcibios_disable_irq(dev);
-}
-
int pci_ext_cfg_avail(void)
{
if (raw_pci_ext_ops)
diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
index 9b18ef315a55..349c0d32cc0b 100644
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -216,7 +216,7 @@ static void pcibios_allocate_bridge_resources(struct pci_dev *dev)
continue;
if (r->parent) /* Already allocated */
continue;
- if (!r->start || pci_claim_resource(dev, idx) < 0) {
+ if (!r->start || pci_claim_bridge_resource(dev, idx) < 0) {
/*
* Something is wrong with the region.
* Invalidate the resource to prevent
diff --git a/arch/x86/pci/intel_mid_pci.c b/arch/x86/pci/intel_mid_pci.c
index 44b9271580b5..efb849323c74 100644
--- a/arch/x86/pci/intel_mid_pci.c
+++ b/arch/x86/pci/intel_mid_pci.c
@@ -234,10 +234,10 @@ static int intel_mid_pci_irq_enable(struct pci_dev *dev)
static void intel_mid_pci_irq_disable(struct pci_dev *dev)
{
- if (!mp_should_keep_irq(&dev->dev) && dev->irq_managed &&
- dev->irq > 0) {
+ if (dev->irq_managed && dev->irq > 0) {
mp_unmap_irq(dev->irq);
dev->irq_managed = 0;
+ dev->irq = 0;
}
}
@@ -293,7 +293,6 @@ static void mrst_power_off_unused_dev(struct pci_dev *dev)
DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x0801, mrst_power_off_unused_dev);
DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x0809, mrst_power_off_unused_dev);
DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x080C, mrst_power_off_unused_dev);
-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x0812, mrst_power_off_unused_dev);
DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x0815, mrst_power_off_unused_dev);
/*
diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c
index 5dc6ca5e1741..e71b3dbd87b8 100644
--- a/arch/x86/pci/irq.c
+++ b/arch/x86/pci/irq.c
@@ -1256,22 +1256,9 @@ static int pirq_enable_irq(struct pci_dev *dev)
return 0;
}
-bool mp_should_keep_irq(struct device *dev)
-{
- if (dev->power.is_prepared)
- return true;
-#ifdef CONFIG_PM
- if (dev->power.runtime_status == RPM_SUSPENDING)
- return true;
-#endif
-
- return false;
-}
-
static void pirq_disable_irq(struct pci_dev *dev)
{
- if (io_apic_assign_pci_irqs && !mp_should_keep_irq(&dev->dev) &&
- dev->irq_managed && dev->irq) {
+ if (io_apic_assign_pci_irqs && dev->irq_managed && dev->irq) {
mp_unmap_irq(dev->irq);
dev->irq = 0;
dev->irq_managed = 0;
diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c
index 326198a4434e..dd30b7e08bc2 100644
--- a/arch/x86/pci/mmconfig-shared.c
+++ b/arch/x86/pci/mmconfig-shared.c
@@ -397,12 +397,12 @@ static acpi_status check_mcfg_resource(struct acpi_resource *res, void *data)
status = acpi_resource_to_address64(res, &address);
if (ACPI_FAILURE(status) ||
- (address.address_length <= 0) ||
+ (address.address.address_length <= 0) ||
(address.resource_type != ACPI_MEMORY_RANGE))
return AE_OK;
- if ((mcfg_res->start >= address.minimum) &&
- (mcfg_res->end < (address.minimum + address.address_length))) {
+ if ((mcfg_res->start >= address.address.minimum) &&
+ (mcfg_res->end < (address.address.minimum + address.address.address_length))) {
mcfg_res->flags = 1;
return AE_CTRL_TERMINATE;
}
@@ -610,6 +610,32 @@ static int __init pci_parse_mcfg(struct acpi_table_header *header)
return 0;
}
+#ifdef CONFIG_ACPI_APEI
+extern int (*arch_apei_filter_addr)(int (*func)(__u64 start, __u64 size,
+ void *data), void *data);
+
+static int pci_mmcfg_for_each_region(int (*func)(__u64 start, __u64 size,
+ void *data), void *data)
+{
+ struct pci_mmcfg_region *cfg;
+ int rc;
+
+ if (list_empty(&pci_mmcfg_list))
+ return 0;
+
+ list_for_each_entry(cfg, &pci_mmcfg_list, list) {
+ rc = func(cfg->res.start, resource_size(&cfg->res), data);
+ if (rc)
+ return rc;
+ }
+
+ return 0;
+}
+#define set_apei_filter() (arch_apei_filter_addr = pci_mmcfg_for_each_region)
+#else
+#define set_apei_filter()
+#endif
+
static void __init __pci_mmcfg_init(int early)
{
pci_mmcfg_reject_broken(early);
@@ -644,6 +670,8 @@ void __init pci_mmcfg_early_init(void)
else
acpi_sfi_table_parse(ACPI_SIG_MCFG, pci_parse_mcfg);
__pci_mmcfg_init(1);
+
+ set_apei_filter();
}
}
diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c
index c489ef2c1a39..d22f4b5bbc04 100644
--- a/arch/x86/pci/xen.c
+++ b/arch/x86/pci/xen.c
@@ -298,12 +298,16 @@ static int xen_initdom_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
map_irq.entry_nr = nvec;
} else if (type == PCI_CAP_ID_MSIX) {
int pos;
+ unsigned long flags;
u32 table_offset, bir;
pos = dev->msix_cap;
pci_read_config_dword(dev, pos + PCI_MSIX_TABLE,
&table_offset);
bir = (u8)(table_offset & PCI_MSIX_TABLE_BIR);
+ flags = pci_resource_flags(dev, bir);
+ if (!flags || (flags & IORESOURCE_UNSET))
+ return -EINVAL;
map_irq.table_base = pci_resource_start(dev, bir);
map_irq.entry_nr = msidesc->msi_attrib.entry_nr;
@@ -458,6 +462,7 @@ int __init pci_xen_hvm_init(void)
* just how GSIs get registered.
*/
__acpi_register_gsi = acpi_register_gsi_xen_hvm;
+ __acpi_unregister_gsi = NULL;
#endif
#ifdef CONFIG_PCI_MSI
@@ -471,52 +476,6 @@ int __init pci_xen_hvm_init(void)
}
#ifdef CONFIG_XEN_DOM0
-static __init void xen_setup_acpi_sci(void)
-{
- int rc;
- int trigger, polarity;
- int gsi = acpi_sci_override_gsi;
- int irq = -1;
- int gsi_override = -1;
-
- if (!gsi)
- return;
-
- rc = acpi_get_override_irq(gsi, &trigger, &polarity);
- if (rc) {
- printk(KERN_WARNING "xen: acpi_get_override_irq failed for acpi"
- " sci, rc=%d\n", rc);
- return;
- }
- trigger = trigger ? ACPI_LEVEL_SENSITIVE : ACPI_EDGE_SENSITIVE;
- polarity = polarity ? ACPI_ACTIVE_LOW : ACPI_ACTIVE_HIGH;
-
- printk(KERN_INFO "xen: sci override: global_irq=%d trigger=%d "
- "polarity=%d\n", gsi, trigger, polarity);
-
- /* Before we bind the GSI to a Linux IRQ, check whether
- * we need to override it with bus_irq (IRQ) value. Usually for
- * IRQs below IRQ_LEGACY_IRQ this holds IRQ == GSI, as so:
- * ACPI: INT_SRC_OVR (bus 0 bus_irq 9 global_irq 9 low level)
- * but there are oddballs where the IRQ != GSI:
- * ACPI: INT_SRC_OVR (bus 0 bus_irq 9 global_irq 20 low level)
- * which ends up being: gsi_to_irq[9] == 20
- * (which is what acpi_gsi_to_irq ends up calling when starting the
- * the ACPI interpreter and keels over since IRQ 9 has not been
- * setup as we had setup IRQ 20 for it).
- */
- if (acpi_gsi_to_irq(gsi, &irq) == 0) {
- /* Use the provided value if it's valid. */
- if (irq >= 0)
- gsi_override = irq;
- }
-
- gsi = xen_register_gsi(gsi, gsi_override, trigger, polarity);
- printk(KERN_INFO "xen: acpi sci %d\n", gsi);
-
- return;
-}
-
int __init pci_xen_initial_domain(void)
{
int irq;
@@ -527,8 +486,8 @@ int __init pci_xen_initial_domain(void)
x86_msi.restore_msi_irqs = xen_initdom_restore_msi_irqs;
pci_msi_ignore_mask = 1;
#endif
- xen_setup_acpi_sci();
__acpi_register_gsi = acpi_register_gsi_xen;
+ __acpi_unregister_gsi = NULL;
/* Pre-allocate legacy irqs */
for (irq = 0; irq < nr_legacy_irqs(); irq++) {
int trigger, polarity;
diff --git a/arch/x86/platform/Makefile b/arch/x86/platform/Makefile
index 85afde1fa3e5..a62e0be3a2f1 100644
--- a/arch/x86/platform/Makefile
+++ b/arch/x86/platform/Makefile
@@ -5,6 +5,7 @@ obj-y += geode/
obj-y += goldfish/
obj-y += iris/
obj-y += intel-mid/
+obj-y += intel-quark/
obj-y += olpc/
obj-y += scx200/
obj-y += sfi/
diff --git a/arch/x86/platform/efi/efi_stub_64.S b/arch/x86/platform/efi/efi_stub_64.S
index 5fcda7272550..86d0f9e08dd9 100644
--- a/arch/x86/platform/efi/efi_stub_64.S
+++ b/arch/x86/platform/efi/efi_stub_64.S
@@ -91,167 +91,6 @@ ENTRY(efi_call)
ret
ENDPROC(efi_call)
-#ifdef CONFIG_EFI_MIXED
-
-/*
- * We run this function from the 1:1 mapping.
- *
- * This function must be invoked with a 1:1 mapped stack.
- */
-ENTRY(__efi64_thunk)
- movl %ds, %eax
- push %rax
- movl %es, %eax
- push %rax
- movl %ss, %eax
- push %rax
-
- subq $32, %rsp
- movl %esi, 0x0(%rsp)
- movl %edx, 0x4(%rsp)
- movl %ecx, 0x8(%rsp)
- movq %r8, %rsi
- movl %esi, 0xc(%rsp)
- movq %r9, %rsi
- movl %esi, 0x10(%rsp)
-
- sgdt save_gdt(%rip)
-
- leaq 1f(%rip), %rbx
- movq %rbx, func_rt_ptr(%rip)
-
- /* Switch to gdt with 32-bit segments */
- movl 64(%rsp), %eax
- lgdt (%rax)
-
- leaq efi_enter32(%rip), %rax
- pushq $__KERNEL_CS
- pushq %rax
- lretq
-
-1: addq $32, %rsp
-
- lgdt save_gdt(%rip)
-
- pop %rbx
- movl %ebx, %ss
- pop %rbx
- movl %ebx, %es
- pop %rbx
- movl %ebx, %ds
-
- /*
- * Convert 32-bit status code into 64-bit.
- */
- test %rax, %rax
- jz 1f
- movl %eax, %ecx
- andl $0x0fffffff, %ecx
- andl $0xf0000000, %eax
- shl $32, %rax
- or %rcx, %rax
-1:
- ret
-ENDPROC(__efi64_thunk)
-
-ENTRY(efi_exit32)
- movq func_rt_ptr(%rip), %rax
- push %rax
- mov %rdi, %rax
- ret
-ENDPROC(efi_exit32)
-
- .code32
-/*
- * EFI service pointer must be in %edi.
- *
- * The stack should represent the 32-bit calling convention.
- */
-ENTRY(efi_enter32)
- movl $__KERNEL_DS, %eax
- movl %eax, %ds
- movl %eax, %es
- movl %eax, %ss
-
- /* Reload pgtables */
- movl %cr3, %eax
- movl %eax, %cr3
-
- /* Disable paging */
- movl %cr0, %eax
- btrl $X86_CR0_PG_BIT, %eax
- movl %eax, %cr0
-
- /* Disable long mode via EFER */
- movl $MSR_EFER, %ecx
- rdmsr
- btrl $_EFER_LME, %eax
- wrmsr
-
- call *%edi
-
- /* We must preserve return value */
- movl %eax, %edi
-
- /*
- * Some firmware will return with interrupts enabled. Be sure to
- * disable them before we switch GDTs.
- */
- cli
-
- movl 68(%esp), %eax
- movl %eax, 2(%eax)
- lgdtl (%eax)
-
- movl %cr4, %eax
- btsl $(X86_CR4_PAE_BIT), %eax
- movl %eax, %cr4
-
- movl %cr3, %eax
- movl %eax, %cr3
-
- movl $MSR_EFER, %ecx
- rdmsr
- btsl $_EFER_LME, %eax
- wrmsr
-
- xorl %eax, %eax
- lldt %ax
-
- movl 72(%esp), %eax
- pushl $__KERNEL_CS
- pushl %eax
-
- /* Enable paging */
- movl %cr0, %eax
- btsl $X86_CR0_PG_BIT, %eax
- movl %eax, %cr0
- lret
-ENDPROC(efi_enter32)
-
- .data
- .balign 8
- .global efi32_boot_gdt
-efi32_boot_gdt: .word 0
- .quad 0
-
-save_gdt: .word 0
- .quad 0
-func_rt_ptr: .quad 0
-
- .global efi_gdt64
-efi_gdt64:
- .word efi_gdt64_end - efi_gdt64
- .long 0 /* Filled out by user */
- .word 0
- .quad 0x0000000000000000 /* NULL descriptor */
- .quad 0x00af9a000000ffff /* __KERNEL_CS */
- .quad 0x00cf92000000ffff /* __KERNEL_DS */
- .quad 0x0080890000000000 /* TS descriptor */
- .quad 0x0000000000000000 /* TS continued */
-efi_gdt64_end:
-#endif /* CONFIG_EFI_MIXED */
-
.data
ENTRY(efi_scratch)
.fill 3,8,0
diff --git a/arch/x86/platform/efi/efi_thunk_64.S b/arch/x86/platform/efi/efi_thunk_64.S
index 8806fa73e6e6..ff85d28c50f2 100644
--- a/arch/x86/platform/efi/efi_thunk_64.S
+++ b/arch/x86/platform/efi/efi_thunk_64.S
@@ -1,9 +1,26 @@
/*
* Copyright (C) 2014 Intel Corporation; author Matt Fleming
+ *
+ * Support for invoking 32-bit EFI runtime services from a 64-bit
+ * kernel.
+ *
+ * The below thunking functions are only used after ExitBootServices()
+ * has been called. This simplifies things considerably as compared with
+ * the early EFI thunking because we can leave all the kernel state
+ * intact (GDT, IDT, etc) and simply invoke the the 32-bit EFI runtime
+ * services from __KERNEL32_CS. This means we can continue to service
+ * interrupts across an EFI mixed mode call.
+ *
+ * We do however, need to handle the fact that we're running in a full
+ * 64-bit virtual address space. Things like the stack and instruction
+ * addresses need to be accessible by the 32-bit firmware, so we rely on
+ * using the identity mappings in the EFI page table to access the stack
+ * and kernel text (see efi_setup_page_tables()).
*/
#include <linux/linkage.h>
#include <asm/page_types.h>
+#include <asm/segment.h>
.text
.code64
@@ -33,14 +50,6 @@ ENTRY(efi64_thunk)
leaq efi_exit32(%rip), %rbx
subq %rax, %rbx
movl %ebx, 8(%rsp)
- leaq efi_gdt64(%rip), %rbx
- subq %rax, %rbx
- movl %ebx, 2(%ebx)
- movl %ebx, 4(%rsp)
- leaq efi_gdt32(%rip), %rbx
- subq %rax, %rbx
- movl %ebx, 2(%ebx)
- movl %ebx, (%rsp)
leaq __efi64_thunk(%rip), %rbx
subq %rax, %rbx
@@ -52,14 +61,92 @@ ENTRY(efi64_thunk)
retq
ENDPROC(efi64_thunk)
- .data
-efi_gdt32:
- .word efi_gdt32_end - efi_gdt32
- .long 0 /* Filled out above */
- .word 0
- .quad 0x0000000000000000 /* NULL descriptor */
- .quad 0x00cf9a000000ffff /* __KERNEL_CS */
- .quad 0x00cf93000000ffff /* __KERNEL_DS */
-efi_gdt32_end:
+/*
+ * We run this function from the 1:1 mapping.
+ *
+ * This function must be invoked with a 1:1 mapped stack.
+ */
+ENTRY(__efi64_thunk)
+ movl %ds, %eax
+ push %rax
+ movl %es, %eax
+ push %rax
+ movl %ss, %eax
+ push %rax
+
+ subq $32, %rsp
+ movl %esi, 0x0(%rsp)
+ movl %edx, 0x4(%rsp)
+ movl %ecx, 0x8(%rsp)
+ movq %r8, %rsi
+ movl %esi, 0xc(%rsp)
+ movq %r9, %rsi
+ movl %esi, 0x10(%rsp)
+
+ leaq 1f(%rip), %rbx
+ movq %rbx, func_rt_ptr(%rip)
+
+ /* Switch to 32-bit descriptor */
+ pushq $__KERNEL32_CS
+ leaq efi_enter32(%rip), %rax
+ pushq %rax
+ lretq
+
+1: addq $32, %rsp
+
+ pop %rbx
+ movl %ebx, %ss
+ pop %rbx
+ movl %ebx, %es
+ pop %rbx
+ movl %ebx, %ds
+ /*
+ * Convert 32-bit status code into 64-bit.
+ */
+ test %rax, %rax
+ jz 1f
+ movl %eax, %ecx
+ andl $0x0fffffff, %ecx
+ andl $0xf0000000, %eax
+ shl $32, %rax
+ or %rcx, %rax
+1:
+ ret
+ENDPROC(__efi64_thunk)
+
+ENTRY(efi_exit32)
+ movq func_rt_ptr(%rip), %rax
+ push %rax
+ mov %rdi, %rax
+ ret
+ENDPROC(efi_exit32)
+
+ .code32
+/*
+ * EFI service pointer must be in %edi.
+ *
+ * The stack should represent the 32-bit calling convention.
+ */
+ENTRY(efi_enter32)
+ movl $__KERNEL_DS, %eax
+ movl %eax, %ds
+ movl %eax, %es
+ movl %eax, %ss
+
+ call *%edi
+
+ /* We must preserve return value */
+ movl %eax, %edi
+
+ movl 72(%esp), %eax
+ pushl $__KERNEL_CS
+ pushl %eax
+
+ lret
+ENDPROC(efi_enter32)
+
+ .data
+ .balign 8
+func_rt_ptr: .quad 0
efi_saved_sp: .quad 0
diff --git a/arch/x86/platform/intel-mid/device_libs/Makefile b/arch/x86/platform/intel-mid/device_libs/Makefile
index af9307f2cc28..91ec9f8704bf 100644
--- a/arch/x86/platform/intel-mid/device_libs/Makefile
+++ b/arch/x86/platform/intel-mid/device_libs/Makefile
@@ -16,8 +16,6 @@ obj-$(subst m,y,$(CONFIG_INPUT_MPU3050)) += platform_mpu3050.o
obj-$(subst m,y,$(CONFIG_INPUT_BMA150)) += platform_bma023.o
obj-$(subst m,y,$(CONFIG_GPIO_PCA953X)) += platform_tca6416.o
obj-$(subst m,y,$(CONFIG_DRM_MEDFIELD)) += platform_tc35876x.o
-# SPI Devices
-obj-$(subst m,y,$(CONFIG_SERIAL_MRST_MAX3110)) += platform_max3111.o
# MISC Devices
obj-$(subst m,y,$(CONFIG_KEYBOARD_GPIO)) += platform_gpio_keys.o
obj-$(subst m,y,$(CONFIG_INTEL_MID_WATCHDOG)) += platform_wdt.o
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_max3111.c b/arch/x86/platform/intel-mid/device_libs/platform_max3111.c
deleted file mode 100644
index afd1df94e0e5..000000000000
--- a/arch/x86/platform/intel-mid/device_libs/platform_max3111.c
+++ /dev/null
@@ -1,35 +0,0 @@
-/*
- * platform_max3111.c: max3111 platform data initilization file
- *
- * (C) Copyright 2013 Intel Corporation
- * Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; version 2
- * of the License.
- */
-
-#include <linux/gpio.h>
-#include <linux/spi/spi.h>
-#include <asm/intel-mid.h>
-
-static void __init *max3111_platform_data(void *info)
-{
- struct spi_board_info *spi_info = info;
- int intr = get_gpio_by_name("max3111_int");
-
- spi_info->mode = SPI_MODE_0;
- if (intr == -1)
- return NULL;
- spi_info->irq = intr + INTEL_MID_IRQ_OFFSET;
- return NULL;
-}
-
-static const struct devs_id max3111_dev_id __initconst = {
- .name = "spi_max3111",
- .type = SFI_DEV_TYPE_SPI,
- .get_platform_data = &max3111_platform_data,
-};
-
-sfi_device(max3111_dev_id);
diff --git a/arch/x86/platform/intel-mid/early_printk_intel_mid.c b/arch/x86/platform/intel-mid/early_printk_intel_mid.c
index e0bd082a80e0..4e720829ab90 100644
--- a/arch/x86/platform/intel-mid/early_printk_intel_mid.c
+++ b/arch/x86/platform/intel-mid/early_printk_intel_mid.c
@@ -10,15 +10,13 @@
*/
/*
- * This file implements two early consoles named mrst and hsu.
- * mrst is based on Maxim3110 spi-uart device, it exists in both
- * Moorestown and Medfield platforms, while hsu is based on a High
- * Speed UART device which only exists in the Medfield platform
+ * This file implements early console named hsu.
+ * hsu is based on a High Speed UART device which only exists in the Medfield
+ * platform
*/
#include <linux/serial_reg.h>
#include <linux/serial_mfd.h>
-#include <linux/kmsg_dump.h>
#include <linux/console.h>
#include <linux/kernel.h>
#include <linux/delay.h>
@@ -28,216 +26,6 @@
#include <asm/pgtable.h>
#include <asm/intel-mid.h>
-#define MRST_SPI_TIMEOUT 0x200000
-#define MRST_REGBASE_SPI0 0xff128000
-#define MRST_REGBASE_SPI1 0xff128400
-#define MRST_CLK_SPI0_REG 0xff11d86c
-
-/* Bit fields in CTRLR0 */
-#define SPI_DFS_OFFSET 0
-
-#define SPI_FRF_OFFSET 4
-#define SPI_FRF_SPI 0x0
-#define SPI_FRF_SSP 0x1
-#define SPI_FRF_MICROWIRE 0x2
-#define SPI_FRF_RESV 0x3
-
-#define SPI_MODE_OFFSET 6
-#define SPI_SCPH_OFFSET 6
-#define SPI_SCOL_OFFSET 7
-#define SPI_TMOD_OFFSET 8
-#define SPI_TMOD_TR 0x0 /* xmit & recv */
-#define SPI_TMOD_TO 0x1 /* xmit only */
-#define SPI_TMOD_RO 0x2 /* recv only */
-#define SPI_TMOD_EPROMREAD 0x3 /* eeprom read mode */
-
-#define SPI_SLVOE_OFFSET 10
-#define SPI_SRL_OFFSET 11
-#define SPI_CFS_OFFSET 12
-
-/* Bit fields in SR, 7 bits */
-#define SR_MASK 0x7f /* cover 7 bits */
-#define SR_BUSY (1 << 0)
-#define SR_TF_NOT_FULL (1 << 1)
-#define SR_TF_EMPT (1 << 2)
-#define SR_RF_NOT_EMPT (1 << 3)
-#define SR_RF_FULL (1 << 4)
-#define SR_TX_ERR (1 << 5)
-#define SR_DCOL (1 << 6)
-
-struct dw_spi_reg {
- u32 ctrl0;
- u32 ctrl1;
- u32 ssienr;
- u32 mwcr;
- u32 ser;
- u32 baudr;
- u32 txfltr;
- u32 rxfltr;
- u32 txflr;
- u32 rxflr;
- u32 sr;
- u32 imr;
- u32 isr;
- u32 risr;
- u32 txoicr;
- u32 rxoicr;
- u32 rxuicr;
- u32 msticr;
- u32 icr;
- u32 dmacr;
- u32 dmatdlr;
- u32 dmardlr;
- u32 idr;
- u32 version;
-
- /* Currently operates as 32 bits, though only the low 16 bits matter */
- u32 dr;
-} __packed;
-
-#define dw_readl(dw, name) __raw_readl(&(dw)->name)
-#define dw_writel(dw, name, val) __raw_writel((val), &(dw)->name)
-
-/* Default use SPI0 register for mrst, we will detect Penwell and use SPI1 */
-static unsigned long mrst_spi_paddr = MRST_REGBASE_SPI0;
-
-static u32 *pclk_spi0;
-/* Always contains an accessible address, start with 0 */
-static struct dw_spi_reg *pspi;
-
-static struct kmsg_dumper dw_dumper;
-static int dumper_registered;
-
-static void dw_kmsg_dump(struct kmsg_dumper *dumper,
- enum kmsg_dump_reason reason)
-{
- static char line[1024];
- size_t len;
-
- /* When run to this, we'd better re-init the HW */
- mrst_early_console_init();
-
- while (kmsg_dump_get_line(dumper, true, line, sizeof(line), &len))
- early_mrst_console.write(&early_mrst_console, line, len);
-}
-
-/* Set the ratio rate to 115200, 8n1, IRQ disabled */
-static void max3110_write_config(void)
-{
- u16 config;
-
- config = 0xc001;
- dw_writel(pspi, dr, config);
-}
-
-/* Translate char to a eligible word and send to max3110 */
-static void max3110_write_data(char c)
-{
- u16 data;
-
- data = 0x8000 | c;
- dw_writel(pspi, dr, data);
-}
-
-void mrst_early_console_init(void)
-{
- u32 ctrlr0 = 0;
- u32 spi0_cdiv;
- u32 freq; /* Freqency info only need be searched once */
-
- /* Base clk is 100 MHz, the actual clk = 100M / (clk_divider + 1) */
- pclk_spi0 = (void *)set_fixmap_offset_nocache(FIX_EARLYCON_MEM_BASE,
- MRST_CLK_SPI0_REG);
- spi0_cdiv = ((*pclk_spi0) & 0xe00) >> 9;
- freq = 100000000 / (spi0_cdiv + 1);
-
- if (intel_mid_identify_cpu() == INTEL_MID_CPU_CHIP_PENWELL)
- mrst_spi_paddr = MRST_REGBASE_SPI1;
-
- pspi = (void *)set_fixmap_offset_nocache(FIX_EARLYCON_MEM_BASE,
- mrst_spi_paddr);
-
- /* Disable SPI controller */
- dw_writel(pspi, ssienr, 0);
-
- /* Set control param, 8 bits, transmit only mode */
- ctrlr0 = dw_readl(pspi, ctrl0);
-
- ctrlr0 &= 0xfcc0;
- ctrlr0 |= 0xf | (SPI_FRF_SPI << SPI_FRF_OFFSET)
- | (SPI_TMOD_TO << SPI_TMOD_OFFSET);
- dw_writel(pspi, ctrl0, ctrlr0);
-
- /*
- * Change the spi0 clk to comply with 115200 bps, use 100000 to
- * calculate the clk dividor to make the clock a little slower
- * than real baud rate.
- */
- dw_writel(pspi, baudr, freq/100000);
-
- /* Disable all INT for early phase */
- dw_writel(pspi, imr, 0x0);
-
- /* Set the cs to spi-uart */
- dw_writel(pspi, ser, 0x2);
-
- /* Enable the HW, the last step for HW init */
- dw_writel(pspi, ssienr, 0x1);
-
- /* Set the default configuration */
- max3110_write_config();
-
- /* Register the kmsg dumper */
- if (!dumper_registered) {
- dw_dumper.dump = dw_kmsg_dump;
- kmsg_dump_register(&dw_dumper);
- dumper_registered = 1;
- }
-}
-
-/* Slave select should be called in the read/write function */
-static void early_mrst_spi_putc(char c)
-{
- unsigned int timeout;
- u32 sr;
-
- timeout = MRST_SPI_TIMEOUT;
- /* Early putc needs to make sure the TX FIFO is not full */
- while (--timeout) {
- sr = dw_readl(pspi, sr);
- if (!(sr & SR_TF_NOT_FULL))
- cpu_relax();
- else
- break;
- }
-
- if (!timeout)
- pr_warn("MRST earlycon: timed out\n");
- else
- max3110_write_data(c);
-}
-
-/* Early SPI only uses polling mode */
-static void early_mrst_spi_write(struct console *con, const char *str,
- unsigned n)
-{
- int i;
-
- for (i = 0; i < n && *str; i++) {
- if (*str == '\n')
- early_mrst_spi_putc('\r');
- early_mrst_spi_putc(*str);
- str++;
- }
-}
-
-struct console early_mrst_console = {
- .name = "earlymrst",
- .write = early_mrst_spi_write,
- .flags = CON_PRINTBUFFER,
- .index = -1,
-};
-
/*
* Following is the early console based on Medfield HSU (High
* Speed UART) device.
@@ -259,7 +47,7 @@ void hsu_early_console_init(const char *s)
port = clamp_val(port, 0, 2);
paddr = HSU_PORT_BASE + port * 0x80;
- phsu = (void *)set_fixmap_offset_nocache(FIX_EARLYCON_MEM_BASE, paddr);
+ phsu = (void __iomem *)set_fixmap_offset_nocache(FIX_EARLYCON_MEM_BASE, paddr);
/* Disable FIFO */
writeb(0x0, phsu + UART_FCR);
diff --git a/arch/x86/platform/intel-mid/intel_mid_vrtc.c b/arch/x86/platform/intel-mid/intel_mid_vrtc.c
index 4762cff7facd..32947ba0f62d 100644
--- a/arch/x86/platform/intel-mid/intel_mid_vrtc.c
+++ b/arch/x86/platform/intel-mid/intel_mid_vrtc.c
@@ -110,7 +110,7 @@ int vrtc_set_mmss(const struct timespec *now)
spin_unlock_irqrestore(&rtc_lock, flags);
} else {
pr_err("%s: Invalid vRTC value: write of %lx to vRTC failed\n",
- __FUNCTION__, now->tv_sec);
+ __func__, now->tv_sec);
retval = -EINVAL;
}
return retval;
diff --git a/arch/x86/platform/intel-quark/Makefile b/arch/x86/platform/intel-quark/Makefile
new file mode 100644
index 000000000000..9cc57ed36022
--- /dev/null
+++ b/arch/x86/platform/intel-quark/Makefile
@@ -0,0 +1,2 @@
+obj-$(CONFIG_INTEL_IMR) += imr.o
+obj-$(CONFIG_DEBUG_IMR_SELFTEST) += imr_selftest.o
diff --git a/arch/x86/platform/intel-quark/imr.c b/arch/x86/platform/intel-quark/imr.c
new file mode 100644
index 000000000000..0ee619f9fcb7
--- /dev/null
+++ b/arch/x86/platform/intel-quark/imr.c
@@ -0,0 +1,661 @@
+/**
+ * imr.c
+ *
+ * Copyright(c) 2013 Intel Corporation.
+ * Copyright(c) 2015 Bryan O'Donoghue <pure.logic@nexus-software.ie>
+ *
+ * IMR registers define an isolated region of memory that can
+ * be masked to prohibit certain system agents from accessing memory.
+ * When a device behind a masked port performs an access - snooped or
+ * not, an IMR may optionally prevent that transaction from changing
+ * the state of memory or from getting correct data in response to the
+ * operation.
+ *
+ * Write data will be dropped and reads will return 0xFFFFFFFF, the
+ * system will reset and system BIOS will print out an error message to
+ * inform the user that an IMR has been violated.
+ *
+ * This code is based on the Linux MTRR code and reference code from
+ * Intel's Quark BSP EFI, Linux and grub code.
+ *
+ * See quark-x1000-datasheet.pdf for register definitions.
+ * http://www.intel.com/content/dam/www/public/us/en/documents/datasheets/quark-x1000-datasheet.pdf
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <asm-generic/sections.h>
+#include <asm/cpu_device_id.h>
+#include <asm/imr.h>
+#include <asm/iosf_mbi.h>
+#include <linux/debugfs.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/types.h>
+
+struct imr_device {
+ struct dentry *file;
+ bool init;
+ struct mutex lock;
+ int max_imr;
+ int reg_base;
+};
+
+static struct imr_device imr_dev;
+
+/*
+ * IMR read/write mask control registers.
+ * See quark-x1000-datasheet.pdf sections 12.7.4.5 and 12.7.4.6 for
+ * bit definitions.
+ *
+ * addr_hi
+ * 31 Lock bit
+ * 30:24 Reserved
+ * 23:2 1 KiB aligned lo address
+ * 1:0 Reserved
+ *
+ * addr_hi
+ * 31:24 Reserved
+ * 23:2 1 KiB aligned hi address
+ * 1:0 Reserved
+ */
+#define IMR_LOCK BIT(31)
+
+struct imr_regs {
+ u32 addr_lo;
+ u32 addr_hi;
+ u32 rmask;
+ u32 wmask;
+};
+
+#define IMR_NUM_REGS (sizeof(struct imr_regs)/sizeof(u32))
+#define IMR_SHIFT 8
+#define imr_to_phys(x) ((x) << IMR_SHIFT)
+#define phys_to_imr(x) ((x) >> IMR_SHIFT)
+
+/**
+ * imr_is_enabled - true if an IMR is enabled false otherwise.
+ *
+ * Determines if an IMR is enabled based on address range and read/write
+ * mask. An IMR set with an address range set to zero and a read/write
+ * access mask set to all is considered to be disabled. An IMR in any
+ * other state - for example set to zero but without read/write access
+ * all is considered to be enabled. This definition of disabled is how
+ * firmware switches off an IMR and is maintained in kernel for
+ * consistency.
+ *
+ * @imr: pointer to IMR descriptor.
+ * @return: true if IMR enabled false if disabled.
+ */
+static inline int imr_is_enabled(struct imr_regs *imr)
+{
+ return !(imr->rmask == IMR_READ_ACCESS_ALL &&
+ imr->wmask == IMR_WRITE_ACCESS_ALL &&
+ imr_to_phys(imr->addr_lo) == 0 &&
+ imr_to_phys(imr->addr_hi) == 0);
+}
+
+/**
+ * imr_read - read an IMR at a given index.
+ *
+ * Requires caller to hold imr mutex.
+ *
+ * @idev: pointer to imr_device structure.
+ * @imr_id: IMR entry to read.
+ * @imr: IMR structure representing address and access masks.
+ * @return: 0 on success or error code passed from mbi_iosf on failure.
+ */
+static int imr_read(struct imr_device *idev, u32 imr_id, struct imr_regs *imr)
+{
+ u32 reg = imr_id * IMR_NUM_REGS + idev->reg_base;
+ int ret;
+
+ ret = iosf_mbi_read(QRK_MBI_UNIT_MM, QRK_MBI_MM_READ,
+ reg++, &imr->addr_lo);
+ if (ret)
+ return ret;
+
+ ret = iosf_mbi_read(QRK_MBI_UNIT_MM, QRK_MBI_MM_READ,
+ reg++, &imr->addr_hi);
+ if (ret)
+ return ret;
+
+ ret = iosf_mbi_read(QRK_MBI_UNIT_MM, QRK_MBI_MM_READ,
+ reg++, &imr->rmask);
+ if (ret)
+ return ret;
+
+ return iosf_mbi_read(QRK_MBI_UNIT_MM, QRK_MBI_MM_READ,
+ reg++, &imr->wmask);
+}
+
+/**
+ * imr_write - write an IMR at a given index.
+ *
+ * Requires caller to hold imr mutex.
+ * Note lock bits need to be written independently of address bits.
+ *
+ * @idev: pointer to imr_device structure.
+ * @imr_id: IMR entry to write.
+ * @imr: IMR structure representing address and access masks.
+ * @lock: indicates if the IMR lock bit should be applied.
+ * @return: 0 on success or error code passed from mbi_iosf on failure.
+ */
+static int imr_write(struct imr_device *idev, u32 imr_id,
+ struct imr_regs *imr, bool lock)
+{
+ unsigned long flags;
+ u32 reg = imr_id * IMR_NUM_REGS + idev->reg_base;
+ int ret;
+
+ local_irq_save(flags);
+
+ ret = iosf_mbi_write(QRK_MBI_UNIT_MM, QRK_MBI_MM_WRITE, reg++,
+ imr->addr_lo);
+ if (ret)
+ goto failed;
+
+ ret = iosf_mbi_write(QRK_MBI_UNIT_MM, QRK_MBI_MM_WRITE,
+ reg++, imr->addr_hi);
+ if (ret)
+ goto failed;
+
+ ret = iosf_mbi_write(QRK_MBI_UNIT_MM, QRK_MBI_MM_WRITE,
+ reg++, imr->rmask);
+ if (ret)
+ goto failed;
+
+ ret = iosf_mbi_write(QRK_MBI_UNIT_MM, QRK_MBI_MM_WRITE,
+ reg++, imr->wmask);
+ if (ret)
+ goto failed;
+
+ /* Lock bit must be set separately to addr_lo address bits. */
+ if (lock) {
+ imr->addr_lo |= IMR_LOCK;
+ ret = iosf_mbi_write(QRK_MBI_UNIT_MM, QRK_MBI_MM_WRITE,
+ reg - IMR_NUM_REGS, imr->addr_lo);
+ if (ret)
+ goto failed;
+ }
+
+ local_irq_restore(flags);
+ return 0;
+failed:
+ /*
+ * If writing to the IOSF failed then we're in an unknown state,
+ * likely a very bad state. An IMR in an invalid state will almost
+ * certainly lead to a memory access violation.
+ */
+ local_irq_restore(flags);
+ WARN(ret, "IOSF-MBI write fail range 0x%08x-0x%08x unreliable\n",
+ imr_to_phys(imr->addr_lo), imr_to_phys(imr->addr_hi) + IMR_MASK);
+
+ return ret;
+}
+
+/**
+ * imr_dbgfs_state_show - print state of IMR registers.
+ *
+ * @s: pointer to seq_file for output.
+ * @unused: unused parameter.
+ * @return: 0 on success or error code passed from mbi_iosf on failure.
+ */
+static int imr_dbgfs_state_show(struct seq_file *s, void *unused)
+{
+ phys_addr_t base;
+ phys_addr_t end;
+ int i;
+ struct imr_device *idev = s->private;
+ struct imr_regs imr;
+ size_t size;
+ int ret = -ENODEV;
+
+ mutex_lock(&idev->lock);
+
+ for (i = 0; i < idev->max_imr; i++) {
+
+ ret = imr_read(idev, i, &imr);
+ if (ret)
+ break;
+
+ /*
+ * Remember to add IMR_ALIGN bytes to size to indicate the
+ * inherent IMR_ALIGN size bytes contained in the masked away
+ * lower ten bits.
+ */
+ if (imr_is_enabled(&imr)) {
+ base = imr_to_phys(imr.addr_lo);
+ end = imr_to_phys(imr.addr_hi) + IMR_MASK;
+ } else {
+ base = 0;
+ end = 0;
+ }
+ size = end - base;
+ seq_printf(s, "imr%02i: base=%pa, end=%pa, size=0x%08zx "
+ "rmask=0x%08x, wmask=0x%08x, %s, %s\n", i,
+ &base, &end, size, imr.rmask, imr.wmask,
+ imr_is_enabled(&imr) ? "enabled " : "disabled",
+ imr.addr_lo & IMR_LOCK ? "locked" : "unlocked");
+ }
+
+ mutex_unlock(&idev->lock);
+ return ret;
+}
+
+/**
+ * imr_state_open - debugfs open callback.
+ *
+ * @inode: pointer to struct inode.
+ * @file: pointer to struct file.
+ * @return: result of single open.
+ */
+static int imr_state_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, imr_dbgfs_state_show, inode->i_private);
+}
+
+static const struct file_operations imr_state_ops = {
+ .open = imr_state_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+/**
+ * imr_debugfs_register - register debugfs hooks.
+ *
+ * @idev: pointer to imr_device structure.
+ * @return: 0 on success - errno on failure.
+ */
+static int imr_debugfs_register(struct imr_device *idev)
+{
+ idev->file = debugfs_create_file("imr_state", S_IFREG | S_IRUGO, NULL,
+ idev, &imr_state_ops);
+ return PTR_ERR_OR_ZERO(idev->file);
+}
+
+/**
+ * imr_debugfs_unregister - unregister debugfs hooks.
+ *
+ * @idev: pointer to imr_device structure.
+ * @return:
+ */
+static void imr_debugfs_unregister(struct imr_device *idev)
+{
+ debugfs_remove(idev->file);
+}
+
+/**
+ * imr_check_params - check passed address range IMR alignment and non-zero size
+ *
+ * @base: base address of intended IMR.
+ * @size: size of intended IMR.
+ * @return: zero on valid range -EINVAL on unaligned base/size.
+ */
+static int imr_check_params(phys_addr_t base, size_t size)
+{
+ if ((base & IMR_MASK) || (size & IMR_MASK)) {
+ pr_err("base %pa size 0x%08zx must align to 1KiB\n",
+ &base, size);
+ return -EINVAL;
+ }
+ if (size == 0)
+ return -EINVAL;
+
+ return 0;
+}
+
+/**
+ * imr_raw_size - account for the IMR_ALIGN bytes that addr_hi appends.
+ *
+ * IMR addr_hi has a built in offset of plus IMR_ALIGN (0x400) bytes from the
+ * value in the register. We need to subtract IMR_ALIGN bytes from input sizes
+ * as a result.
+ *
+ * @size: input size bytes.
+ * @return: reduced size.
+ */
+static inline size_t imr_raw_size(size_t size)
+{
+ return size - IMR_ALIGN;
+}
+
+/**
+ * imr_address_overlap - detects an address overlap.
+ *
+ * @addr: address to check against an existing IMR.
+ * @imr: imr being checked.
+ * @return: true for overlap false for no overlap.
+ */
+static inline int imr_address_overlap(phys_addr_t addr, struct imr_regs *imr)
+{
+ return addr >= imr_to_phys(imr->addr_lo) && addr <= imr_to_phys(imr->addr_hi);
+}
+
+/**
+ * imr_add_range - add an Isolated Memory Region.
+ *
+ * @base: physical base address of region aligned to 1KiB.
+ * @size: physical size of region in bytes must be aligned to 1KiB.
+ * @read_mask: read access mask.
+ * @write_mask: write access mask.
+ * @lock: indicates whether or not to permanently lock this region.
+ * @return: zero on success or negative value indicating error.
+ */
+int imr_add_range(phys_addr_t base, size_t size,
+ unsigned int rmask, unsigned int wmask, bool lock)
+{
+ phys_addr_t end;
+ unsigned int i;
+ struct imr_device *idev = &imr_dev;
+ struct imr_regs imr;
+ size_t raw_size;
+ int reg;
+ int ret;
+
+ if (WARN_ONCE(idev->init == false, "driver not initialized"))
+ return -ENODEV;
+
+ ret = imr_check_params(base, size);
+ if (ret)
+ return ret;
+
+ /* Tweak the size value. */
+ raw_size = imr_raw_size(size);
+ end = base + raw_size;
+
+ /*
+ * Check for reserved IMR value common to firmware, kernel and grub
+ * indicating a disabled IMR.
+ */
+ imr.addr_lo = phys_to_imr(base);
+ imr.addr_hi = phys_to_imr(end);
+ imr.rmask = rmask;
+ imr.wmask = wmask;
+ if (!imr_is_enabled(&imr))
+ return -ENOTSUPP;
+
+ mutex_lock(&idev->lock);
+
+ /*
+ * Find a free IMR while checking for an existing overlapping range.
+ * Note there's no restriction in silicon to prevent IMR overlaps.
+ * For the sake of simplicity and ease in defining/debugging an IMR
+ * memory map we exclude IMR overlaps.
+ */
+ reg = -1;
+ for (i = 0; i < idev->max_imr; i++) {
+ ret = imr_read(idev, i, &imr);
+ if (ret)
+ goto failed;
+
+ /* Find overlap @ base or end of requested range. */
+ ret = -EINVAL;
+ if (imr_is_enabled(&imr)) {
+ if (imr_address_overlap(base, &imr))
+ goto failed;
+ if (imr_address_overlap(end, &imr))
+ goto failed;
+ } else {
+ reg = i;
+ }
+ }
+
+ /* Error out if we have no free IMR entries. */
+ if (reg == -1) {
+ ret = -ENOMEM;
+ goto failed;
+ }
+
+ pr_debug("add %d phys %pa-%pa size %zx mask 0x%08x wmask 0x%08x\n",
+ reg, &base, &end, raw_size, rmask, wmask);
+
+ /* Enable IMR at specified range and access mask. */
+ imr.addr_lo = phys_to_imr(base);
+ imr.addr_hi = phys_to_imr(end);
+ imr.rmask = rmask;
+ imr.wmask = wmask;
+
+ ret = imr_write(idev, reg, &imr, lock);
+ if (ret < 0) {
+ /*
+ * In the highly unlikely event iosf_mbi_write failed
+ * attempt to rollback the IMR setup skipping the trapping
+ * of further IOSF write failures.
+ */
+ imr.addr_lo = 0;
+ imr.addr_hi = 0;
+ imr.rmask = IMR_READ_ACCESS_ALL;
+ imr.wmask = IMR_WRITE_ACCESS_ALL;
+ imr_write(idev, reg, &imr, false);
+ }
+failed:
+ mutex_unlock(&idev->lock);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(imr_add_range);
+
+/**
+ * __imr_remove_range - delete an Isolated Memory Region.
+ *
+ * This function allows you to delete an IMR by its index specified by reg or
+ * by address range specified by base and size respectively. If you specify an
+ * index on its own the base and size parameters are ignored.
+ * imr_remove_range(0, base, size); delete IMR at index 0 base/size ignored.
+ * imr_remove_range(-1, base, size); delete IMR from base to base+size.
+ *
+ * @reg: imr index to remove.
+ * @base: physical base address of region aligned to 1 KiB.
+ * @size: physical size of region in bytes aligned to 1 KiB.
+ * @return: -EINVAL on invalid range or out or range id
+ * -ENODEV if reg is valid but no IMR exists or is locked
+ * 0 on success.
+ */
+static int __imr_remove_range(int reg, phys_addr_t base, size_t size)
+{
+ phys_addr_t end;
+ bool found = false;
+ unsigned int i;
+ struct imr_device *idev = &imr_dev;
+ struct imr_regs imr;
+ size_t raw_size;
+ int ret = 0;
+
+ if (WARN_ONCE(idev->init == false, "driver not initialized"))
+ return -ENODEV;
+
+ /*
+ * Validate address range if deleting by address, else we are
+ * deleting by index where base and size will be ignored.
+ */
+ if (reg == -1) {
+ ret = imr_check_params(base, size);
+ if (ret)
+ return ret;
+ }
+
+ /* Tweak the size value. */
+ raw_size = imr_raw_size(size);
+ end = base + raw_size;
+
+ mutex_lock(&idev->lock);
+
+ if (reg >= 0) {
+ /* If a specific IMR is given try to use it. */
+ ret = imr_read(idev, reg, &imr);
+ if (ret)
+ goto failed;
+
+ if (!imr_is_enabled(&imr) || imr.addr_lo & IMR_LOCK) {
+ ret = -ENODEV;
+ goto failed;
+ }
+ found = true;
+ } else {
+ /* Search for match based on address range. */
+ for (i = 0; i < idev->max_imr; i++) {
+ ret = imr_read(idev, i, &imr);
+ if (ret)
+ goto failed;
+
+ if (!imr_is_enabled(&imr) || imr.addr_lo & IMR_LOCK)
+ continue;
+
+ if ((imr_to_phys(imr.addr_lo) == base) &&
+ (imr_to_phys(imr.addr_hi) == end)) {
+ found = true;
+ reg = i;
+ break;
+ }
+ }
+ }
+
+ if (!found) {
+ ret = -ENODEV;
+ goto failed;
+ }
+
+ pr_debug("remove %d phys %pa-%pa size %zx\n", reg, &base, &end, raw_size);
+
+ /* Tear down the IMR. */
+ imr.addr_lo = 0;
+ imr.addr_hi = 0;
+ imr.rmask = IMR_READ_ACCESS_ALL;
+ imr.wmask = IMR_WRITE_ACCESS_ALL;
+
+ ret = imr_write(idev, reg, &imr, false);
+
+failed:
+ mutex_unlock(&idev->lock);
+ return ret;
+}
+
+/**
+ * imr_remove_range - delete an Isolated Memory Region by address
+ *
+ * This function allows you to delete an IMR by an address range specified
+ * by base and size respectively.
+ * imr_remove_range(base, size); delete IMR from base to base+size.
+ *
+ * @base: physical base address of region aligned to 1 KiB.
+ * @size: physical size of region in bytes aligned to 1 KiB.
+ * @return: -EINVAL on invalid range or out or range id
+ * -ENODEV if reg is valid but no IMR exists or is locked
+ * 0 on success.
+ */
+int imr_remove_range(phys_addr_t base, size_t size)
+{
+ return __imr_remove_range(-1, base, size);
+}
+EXPORT_SYMBOL_GPL(imr_remove_range);
+
+/**
+ * imr_clear - delete an Isolated Memory Region by index
+ *
+ * This function allows you to delete an IMR by an address range specified
+ * by the index of the IMR. Useful for initial sanitization of the IMR
+ * address map.
+ * imr_ge(base, size); delete IMR from base to base+size.
+ *
+ * @reg: imr index to remove.
+ * @return: -EINVAL on invalid range or out or range id
+ * -ENODEV if reg is valid but no IMR exists or is locked
+ * 0 on success.
+ */
+static inline int imr_clear(int reg)
+{
+ return __imr_remove_range(reg, 0, 0);
+}
+
+/**
+ * imr_fixup_memmap - Tear down IMRs used during bootup.
+ *
+ * BIOS and Grub both setup IMRs around compressed kernel, initrd memory
+ * that need to be removed before the kernel hands out one of the IMR
+ * encased addresses to a downstream DMA agent such as the SD or Ethernet.
+ * IMRs on Galileo are setup to immediately reset the system on violation.
+ * As a result if you're running a root filesystem from SD - you'll need
+ * the boot-time IMRs torn down or you'll find seemingly random resets when
+ * using your filesystem.
+ *
+ * @idev: pointer to imr_device structure.
+ * @return:
+ */
+static void __init imr_fixup_memmap(struct imr_device *idev)
+{
+ phys_addr_t base = virt_to_phys(&_text);
+ size_t size = virt_to_phys(&__end_rodata) - base;
+ int i;
+ int ret;
+
+ /* Tear down all existing unlocked IMRs. */
+ for (i = 0; i < idev->max_imr; i++)
+ imr_clear(i);
+
+ /*
+ * Setup a locked IMR around the physical extent of the kernel
+ * from the beginning of the .text secton to the end of the
+ * .rodata section as one physically contiguous block.
+ */
+ ret = imr_add_range(base, size, IMR_CPU, IMR_CPU, true);
+ if (ret < 0) {
+ pr_err("unable to setup IMR for kernel: (%p - %p)\n",
+ &_text, &__end_rodata);
+ } else {
+ pr_info("protecting kernel .text - .rodata: %zu KiB (%p - %p)\n",
+ size / 1024, &_text, &__end_rodata);
+ }
+
+}
+
+static const struct x86_cpu_id imr_ids[] __initconst = {
+ { X86_VENDOR_INTEL, 5, 9 }, /* Intel Quark SoC X1000. */
+ {}
+};
+MODULE_DEVICE_TABLE(x86cpu, imr_ids);
+
+/**
+ * imr_init - entry point for IMR driver.
+ *
+ * return: -ENODEV for no IMR support 0 if good to go.
+ */
+static int __init imr_init(void)
+{
+ struct imr_device *idev = &imr_dev;
+ int ret;
+
+ if (!x86_match_cpu(imr_ids) || !iosf_mbi_available())
+ return -ENODEV;
+
+ idev->max_imr = QUARK_X1000_IMR_MAX;
+ idev->reg_base = QUARK_X1000_IMR_REGBASE;
+ idev->init = true;
+
+ mutex_init(&idev->lock);
+ ret = imr_debugfs_register(idev);
+ if (ret != 0)
+ pr_warn("debugfs register failed!\n");
+ imr_fixup_memmap(idev);
+ return 0;
+}
+
+/**
+ * imr_exit - exit point for IMR code.
+ *
+ * Deregisters debugfs, leave IMR state as-is.
+ *
+ * return:
+ */
+static void __exit imr_exit(void)
+{
+ imr_debugfs_unregister(&imr_dev);
+}
+
+module_init(imr_init);
+module_exit(imr_exit);
+
+MODULE_AUTHOR("Bryan O'Donoghue <pure.logic@nexus-software.ie>");
+MODULE_DESCRIPTION("Intel Isolated Memory Region driver");
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/arch/x86/platform/intel-quark/imr_selftest.c b/arch/x86/platform/intel-quark/imr_selftest.c
new file mode 100644
index 000000000000..c9a0838890e2
--- /dev/null
+++ b/arch/x86/platform/intel-quark/imr_selftest.c
@@ -0,0 +1,129 @@
+/**
+ * imr_selftest.c
+ *
+ * Copyright(c) 2013 Intel Corporation.
+ * Copyright(c) 2015 Bryan O'Donoghue <pure.logic@nexus-software.ie>
+ *
+ * IMR self test. The purpose of this module is to run a set of tests on the
+ * IMR API to validate it's sanity. We check for overlapping, reserved
+ * addresses and setup/teardown sanity.
+ *
+ */
+
+#include <asm-generic/sections.h>
+#include <asm/imr.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/types.h>
+
+#define SELFTEST KBUILD_MODNAME ": "
+/**
+ * imr_self_test_result - Print result string for self test.
+ *
+ * @res: result code - true if test passed false otherwise.
+ * @fmt: format string.
+ * ... variadic argument list.
+ */
+static void __init imr_self_test_result(int res, const char *fmt, ...)
+{
+ va_list vlist;
+
+ /* Print pass/fail. */
+ if (res)
+ pr_info(SELFTEST "pass ");
+ else
+ pr_info(SELFTEST "fail ");
+
+ /* Print variable string. */
+ va_start(vlist, fmt);
+ vprintk(fmt, vlist);
+ va_end(vlist);
+
+ /* Optional warning. */
+ WARN(res == 0, "test failed");
+}
+#undef SELFTEST
+
+/**
+ * imr_self_test
+ *
+ * Verify IMR self_test with some simple tests to verify overlap,
+ * zero sized allocations and 1 KiB sized areas.
+ *
+ */
+static void __init imr_self_test(void)
+{
+ phys_addr_t base = virt_to_phys(&_text);
+ size_t size = virt_to_phys(&__end_rodata) - base;
+ const char *fmt_over = "overlapped IMR @ (0x%08lx - 0x%08lx)\n";
+ int ret;
+
+ /* Test zero zero. */
+ ret = imr_add_range(0, 0, 0, 0, false);
+ imr_self_test_result(ret < 0, "zero sized IMR\n");
+
+ /* Test exact overlap. */
+ ret = imr_add_range(base, size, IMR_CPU, IMR_CPU, false);
+ imr_self_test_result(ret < 0, fmt_over, __va(base), __va(base + size));
+
+ /* Test overlap with base inside of existing. */
+ base += size - IMR_ALIGN;
+ ret = imr_add_range(base, size, IMR_CPU, IMR_CPU, false);
+ imr_self_test_result(ret < 0, fmt_over, __va(base), __va(base + size));
+
+ /* Test overlap with end inside of existing. */
+ base -= size + IMR_ALIGN * 2;
+ ret = imr_add_range(base, size, IMR_CPU, IMR_CPU, false);
+ imr_self_test_result(ret < 0, fmt_over, __va(base), __va(base + size));
+
+ /* Test that a 1 KiB IMR @ zero with read/write all will bomb out. */
+ ret = imr_add_range(0, IMR_ALIGN, IMR_READ_ACCESS_ALL,
+ IMR_WRITE_ACCESS_ALL, false);
+ imr_self_test_result(ret < 0, "1KiB IMR @ 0x00000000 - access-all\n");
+
+ /* Test that a 1 KiB IMR @ zero with CPU only will work. */
+ ret = imr_add_range(0, IMR_ALIGN, IMR_CPU, IMR_CPU, false);
+ imr_self_test_result(ret >= 0, "1KiB IMR @ 0x00000000 - cpu-access\n");
+ if (ret >= 0) {
+ ret = imr_remove_range(0, IMR_ALIGN);
+ imr_self_test_result(ret == 0, "teardown - cpu-access\n");
+ }
+
+ /* Test 2 KiB works. */
+ size = IMR_ALIGN * 2;
+ ret = imr_add_range(0, size, IMR_READ_ACCESS_ALL,
+ IMR_WRITE_ACCESS_ALL, false);
+ imr_self_test_result(ret >= 0, "2KiB IMR @ 0x00000000\n");
+ if (ret >= 0) {
+ ret = imr_remove_range(0, size);
+ imr_self_test_result(ret == 0, "teardown 2KiB\n");
+ }
+}
+
+/**
+ * imr_self_test_init - entry point for IMR driver.
+ *
+ * return: -ENODEV for no IMR support 0 if good to go.
+ */
+static int __init imr_self_test_init(void)
+{
+ imr_self_test();
+ return 0;
+}
+
+/**
+ * imr_self_test_exit - exit point for IMR code.
+ *
+ * return:
+ */
+static void __exit imr_self_test_exit(void)
+{
+}
+
+module_init(imr_self_test_init);
+module_exit(imr_self_test_exit);
+
+MODULE_AUTHOR("Bryan O'Donoghue <pure.logic@nexus-software.ie>");
+MODULE_DESCRIPTION("Intel Isolated Memory Region self-test driver");
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/arch/x86/platform/uv/uv_nmi.c b/arch/x86/platform/uv/uv_nmi.c
index c6b146e67116..7488cafab955 100644
--- a/arch/x86/platform/uv/uv_nmi.c
+++ b/arch/x86/platform/uv/uv_nmi.c
@@ -273,20 +273,6 @@ static inline void uv_clear_nmi(int cpu)
}
}
-/* Print non-responding cpus */
-static void uv_nmi_nr_cpus_pr(char *fmt)
-{
- static char cpu_list[1024];
- int len = sizeof(cpu_list);
- int c = cpumask_weight(uv_nmi_cpu_mask);
- int n = cpulist_scnprintf(cpu_list, len, uv_nmi_cpu_mask);
-
- if (n >= len-1)
- strcpy(&cpu_list[len - 6], "...\n");
-
- printk(fmt, c, cpu_list);
-}
-
/* Ping non-responding cpus attemping to force them into the NMI handler */
static void uv_nmi_nr_cpus_ping(void)
{
@@ -371,16 +357,19 @@ static void uv_nmi_wait(int master)
break;
/* if not all made it in, send IPI NMI to them */
- uv_nmi_nr_cpus_pr(KERN_ALERT
- "UV: Sending NMI IPI to %d non-responding CPUs: %s\n");
+ pr_alert("UV: Sending NMI IPI to %d non-responding CPUs: %*pbl\n",
+ cpumask_weight(uv_nmi_cpu_mask),
+ cpumask_pr_args(uv_nmi_cpu_mask));
+
uv_nmi_nr_cpus_ping();
/* if all cpus are in, then done */
if (!uv_nmi_wait_cpus(0))
break;
- uv_nmi_nr_cpus_pr(KERN_ALERT
- "UV: %d CPUs not in NMI loop: %s\n");
+ pr_alert("UV: %d CPUs not in NMI loop: %*pbl\n",
+ cpumask_weight(uv_nmi_cpu_mask),
+ cpumask_pr_args(uv_nmi_cpu_mask));
} while (0);
pr_alert("UV: %d of %d CPUs in NMI\n",
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index 6ec7910f59bf..3e32ed5648a0 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -105,11 +105,8 @@ static void __save_processor_state(struct saved_context *ctxt)
ctxt->cr0 = read_cr0();
ctxt->cr2 = read_cr2();
ctxt->cr3 = read_cr3();
-#ifdef CONFIG_X86_32
- ctxt->cr4 = read_cr4_safe();
-#else
-/* CONFIG_X86_64 */
- ctxt->cr4 = read_cr4();
+ ctxt->cr4 = __read_cr4_safe();
+#ifdef CONFIG_X86_64
ctxt->cr8 = read_cr8();
#endif
ctxt->misc_enable_saved = !rdmsrl_safe(MSR_IA32_MISC_ENABLE,
@@ -175,12 +172,12 @@ static void notrace __restore_processor_state(struct saved_context *ctxt)
/* cr4 was introduced in the Pentium CPU */
#ifdef CONFIG_X86_32
if (ctxt->cr4)
- write_cr4(ctxt->cr4);
+ __write_cr4(ctxt->cr4);
#else
/* CONFIG X86_64 */
wrmsrl(MSR_EFER, ctxt->efer);
write_cr8(ctxt->cr8);
- write_cr4(ctxt->cr4);
+ __write_cr4(ctxt->cr4);
#endif
write_cr3(ctxt->cr3);
write_cr2(ctxt->cr2);
diff --git a/arch/x86/realmode/Makefile b/arch/x86/realmode/Makefile
index 94f7fbe97b08..e02c2c6c56a5 100644
--- a/arch/x86/realmode/Makefile
+++ b/arch/x86/realmode/Makefile
@@ -6,7 +6,7 @@
# for more details.
#
#
-
+KASAN_SANITIZE := n
subdir- := rm
obj-y += init.o
diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c
index bad628a620c4..0b7a63d98440 100644
--- a/arch/x86/realmode/init.c
+++ b/arch/x86/realmode/init.c
@@ -81,7 +81,7 @@ void __init setup_real_mode(void)
trampoline_header->start = (u64) secondary_startup_64;
trampoline_cr4_features = &trampoline_header->cr4;
- *trampoline_cr4_features = read_cr4();
+ *trampoline_cr4_features = __read_cr4();
trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd);
trampoline_pgd[0] = init_level4_pgt[pgd_index(__PAGE_OFFSET)].pgd;
diff --git a/arch/x86/realmode/rm/Makefile b/arch/x86/realmode/rm/Makefile
index 7c0d7be176a5..2730d775ef9a 100644
--- a/arch/x86/realmode/rm/Makefile
+++ b/arch/x86/realmode/rm/Makefile
@@ -6,6 +6,7 @@
# for more details.
#
#
+KASAN_SANITIZE := n
always := realmode.bin realmode.relocs
diff --git a/arch/x86/tools/calc_run_size.pl b/arch/x86/tools/calc_run_size.pl
deleted file mode 100644
index 23210baade2d..000000000000
--- a/arch/x86/tools/calc_run_size.pl
+++ /dev/null
@@ -1,39 +0,0 @@
-#!/usr/bin/perl
-#
-# Calculate the amount of space needed to run the kernel, including room for
-# the .bss and .brk sections.
-#
-# Usage:
-# objdump -h a.out | perl calc_run_size.pl
-use strict;
-
-my $mem_size = 0;
-my $file_offset = 0;
-
-my $sections=" *[0-9]+ \.(?:bss|brk) +";
-while (<>) {
- if (/^$sections([0-9a-f]+) +(?:[0-9a-f]+ +){2}([0-9a-f]+)/) {
- my $size = hex($1);
- my $offset = hex($2);
- $mem_size += $size;
- if ($file_offset == 0) {
- $file_offset = $offset;
- } elsif ($file_offset != $offset) {
- # BFD linker shows the same file offset in ELF.
- # Gold linker shows them as consecutive.
- next if ($file_offset + $mem_size == $offset + $size);
-
- printf STDERR "file_offset: 0x%lx\n", $file_offset;
- printf STDERR "mem_size: 0x%lx\n", $mem_size;
- printf STDERR "offset: 0x%lx\n", $offset;
- printf STDERR "size: 0x%lx\n", $size;
-
- die ".bss and .brk are non-contiguous\n";
- }
- }
-}
-
-if ($file_offset == 0) {
- die "Never found .bss or .brk file offset\n";
-}
-printf("%d\n", $mem_size + $file_offset);
diff --git a/arch/x86/tools/calc_run_size.sh b/arch/x86/tools/calc_run_size.sh
new file mode 100644
index 000000000000..1a4c17bb3910
--- /dev/null
+++ b/arch/x86/tools/calc_run_size.sh
@@ -0,0 +1,42 @@
+#!/bin/sh
+#
+# Calculate the amount of space needed to run the kernel, including room for
+# the .bss and .brk sections.
+#
+# Usage:
+# objdump -h a.out | sh calc_run_size.sh
+
+NUM='\([0-9a-fA-F]*[ \t]*\)'
+OUT=$(sed -n 's/^[ \t0-9]*.b[sr][sk][ \t]*'"$NUM$NUM$NUM$NUM"'.*/\1\4/p')
+if [ -z "$OUT" ] ; then
+ echo "Never found .bss or .brk file offset" >&2
+ exit 1
+fi
+
+OUT=$(echo ${OUT# })
+sizeA=$(printf "%d" 0x${OUT%% *})
+OUT=${OUT#* }
+offsetA=$(printf "%d" 0x${OUT%% *})
+OUT=${OUT#* }
+sizeB=$(printf "%d" 0x${OUT%% *})
+OUT=${OUT#* }
+offsetB=$(printf "%d" 0x${OUT%% *})
+
+run_size=$(( $offsetA + $sizeA + $sizeB ))
+
+# BFD linker shows the same file offset in ELF.
+if [ "$offsetA" -ne "$offsetB" ] ; then
+ # Gold linker shows them as consecutive.
+ endB=$(( $offsetB + $sizeB ))
+ if [ "$endB" != "$run_size" ] ; then
+ printf "sizeA: 0x%x\n" $sizeA >&2
+ printf "offsetA: 0x%x\n" $offsetA >&2
+ printf "sizeB: 0x%x\n" $sizeB >&2
+ printf "offsetB: 0x%x\n" $offsetB >&2
+ echo ".bss and .brk are non-contiguous" >&2
+ exit 1
+ fi
+fi
+
+printf "%d\n" $run_size
+exit 0
diff --git a/arch/x86/um/signal.c b/arch/x86/um/signal.c
index 79d824551c1a..0c8c32bfd792 100644
--- a/arch/x86/um/signal.c
+++ b/arch/x86/um/signal.c
@@ -157,7 +157,7 @@ static int copy_sc_from_user(struct pt_regs *regs,
int err, pid;
/* Always make any pending restarted system calls return -EINTR */
- current_thread_info()->restart_block.fn = do_no_restart_syscall;
+ current->restart_block.fn = do_no_restart_syscall;
err = copy_from_user(&sc, from, sizeof(sc));
if (err)
diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile
index 5a4affe025e8..7b9be9822724 100644
--- a/arch/x86/vdso/Makefile
+++ b/arch/x86/vdso/Makefile
@@ -3,6 +3,7 @@
#
KBUILD_CFLAGS += $(DISABLE_LTO)
+KASAN_SANITIZE := n
VDSO64-$(CONFIG_X86_64) := y
VDSOX32-$(CONFIG_X86_X32_ABI) := y
@@ -205,4 +206,4 @@ $(vdso_img_insttargets): install_%: $(obj)/%.dbg $(MODLIB)/vdso FORCE
PHONY += vdso_install $(vdso_img_insttargets)
vdso_install: $(vdso_img_insttargets) FORCE
-clean-files := vdso32-syscall* vdso32-sysenter* vdso32-int80*
+clean-files := vdso32-syscall* vdso32-sysenter* vdso32-int80* vdso64*
diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c
index 009495b9ab4b..1c9f750c3859 100644
--- a/arch/x86/vdso/vma.c
+++ b/arch/x86/vdso/vma.c
@@ -41,12 +41,17 @@ void __init init_vdso_image(const struct vdso_image *image)
struct linux_binprm;
-/* Put the vdso above the (randomized) stack with another randomized offset.
- This way there is no hole in the middle of address space.
- To save memory make sure it is still in the same PTE as the stack top.
- This doesn't give that many random bits.
-
- Only used for the 64-bit and x32 vdsos. */
+/*
+ * Put the vdso above the (randomized) stack with another randomized
+ * offset. This way there is no hole in the middle of address space.
+ * To save memory make sure it is still in the same PTE as the stack
+ * top. This doesn't give that many random bits.
+ *
+ * Note that this algorithm is imperfect: the distribution of the vdso
+ * start address within a PMD is biased toward the end.
+ *
+ * Only used for the 64-bit and x32 vdsos.
+ */
static unsigned long vdso_addr(unsigned long start, unsigned len)
{
#ifdef CONFIG_X86_32
@@ -54,22 +59,30 @@ static unsigned long vdso_addr(unsigned long start, unsigned len)
#else
unsigned long addr, end;
unsigned offset;
- end = (start + PMD_SIZE - 1) & PMD_MASK;
+
+ /*
+ * Round up the start address. It can start out unaligned as a result
+ * of stack start randomization.
+ */
+ start = PAGE_ALIGN(start);
+
+ /* Round the lowest possible end address up to a PMD boundary. */
+ end = (start + len + PMD_SIZE - 1) & PMD_MASK;
if (end >= TASK_SIZE_MAX)
end = TASK_SIZE_MAX;
end -= len;
- /* This loses some more bits than a modulo, but is cheaper */
- offset = get_random_int() & (PTRS_PER_PTE - 1);
- addr = start + (offset << PAGE_SHIFT);
- if (addr >= end)
- addr = end;
+
+ if (end > start) {
+ offset = get_random_int() % (((end - start) >> PAGE_SHIFT) + 1);
+ addr = start + (offset << PAGE_SHIFT);
+ } else {
+ addr = start;
+ }
/*
- * page-align it here so that get_unmapped_area doesn't
- * align it wrongfully again to the next page. addr can come in 4K
- * unaligned here as a result of stack start randomization.
+ * Forcibly align the final address in case we have a hardware
+ * issue that requires alignment for performance reasons.
*/
- addr = PAGE_ALIGN(addr);
addr = align_vdso_addr(addr);
return addr;
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 6bf3a13e3e0f..bd8b8459c3d0 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -40,6 +40,7 @@
#include <xen/interface/physdev.h>
#include <xen/interface/vcpu.h>
#include <xen/interface/memory.h>
+#include <xen/interface/nmi.h>
#include <xen/interface/xen-mca.h>
#include <xen/features.h>
#include <xen/page.h>
@@ -66,6 +67,7 @@
#include <asm/reboot.h>
#include <asm/stackprotector.h>
#include <asm/hypervisor.h>
+#include <asm/mach_traps.h>
#include <asm/mwait.h>
#include <asm/pci_x86.h>
#include <asm/pat.h>
@@ -1351,6 +1353,21 @@ static const struct machine_ops xen_machine_ops __initconst = {
.emergency_restart = xen_emergency_restart,
};
+static unsigned char xen_get_nmi_reason(void)
+{
+ unsigned char reason = 0;
+
+ /* Construct a value which looks like it came from port 0x61. */
+ if (test_bit(_XEN_NMIREASON_io_error,
+ &HYPERVISOR_shared_info->arch.nmi_reason))
+ reason |= NMI_REASON_IOCHK;
+ if (test_bit(_XEN_NMIREASON_pci_serr,
+ &HYPERVISOR_shared_info->arch.nmi_reason))
+ reason |= NMI_REASON_SERR;
+
+ return reason;
+}
+
static void __init xen_boot_params_init_edd(void)
{
#if IS_ENABLED(CONFIG_EDD)
@@ -1477,10 +1494,10 @@ static void xen_pvh_set_cr_flags(int cpu)
* set them here. For all, OSFXSR OSXMMEXCPT are set in fpu_init.
*/
if (cpu_has_pse)
- set_in_cr4(X86_CR4_PSE);
+ cr4_set_bits_and_update_boot(X86_CR4_PSE);
if (cpu_has_pge)
- set_in_cr4(X86_CR4_PGE);
+ cr4_set_bits_and_update_boot(X86_CR4_PGE);
}
/*
@@ -1535,9 +1552,12 @@ asmlinkage __visible void __init xen_start_kernel(void)
pv_info = xen_info;
pv_init_ops = xen_init_ops;
pv_apic_ops = xen_apic_ops;
- if (!xen_pvh_domain())
+ if (!xen_pvh_domain()) {
pv_cpu_ops = xen_cpu_ops;
+ x86_platform.get_nmi_reason = xen_get_nmi_reason;
+ }
+
if (xen_feature(XENFEAT_auto_translated_physmap))
x86_init.resources.memory_setup = xen_auto_xlated_memory_setup;
else
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 5c1f9ace7ae7..adca9e2b6553 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1489,7 +1489,7 @@ static void __init xen_set_pte_init(pte_t *ptep, pte_t pte)
native_set_pte(ptep, pte);
}
-static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
+static void __init pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
{
struct mmuext_op op;
op.cmd = cmd;
@@ -1657,7 +1657,7 @@ void __init xen_reserve_top(void)
* Like __va(), but returns address in the kernel mapping (which is
* all we have until the physical memory mapping has been set up.
*/
-static void *__ka(phys_addr_t paddr)
+static void * __init __ka(phys_addr_t paddr)
{
#ifdef CONFIG_X86_64
return (void *)(paddr + __START_KERNEL_map);
@@ -1667,7 +1667,7 @@ static void *__ka(phys_addr_t paddr)
}
/* Convert a machine address to physical address */
-static unsigned long m2p(phys_addr_t maddr)
+static unsigned long __init m2p(phys_addr_t maddr)
{
phys_addr_t paddr;
@@ -1678,13 +1678,14 @@ static unsigned long m2p(phys_addr_t maddr)
}
/* Convert a machine address to kernel virtual */
-static void *m2v(phys_addr_t maddr)
+static void * __init m2v(phys_addr_t maddr)
{
return __ka(m2p(maddr));
}
/* Set the page permissions on an identity-mapped pages */
-static void set_page_prot_flags(void *addr, pgprot_t prot, unsigned long flags)
+static void __init set_page_prot_flags(void *addr, pgprot_t prot,
+ unsigned long flags)
{
unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
pte_t pte = pfn_pte(pfn, prot);
@@ -1696,7 +1697,7 @@ static void set_page_prot_flags(void *addr, pgprot_t prot, unsigned long flags)
if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, flags))
BUG();
}
-static void set_page_prot(void *addr, pgprot_t prot)
+static void __init set_page_prot(void *addr, pgprot_t prot)
{
return set_page_prot_flags(addr, prot, UVMF_NONE);
}
@@ -1733,10 +1734,8 @@ static void __init xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) {
pte_t pte;
-#ifdef CONFIG_X86_32
if (pfn > max_pfn_mapped)
max_pfn_mapped = pfn;
-#endif
if (!pte_none(pte_page[pteidx]))
continue;
@@ -1769,7 +1768,7 @@ void __init xen_setup_machphys_mapping(void)
}
#ifdef CONFIG_X86_64
-static void convert_pfn_mfn(void *v)
+static void __init convert_pfn_mfn(void *v)
{
pte_t *pte = v;
int i;
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index edbc7a63fd73..740ae3026a14 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -84,8 +84,6 @@
#define PMDS_PER_MID_PAGE (P2M_MID_PER_PAGE / PTRS_PER_PTE)
-static void __init m2p_override_init(void);
-
unsigned long *xen_p2m_addr __read_mostly;
EXPORT_SYMBOL_GPL(xen_p2m_addr);
unsigned long xen_p2m_size __read_mostly;
@@ -167,10 +165,13 @@ static void * __ref alloc_p2m_page(void)
return (void *)__get_free_page(GFP_KERNEL | __GFP_REPEAT);
}
-/* Only to be called in case of a race for a page just allocated! */
-static void free_p2m_page(void *p)
+static void __ref free_p2m_page(void *p)
{
- BUG_ON(!slab_is_available());
+ if (unlikely(!slab_is_available())) {
+ free_bootmem((unsigned long)p, PAGE_SIZE);
+ return;
+ }
+
free_page((unsigned long)p);
}
@@ -375,7 +376,7 @@ static void __init xen_rebuild_p2m_list(unsigned long *p2m)
p2m_missing_pte : p2m_identity_pte;
for (i = 0; i < PMDS_PER_MID_PAGE; i++) {
pmdp = populate_extra_pmd(
- (unsigned long)(p2m + pfn + i * PTRS_PER_PTE));
+ (unsigned long)(p2m + pfn) + i * PMD_SIZE);
set_pmd(pmdp, __pmd(__pa(ptep) | _KERNPG_TABLE));
}
}
@@ -399,8 +400,6 @@ void __init xen_vmalloc_p2m_tree(void)
xen_p2m_size = xen_max_p2m_pfn;
xen_inv_extra_mem();
-
- m2p_override_init();
}
unsigned long get_phys_to_machine(unsigned long pfn)
@@ -436,10 +435,9 @@ EXPORT_SYMBOL_GPL(get_phys_to_machine);
* a new pmd is to replace p2m_missing_pte or p2m_identity_pte by a individual
* pmd. In case of PAE/x86-32 there are multiple pmds to allocate!
*/
-static pte_t *alloc_p2m_pmd(unsigned long addr, pte_t *ptep, pte_t *pte_pg)
+static pte_t *alloc_p2m_pmd(unsigned long addr, pte_t *pte_pg)
{
pte_t *ptechk;
- pte_t *pteret = ptep;
pte_t *pte_newpg[PMDS_PER_MID_PAGE];
pmd_t *pmdp;
unsigned int level;
@@ -473,8 +471,6 @@ static pte_t *alloc_p2m_pmd(unsigned long addr, pte_t *ptep, pte_t *pte_pg)
if (ptechk == pte_pg) {
set_pmd(pmdp,
__pmd(__pa(pte_newpg[i]) | _KERNPG_TABLE));
- if (vaddr == (addr & ~(PMD_SIZE - 1)))
- pteret = pte_offset_kernel(pmdp, addr);
pte_newpg[i] = NULL;
}
@@ -488,7 +484,7 @@ static pte_t *alloc_p2m_pmd(unsigned long addr, pte_t *ptep, pte_t *pte_pg)
vaddr += PMD_SIZE;
}
- return pteret;
+ return lookup_address(addr, &level);
}
/*
@@ -517,7 +513,7 @@ static bool alloc_p2m(unsigned long pfn)
if (pte_pg == p2m_missing_pte || pte_pg == p2m_identity_pte) {
/* PMD level is missing, allocate a new one */
- ptep = alloc_p2m_pmd(addr, ptep, pte_pg);
+ ptep = alloc_p2m_pmd(addr, pte_pg);
if (!ptep)
return false;
}
@@ -554,7 +550,7 @@ static bool alloc_p2m(unsigned long pfn)
mid_mfn = NULL;
}
- p2m_pfn = pte_pfn(ACCESS_ONCE(*ptep));
+ p2m_pfn = pte_pfn(READ_ONCE(*ptep));
if (p2m_pfn == PFN_DOWN(__pa(p2m_identity)) ||
p2m_pfn == PFN_DOWN(__pa(p2m_missing))) {
/* p2m leaf page is missing */
@@ -652,100 +648,21 @@ bool set_phys_to_machine(unsigned long pfn, unsigned long mfn)
return true;
}
-#define M2P_OVERRIDE_HASH_SHIFT 10
-#define M2P_OVERRIDE_HASH (1 << M2P_OVERRIDE_HASH_SHIFT)
-
-static struct list_head *m2p_overrides;
-static DEFINE_SPINLOCK(m2p_override_lock);
-
-static void __init m2p_override_init(void)
-{
- unsigned i;
-
- m2p_overrides = alloc_bootmem_align(
- sizeof(*m2p_overrides) * M2P_OVERRIDE_HASH,
- sizeof(unsigned long));
-
- for (i = 0; i < M2P_OVERRIDE_HASH; i++)
- INIT_LIST_HEAD(&m2p_overrides[i]);
-}
-
-static unsigned long mfn_hash(unsigned long mfn)
-{
- return hash_long(mfn, M2P_OVERRIDE_HASH_SHIFT);
-}
-
-/* Add an MFN override for a particular page */
-static int m2p_add_override(unsigned long mfn, struct page *page,
- struct gnttab_map_grant_ref *kmap_op)
-{
- unsigned long flags;
- unsigned long pfn;
- unsigned long uninitialized_var(address);
- unsigned level;
- pte_t *ptep = NULL;
-
- pfn = page_to_pfn(page);
- if (!PageHighMem(page)) {
- address = (unsigned long)__va(pfn << PAGE_SHIFT);
- ptep = lookup_address(address, &level);
- if (WARN(ptep == NULL || level != PG_LEVEL_4K,
- "m2p_add_override: pfn %lx not mapped", pfn))
- return -EINVAL;
- }
-
- if (kmap_op != NULL) {
- if (!PageHighMem(page)) {
- struct multicall_space mcs =
- xen_mc_entry(sizeof(*kmap_op));
-
- MULTI_grant_table_op(mcs.mc,
- GNTTABOP_map_grant_ref, kmap_op, 1);
-
- xen_mc_issue(PARAVIRT_LAZY_MMU);
- }
- }
- spin_lock_irqsave(&m2p_override_lock, flags);
- list_add(&page->lru, &m2p_overrides[mfn_hash(mfn)]);
- spin_unlock_irqrestore(&m2p_override_lock, flags);
-
- /* p2m(m2p(mfn)) == mfn: the mfn is already present somewhere in
- * this domain. Set the FOREIGN_FRAME_BIT in the p2m for the other
- * pfn so that the following mfn_to_pfn(mfn) calls will return the
- * pfn from the m2p_override (the backend pfn) instead.
- * We need to do this because the pages shared by the frontend
- * (xen-blkfront) can be already locked (lock_page, called by
- * do_read_cache_page); when the userspace backend tries to use them
- * with direct_IO, mfn_to_pfn returns the pfn of the frontend, so
- * do_blockdev_direct_IO is going to try to lock the same pages
- * again resulting in a deadlock.
- * As a side effect get_user_pages_fast might not be safe on the
- * frontend pages while they are being shared with the backend,
- * because mfn_to_pfn (that ends up being called by GUPF) will
- * return the backend pfn rather than the frontend pfn. */
- pfn = mfn_to_pfn_no_overrides(mfn);
- if (__pfn_to_mfn(pfn) == mfn)
- set_phys_to_machine(pfn, FOREIGN_FRAME(mfn));
-
- return 0;
-}
-
int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops,
struct gnttab_map_grant_ref *kmap_ops,
struct page **pages, unsigned int count)
{
int i, ret = 0;
- bool lazy = false;
pte_t *pte;
if (xen_feature(XENFEAT_auto_translated_physmap))
return 0;
- if (kmap_ops &&
- !in_interrupt() &&
- paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) {
- arch_enter_lazy_mmu_mode();
- lazy = true;
+ if (kmap_ops) {
+ ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
+ kmap_ops, count);
+ if (ret)
+ goto out;
}
for (i = 0; i < count; i++) {
@@ -764,170 +681,28 @@ int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops,
}
pfn = page_to_pfn(pages[i]);
- WARN_ON(PagePrivate(pages[i]));
- SetPagePrivate(pages[i]);
- set_page_private(pages[i], mfn);
- pages[i]->index = pfn_to_mfn(pfn);
+ WARN(pfn_to_mfn(pfn) != INVALID_P2M_ENTRY, "page must be ballooned");
if (unlikely(!set_phys_to_machine(pfn, FOREIGN_FRAME(mfn)))) {
ret = -ENOMEM;
goto out;
}
-
- if (kmap_ops) {
- ret = m2p_add_override(mfn, pages[i], &kmap_ops[i]);
- if (ret)
- goto out;
- }
}
out:
- if (lazy)
- arch_leave_lazy_mmu_mode();
-
return ret;
}
EXPORT_SYMBOL_GPL(set_foreign_p2m_mapping);
-static struct page *m2p_find_override(unsigned long mfn)
-{
- unsigned long flags;
- struct list_head *bucket;
- struct page *p, *ret;
-
- if (unlikely(!m2p_overrides))
- return NULL;
-
- ret = NULL;
- bucket = &m2p_overrides[mfn_hash(mfn)];
-
- spin_lock_irqsave(&m2p_override_lock, flags);
-
- list_for_each_entry(p, bucket, lru) {
- if (page_private(p) == mfn) {
- ret = p;
- break;
- }
- }
-
- spin_unlock_irqrestore(&m2p_override_lock, flags);
-
- return ret;
-}
-
-static int m2p_remove_override(struct page *page,
- struct gnttab_map_grant_ref *kmap_op,
- unsigned long mfn)
-{
- unsigned long flags;
- unsigned long pfn;
- unsigned long uninitialized_var(address);
- unsigned level;
- pte_t *ptep = NULL;
-
- pfn = page_to_pfn(page);
-
- if (!PageHighMem(page)) {
- address = (unsigned long)__va(pfn << PAGE_SHIFT);
- ptep = lookup_address(address, &level);
-
- if (WARN(ptep == NULL || level != PG_LEVEL_4K,
- "m2p_remove_override: pfn %lx not mapped", pfn))
- return -EINVAL;
- }
-
- spin_lock_irqsave(&m2p_override_lock, flags);
- list_del(&page->lru);
- spin_unlock_irqrestore(&m2p_override_lock, flags);
-
- if (kmap_op != NULL) {
- if (!PageHighMem(page)) {
- struct multicall_space mcs;
- struct gnttab_unmap_and_replace *unmap_op;
- struct page *scratch_page = get_balloon_scratch_page();
- unsigned long scratch_page_address = (unsigned long)
- __va(page_to_pfn(scratch_page) << PAGE_SHIFT);
-
- /*
- * It might be that we queued all the m2p grant table
- * hypercalls in a multicall, then m2p_remove_override
- * get called before the multicall has actually been
- * issued. In this case handle is going to -1 because
- * it hasn't been modified yet.
- */
- if (kmap_op->handle == -1)
- xen_mc_flush();
- /*
- * Now if kmap_op->handle is negative it means that the
- * hypercall actually returned an error.
- */
- if (kmap_op->handle == GNTST_general_error) {
- pr_warn("m2p_remove_override: pfn %lx mfn %lx, failed to modify kernel mappings",
- pfn, mfn);
- put_balloon_scratch_page();
- return -1;
- }
-
- xen_mc_batch();
-
- mcs = __xen_mc_entry(
- sizeof(struct gnttab_unmap_and_replace));
- unmap_op = mcs.args;
- unmap_op->host_addr = kmap_op->host_addr;
- unmap_op->new_addr = scratch_page_address;
- unmap_op->handle = kmap_op->handle;
-
- MULTI_grant_table_op(mcs.mc,
- GNTTABOP_unmap_and_replace, unmap_op, 1);
-
- mcs = __xen_mc_entry(0);
- MULTI_update_va_mapping(mcs.mc, scratch_page_address,
- pfn_pte(page_to_pfn(scratch_page),
- PAGE_KERNEL_RO), 0);
-
- xen_mc_issue(PARAVIRT_LAZY_MMU);
-
- kmap_op->host_addr = 0;
- put_balloon_scratch_page();
- }
- }
-
- /* p2m(m2p(mfn)) == FOREIGN_FRAME(mfn): the mfn is already present
- * somewhere in this domain, even before being added to the
- * m2p_override (see comment above in m2p_add_override).
- * If there are no other entries in the m2p_override corresponding
- * to this mfn, then remove the FOREIGN_FRAME_BIT from the p2m for
- * the original pfn (the one shared by the frontend): the backend
- * cannot do any IO on this page anymore because it has been
- * unshared. Removing the FOREIGN_FRAME_BIT from the p2m entry of
- * the original pfn causes mfn_to_pfn(mfn) to return the frontend
- * pfn again. */
- mfn &= ~FOREIGN_FRAME_BIT;
- pfn = mfn_to_pfn_no_overrides(mfn);
- if (__pfn_to_mfn(pfn) == FOREIGN_FRAME(mfn) &&
- m2p_find_override(mfn) == NULL)
- set_phys_to_machine(pfn, mfn);
-
- return 0;
-}
-
int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops,
- struct gnttab_map_grant_ref *kmap_ops,
+ struct gnttab_unmap_grant_ref *kunmap_ops,
struct page **pages, unsigned int count)
{
int i, ret = 0;
- bool lazy = false;
if (xen_feature(XENFEAT_auto_translated_physmap))
return 0;
- if (kmap_ops &&
- !in_interrupt() &&
- paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) {
- arch_enter_lazy_mmu_mode();
- lazy = true;
- }
-
for (i = 0; i < count; i++) {
unsigned long mfn = __pfn_to_mfn(page_to_pfn(pages[i]));
unsigned long pfn = page_to_pfn(pages[i]);
@@ -937,36 +712,16 @@ int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops,
goto out;
}
- set_page_private(pages[i], INVALID_P2M_ENTRY);
- WARN_ON(!PagePrivate(pages[i]));
- ClearPagePrivate(pages[i]);
- set_phys_to_machine(pfn, pages[i]->index);
-
- if (kmap_ops)
- ret = m2p_remove_override(pages[i], &kmap_ops[i], mfn);
- if (ret)
- goto out;
+ set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
}
-
+ if (kunmap_ops)
+ ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
+ kunmap_ops, count);
out:
- if (lazy)
- arch_leave_lazy_mmu_mode();
return ret;
}
EXPORT_SYMBOL_GPL(clear_foreign_p2m_mapping);
-unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn)
-{
- struct page *p = m2p_find_override(mfn);
- unsigned long ret = pfn;
-
- if (p)
- ret = page_to_pfn(p);
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(m2p_find_override_pfn);
-
#ifdef CONFIG_XEN_DEBUG_FS
#include <linux/debugfs.h>
#include "debugfs.h"
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index dfd77dec8e2b..55f388ef481a 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -32,16 +32,6 @@
#include "p2m.h"
#include "mmu.h"
-/* These are code, but not functions. Defined in entry.S */
-extern const char xen_hypervisor_callback[];
-extern const char xen_failsafe_callback[];
-#ifdef CONFIG_X86_64
-extern asmlinkage void nmi(void);
-#endif
-extern void xen_sysenter_target(void);
-extern void xen_syscall_target(void);
-extern void xen_syscall32_target(void);
-
/* Amount of extra memory space we add to the e820 ranges */
struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata;
@@ -74,7 +64,7 @@ static unsigned long xen_remap_mfn __initdata = INVALID_P2M_ENTRY;
*/
#define EXTRA_MEM_RATIO (10)
-static void __init xen_add_extra_mem(u64 start, u64 size)
+static void __init xen_add_extra_mem(phys_addr_t start, phys_addr_t size)
{
int i;
@@ -97,10 +87,10 @@ static void __init xen_add_extra_mem(u64 start, u64 size)
memblock_reserve(start, size);
}
-static void __init xen_del_extra_mem(u64 start, u64 size)
+static void __init xen_del_extra_mem(phys_addr_t start, phys_addr_t size)
{
int i;
- u64 start_r, size_r;
+ phys_addr_t start_r, size_r;
for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
start_r = xen_extra_mem[i].start;
@@ -140,7 +130,7 @@ static void __init xen_del_extra_mem(u64 start, u64 size)
unsigned long __ref xen_chk_extra_mem(unsigned long pfn)
{
int i;
- unsigned long addr = PFN_PHYS(pfn);
+ phys_addr_t addr = PFN_PHYS(pfn);
for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
if (addr >= xen_extra_mem[i].start &&
@@ -160,6 +150,8 @@ void __init xen_inv_extra_mem(void)
int i;
for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
+ if (!xen_extra_mem[i].size)
+ continue;
pfn_s = PFN_DOWN(xen_extra_mem[i].start);
pfn_e = PFN_UP(xen_extra_mem[i].start + xen_extra_mem[i].size);
for (pfn = pfn_s; pfn < pfn_e; pfn++)
@@ -229,15 +221,14 @@ static int __init xen_free_mfn(unsigned long mfn)
* as a fallback if the remapping fails.
*/
static void __init xen_set_identity_and_release_chunk(unsigned long start_pfn,
- unsigned long end_pfn, unsigned long nr_pages, unsigned long *identity,
- unsigned long *released)
+ unsigned long end_pfn, unsigned long nr_pages, unsigned long *released)
{
- unsigned long len = 0;
unsigned long pfn, end;
int ret;
WARN_ON(start_pfn > end_pfn);
+ /* Release pages first. */
end = min(end_pfn, nr_pages);
for (pfn = start_pfn; pfn < end; pfn++) {
unsigned long mfn = pfn_to_mfn(pfn);
@@ -250,16 +241,14 @@ static void __init xen_set_identity_and_release_chunk(unsigned long start_pfn,
WARN(ret != 1, "Failed to release pfn %lx err=%d\n", pfn, ret);
if (ret == 1) {
+ (*released)++;
if (!__set_phys_to_machine(pfn, INVALID_P2M_ENTRY))
break;
- len++;
} else
break;
}
- /* Need to release pages first */
- *released += len;
- *identity += set_phys_range_identity(start_pfn, end_pfn);
+ set_phys_range_identity(start_pfn, end_pfn);
}
/*
@@ -268,7 +257,7 @@ static void __init xen_set_identity_and_release_chunk(unsigned long start_pfn,
static void __init xen_update_mem_tables(unsigned long pfn, unsigned long mfn)
{
struct mmu_update update = {
- .ptr = ((unsigned long long)mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE,
+ .ptr = ((uint64_t)mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE,
.val = pfn
};
@@ -287,7 +276,7 @@ static void __init xen_update_mem_tables(unsigned long pfn, unsigned long mfn)
}
/* Update kernel mapping, but not for highmem. */
- if ((pfn << PAGE_SHIFT) >= __pa(high_memory))
+ if (pfn >= PFN_UP(__pa(high_memory - 1)))
return;
if (HYPERVISOR_update_va_mapping((unsigned long)__va(pfn << PAGE_SHIFT),
@@ -318,7 +307,6 @@ static void __init xen_do_set_identity_and_remap_chunk(
unsigned long ident_pfn_iter, remap_pfn_iter;
unsigned long ident_end_pfn = start_pfn + size;
unsigned long left = size;
- unsigned long ident_cnt = 0;
unsigned int i, chunk;
WARN_ON(size == 0);
@@ -347,8 +335,7 @@ static void __init xen_do_set_identity_and_remap_chunk(
xen_remap_mfn = mfn;
/* Set identity map */
- ident_cnt += set_phys_range_identity(ident_pfn_iter,
- ident_pfn_iter + chunk);
+ set_phys_range_identity(ident_pfn_iter, ident_pfn_iter + chunk);
left -= chunk;
}
@@ -371,7 +358,7 @@ static void __init xen_do_set_identity_and_remap_chunk(
static unsigned long __init xen_set_identity_and_remap_chunk(
const struct e820entry *list, size_t map_size, unsigned long start_pfn,
unsigned long end_pfn, unsigned long nr_pages, unsigned long remap_pfn,
- unsigned long *identity, unsigned long *released)
+ unsigned long *released, unsigned long *remapped)
{
unsigned long pfn;
unsigned long i = 0;
@@ -386,8 +373,7 @@ static unsigned long __init xen_set_identity_and_remap_chunk(
/* Do not remap pages beyond the current allocation */
if (cur_pfn >= nr_pages) {
/* Identity map remaining pages */
- *identity += set_phys_range_identity(cur_pfn,
- cur_pfn + size);
+ set_phys_range_identity(cur_pfn, cur_pfn + size);
break;
}
if (cur_pfn + size > nr_pages)
@@ -398,7 +384,7 @@ static unsigned long __init xen_set_identity_and_remap_chunk(
if (!remap_range_size) {
pr_warning("Unable to find available pfn range, not remapping identity pages\n");
xen_set_identity_and_release_chunk(cur_pfn,
- cur_pfn + left, nr_pages, identity, released);
+ cur_pfn + left, nr_pages, released);
break;
}
/* Adjust size to fit in current e820 RAM region */
@@ -410,7 +396,7 @@ static unsigned long __init xen_set_identity_and_remap_chunk(
/* Update variables to reflect new mappings. */
i += size;
remap_pfn += size;
- *identity += size;
+ *remapped += size;
}
/*
@@ -427,13 +413,13 @@ static unsigned long __init xen_set_identity_and_remap_chunk(
static void __init xen_set_identity_and_remap(
const struct e820entry *list, size_t map_size, unsigned long nr_pages,
- unsigned long *released)
+ unsigned long *released, unsigned long *remapped)
{
phys_addr_t start = 0;
- unsigned long identity = 0;
unsigned long last_pfn = nr_pages;
const struct e820entry *entry;
unsigned long num_released = 0;
+ unsigned long num_remapped = 0;
int i;
/*
@@ -460,14 +446,14 @@ static void __init xen_set_identity_and_remap(
last_pfn = xen_set_identity_and_remap_chunk(
list, map_size, start_pfn,
end_pfn, nr_pages, last_pfn,
- &identity, &num_released);
+ &num_released, &num_remapped);
start = end;
}
}
*released = num_released;
+ *remapped = num_remapped;
- pr_info("Set %ld page(s) to 1-1 mapping\n", identity);
pr_info("Released %ld page(s)\n", num_released);
}
@@ -549,20 +535,21 @@ static unsigned long __init xen_get_max_pages(void)
return min(max_pages, MAX_DOMAIN_PAGES);
}
-static void xen_align_and_add_e820_region(u64 start, u64 size, int type)
+static void __init xen_align_and_add_e820_region(phys_addr_t start,
+ phys_addr_t size, int type)
{
- u64 end = start + size;
+ phys_addr_t end = start + size;
/* Align RAM regions to page boundaries. */
if (type == E820_RAM) {
start = PAGE_ALIGN(start);
- end &= ~((u64)PAGE_SIZE - 1);
+ end &= ~((phys_addr_t)PAGE_SIZE - 1);
}
e820_add_region(start, end - start, type);
}
-void xen_ignore_unusable(struct e820entry *list, size_t map_size)
+static void __init xen_ignore_unusable(struct e820entry *list, size_t map_size)
{
struct e820entry *entry;
unsigned int i;
@@ -581,11 +568,12 @@ char * __init xen_memory_setup(void)
static struct e820entry map[E820MAX] __initdata;
unsigned long max_pfn = xen_start_info->nr_pages;
- unsigned long long mem_end;
+ phys_addr_t mem_end;
int rc;
struct xen_memory_map memmap;
unsigned long max_pages;
unsigned long extra_pages = 0;
+ unsigned long remapped_pages;
int i;
int op;
@@ -635,9 +623,10 @@ char * __init xen_memory_setup(void)
* underlying RAM.
*/
xen_set_identity_and_remap(map, memmap.nr_entries, max_pfn,
- &xen_released_pages);
+ &xen_released_pages, &remapped_pages);
extra_pages += xen_released_pages;
+ extra_pages += remapped_pages;
/*
* Clamp the amount of extra memory to a EXTRA_MEM_RATIO
@@ -654,16 +643,16 @@ char * __init xen_memory_setup(void)
extra_pages);
i = 0;
while (i < memmap.nr_entries) {
- u64 addr = map[i].addr;
- u64 size = map[i].size;
+ phys_addr_t addr = map[i].addr;
+ phys_addr_t size = map[i].size;
u32 type = map[i].type;
if (type == E820_RAM) {
if (addr < mem_end) {
size = min(size, mem_end - addr);
} else if (extra_pages) {
- size = min(size, (u64)extra_pages * PAGE_SIZE);
- extra_pages -= size / PAGE_SIZE;
+ size = min(size, PFN_PHYS(extra_pages));
+ extra_pages -= PFN_DOWN(size);
xen_add_extra_mem(addr, size);
xen_max_p2m_pfn = PFN_DOWN(addr + size);
} else
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 4c071aeb8417..08e8489c47f1 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -507,7 +507,7 @@ static int xen_cpu_disable(void)
static void xen_cpu_die(unsigned int cpu)
{
while (xen_pv_domain() && HYPERVISOR_vcpu_op(VCPUOP_is_up, cpu, NULL)) {
- current->state = TASK_UNINTERRUPTIBLE;
+ __set_current_state(TASK_UNINTERRUPTIBLE);
schedule_timeout(HZ/10);
}
diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
index 23b45eb9a89c..956374c1edbc 100644
--- a/arch/x86/xen/spinlock.c
+++ b/arch/x86/xen/spinlock.c
@@ -41,7 +41,7 @@ static u8 zero_stats;
static inline void check_zero(void)
{
u8 ret;
- u8 old = ACCESS_ONCE(zero_stats);
+ u8 old = READ_ONCE(zero_stats);
if (unlikely(old)) {
ret = cmpxchg(&zero_stats, old, 0);
/* This ensures only one fellow resets the stat */
@@ -112,6 +112,7 @@ __visible void xen_lock_spinning(struct arch_spinlock *lock, __ticket_t want)
struct xen_lock_waiting *w = this_cpu_ptr(&lock_waiting);
int cpu = smp_processor_id();
u64 start;
+ __ticket_t head;
unsigned long flags;
/* If kicker interrupts not initialized yet, just spin */
@@ -159,11 +160,15 @@ __visible void xen_lock_spinning(struct arch_spinlock *lock, __ticket_t want)
*/
__ticket_enter_slowpath(lock);
+ /* make sure enter_slowpath, which is atomic does not cross the read */
+ smp_mb__after_atomic();
+
/*
* check again make sure it didn't become free while
* we weren't looking
*/
- if (ACCESS_ONCE(lock->tickets.head) == want) {
+ head = READ_ONCE(lock->tickets.head);
+ if (__tickets_equal(head, want)) {
add_stats(TAKEN_SLOW_PICKUP, 1);
goto out;
}
@@ -204,8 +209,8 @@ static void xen_unlock_kick(struct arch_spinlock *lock, __ticket_t next)
const struct xen_lock_waiting *w = &per_cpu(lock_waiting, cpu);
/* Make sure we read lock before want */
- if (ACCESS_ONCE(w->lock) == lock &&
- ACCESS_ONCE(w->want) == next) {
+ if (READ_ONCE(w->lock) == lock &&
+ READ_ONCE(w->want) == next) {
add_stats(RELEASED_SLOW_KICKED, 1);
xen_send_IPI_one(cpu, XEN_SPIN_UNLOCK_VECTOR);
break;
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index f473d268d387..55da33b1d51c 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -391,7 +391,7 @@ static const struct clock_event_device *xen_clockevent =
struct xen_clock_event_device {
struct clock_event_device evt;
- char *name;
+ char name[16];
};
static DEFINE_PER_CPU(struct xen_clock_event_device, xen_clock_events) = { .evt.irq = -1 };
@@ -420,46 +420,38 @@ void xen_teardown_timer(int cpu)
if (evt->irq >= 0) {
unbind_from_irqhandler(evt->irq, NULL);
evt->irq = -1;
- kfree(per_cpu(xen_clock_events, cpu).name);
- per_cpu(xen_clock_events, cpu).name = NULL;
}
}
void xen_setup_timer(int cpu)
{
- char *name;
- struct clock_event_device *evt;
+ struct xen_clock_event_device *xevt = &per_cpu(xen_clock_events, cpu);
+ struct clock_event_device *evt = &xevt->evt;
int irq;
- evt = &per_cpu(xen_clock_events, cpu).evt;
WARN(evt->irq >= 0, "IRQ%d for CPU%d is already allocated\n", evt->irq, cpu);
if (evt->irq >= 0)
xen_teardown_timer(cpu);
printk(KERN_INFO "installing Xen timer for CPU %d\n", cpu);
- name = kasprintf(GFP_KERNEL, "timer%d", cpu);
- if (!name)
- name = "<timer kasprintf failed>";
+ snprintf(xevt->name, sizeof(xevt->name), "timer%d", cpu);
irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt,
IRQF_PERCPU|IRQF_NOBALANCING|IRQF_TIMER|
IRQF_FORCE_RESUME|IRQF_EARLY_RESUME,
- name, NULL);
+ xevt->name, NULL);
(void)xen_set_irq_priority(irq, XEN_IRQ_PRIORITY_MAX);
memcpy(evt, xen_clockevent, sizeof(*evt));
evt->cpumask = cpumask_of(cpu);
evt->irq = irq;
- per_cpu(xen_clock_events, cpu).name = name;
}
void xen_setup_cpu_clockevents(void)
{
- BUG_ON(preemptible());
-
clockevents_register_device(this_cpu_ptr(&xen_clock_events.evt));
}
@@ -487,6 +479,10 @@ static void __init xen_time_init(void)
int cpu = smp_processor_id();
struct timespec tp;
+ /* As Dom0 is never moved, no penalty on using TSC there */
+ if (xen_initial_domain())
+ xen_clocksource.rating = 275;
+
clocksource_register_hz(&xen_clocksource, NSEC_PER_SEC);
if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) {
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 5686bd9d58cc..9e195c683549 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -10,6 +10,12 @@
extern const char xen_hypervisor_callback[];
extern const char xen_failsafe_callback[];
+void xen_sysenter_target(void);
+#ifdef CONFIG_X86_64
+void xen_syscall_target(void);
+void xen_syscall32_target(void);
+#endif
+
extern void *xen_initial_gdt;
struct trap_info;