aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig38
-rw-r--r--arch/x86/Kconfig.debug17
-rw-r--r--arch/x86/boot/Makefile1
-rw-r--r--arch/x86/boot/compressed/Makefile1
-rw-r--r--arch/x86/crypto/ghash-clmulni-intel_glue.c26
-rw-r--r--arch/x86/entry/syscalls/syscall_32.tbl1
-rw-r--r--arch/x86/entry/syscalls/syscall_64.tbl1
-rw-r--r--arch/x86/entry/vdso/Makefile1
-rw-r--r--arch/x86/entry/vdso/vclock_gettime.c12
-rw-r--r--arch/x86/include/asm/barrier.h36
-rw-r--r--arch/x86/include/asm/boot.h2
-rw-r--r--arch/x86/include/asm/device.h10
-rw-r--r--arch/x86/include/asm/dma-mapping.h2
-rw-r--r--arch/x86/include/asm/fpu/internal.h1
-rw-r--r--arch/x86/include/asm/fpu/xstate.h11
-rw-r--r--arch/x86/include/asm/hw_irq.h5
-rw-r--r--arch/x86/include/asm/intel_punit_ipc.h101
-rw-r--r--arch/x86/include/asm/intel_telemetry.h147
-rw-r--r--arch/x86/include/asm/iosf_mbi.h51
-rw-r--r--arch/x86/include/asm/kvm_host.h75
-rw-r--r--arch/x86/include/asm/lguest.h4
-rw-r--r--arch/x86/include/asm/mmu_context.h34
-rw-r--r--arch/x86/include/asm/pci_x86.h10
-rw-r--r--arch/x86/include/asm/pgtable.h40
-rw-r--r--arch/x86/include/asm/pgtable_types.h9
-rw-r--r--arch/x86/include/asm/platform_sst_audio.h1
-rw-r--r--arch/x86/include/asm/pmem.h18
-rw-r--r--arch/x86/include/asm/uaccess.h78
-rw-r--r--arch/x86/include/asm/uaccess_64.h94
-rw-r--r--arch/x86/include/asm/xen/hypercall.h6
-rw-r--r--arch/x86/include/uapi/asm/hyperv.h92
-rw-r--r--arch/x86/kernel/cpu/amd.c3
-rw-r--r--arch/x86/kernel/cpuid.c24
-rw-r--r--arch/x86/kernel/early-quirks.c1
-rw-r--r--arch/x86/kernel/fpu/init.c161
-rw-r--r--arch/x86/kernel/fpu/xstate.c4
-rw-r--r--arch/x86/kernel/ftrace.c21
-rw-r--r--arch/x86/kernel/livepatch.c29
-rw-r--r--arch/x86/kernel/machine_kexec_64.c2
-rw-r--r--arch/x86/kernel/msr.c24
-rw-r--r--arch/x86/kernel/reboot.c8
-rw-r--r--arch/x86/kernel/verify_cpu.S50
-rw-r--r--arch/x86/kernel/vm86_32.c6
-rw-r--r--arch/x86/kvm/hyperv.c708
-rw-r--r--arch/x86/kvm/hyperv.h55
-rw-r--r--arch/x86/kvm/ioapic.c4
-rw-r--r--arch/x86/kvm/ioapic.h7
-rw-r--r--arch/x86/kvm/iommu.c11
-rw-r--r--arch/x86/kvm/irq.c2
-rw-r--r--arch/x86/kvm/irq_comm.c41
-rw-r--r--arch/x86/kvm/lapic.c40
-rw-r--r--arch/x86/kvm/lapic.h9
-rw-r--r--arch/x86/kvm/mmu.c430
-rw-r--r--arch/x86/kvm/mmu_audit.c17
-rw-r--r--arch/x86/kvm/paging_tmpl.h26
-rw-r--r--arch/x86/kvm/svm.c37
-rw-r--r--arch/x86/kvm/trace.h265
-rw-r--r--arch/x86/kvm/vmx.c204
-rw-r--r--arch/x86/kvm/x86.c112
-rw-r--r--arch/x86/mm/gup.c74
-rw-r--r--arch/x86/mm/init_64.c36
-rw-r--r--arch/x86/mm/mmap.c12
-rw-r--r--arch/x86/mm/pageattr.c3
-rw-r--r--arch/x86/mm/pat.c5
-rw-r--r--arch/x86/mm/pgtable.c13
-rw-r--r--arch/x86/mm/tlb.c29
-rw-r--r--arch/x86/net/bpf_jit_comp.c40
-rw-r--r--arch/x86/pci/Makefile2
-rw-r--r--arch/x86/pci/common.c38
-rw-r--r--arch/x86/pci/pcbios.c108
-rw-r--r--arch/x86/pci/vmd.c723
-rw-r--r--arch/x86/platform/atom/punit_atom_debug.c7
-rw-r--r--arch/x86/platform/intel-quark/imr.c28
-rw-r--r--arch/x86/realmode/rm/Makefile1
-rw-r--r--arch/x86/um/Makefile2
-rw-r--r--arch/x86/um/asm/barrier.h9
-rw-r--r--arch/x86/um/asm/syscall.h1
-rw-r--r--arch/x86/um/ptrace_32.c8
-rw-r--r--arch/x86/xen/apic.c2
-rw-r--r--arch/x86/xen/enlighten.c8
-rw-r--r--arch/x86/xen/suspend.c3
-rw-r--r--arch/x86/xen/time.c115
82 files changed, 3490 insertions, 1003 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 258965d56beb..330e738ccfc1 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -24,12 +24,14 @@ config X86
select ARCH_DISCARD_MEMBLOCK
select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS
+ select ARCH_HAS_DEVMEM_IS_ALLOWED
select ARCH_HAS_ELF_RANDOMIZE
select ARCH_HAS_FAST_MULTIPLIER
select ARCH_HAS_GCOV_PROFILE_ALL
select ARCH_HAS_PMEM_API if X86_64
select ARCH_HAS_MMIO_FLUSH
select ARCH_HAS_SG_CHAIN
+ select ARCH_HAS_UBSAN_SANITIZE_ALL
select ARCH_HAVE_NMI_SAFE_CMPXCHG
select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI
select ARCH_MIGHT_HAVE_PC_PARPORT
@@ -82,6 +84,8 @@ config X86
select HAVE_ARCH_KASAN if X86_64 && SPARSEMEM_VMEMMAP
select HAVE_ARCH_KGDB
select HAVE_ARCH_KMEMCHECK
+ select HAVE_ARCH_MMAP_RND_BITS if MMU
+ select HAVE_ARCH_MMAP_RND_COMPAT_BITS if MMU && COMPAT
select HAVE_ARCH_SECCOMP_FILTER
select HAVE_ARCH_SOFT_DIRTY if X86_64
select HAVE_ARCH_TRACEHOOK
@@ -96,7 +100,6 @@ config X86
select HAVE_DEBUG_KMEMLEAK
select HAVE_DEBUG_STACKOVERFLOW
select HAVE_DMA_API_DEBUG
- select HAVE_DMA_ATTRS
select HAVE_DMA_CONTIGUOUS
select HAVE_DYNAMIC_FTRACE
select HAVE_DYNAMIC_FTRACE_WITH_REGS
@@ -177,12 +180,23 @@ config LOCKDEP_SUPPORT
config STACKTRACE_SUPPORT
def_bool y
-config HAVE_LATENCYTOP_SUPPORT
- def_bool y
-
config MMU
def_bool y
+config ARCH_MMAP_RND_BITS_MIN
+ default 28 if 64BIT
+ default 8
+
+config ARCH_MMAP_RND_BITS_MAX
+ default 32 if 64BIT
+ default 16
+
+config ARCH_MMAP_RND_COMPAT_BITS_MIN
+ default 8
+
+config ARCH_MMAP_RND_COMPAT_BITS_MAX
+ default 16
+
config SBUS
bool
@@ -534,9 +548,10 @@ config X86_INTEL_QUARK
config X86_INTEL_LPSS
bool "Intel Low Power Subsystem Support"
- depends on ACPI
+ depends on X86 && ACPI
select COMMON_CLK
select PINCTRL
+ select IOSF_MBI
---help---
Select to build support for Intel Low Power Subsystem such as
found on Intel Lynxpoint PCH. Selecting this option enables
@@ -2684,6 +2699,19 @@ config PMC_ATOM
def_bool y
depends on PCI
+config VMD
+ depends on PCI_MSI
+ tristate "Volume Management Device Driver"
+ default N
+ ---help---
+ Adds support for the Intel Volume Management Device (VMD). VMD is a
+ secondary PCI host bridge that allows PCI Express root ports,
+ and devices attached to them, to be removed from the default
+ PCI domain and placed within the VMD domain. This provides
+ more bus resources than are otherwise possible with a
+ single domain. If you know your system provides one of these and
+ has devices attached to it, say Y; if you are not sure, say N.
+
source "net/Kconfig"
source "drivers/Kconfig"
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 110253ce83af..9b18ed97a8a2 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -5,23 +5,6 @@ config TRACE_IRQFLAGS_SUPPORT
source "lib/Kconfig.debug"
-config STRICT_DEVMEM
- bool "Filter access to /dev/mem"
- ---help---
- If this option is disabled, you allow userspace (root) access to all
- of memory, including kernel and userspace memory. Accidental
- access to this is obviously disastrous, but specific access can
- be used by people debugging the kernel. Note that with PAT support
- enabled, even in this case there are restrictions on /dev/mem
- use due to the cache aliasing requirements.
-
- If this option is switched on, the /dev/mem file only allows
- userspace access to PCI space and the BIOS code and data regions.
- This is sufficient for dosemu and X and all common users of
- /dev/mem.
-
- If in doubt, say Y.
-
config X86_VERBOSE_BOOTUP
bool "Enable verbose x86 bootup info messages"
default y
diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile
index 2ee62dba0373..bbe1a62efc02 100644
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -60,6 +60,7 @@ clean-files += cpustr.h
KBUILD_CFLAGS := $(USERINCLUDE) $(REALMODE_CFLAGS) -D_SETUP
KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__
GCOV_PROFILE := n
+UBSAN_SANITIZE := n
$(obj)/bzImage: asflags-y := $(SVGA_MODE)
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index 0a291cdfaf77..f9ce75d80101 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -33,6 +33,7 @@ KBUILD_CFLAGS += $(call cc-option,-fno-stack-protector)
KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__
GCOV_PROFILE := n
+UBSAN_SANITIZE :=n
LDFLAGS := -m elf_$(UTS_MACHINE)
LDFLAGS_vmlinux := -T
diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c
index 440df0c7a2ee..a69321a77783 100644
--- a/arch/x86/crypto/ghash-clmulni-intel_glue.c
+++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c
@@ -219,6 +219,29 @@ static int ghash_async_final(struct ahash_request *req)
}
}
+static int ghash_async_import(struct ahash_request *req, const void *in)
+{
+ struct ahash_request *cryptd_req = ahash_request_ctx(req);
+ struct shash_desc *desc = cryptd_shash_desc(cryptd_req);
+ struct ghash_desc_ctx *dctx = shash_desc_ctx(desc);
+
+ ghash_async_init(req);
+ memcpy(dctx, in, sizeof(*dctx));
+ return 0;
+
+}
+
+static int ghash_async_export(struct ahash_request *req, void *out)
+{
+ struct ahash_request *cryptd_req = ahash_request_ctx(req);
+ struct shash_desc *desc = cryptd_shash_desc(cryptd_req);
+ struct ghash_desc_ctx *dctx = shash_desc_ctx(desc);
+
+ memcpy(out, dctx, sizeof(*dctx));
+ return 0;
+
+}
+
static int ghash_async_digest(struct ahash_request *req)
{
struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
@@ -288,8 +311,11 @@ static struct ahash_alg ghash_async_alg = {
.final = ghash_async_final,
.setkey = ghash_async_setkey,
.digest = ghash_async_digest,
+ .export = ghash_async_export,
+ .import = ghash_async_import,
.halg = {
.digestsize = GHASH_DIGEST_SIZE,
+ .statesize = sizeof(struct ghash_desc_ctx),
.base = {
.cra_name = "ghash",
.cra_driver_name = "ghash-clmulni",
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index f17705e1332c..cb713df81180 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -383,3 +383,4 @@
374 i386 userfaultfd sys_userfaultfd
375 i386 membarrier sys_membarrier
376 i386 mlock2 sys_mlock2
+377 i386 copy_file_range sys_copy_file_range
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 314a90bfc09c..dc1040a50bdc 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -332,6 +332,7 @@
323 common userfaultfd sys_userfaultfd
324 common membarrier sys_membarrier
325 common mlock2 sys_mlock2
+326 common copy_file_range sys_copy_file_range
#
# x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile
index 265c0ed68118..c854541d93ff 100644
--- a/arch/x86/entry/vdso/Makefile
+++ b/arch/x86/entry/vdso/Makefile
@@ -4,6 +4,7 @@
KBUILD_CFLAGS += $(DISABLE_LTO)
KASAN_SANITIZE := n
+UBSAN_SANITIZE := n
VDSO64-$(CONFIG_X86_64) := y
VDSOX32-$(CONFIG_X86_X32_ABI) := y
diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c
index 8602f06c759f..1a50e09c945b 100644
--- a/arch/x86/entry/vdso/vclock_gettime.c
+++ b/arch/x86/entry/vdso/vclock_gettime.c
@@ -126,23 +126,23 @@ static notrace cycle_t vread_pvclock(int *mode)
*
* On Xen, we don't appear to have that guarantee, but Xen still
* supplies a valid seqlock using the version field.
-
+ *
* We only do pvclock vdso timing at all if
* PVCLOCK_TSC_STABLE_BIT is set, and we interpret that bit to
* mean that all vCPUs have matching pvti and that the TSC is
* synced, so we can just look at vCPU 0's pvti.
*/
- if (unlikely(!(pvti->flags & PVCLOCK_TSC_STABLE_BIT))) {
- *mode = VCLOCK_NONE;
- return 0;
- }
-
do {
version = pvti->version;
smp_rmb();
+ if (unlikely(!(pvti->flags & PVCLOCK_TSC_STABLE_BIT))) {
+ *mode = VCLOCK_NONE;
+ return 0;
+ }
+
tsc = rdtsc_ordered();
pvti_tsc_to_system_mul = pvti->tsc_to_system_mul;
pvti_tsc_shift = pvti->tsc_shift;
diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h
index 0681d2532527..a584e1c50918 100644
--- a/arch/x86/include/asm/barrier.h
+++ b/arch/x86/include/asm/barrier.h
@@ -31,20 +31,10 @@
#endif
#define dma_wmb() barrier()
-#ifdef CONFIG_SMP
-#define smp_mb() mb()
-#define smp_rmb() dma_rmb()
-#define smp_wmb() barrier()
-#define smp_store_mb(var, value) do { (void)xchg(&var, value); } while (0)
-#else /* !SMP */
-#define smp_mb() barrier()
-#define smp_rmb() barrier()
-#define smp_wmb() barrier()
-#define smp_store_mb(var, value) do { WRITE_ONCE(var, value); barrier(); } while (0)
-#endif /* SMP */
-
-#define read_barrier_depends() do { } while (0)
-#define smp_read_barrier_depends() do { } while (0)
+#define __smp_mb() mb()
+#define __smp_rmb() dma_rmb()
+#define __smp_wmb() barrier()
+#define __smp_store_mb(var, value) do { (void)xchg(&var, value); } while (0)
#if defined(CONFIG_X86_PPRO_FENCE)
@@ -53,31 +43,31 @@
* model and we should fall back to full barriers.
*/
-#define smp_store_release(p, v) \
+#define __smp_store_release(p, v) \
do { \
compiletime_assert_atomic_type(*p); \
- smp_mb(); \
+ __smp_mb(); \
WRITE_ONCE(*p, v); \
} while (0)
-#define smp_load_acquire(p) \
+#define __smp_load_acquire(p) \
({ \
typeof(*p) ___p1 = READ_ONCE(*p); \
compiletime_assert_atomic_type(*p); \
- smp_mb(); \
+ __smp_mb(); \
___p1; \
})
#else /* regular x86 TSO memory ordering */
-#define smp_store_release(p, v) \
+#define __smp_store_release(p, v) \
do { \
compiletime_assert_atomic_type(*p); \
barrier(); \
WRITE_ONCE(*p, v); \
} while (0)
-#define smp_load_acquire(p) \
+#define __smp_load_acquire(p) \
({ \
typeof(*p) ___p1 = READ_ONCE(*p); \
compiletime_assert_atomic_type(*p); \
@@ -88,7 +78,9 @@ do { \
#endif
/* Atomic operations are already serializing on x86 */
-#define smp_mb__before_atomic() barrier()
-#define smp_mb__after_atomic() barrier()
+#define __smp_mb__before_atomic() barrier()
+#define __smp_mb__after_atomic() barrier()
+
+#include <asm-generic/barrier.h>
#endif /* _ASM_X86_BARRIER_H */
diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h
index 4fa687a47a62..6b8d6e8cd449 100644
--- a/arch/x86/include/asm/boot.h
+++ b/arch/x86/include/asm/boot.h
@@ -27,7 +27,7 @@
#define BOOT_HEAP_SIZE 0x400000
#else /* !CONFIG_KERNEL_BZIP2 */
-#define BOOT_HEAP_SIZE 0x8000
+#define BOOT_HEAP_SIZE 0x10000
#endif /* !CONFIG_KERNEL_BZIP2 */
diff --git a/arch/x86/include/asm/device.h b/arch/x86/include/asm/device.h
index 03dd72957d2f..684ed6c3aa67 100644
--- a/arch/x86/include/asm/device.h
+++ b/arch/x86/include/asm/device.h
@@ -10,6 +10,16 @@ struct dev_archdata {
#endif
};
+#if defined(CONFIG_X86_DEV_DMA_OPS) && defined(CONFIG_PCI_DOMAINS)
+struct dma_domain {
+ struct list_head node;
+ struct dma_map_ops *dma_ops;
+ int domain_nr;
+};
+void add_dma_domain(struct dma_domain *domain);
+void del_dma_domain(struct dma_domain *domain);
+#endif
+
struct pdev_archdata {
};
diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h
index 953b7263f844..3a27b93e6261 100644
--- a/arch/x86/include/asm/dma-mapping.h
+++ b/arch/x86/include/asm/dma-mapping.h
@@ -46,8 +46,6 @@ bool arch_dma_alloc_attrs(struct device **dev, gfp_t *gfp);
#define HAVE_ARCH_DMA_SUPPORTED 1
extern int dma_supported(struct device *hwdev, u64 mask);
-#include <asm-generic/dma-mapping-common.h>
-
extern void *dma_generic_alloc_coherent(struct device *dev, size_t size,
dma_addr_t *dma_addr, gfp_t flag,
struct dma_attrs *attrs);
diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index eadcdd5bb946..0fd440df63f1 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -42,6 +42,7 @@ extern void fpu__init_cpu_xstate(void);
extern void fpu__init_system(struct cpuinfo_x86 *c);
extern void fpu__init_check_bugs(void);
extern void fpu__resume_cpu(void);
+extern u64 fpu__get_supported_xfeatures_mask(void);
/*
* Debugging facility:
diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h
index 3a6c89b70307..af30fdeb140d 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -20,15 +20,16 @@
/* Supported features which support lazy state saving */
#define XFEATURE_MASK_LAZY (XFEATURE_MASK_FP | \
- XFEATURE_MASK_SSE | \
+ XFEATURE_MASK_SSE)
+
+/* Supported features which require eager state saving */
+#define XFEATURE_MASK_EAGER (XFEATURE_MASK_BNDREGS | \
+ XFEATURE_MASK_BNDCSR | \
XFEATURE_MASK_YMM | \
- XFEATURE_MASK_OPMASK | \
+ XFEATURE_MASK_OPMASK | \
XFEATURE_MASK_ZMM_Hi256 | \
XFEATURE_MASK_Hi16_ZMM)
-/* Supported features which require eager state saving */
-#define XFEATURE_MASK_EAGER (XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR)
-
/* All currently supported features */
#define XCNTXT_MASK (XFEATURE_MASK_LAZY | XFEATURE_MASK_EAGER)
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index 1e3408e88604..1815b736269d 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -130,6 +130,11 @@ struct irq_alloc_info {
char *uv_name;
};
#endif
+#if IS_ENABLED(CONFIG_VMD)
+ struct {
+ struct msi_desc *desc;
+ };
+#endif
};
};
diff --git a/arch/x86/include/asm/intel_punit_ipc.h b/arch/x86/include/asm/intel_punit_ipc.h
new file mode 100644
index 000000000000..201eb9dce595
--- /dev/null
+++ b/arch/x86/include/asm/intel_punit_ipc.h
@@ -0,0 +1,101 @@
+#ifndef _ASM_X86_INTEL_PUNIT_IPC_H_
+#define _ASM_X86_INTEL_PUNIT_IPC_H_
+
+/*
+ * Three types of 8bit P-Unit IPC commands are supported,
+ * bit[7:6]: [00]: BIOS; [01]: GTD; [10]: ISPD.
+ */
+typedef enum {
+ BIOS_IPC = 0,
+ GTDRIVER_IPC,
+ ISPDRIVER_IPC,
+ RESERVED_IPC,
+} IPC_TYPE;
+
+#define IPC_TYPE_OFFSET 6
+#define IPC_PUNIT_BIOS_CMD_BASE (BIOS_IPC << IPC_TYPE_OFFSET)
+#define IPC_PUNIT_GTD_CMD_BASE (GTDDRIVER_IPC << IPC_TYPE_OFFSET)
+#define IPC_PUNIT_ISPD_CMD_BASE (ISPDRIVER_IPC << IPC_TYPE_OFFSET)
+#define IPC_PUNIT_CMD_TYPE_MASK (RESERVED_IPC << IPC_TYPE_OFFSET)
+
+/* BIOS => Pcode commands */
+#define IPC_PUNIT_BIOS_ZERO (IPC_PUNIT_BIOS_CMD_BASE | 0x00)
+#define IPC_PUNIT_BIOS_VR_INTERFACE (IPC_PUNIT_BIOS_CMD_BASE | 0x01)
+#define IPC_PUNIT_BIOS_READ_PCS (IPC_PUNIT_BIOS_CMD_BASE | 0x02)
+#define IPC_PUNIT_BIOS_WRITE_PCS (IPC_PUNIT_BIOS_CMD_BASE | 0x03)
+#define IPC_PUNIT_BIOS_READ_PCU_CONFIG (IPC_PUNIT_BIOS_CMD_BASE | 0x04)
+#define IPC_PUNIT_BIOS_WRITE_PCU_CONFIG (IPC_PUNIT_BIOS_CMD_BASE | 0x05)
+#define IPC_PUNIT_BIOS_READ_PL1_SETTING (IPC_PUNIT_BIOS_CMD_BASE | 0x06)
+#define IPC_PUNIT_BIOS_WRITE_PL1_SETTING (IPC_PUNIT_BIOS_CMD_BASE | 0x07)
+#define IPC_PUNIT_BIOS_TRIGGER_VDD_RAM (IPC_PUNIT_BIOS_CMD_BASE | 0x08)
+#define IPC_PUNIT_BIOS_READ_TELE_INFO (IPC_PUNIT_BIOS_CMD_BASE | 0x09)
+#define IPC_PUNIT_BIOS_READ_TELE_TRACE_CTRL (IPC_PUNIT_BIOS_CMD_BASE | 0x0a)
+#define IPC_PUNIT_BIOS_WRITE_TELE_TRACE_CTRL (IPC_PUNIT_BIOS_CMD_BASE | 0x0b)
+#define IPC_PUNIT_BIOS_READ_TELE_EVENT_CTRL (IPC_PUNIT_BIOS_CMD_BASE | 0x0c)
+#define IPC_PUNIT_BIOS_WRITE_TELE_EVENT_CTRL (IPC_PUNIT_BIOS_CMD_BASE | 0x0d)
+#define IPC_PUNIT_BIOS_READ_TELE_TRACE (IPC_PUNIT_BIOS_CMD_BASE | 0x0e)
+#define IPC_PUNIT_BIOS_WRITE_TELE_TRACE (IPC_PUNIT_BIOS_CMD_BASE | 0x0f)
+#define IPC_PUNIT_BIOS_READ_TELE_EVENT (IPC_PUNIT_BIOS_CMD_BASE | 0x10)
+#define IPC_PUNIT_BIOS_WRITE_TELE_EVENT (IPC_PUNIT_BIOS_CMD_BASE | 0x11)
+#define IPC_PUNIT_BIOS_READ_MODULE_TEMP (IPC_PUNIT_BIOS_CMD_BASE | 0x12)
+#define IPC_PUNIT_BIOS_RESERVED (IPC_PUNIT_BIOS_CMD_BASE | 0x13)
+#define IPC_PUNIT_BIOS_READ_VOLTAGE_OVER (IPC_PUNIT_BIOS_CMD_BASE | 0x14)
+#define IPC_PUNIT_BIOS_WRITE_VOLTAGE_OVER (IPC_PUNIT_BIOS_CMD_BASE | 0x15)
+#define IPC_PUNIT_BIOS_READ_RATIO_OVER (IPC_PUNIT_BIOS_CMD_BASE | 0x16)
+#define IPC_PUNIT_BIOS_WRITE_RATIO_OVER (IPC_PUNIT_BIOS_CMD_BASE | 0x17)
+#define IPC_PUNIT_BIOS_READ_VF_GL_CTRL (IPC_PUNIT_BIOS_CMD_BASE | 0x18)
+#define IPC_PUNIT_BIOS_WRITE_VF_GL_CTRL (IPC_PUNIT_BIOS_CMD_BASE | 0x19)
+#define IPC_PUNIT_BIOS_READ_FM_SOC_TEMP_THRESH (IPC_PUNIT_BIOS_CMD_BASE | 0x1a)
+#define IPC_PUNIT_BIOS_WRITE_FM_SOC_TEMP_THRESH (IPC_PUNIT_BIOS_CMD_BASE | 0x1b)
+
+/* GT Driver => Pcode commands */
+#define IPC_PUNIT_GTD_ZERO (IPC_PUNIT_GTD_CMD_BASE | 0x00)
+#define IPC_PUNIT_GTD_CONFIG (IPC_PUNIT_GTD_CMD_BASE | 0x01)
+#define IPC_PUNIT_GTD_READ_ICCP_LIC_CDYN_SCAL (IPC_PUNIT_GTD_CMD_BASE | 0x02)
+#define IPC_PUNIT_GTD_WRITE_ICCP_LIC_CDYN_SCAL (IPC_PUNIT_GTD_CMD_BASE | 0x03)
+#define IPC_PUNIT_GTD_GET_WM_VAL (IPC_PUNIT_GTD_CMD_BASE | 0x06)
+#define IPC_PUNIT_GTD_WRITE_CONFIG_WISHREQ (IPC_PUNIT_GTD_CMD_BASE | 0x07)
+#define IPC_PUNIT_GTD_READ_REQ_DUTY_CYCLE (IPC_PUNIT_GTD_CMD_BASE | 0x16)
+#define IPC_PUNIT_GTD_DIS_VOL_FREQ_CHG_REQUEST (IPC_PUNIT_GTD_CMD_BASE | 0x17)
+#define IPC_PUNIT_GTD_DYNA_DUTY_CYCLE_CTRL (IPC_PUNIT_GTD_CMD_BASE | 0x1a)
+#define IPC_PUNIT_GTD_DYNA_DUTY_CYCLE_TUNING (IPC_PUNIT_GTD_CMD_BASE | 0x1c)
+
+/* ISP Driver => Pcode commands */
+#define IPC_PUNIT_ISPD_ZERO (IPC_PUNIT_ISPD_CMD_BASE | 0x00)
+#define IPC_PUNIT_ISPD_CONFIG (IPC_PUNIT_ISPD_CMD_BASE | 0x01)
+#define IPC_PUNIT_ISPD_GET_ISP_LTR_VAL (IPC_PUNIT_ISPD_CMD_BASE | 0x02)
+#define IPC_PUNIT_ISPD_ACCESS_IU_FREQ_BOUNDS (IPC_PUNIT_ISPD_CMD_BASE | 0x03)
+#define IPC_PUNIT_ISPD_READ_CDYN_LEVEL (IPC_PUNIT_ISPD_CMD_BASE | 0x04)
+#define IPC_PUNIT_ISPD_WRITE_CDYN_LEVEL (IPC_PUNIT_ISPD_CMD_BASE | 0x05)
+
+/* Error codes */
+#define IPC_PUNIT_ERR_SUCCESS 0
+#define IPC_PUNIT_ERR_INVALID_CMD 1
+#define IPC_PUNIT_ERR_INVALID_PARAMETER 2
+#define IPC_PUNIT_ERR_CMD_TIMEOUT 3
+#define IPC_PUNIT_ERR_CMD_LOCKED 4
+#define IPC_PUNIT_ERR_INVALID_VR_ID 5
+#define IPC_PUNIT_ERR_VR_ERR 6
+
+#if IS_ENABLED(CONFIG_INTEL_PUNIT_IPC)
+
+int intel_punit_ipc_simple_command(int cmd, int para1, int para2);
+int intel_punit_ipc_command(u32 cmd, u32 para1, u32 para2, u32 *in, u32 *out);
+
+#else
+
+static inline int intel_punit_ipc_simple_command(int cmd,
+ int para1, int para2)
+{
+ return -ENODEV;
+}
+
+static inline int intel_punit_ipc_command(u32 cmd, u32 para1, u32 para2,
+ u32 *in, u32 *out)
+{
+ return -ENODEV;
+}
+
+#endif /* CONFIG_INTEL_PUNIT_IPC */
+
+#endif
diff --git a/arch/x86/include/asm/intel_telemetry.h b/arch/x86/include/asm/intel_telemetry.h
new file mode 100644
index 000000000000..ed65fe701de5
--- /dev/null
+++ b/arch/x86/include/asm/intel_telemetry.h
@@ -0,0 +1,147 @@
+/*
+ * Intel SOC Telemetry Driver Header File
+ * Copyright (C) 2015, Intel Corporation.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ */
+#ifndef INTEL_TELEMETRY_H
+#define INTEL_TELEMETRY_H
+
+#define TELEM_MAX_EVENTS_SRAM 28
+#define TELEM_MAX_OS_ALLOCATED_EVENTS 20
+
+enum telemetry_unit {
+ TELEM_PSS = 0,
+ TELEM_IOSS,
+ TELEM_UNIT_NONE
+};
+
+struct telemetry_evtlog {
+ u32 telem_evtid;
+ u64 telem_evtlog;
+};
+
+struct telemetry_evtconfig {
+ /* Array of Event-IDs to Enable */
+ u32 *evtmap;
+
+ /* Number of Events (<29) in evtmap */
+ u8 num_evts;
+
+ /* Sampling period */
+ u8 period;
+};
+
+struct telemetry_evtmap {
+ const char *name;
+ u32 evt_id;
+};
+
+struct telemetry_unit_config {
+ struct telemetry_evtmap *telem_evts;
+ void __iomem *regmap;
+ u32 ssram_base_addr;
+ u8 ssram_evts_used;
+ u8 curr_period;
+ u8 max_period;
+ u8 min_period;
+ u32 ssram_size;
+
+};
+
+struct telemetry_plt_config {
+ struct telemetry_unit_config pss_config;
+ struct telemetry_unit_config ioss_config;
+ struct mutex telem_trace_lock;
+ struct mutex telem_lock;
+ bool telem_in_use;
+};
+
+struct telemetry_core_ops {
+ int (*get_sampling_period)(u8 *pss_min_period, u8 *pss_max_period,
+ u8 *ioss_min_period, u8 *ioss_max_period);
+
+ int (*get_eventconfig)(struct telemetry_evtconfig *pss_evtconfig,
+ struct telemetry_evtconfig *ioss_evtconfig,
+ int pss_len, int ioss_len);
+
+ int (*update_events)(struct telemetry_evtconfig pss_evtconfig,
+ struct telemetry_evtconfig ioss_evtconfig);
+
+ int (*set_sampling_period)(u8 pss_period, u8 ioss_period);
+
+ int (*get_trace_verbosity)(enum telemetry_unit telem_unit,
+ u32 *verbosity);
+
+ int (*set_trace_verbosity)(enum telemetry_unit telem_unit,
+ u32 verbosity);
+
+ int (*raw_read_eventlog)(enum telemetry_unit telem_unit,
+ struct telemetry_evtlog *evtlog,
+ int len, int log_all_evts);
+
+ int (*read_eventlog)(enum telemetry_unit telem_unit,
+ struct telemetry_evtlog *evtlog,
+ int len, int log_all_evts);
+
+ int (*add_events)(u8 num_pss_evts, u8 num_ioss_evts,
+ u32 *pss_evtmap, u32 *ioss_evtmap);
+
+ int (*reset_events)(void);
+};
+
+int telemetry_set_pltdata(struct telemetry_core_ops *ops,
+ struct telemetry_plt_config *pltconfig);
+
+int telemetry_clear_pltdata(void);
+
+int telemetry_pltconfig_valid(void);
+
+int telemetry_get_evtname(enum telemetry_unit telem_unit,
+ const char **name, int len);
+
+int telemetry_update_events(struct telemetry_evtconfig pss_evtconfig,
+ struct telemetry_evtconfig ioss_evtconfig);
+
+int telemetry_add_events(u8 num_pss_evts, u8 num_ioss_evts,
+ u32 *pss_evtmap, u32 *ioss_evtmap);
+
+int telemetry_reset_events(void);
+
+int telemetry_get_eventconfig(struct telemetry_evtconfig *pss_config,
+ struct telemetry_evtconfig *ioss_config,
+ int pss_len, int ioss_len);
+
+int telemetry_read_events(enum telemetry_unit telem_unit,
+ struct telemetry_evtlog *evtlog, int len);
+
+int telemetry_raw_read_events(enum telemetry_unit telem_unit,
+ struct telemetry_evtlog *evtlog, int len);
+
+int telemetry_read_eventlog(enum telemetry_unit telem_unit,
+ struct telemetry_evtlog *evtlog, int len);
+
+int telemetry_raw_read_eventlog(enum telemetry_unit telem_unit,
+ struct telemetry_evtlog *evtlog, int len);
+
+int telemetry_get_sampling_period(u8 *pss_min_period, u8 *pss_max_period,
+ u8 *ioss_min_period, u8 *ioss_max_period);
+
+int telemetry_set_sampling_period(u8 pss_period, u8 ioss_period);
+
+int telemetry_set_trace_verbosity(enum telemetry_unit telem_unit,
+ u32 verbosity);
+
+int telemetry_get_trace_verbosity(enum telemetry_unit telem_unit,
+ u32 *verbosity);
+
+#endif /* INTEL_TELEMETRY_H */
diff --git a/arch/x86/include/asm/iosf_mbi.h b/arch/x86/include/asm/iosf_mbi.h
index b72ad0faa6c5..b41ee164930a 100644
--- a/arch/x86/include/asm/iosf_mbi.h
+++ b/arch/x86/include/asm/iosf_mbi.h
@@ -1,5 +1,5 @@
/*
- * iosf_mbi.h: Intel OnChip System Fabric MailBox access support
+ * Intel OnChip System Fabric MailBox access support
*/
#ifndef IOSF_MBI_SYMS_H
@@ -16,6 +16,18 @@
#define MBI_MASK_LO 0x000000FF
#define MBI_ENABLE 0xF0
+/* IOSF SB read/write opcodes */
+#define MBI_MMIO_READ 0x00
+#define MBI_MMIO_WRITE 0x01
+#define MBI_CFG_READ 0x04
+#define MBI_CFG_WRITE 0x05
+#define MBI_CR_READ 0x06
+#define MBI_CR_WRITE 0x07
+#define MBI_REG_READ 0x10
+#define MBI_REG_WRITE 0x11
+#define MBI_ESRAM_READ 0x12
+#define MBI_ESRAM_WRITE 0x13
+
/* Baytrail available units */
#define BT_MBI_UNIT_AUNIT 0x00
#define BT_MBI_UNIT_SMC 0x01
@@ -28,50 +40,13 @@
#define BT_MBI_UNIT_SATA 0xA3
#define BT_MBI_UNIT_PCIE 0xA6
-/* Baytrail read/write opcodes */
-#define BT_MBI_AUNIT_READ 0x10
-#define BT_MBI_AUNIT_WRITE 0x11
-#define BT_MBI_SMC_READ 0x10
-#define BT_MBI_SMC_WRITE 0x11
-#define BT_MBI_CPU_READ 0x10
-#define BT_MBI_CPU_WRITE 0x11
-#define BT_MBI_BUNIT_READ 0x10
-#define BT_MBI_BUNIT_WRITE 0x11
-#define BT_MBI_PMC_READ 0x06
-#define BT_MBI_PMC_WRITE 0x07
-#define BT_MBI_GFX_READ 0x00
-#define BT_MBI_GFX_WRITE 0x01
-#define BT_MBI_SMIO_READ 0x06
-#define BT_MBI_SMIO_WRITE 0x07
-#define BT_MBI_USB_READ 0x06
-#define BT_MBI_USB_WRITE 0x07
-#define BT_MBI_SATA_READ 0x00
-#define BT_MBI_SATA_WRITE 0x01
-#define BT_MBI_PCIE_READ 0x00
-#define BT_MBI_PCIE_WRITE 0x01
-
/* Quark available units */
#define QRK_MBI_UNIT_HBA 0x00
#define QRK_MBI_UNIT_HB 0x03
#define QRK_MBI_UNIT_RMU 0x04
#define QRK_MBI_UNIT_MM 0x05
-#define QRK_MBI_UNIT_MMESRAM 0x05
#define QRK_MBI_UNIT_SOC 0x31
-/* Quark read/write opcodes */
-#define QRK_MBI_HBA_READ 0x10
-#define QRK_MBI_HBA_WRITE 0x11
-#define QRK_MBI_HB_READ 0x10
-#define QRK_MBI_HB_WRITE 0x11
-#define QRK_MBI_RMU_READ 0x10
-#define QRK_MBI_RMU_WRITE 0x11
-#define QRK_MBI_MM_READ 0x10
-#define QRK_MBI_MM_WRITE 0x11
-#define QRK_MBI_MMESRAM_READ 0x12
-#define QRK_MBI_MMESRAM_WRITE 0x13
-#define QRK_MBI_SOC_READ 0x06
-#define QRK_MBI_SOC_WRITE 0x07
-
#if IS_ENABLED(CONFIG_IOSF_MBI)
bool iosf_mbi_available(void);
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 30cfd64295a0..44adbb819041 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -25,6 +25,7 @@
#include <linux/pvclock_gtod.h>
#include <linux/clocksource.h>
#include <linux/irqbypass.h>
+#include <linux/hyperv.h>
#include <asm/pvclock-abi.h>
#include <asm/desc.h>
@@ -45,6 +46,31 @@
#define KVM_IRQCHIP_NUM_PINS KVM_IOAPIC_NUM_PINS
+/* x86-specific vcpu->requests bit members */
+#define KVM_REQ_MIGRATE_TIMER 8
+#define KVM_REQ_REPORT_TPR_ACCESS 9
+#define KVM_REQ_TRIPLE_FAULT 10
+#define KVM_REQ_MMU_SYNC 11
+#define KVM_REQ_CLOCK_UPDATE 12
+#define KVM_REQ_DEACTIVATE_FPU 13
+#define KVM_REQ_EVENT 14
+#define KVM_REQ_APF_HALT 15
+#define KVM_REQ_STEAL_UPDATE 16
+#define KVM_REQ_NMI 17
+#define KVM_REQ_PMU 18
+#define KVM_REQ_PMI 19
+#define KVM_REQ_SMI 20
+#define KVM_REQ_MASTERCLOCK_UPDATE 21
+#define KVM_REQ_MCLOCK_INPROGRESS 22
+#define KVM_REQ_SCAN_IOAPIC 23
+#define KVM_REQ_GLOBAL_CLOCK_UPDATE 24
+#define KVM_REQ_APIC_PAGE_RELOAD 25
+#define KVM_REQ_HV_CRASH 26
+#define KVM_REQ_IOAPIC_EOI_EXIT 27
+#define KVM_REQ_HV_RESET 28
+#define KVM_REQ_HV_EXIT 29
+#define KVM_REQ_HV_STIMER 30
+
#define CR0_RESERVED_BITS \
(~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
| X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \
@@ -213,6 +239,10 @@ union kvm_mmu_page_role {
};
};
+struct kvm_rmap_head {
+ unsigned long val;
+};
+
struct kvm_mmu_page {
struct list_head link;
struct hlist_node hash_link;
@@ -230,7 +260,7 @@ struct kvm_mmu_page {
bool unsync;
int root_count; /* Currently serving as active root */
unsigned int unsync_children;
- unsigned long parent_ptes; /* Reverse mapping for parent_pte */
+ struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */
/* The page is obsolete if mmu_valid_gen != kvm->arch.mmu_valid_gen. */
unsigned long mmu_valid_gen;
@@ -374,10 +404,38 @@ struct kvm_mtrr {
struct list_head head;
};
+/* Hyper-V SynIC timer */
+struct kvm_vcpu_hv_stimer {
+ struct hrtimer timer;
+ int index;
+ u64 config;
+ u64 count;
+ u64 exp_time;
+ struct hv_message msg;
+ bool msg_pending;
+};
+
+/* Hyper-V synthetic interrupt controller (SynIC)*/
+struct kvm_vcpu_hv_synic {
+ u64 version;
+ u64 control;
+ u64 msg_page;
+ u64 evt_page;
+ atomic64_t sint[HV_SYNIC_SINT_COUNT];
+ atomic_t sint_to_gsi[HV_SYNIC_SINT_COUNT];
+ DECLARE_BITMAP(auto_eoi_bitmap, 256);
+ DECLARE_BITMAP(vec_bitmap, 256);
+ bool active;
+};
+
/* Hyper-V per vcpu emulation context */
struct kvm_vcpu_hv {
u64 hv_vapic;
s64 runtime_offset;
+ struct kvm_vcpu_hv_synic synic;
+ struct kvm_hyperv_exit exit;
+ struct kvm_vcpu_hv_stimer stimer[HV_SYNIC_STIMER_COUNT];
+ DECLARE_BITMAP(stimer_pending_bitmap, HV_SYNIC_STIMER_COUNT);
};
struct kvm_vcpu_arch {
@@ -400,7 +458,8 @@ struct kvm_vcpu_arch {
u64 efer;
u64 apic_base;
struct kvm_lapic *apic; /* kernel irqchip context */
- u64 eoi_exit_bitmap[4];
+ bool apicv_active;
+ DECLARE_BITMAP(ioapic_handled_vectors, 256);
unsigned long apic_attention;
int32_t apic_arb_prio;
int mp_state;
@@ -589,7 +648,7 @@ struct kvm_lpage_info {
};
struct kvm_arch_memory_slot {
- unsigned long *rmap[KVM_NR_PAGE_SIZES];
+ struct kvm_rmap_head *rmap[KVM_NR_PAGE_SIZES];
struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1];
};
@@ -831,10 +890,11 @@ struct kvm_x86_ops {
void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
void (*enable_irq_window)(struct kvm_vcpu *vcpu);
void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
- int (*cpu_uses_apicv)(struct kvm_vcpu *vcpu);
+ bool (*get_enable_apicv)(void);
+ void (*refresh_apicv_exec_ctrl)(struct kvm_vcpu *vcpu);
void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr);
void (*hwapic_isr_update)(struct kvm *kvm, int isr);
- void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu);
+ void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
void (*set_virtual_x2apic_mode)(struct kvm_vcpu *vcpu, bool set);
void (*set_apic_access_page_addr)(struct kvm_vcpu *vcpu, hpa_t hpa);
void (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector);
@@ -1086,6 +1146,8 @@ gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,
gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva,
struct x86_exception *exception);
+void kvm_vcpu_deactivate_apicv(struct kvm_vcpu *vcpu);
+
int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code,
@@ -1231,6 +1293,9 @@ u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc);
unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu);
bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip);
+void kvm_make_mclock_inprogress_request(struct kvm *kvm);
+void kvm_make_scan_ioapic_request(struct kvm *kvm);
+
void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
struct kvm_async_pf *work);
void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
diff --git a/arch/x86/include/asm/lguest.h b/arch/x86/include/asm/lguest.h
index 3bbc07a57a31..73d0c9b92087 100644
--- a/arch/x86/include/asm/lguest.h
+++ b/arch/x86/include/asm/lguest.h
@@ -12,7 +12,9 @@
#define GUEST_PL 1
/* Page for Switcher text itself, then two pages per cpu */
-#define TOTAL_SWITCHER_PAGES (1 + 2 * nr_cpu_ids)
+#define SWITCHER_TEXT_PAGES (1)
+#define SWITCHER_STACK_PAGES (2 * nr_cpu_ids)
+#define TOTAL_SWITCHER_PAGES (SWITCHER_TEXT_PAGES + SWITCHER_STACK_PAGES)
/* Where we map the Switcher, in both Host and Guest. */
extern unsigned long switcher_addr;
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 379cd3658799..bfd9b2a35a0b 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -116,8 +116,36 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
#endif
cpumask_set_cpu(cpu, mm_cpumask(next));
- /* Re-load page tables */
+ /*
+ * Re-load page tables.
+ *
+ * This logic has an ordering constraint:
+ *
+ * CPU 0: Write to a PTE for 'next'
+ * CPU 0: load bit 1 in mm_cpumask. if nonzero, send IPI.
+ * CPU 1: set bit 1 in next's mm_cpumask
+ * CPU 1: load from the PTE that CPU 0 writes (implicit)
+ *
+ * We need to prevent an outcome in which CPU 1 observes
+ * the new PTE value and CPU 0 observes bit 1 clear in
+ * mm_cpumask. (If that occurs, then the IPI will never
+ * be sent, and CPU 0's TLB will contain a stale entry.)
+ *
+ * The bad outcome can occur if either CPU's load is
+ * reordered before that CPU's store, so both CPUs must
+ * execute full barriers to prevent this from happening.
+ *
+ * Thus, switch_mm needs a full barrier between the
+ * store to mm_cpumask and any operation that could load
+ * from next->pgd. TLB fills are special and can happen
+ * due to instruction fetches or for no reason at all,
+ * and neither LOCK nor MFENCE orders them.
+ * Fortunately, load_cr3() is serializing and gives the
+ * ordering guarantee we need.
+ *
+ */
load_cr3(next->pgd);
+
trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
/* Stop flush ipis for the previous mm */
@@ -156,10 +184,14 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
* schedule, protecting us from simultaneous changes.
*/
cpumask_set_cpu(cpu, mm_cpumask(next));
+
/*
* We were in lazy tlb mode and leave_mm disabled
* tlb flush IPI delivery. We must reload CR3
* to make sure to use no freed page tables.
+ *
+ * As above, load_cr3() is serializing and orders TLB
+ * fills with respect to the mm_cpumask write.
*/
load_cr3(next->pgd);
trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h
index fa1195dae425..46873fbd44e1 100644
--- a/arch/x86/include/asm/pci_x86.h
+++ b/arch/x86/include/asm/pci_x86.h
@@ -151,11 +151,11 @@ extern struct list_head pci_mmcfg_list;
#define PCI_MMCFG_BUS_OFFSET(bus) ((bus) << 20)
/*
- * AMD Fam10h CPUs are buggy, and cannot access MMIO config space
- * on their northbrige except through the * %eax register. As such, you MUST
- * NOT use normal IOMEM accesses, you need to only use the magic mmio-config
- * accessor functions.
- * In fact just use pci_config_*, nothing else please.
+ * On AMD Fam10h CPUs, all PCI MMIO configuration space accesses must use
+ * %eax. No other source or target registers may be used. The following
+ * mmio_config_* accessors enforce this. See "BIOS and Kernel Developer's
+ * Guide (BKDG) For AMD Family 10h Processors", rev. 3.48, sec 2.11.1,
+ * "MMIO Configuration Coding Requirements".
*/
static inline unsigned char mmio_config_readb(void __iomem *pos)
{
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index d3eee663c41f..0687c4748b8f 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -162,20 +162,22 @@ static inline int pmd_large(pmd_t pte)
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-static inline int pmd_trans_splitting(pmd_t pmd)
-{
- return pmd_val(pmd) & _PAGE_SPLITTING;
-}
-
static inline int pmd_trans_huge(pmd_t pmd)
{
- return pmd_val(pmd) & _PAGE_PSE;
+ return (pmd_val(pmd) & (_PAGE_PSE|_PAGE_DEVMAP)) == _PAGE_PSE;
}
static inline int has_transparent_hugepage(void)
{
return cpu_has_pse;
}
+
+#ifdef __HAVE_ARCH_PTE_DEVMAP
+static inline int pmd_devmap(pmd_t pmd)
+{
+ return !!(pmd_val(pmd) & _PAGE_DEVMAP);
+}
+#endif
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
static inline pte_t pte_set_flags(pte_t pte, pteval_t set)
@@ -252,6 +254,11 @@ static inline pte_t pte_mkspecial(pte_t pte)
return pte_set_flags(pte, _PAGE_SPECIAL);
}
+static inline pte_t pte_mkdevmap(pte_t pte)
+{
+ return pte_set_flags(pte, _PAGE_SPECIAL|_PAGE_DEVMAP);
+}
+
static inline pmd_t pmd_set_flags(pmd_t pmd, pmdval_t set)
{
pmdval_t v = native_pmd_val(pmd);
@@ -271,6 +278,11 @@ static inline pmd_t pmd_mkold(pmd_t pmd)
return pmd_clear_flags(pmd, _PAGE_ACCESSED);
}
+static inline pmd_t pmd_mkclean(pmd_t pmd)
+{
+ return pmd_clear_flags(pmd, _PAGE_DIRTY);
+}
+
static inline pmd_t pmd_wrprotect(pmd_t pmd)
{
return pmd_clear_flags(pmd, _PAGE_RW);
@@ -281,6 +293,11 @@ static inline pmd_t pmd_mkdirty(pmd_t pmd)
return pmd_set_flags(pmd, _PAGE_DIRTY | _PAGE_SOFT_DIRTY);
}
+static inline pmd_t pmd_mkdevmap(pmd_t pmd)
+{
+ return pmd_set_flags(pmd, _PAGE_DEVMAP);
+}
+
static inline pmd_t pmd_mkhuge(pmd_t pmd)
{
return pmd_set_flags(pmd, _PAGE_PSE);
@@ -462,6 +479,13 @@ static inline int pte_present(pte_t a)
return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE);
}
+#ifdef __HAVE_ARCH_PTE_DEVMAP
+static inline int pte_devmap(pte_t a)
+{
+ return (pte_flags(a) & _PAGE_DEVMAP) == _PAGE_DEVMAP;
+}
+#endif
+
#define pte_accessible pte_accessible
static inline bool pte_accessible(struct mm_struct *mm, pte_t a)
{
@@ -808,10 +832,6 @@ extern int pmdp_clear_flush_young(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmdp);
-#define __HAVE_ARCH_PMDP_SPLITTING_FLUSH
-extern void pmdp_splitting_flush(struct vm_area_struct *vma,
- unsigned long addr, pmd_t *pmdp);
-
#define __HAVE_ARCH_PMD_WRITE
static inline int pmd_write(pmd_t pmd)
{
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index a471cadb9630..04c27a013165 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -22,10 +22,11 @@
#define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */
#define _PAGE_BIT_SPECIAL _PAGE_BIT_SOFTW1
#define _PAGE_BIT_CPA_TEST _PAGE_BIT_SOFTW1
-#define _PAGE_BIT_SPLITTING _PAGE_BIT_SOFTW2 /* only valid on a PSE pmd */
#define _PAGE_BIT_HIDDEN _PAGE_BIT_SOFTW3 /* hidden by kmemcheck */
#define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_SOFTW3 /* software dirty tracking */
-#define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */
+#define _PAGE_BIT_SOFTW4 58 /* available for programmer */
+#define _PAGE_BIT_DEVMAP _PAGE_BIT_SOFTW4
+#define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */
/* If _PAGE_BIT_PRESENT is clear, we use these: */
/* - if the user mapped it with PROT_NONE; pte_present gives true */
@@ -46,7 +47,6 @@
#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE)
#define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL)
#define _PAGE_CPA_TEST (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST)
-#define _PAGE_SPLITTING (_AT(pteval_t, 1) << _PAGE_BIT_SPLITTING)
#define __HAVE_ARCH_PTE_SPECIAL
#ifdef CONFIG_KMEMCHECK
@@ -85,8 +85,11 @@
#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
#define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX)
+#define _PAGE_DEVMAP (_AT(u64, 1) << _PAGE_BIT_DEVMAP)
+#define __HAVE_ARCH_PTE_DEVMAP
#else
#define _PAGE_NX (_AT(pteval_t, 0))
+#define _PAGE_DEVMAP (_AT(pteval_t, 0))
#endif
#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
diff --git a/arch/x86/include/asm/platform_sst_audio.h b/arch/x86/include/asm/platform_sst_audio.h
index 7249e6d0902d..5973a2f3db3d 100644
--- a/arch/x86/include/asm/platform_sst_audio.h
+++ b/arch/x86/include/asm/platform_sst_audio.h
@@ -55,6 +55,7 @@ enum sst_audio_device_id_mrfld {
PIPE_MEDIA0_IN = 0x8F,
PIPE_MEDIA1_IN = 0x90,
PIPE_MEDIA2_IN = 0x91,
+ PIPE_MEDIA3_IN = 0x9C,
PIPE_RSVD = 0xFF,
};
diff --git a/arch/x86/include/asm/pmem.h b/arch/x86/include/asm/pmem.h
index d8ce3ec816ab..c57fd1ea9689 100644
--- a/arch/x86/include/asm/pmem.h
+++ b/arch/x86/include/asm/pmem.h
@@ -67,18 +67,19 @@ static inline void arch_wmb_pmem(void)
}
/**
- * __arch_wb_cache_pmem - write back a cache range with CLWB
+ * arch_wb_cache_pmem - write back a cache range with CLWB
* @vaddr: virtual start address
* @size: number of bytes to write back
*
* Write back a cache range using the CLWB (cache line write back)
* instruction. This function requires explicit ordering with an
- * arch_wmb_pmem() call. This API is internal to the x86 PMEM implementation.
+ * arch_wmb_pmem() call.
*/
-static inline void __arch_wb_cache_pmem(void *vaddr, size_t size)
+static inline void arch_wb_cache_pmem(void __pmem *addr, size_t size)
{
u16 x86_clflush_size = boot_cpu_data.x86_clflush_size;
unsigned long clflush_mask = x86_clflush_size - 1;
+ void *vaddr = (void __force *)addr;
void *vend = vaddr + size;
void *p;
@@ -115,7 +116,7 @@ static inline size_t arch_copy_from_iter_pmem(void __pmem *addr, size_t bytes,
len = copy_from_iter_nocache(vaddr, bytes, i);
if (__iter_needs_pmem_wb(i))
- __arch_wb_cache_pmem(vaddr, bytes);
+ arch_wb_cache_pmem(addr, bytes);
return len;
}
@@ -132,13 +133,8 @@ static inline void arch_clear_pmem(void __pmem *addr, size_t size)
{
void *vaddr = (void __force *)addr;
- /* TODO: implement the zeroing via non-temporal writes */
- if (size == PAGE_SIZE && ((unsigned long)vaddr & ~PAGE_MASK) == 0)
- clear_page(vaddr);
- else
- memset(vaddr, 0, size);
-
- __arch_wb_cache_pmem(vaddr, size);
+ memset(vaddr, 0, size);
+ arch_wb_cache_pmem(addr, size);
}
static inline bool __arch_has_wmb_pmem(void)
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 660458af425d..a4a30e4b2d34 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -134,6 +134,9 @@ extern int __get_user_4(void);
extern int __get_user_8(void);
extern int __get_user_bad(void);
+#define __uaccess_begin() stac()
+#define __uaccess_end() clac()
+
/*
* This is a type: either unsigned long, if the argument fits into
* that type, or otherwise unsigned long long.
@@ -193,10 +196,10 @@ __typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL))
#ifdef CONFIG_X86_32
#define __put_user_asm_u64(x, addr, err, errret) \
- asm volatile(ASM_STAC "\n" \
+ asm volatile("\n" \
"1: movl %%eax,0(%2)\n" \
"2: movl %%edx,4(%2)\n" \
- "3: " ASM_CLAC "\n" \
+ "3:" \
".section .fixup,\"ax\"\n" \
"4: movl %3,%0\n" \
" jmp 3b\n" \
@@ -207,10 +210,10 @@ __typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL))
: "A" (x), "r" (addr), "i" (errret), "0" (err))
#define __put_user_asm_ex_u64(x, addr) \
- asm volatile(ASM_STAC "\n" \
+ asm volatile("\n" \
"1: movl %%eax,0(%1)\n" \
"2: movl %%edx,4(%1)\n" \
- "3: " ASM_CLAC "\n" \
+ "3:" \
_ASM_EXTABLE_EX(1b, 2b) \
_ASM_EXTABLE_EX(2b, 3b) \
: : "A" (x), "r" (addr))
@@ -304,6 +307,10 @@ do { \
} \
} while (0)
+/*
+ * This doesn't do __uaccess_begin/end - the exception handling
+ * around it must do that.
+ */
#define __put_user_size_ex(x, ptr, size) \
do { \
__chk_user_ptr(ptr); \
@@ -358,9 +365,9 @@ do { \
} while (0)
#define __get_user_asm(x, addr, err, itype, rtype, ltype, errret) \
- asm volatile(ASM_STAC "\n" \
+ asm volatile("\n" \
"1: mov"itype" %2,%"rtype"1\n" \
- "2: " ASM_CLAC "\n" \
+ "2:\n" \
".section .fixup,\"ax\"\n" \
"3: mov %3,%0\n" \
" xor"itype" %"rtype"1,%"rtype"1\n" \
@@ -370,6 +377,10 @@ do { \
: "=r" (err), ltype(x) \
: "m" (__m(addr)), "i" (errret), "0" (err))
+/*
+ * This doesn't do __uaccess_begin/end - the exception handling
+ * around it must do that.
+ */
#define __get_user_size_ex(x, ptr, size) \
do { \
__chk_user_ptr(ptr); \
@@ -400,7 +411,9 @@ do { \
#define __put_user_nocheck(x, ptr, size) \
({ \
int __pu_err; \
+ __uaccess_begin(); \
__put_user_size((x), (ptr), (size), __pu_err, -EFAULT); \
+ __uaccess_end(); \
__builtin_expect(__pu_err, 0); \
})
@@ -408,7 +421,9 @@ do { \
({ \
int __gu_err; \
unsigned long __gu_val; \
+ __uaccess_begin(); \
__get_user_size(__gu_val, (ptr), (size), __gu_err, -EFAULT); \
+ __uaccess_end(); \
(x) = (__force __typeof__(*(ptr)))__gu_val; \
__builtin_expect(__gu_err, 0); \
})
@@ -423,9 +438,9 @@ struct __large_struct { unsigned long buf[100]; };
* aliasing issues.
*/
#define __put_user_asm(x, addr, err, itype, rtype, ltype, errret) \
- asm volatile(ASM_STAC "\n" \
+ asm volatile("\n" \
"1: mov"itype" %"rtype"1,%2\n" \
- "2: " ASM_CLAC "\n" \
+ "2:\n" \
".section .fixup,\"ax\"\n" \
"3: mov %3,%0\n" \
" jmp 2b\n" \
@@ -445,11 +460,11 @@ struct __large_struct { unsigned long buf[100]; };
*/
#define uaccess_try do { \
current_thread_info()->uaccess_err = 0; \
- stac(); \
+ __uaccess_begin(); \
barrier();
#define uaccess_catch(err) \
- clac(); \
+ __uaccess_end(); \
(err) |= (current_thread_info()->uaccess_err ? -EFAULT : 0); \
} while (0)
@@ -547,12 +562,13 @@ extern void __cmpxchg_wrong_size(void)
__typeof__(ptr) __uval = (uval); \
__typeof__(*(ptr)) __old = (old); \
__typeof__(*(ptr)) __new = (new); \
+ __uaccess_begin(); \
switch (size) { \
case 1: \
{ \
- asm volatile("\t" ASM_STAC "\n" \
+ asm volatile("\n" \
"1:\t" LOCK_PREFIX "cmpxchgb %4, %2\n" \
- "2:\t" ASM_CLAC "\n" \
+ "2:\n" \
"\t.section .fixup, \"ax\"\n" \
"3:\tmov %3, %0\n" \
"\tjmp 2b\n" \
@@ -566,9 +582,9 @@ extern void __cmpxchg_wrong_size(void)
} \
case 2: \
{ \
- asm volatile("\t" ASM_STAC "\n" \
+ asm volatile("\n" \
"1:\t" LOCK_PREFIX "cmpxchgw %4, %2\n" \
- "2:\t" ASM_CLAC "\n" \
+ "2:\n" \
"\t.section .fixup, \"ax\"\n" \
"3:\tmov %3, %0\n" \
"\tjmp 2b\n" \
@@ -582,9 +598,9 @@ extern void __cmpxchg_wrong_size(void)
} \
case 4: \
{ \
- asm volatile("\t" ASM_STAC "\n" \
+ asm volatile("\n" \
"1:\t" LOCK_PREFIX "cmpxchgl %4, %2\n" \
- "2:\t" ASM_CLAC "\n" \
+ "2:\n" \
"\t.section .fixup, \"ax\"\n" \
"3:\tmov %3, %0\n" \
"\tjmp 2b\n" \
@@ -601,9 +617,9 @@ extern void __cmpxchg_wrong_size(void)
if (!IS_ENABLED(CONFIG_X86_64)) \
__cmpxchg_wrong_size(); \
\
- asm volatile("\t" ASM_STAC "\n" \
+ asm volatile("\n" \
"1:\t" LOCK_PREFIX "cmpxchgq %4, %2\n" \
- "2:\t" ASM_CLAC "\n" \
+ "2:\n" \
"\t.section .fixup, \"ax\"\n" \
"3:\tmov %3, %0\n" \
"\tjmp 2b\n" \
@@ -618,6 +634,7 @@ extern void __cmpxchg_wrong_size(void)
default: \
__cmpxchg_wrong_size(); \
} \
+ __uaccess_end(); \
*__uval = __old; \
__ret; \
})
@@ -754,5 +771,30 @@ copy_to_user(void __user *to, const void *from, unsigned long n)
*/
#define __copy_from_user_nmi __copy_from_user_inatomic
+/*
+ * The "unsafe" user accesses aren't really "unsafe", but the naming
+ * is a big fat warning: you have to not only do the access_ok()
+ * checking before using them, but you have to surround them with the
+ * user_access_begin/end() pair.
+ */
+#define user_access_begin() __uaccess_begin()
+#define user_access_end() __uaccess_end()
+
+#define unsafe_put_user(x, ptr) \
+({ \
+ int __pu_err; \
+ __put_user_size((x), (ptr), sizeof(*(ptr)), __pu_err, -EFAULT); \
+ __builtin_expect(__pu_err, 0); \
+})
+
+#define unsafe_get_user(x, ptr) \
+({ \
+ int __gu_err; \
+ unsigned long __gu_val; \
+ __get_user_size(__gu_val, (ptr), sizeof(*(ptr)), __gu_err, -EFAULT); \
+ (x) = (__force __typeof__(*(ptr)))__gu_val; \
+ __builtin_expect(__gu_err, 0); \
+})
+
#endif /* _ASM_X86_UACCESS_H */
diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index f2f9b39b274a..b89c34c4019b 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -56,35 +56,49 @@ int __copy_from_user_nocheck(void *dst, const void __user *src, unsigned size)
if (!__builtin_constant_p(size))
return copy_user_generic(dst, (__force void *)src, size);
switch (size) {
- case 1:__get_user_asm(*(u8 *)dst, (u8 __user *)src,
+ case 1:
+ __uaccess_begin();
+ __get_user_asm(*(u8 *)dst, (u8 __user *)src,
ret, "b", "b", "=q", 1);
+ __uaccess_end();
return ret;
- case 2:__get_user_asm(*(u16 *)dst, (u16 __user *)src,
+ case 2:
+ __uaccess_begin();
+ __get_user_asm(*(u16 *)dst, (u16 __user *)src,
ret, "w", "w", "=r", 2);
+ __uaccess_end();
return ret;
- case 4:__get_user_asm(*(u32 *)dst, (u32 __user *)src,
+ case 4:
+ __uaccess_begin();
+ __get_user_asm(*(u32 *)dst, (u32 __user *)src,
ret, "l", "k", "=r", 4);
+ __uaccess_end();
return ret;
- case 8:__get_user_asm(*(u64 *)dst, (u64 __user *)src,
+ case 8:
+ __uaccess_begin();
+ __get_user_asm(*(u64 *)dst, (u64 __user *)src,
ret, "q", "", "=r", 8);
+ __uaccess_end();
return ret;
case 10:
+ __uaccess_begin();
__get_user_asm(*(u64 *)dst, (u64 __user *)src,
ret, "q", "", "=r", 10);
- if (unlikely(ret))
- return ret;
- __get_user_asm(*(u16 *)(8 + (char *)dst),
- (u16 __user *)(8 + (char __user *)src),
- ret, "w", "w", "=r", 2);
+ if (likely(!ret))
+ __get_user_asm(*(u16 *)(8 + (char *)dst),
+ (u16 __user *)(8 + (char __user *)src),
+ ret, "w", "w", "=r", 2);
+ __uaccess_end();
return ret;
case 16:
+ __uaccess_begin();
__get_user_asm(*(u64 *)dst, (u64 __user *)src,
ret, "q", "", "=r", 16);
- if (unlikely(ret))
- return ret;
- __get_user_asm(*(u64 *)(8 + (char *)dst),
- (u64 __user *)(8 + (char __user *)src),
- ret, "q", "", "=r", 8);
+ if (likely(!ret))
+ __get_user_asm(*(u64 *)(8 + (char *)dst),
+ (u64 __user *)(8 + (char __user *)src),
+ ret, "q", "", "=r", 8);
+ __uaccess_end();
return ret;
default:
return copy_user_generic(dst, (__force void *)src, size);
@@ -106,35 +120,51 @@ int __copy_to_user_nocheck(void __user *dst, const void *src, unsigned size)
if (!__builtin_constant_p(size))
return copy_user_generic((__force void *)dst, src, size);
switch (size) {
- case 1:__put_user_asm(*(u8 *)src, (u8 __user *)dst,
+ case 1:
+ __uaccess_begin();
+ __put_user_asm(*(u8 *)src, (u8 __user *)dst,
ret, "b", "b", "iq", 1);
+ __uaccess_end();
return ret;
- case 2:__put_user_asm(*(u16 *)src, (u16 __user *)dst,
+ case 2:
+ __uaccess_begin();
+ __put_user_asm(*(u16 *)src, (u16 __user *)dst,
ret, "w", "w", "ir", 2);
+ __uaccess_end();
return ret;
- case 4:__put_user_asm(*(u32 *)src, (u32 __user *)dst,
+ case 4:
+ __uaccess_begin();
+ __put_user_asm(*(u32 *)src, (u32 __user *)dst,
ret, "l", "k", "ir", 4);
+ __uaccess_end();
return ret;
- case 8:__put_user_asm(*(u64 *)src, (u64 __user *)dst,
+ case 8:
+ __uaccess_begin();
+ __put_user_asm(*(u64 *)src, (u64 __user *)dst,
ret, "q", "", "er", 8);
+ __uaccess_end();
return ret;
case 10:
+ __uaccess_begin();
__put_user_asm(*(u64 *)src, (u64 __user *)dst,
ret, "q", "", "er", 10);
- if (unlikely(ret))
- return ret;
- asm("":::"memory");
- __put_user_asm(4[(u16 *)src], 4 + (u16 __user *)dst,
- ret, "w", "w", "ir", 2);
+ if (likely(!ret)) {
+ asm("":::"memory");
+ __put_user_asm(4[(u16 *)src], 4 + (u16 __user *)dst,
+ ret, "w", "w", "ir", 2);
+ }
+ __uaccess_end();
return ret;
case 16:
+ __uaccess_begin();
__put_user_asm(*(u64 *)src, (u64 __user *)dst,
ret, "q", "", "er", 16);
- if (unlikely(ret))
- return ret;
- asm("":::"memory");
- __put_user_asm(1[(u64 *)src], 1 + (u64 __user *)dst,
- ret, "q", "", "er", 8);
+ if (likely(!ret)) {
+ asm("":::"memory");
+ __put_user_asm(1[(u64 *)src], 1 + (u64 __user *)dst,
+ ret, "q", "", "er", 8);
+ }
+ __uaccess_end();
return ret;
default:
return copy_user_generic((__force void *)dst, src, size);
@@ -160,39 +190,47 @@ int __copy_in_user(void __user *dst, const void __user *src, unsigned size)
switch (size) {
case 1: {
u8 tmp;
+ __uaccess_begin();
__get_user_asm(tmp, (u8 __user *)src,
ret, "b", "b", "=q", 1);
if (likely(!ret))
__put_user_asm(tmp, (u8 __user *)dst,
ret, "b", "b", "iq", 1);
+ __uaccess_end();
return ret;
}
case 2: {
u16 tmp;
+ __uaccess_begin();
__get_user_asm(tmp, (u16 __user *)src,
ret, "w", "w", "=r", 2);
if (likely(!ret))
__put_user_asm(tmp, (u16 __user *)dst,
ret, "w", "w", "ir", 2);
+ __uaccess_end();
return ret;
}
case 4: {
u32 tmp;
+ __uaccess_begin();
__get_user_asm(tmp, (u32 __user *)src,
ret, "l", "k", "=r", 4);
if (likely(!ret))
__put_user_asm(tmp, (u32 __user *)dst,
ret, "l", "k", "ir", 4);
+ __uaccess_end();
return ret;
}
case 8: {
u64 tmp;
+ __uaccess_begin();
__get_user_asm(tmp, (u64 __user *)src,
ret, "q", "", "=r", 8);
if (likely(!ret))
__put_user_asm(tmp, (u64 __user *)dst,
ret, "q", "", "er", 8);
+ __uaccess_end();
return ret;
}
default:
diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h
index 4c20dd333412..3bcdcc84259d 100644
--- a/arch/x86/include/asm/xen/hypercall.h
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -310,10 +310,10 @@ HYPERVISOR_mca(struct xen_mc *mc_op)
}
static inline int
-HYPERVISOR_dom0_op(struct xen_platform_op *platform_op)
+HYPERVISOR_platform_op(struct xen_platform_op *op)
{
- platform_op->interface_version = XENPF_INTERFACE_VERSION;
- return _hypercall1(int, dom0_op, platform_op);
+ op->interface_version = XENPF_INTERFACE_VERSION;
+ return _hypercall1(int, platform_op, op);
}
static inline int
diff --git a/arch/x86/include/uapi/asm/hyperv.h b/arch/x86/include/uapi/asm/hyperv.h
index 040d4083c24f..7956412d09bd 100644
--- a/arch/x86/include/uapi/asm/hyperv.h
+++ b/arch/x86/include/uapi/asm/hyperv.h
@@ -269,4 +269,96 @@ typedef struct _HV_REFERENCE_TSC_PAGE {
#define HV_SYNIC_SINT_AUTO_EOI (1ULL << 17)
#define HV_SYNIC_SINT_VECTOR_MASK (0xFF)
+#define HV_SYNIC_STIMER_COUNT (4)
+
+/* Define synthetic interrupt controller message constants. */
+#define HV_MESSAGE_SIZE (256)
+#define HV_MESSAGE_PAYLOAD_BYTE_COUNT (240)
+#define HV_MESSAGE_PAYLOAD_QWORD_COUNT (30)
+
+/* Define hypervisor message types. */
+enum hv_message_type {
+ HVMSG_NONE = 0x00000000,
+
+ /* Memory access messages. */
+ HVMSG_UNMAPPED_GPA = 0x80000000,
+ HVMSG_GPA_INTERCEPT = 0x80000001,
+
+ /* Timer notification messages. */
+ HVMSG_TIMER_EXPIRED = 0x80000010,
+
+ /* Error messages. */
+ HVMSG_INVALID_VP_REGISTER_VALUE = 0x80000020,
+ HVMSG_UNRECOVERABLE_EXCEPTION = 0x80000021,
+ HVMSG_UNSUPPORTED_FEATURE = 0x80000022,
+
+ /* Trace buffer complete messages. */
+ HVMSG_EVENTLOG_BUFFERCOMPLETE = 0x80000040,
+
+ /* Platform-specific processor intercept messages. */
+ HVMSG_X64_IOPORT_INTERCEPT = 0x80010000,
+ HVMSG_X64_MSR_INTERCEPT = 0x80010001,
+ HVMSG_X64_CPUID_INTERCEPT = 0x80010002,
+ HVMSG_X64_EXCEPTION_INTERCEPT = 0x80010003,
+ HVMSG_X64_APIC_EOI = 0x80010004,
+ HVMSG_X64_LEGACY_FP_ERROR = 0x80010005
+};
+
+/* Define synthetic interrupt controller message flags. */
+union hv_message_flags {
+ __u8 asu8;
+ struct {
+ __u8 msg_pending:1;
+ __u8 reserved:7;
+ };
+};
+
+/* Define port identifier type. */
+union hv_port_id {
+ __u32 asu32;
+ struct {
+ __u32 id:24;
+ __u32 reserved:8;
+ } u;
+};
+
+/* Define synthetic interrupt controller message header. */
+struct hv_message_header {
+ __u32 message_type;
+ __u8 payload_size;
+ union hv_message_flags message_flags;
+ __u8 reserved[2];
+ union {
+ __u64 sender;
+ union hv_port_id port;
+ };
+};
+
+/* Define synthetic interrupt controller message format. */
+struct hv_message {
+ struct hv_message_header header;
+ union {
+ __u64 payload[HV_MESSAGE_PAYLOAD_QWORD_COUNT];
+ } u;
+};
+
+/* Define the synthetic interrupt message page layout. */
+struct hv_message_page {
+ struct hv_message sint_message[HV_SYNIC_SINT_COUNT];
+};
+
+/* Define timer message payload structure. */
+struct hv_timer_message_payload {
+ __u32 timer_index;
+ __u32 reserved;
+ __u64 expiration_time; /* When the timer expired */
+ __u64 delivery_time; /* When the message was delivered */
+};
+
+#define HV_STIMER_ENABLE (1ULL << 0)
+#define HV_STIMER_PERIODIC (1ULL << 1)
+#define HV_STIMER_LAZY (1ULL << 2)
+#define HV_STIMER_AUTOENABLE (1ULL << 3)
+#define HV_STIMER_SINT(config) (__u8)(((config) >> 16) & 0x0F)
+
#endif
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index e678ddeed030..a07956a08936 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -434,8 +434,7 @@ static void srat_detect_node(struct cpuinfo_x86 *c)
*/
int ht_nodeid = c->initial_apicid;
- if (ht_nodeid >= 0 &&
- __apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
+ if (__apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
node = __apicid_to_node[ht_nodeid];
/* Pick a nearby node */
if (!node_online(node))
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index bd3507da39f0..2836de390f95 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -58,28 +58,6 @@ static void cpuid_smp_cpuid(void *cmd_block)
&cmd->eax, &cmd->ebx, &cmd->ecx, &cmd->edx);
}
-static loff_t cpuid_seek(struct file *file, loff_t offset, int orig)
-{
- loff_t ret;
- struct inode *inode = file->f_mapping->host;
-
- mutex_lock(&inode->i_mutex);
- switch (orig) {
- case 0:
- file->f_pos = offset;
- ret = file->f_pos;
- break;
- case 1:
- file->f_pos += offset;
- ret = file->f_pos;
- break;
- default:
- ret = -EINVAL;
- }
- mutex_unlock(&inode->i_mutex);
- return ret;
-}
-
static ssize_t cpuid_read(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
{
@@ -132,7 +110,7 @@ static int cpuid_open(struct inode *inode, struct file *file)
*/
static const struct file_operations cpuid_fops = {
.owner = THIS_MODULE,
- .llseek = cpuid_seek,
+ .llseek = no_seek_end_llseek,
.read = cpuid_read,
.open = cpuid_open,
};
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index db9a675e751b..bca14c899137 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -547,6 +547,7 @@ static const struct pci_device_id intel_stolen_ids[] __initconst = {
INTEL_CHV_IDS(&chv_stolen_funcs),
INTEL_SKL_IDS(&gen9_stolen_funcs),
INTEL_BXT_IDS(&gen9_stolen_funcs),
+ INTEL_KBL_IDS(&gen9_stolen_funcs),
};
static void __init intel_graphics_stolen(int num, int slot, int func)
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index 7b2978ab30df..6d9f0a7ef4c8 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -3,8 +3,11 @@
*/
#include <asm/fpu/internal.h>
#include <asm/tlbflush.h>
+#include <asm/setup.h>
+#include <asm/cmdline.h>
#include <linux/sched.h>
+#include <linux/init.h>
/*
* Initialize the TS bit in CR0 according to the style of context-switches
@@ -270,20 +273,52 @@ static void __init fpu__init_system_xstate_size_legacy(void)
*/
static enum { AUTO, ENABLE, DISABLE } eagerfpu = AUTO;
-static int __init eager_fpu_setup(char *s)
+/*
+ * Find supported xfeatures based on cpu features and command-line input.
+ * This must be called after fpu__init_parse_early_param() is called and
+ * xfeatures_mask is enumerated.
+ */
+u64 __init fpu__get_supported_xfeatures_mask(void)
{
- if (!strcmp(s, "on"))
- eagerfpu = ENABLE;
- else if (!strcmp(s, "off"))
- eagerfpu = DISABLE;
- else if (!strcmp(s, "auto"))
- eagerfpu = AUTO;
- return 1;
+ /* Support all xfeatures known to us */
+ if (eagerfpu != DISABLE)
+ return XCNTXT_MASK;
+
+ /* Warning of xfeatures being disabled for no eagerfpu mode */
+ if (xfeatures_mask & XFEATURE_MASK_EAGER) {
+ pr_err("x86/fpu: eagerfpu switching disabled, disabling the following xstate features: 0x%llx.\n",
+ xfeatures_mask & XFEATURE_MASK_EAGER);
+ }
+
+ /* Return a mask that masks out all features requiring eagerfpu mode */
+ return ~XFEATURE_MASK_EAGER;
+}
+
+/*
+ * Disable features dependent on eagerfpu.
+ */
+static void __init fpu__clear_eager_fpu_features(void)
+{
+ setup_clear_cpu_cap(X86_FEATURE_MPX);
+ setup_clear_cpu_cap(X86_FEATURE_AVX);
+ setup_clear_cpu_cap(X86_FEATURE_AVX2);
+ setup_clear_cpu_cap(X86_FEATURE_AVX512F);
+ setup_clear_cpu_cap(X86_FEATURE_AVX512PF);
+ setup_clear_cpu_cap(X86_FEATURE_AVX512ER);
+ setup_clear_cpu_cap(X86_FEATURE_AVX512CD);
}
-__setup("eagerfpu=", eager_fpu_setup);
/*
* Pick the FPU context switching strategy:
+ *
+ * When eagerfpu is AUTO or ENABLE, we ensure it is ENABLE if either of
+ * the following is true:
+ *
+ * (1) the cpu has xsaveopt, as it has the optimization and doing eager
+ * FPU switching has a relatively low cost compared to a plain xsave;
+ * (2) the cpu has xsave features (e.g. MPX) that depend on eager FPU
+ * switching. Should the kernel boot with noxsaveopt, we support MPX
+ * with eager FPU switching at a higher cost.
*/
static void __init fpu__init_system_ctx_switch(void)
{
@@ -295,19 +330,11 @@ static void __init fpu__init_system_ctx_switch(void)
WARN_ON_FPU(current->thread.fpu.fpstate_active);
current_thread_info()->status = 0;
- /* Auto enable eagerfpu for xsaveopt */
if (boot_cpu_has(X86_FEATURE_XSAVEOPT) && eagerfpu != DISABLE)
eagerfpu = ENABLE;
- if (xfeatures_mask & XFEATURE_MASK_EAGER) {
- if (eagerfpu == DISABLE) {
- pr_err("x86/fpu: eagerfpu switching disabled, disabling the following xstate features: 0x%llx.\n",
- xfeatures_mask & XFEATURE_MASK_EAGER);
- xfeatures_mask &= ~XFEATURE_MASK_EAGER;
- } else {
- eagerfpu = ENABLE;
- }
- }
+ if (xfeatures_mask & XFEATURE_MASK_EAGER)
+ eagerfpu = ENABLE;
if (eagerfpu == ENABLE)
setup_force_cpu_cap(X86_FEATURE_EAGER_FPU);
@@ -316,11 +343,48 @@ static void __init fpu__init_system_ctx_switch(void)
}
/*
+ * We parse fpu parameters early because fpu__init_system() is executed
+ * before parse_early_param().
+ */
+static void __init fpu__init_parse_early_param(void)
+{
+ /*
+ * No need to check "eagerfpu=auto" again, since it is the
+ * initial default.
+ */
+ if (cmdline_find_option_bool(boot_command_line, "eagerfpu=off")) {
+ eagerfpu = DISABLE;
+ fpu__clear_eager_fpu_features();
+ } else if (cmdline_find_option_bool(boot_command_line, "eagerfpu=on")) {
+ eagerfpu = ENABLE;
+ }
+
+ if (cmdline_find_option_bool(boot_command_line, "no387"))
+ setup_clear_cpu_cap(X86_FEATURE_FPU);
+
+ if (cmdline_find_option_bool(boot_command_line, "nofxsr")) {
+ setup_clear_cpu_cap(X86_FEATURE_FXSR);
+ setup_clear_cpu_cap(X86_FEATURE_FXSR_OPT);
+ setup_clear_cpu_cap(X86_FEATURE_XMM);
+ }
+
+ if (cmdline_find_option_bool(boot_command_line, "noxsave"))
+ fpu__xstate_clear_all_cpu_caps();
+
+ if (cmdline_find_option_bool(boot_command_line, "noxsaveopt"))
+ setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
+
+ if (cmdline_find_option_bool(boot_command_line, "noxsaves"))
+ setup_clear_cpu_cap(X86_FEATURE_XSAVES);
+}
+
+/*
* Called on the boot CPU once per system bootup, to set up the initial
* FPU state that is later cloned into all processes:
*/
void __init fpu__init_system(struct cpuinfo_x86 *c)
{
+ fpu__init_parse_early_param();
fpu__init_system_early_generic(c);
/*
@@ -344,62 +408,3 @@ void __init fpu__init_system(struct cpuinfo_x86 *c)
fpu__init_system_ctx_switch();
}
-
-/*
- * Boot parameter to turn off FPU support and fall back to math-emu:
- */
-static int __init no_387(char *s)
-{
- setup_clear_cpu_cap(X86_FEATURE_FPU);
- return 1;
-}
-__setup("no387", no_387);
-
-/*
- * Disable all xstate CPU features:
- */
-static int __init x86_noxsave_setup(char *s)
-{
- if (strlen(s))
- return 0;
-
- fpu__xstate_clear_all_cpu_caps();
-
- return 1;
-}
-__setup("noxsave", x86_noxsave_setup);
-
-/*
- * Disable the XSAVEOPT instruction specifically:
- */
-static int __init x86_noxsaveopt_setup(char *s)
-{
- setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
-
- return 1;
-}
-__setup("noxsaveopt", x86_noxsaveopt_setup);
-
-/*
- * Disable the XSAVES instruction:
- */
-static int __init x86_noxsaves_setup(char *s)
-{
- setup_clear_cpu_cap(X86_FEATURE_XSAVES);
-
- return 1;
-}
-__setup("noxsaves", x86_noxsaves_setup);
-
-/*
- * Disable FX save/restore and SSE support:
- */
-static int __init x86_nofxsr_setup(char *s)
-{
- setup_clear_cpu_cap(X86_FEATURE_FXSR);
- setup_clear_cpu_cap(X86_FEATURE_FXSR_OPT);
- setup_clear_cpu_cap(X86_FEATURE_XMM);
-
- return 1;
-}
-__setup("nofxsr", x86_nofxsr_setup);
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 40f100285984..d425cda5ae6d 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -52,6 +52,7 @@ void fpu__xstate_clear_all_cpu_caps(void)
setup_clear_cpu_cap(X86_FEATURE_AVX512ER);
setup_clear_cpu_cap(X86_FEATURE_AVX512CD);
setup_clear_cpu_cap(X86_FEATURE_MPX);
+ setup_clear_cpu_cap(X86_FEATURE_XGETBV1);
}
/*
@@ -632,8 +633,7 @@ void __init fpu__init_system_xstate(void)
BUG();
}
- /* Support only the state known to the OS: */
- xfeatures_mask = xfeatures_mask & XCNTXT_MASK;
+ xfeatures_mask &= fpu__get_supported_xfeatures_mask();
/* Enable xstate instructions to be able to continue with initialization: */
fpu__init_cpu_xstate();
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 311bcf338f07..29408d6d6626 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -105,14 +105,14 @@ ftrace_modify_code_direct(unsigned long ip, unsigned const char *old_code,
{
unsigned char replaced[MCOUNT_INSN_SIZE];
+ ftrace_expected = old_code;
+
/*
- * Note: Due to modules and __init, code can
- * disappear and change, we need to protect against faulting
- * as well as code changing. We do this by using the
- * probe_kernel_* functions.
- *
- * No real locking needed, this code is run through
- * kstop_machine, or before SMP starts.
+ * Note:
+ * We are paranoid about modifying text, as if a bug was to happen, it
+ * could cause us to read or write to someplace that could cause harm.
+ * Carefully read and modify the code with probe_kernel_*(), and make
+ * sure what we read is what we expected it to be before modifying it.
*/
/* read the text we want to modify */
@@ -154,6 +154,8 @@ int ftrace_make_nop(struct module *mod,
if (addr == MCOUNT_ADDR)
return ftrace_modify_code_direct(rec->ip, old, new);
+ ftrace_expected = NULL;
+
/* Normal cases use add_brk_on_nop */
WARN_ONCE(1, "invalid use of ftrace_make_nop");
return -EINVAL;
@@ -220,6 +222,7 @@ int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr,
unsigned long addr)
{
WARN_ON(1);
+ ftrace_expected = NULL;
return -EINVAL;
}
@@ -314,6 +317,8 @@ static int add_break(unsigned long ip, const char *old)
if (probe_kernel_read(replaced, (void *)ip, MCOUNT_INSN_SIZE))
return -EFAULT;
+ ftrace_expected = old;
+
/* Make sure it is what we expect it to be */
if (memcmp(replaced, old, MCOUNT_INSN_SIZE) != 0)
return -EINVAL;
@@ -413,6 +418,8 @@ static int remove_breakpoint(struct dyn_ftrace *rec)
ftrace_addr = ftrace_get_addr_curr(rec);
nop = ftrace_call_replace(ip, ftrace_addr);
+ ftrace_expected = nop;
+
if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) != 0)
return -EINVAL;
}
diff --git a/arch/x86/kernel/livepatch.c b/arch/x86/kernel/livepatch.c
index d1d35ccffed3..92fc1a51f994 100644
--- a/arch/x86/kernel/livepatch.c
+++ b/arch/x86/kernel/livepatch.c
@@ -20,8 +20,6 @@
#include <linux/module.h>
#include <linux/uaccess.h>
-#include <asm/cacheflush.h>
-#include <asm/page_types.h>
#include <asm/elf.h>
#include <asm/livepatch.h>
@@ -38,11 +36,10 @@
int klp_write_module_reloc(struct module *mod, unsigned long type,
unsigned long loc, unsigned long value)
{
- int ret, numpages, size = 4;
- bool readonly;
+ size_t size = 4;
unsigned long val;
- unsigned long core = (unsigned long)mod->module_core;
- unsigned long core_size = mod->core_size;
+ unsigned long core = (unsigned long)mod->core_layout.base;
+ unsigned long core_size = mod->core_layout.size;
switch (type) {
case R_X86_64_NONE:
@@ -69,23 +66,5 @@ int klp_write_module_reloc(struct module *mod, unsigned long type,
/* loc does not point to any symbol inside the module */
return -EINVAL;
- readonly = false;
-
-#ifdef CONFIG_DEBUG_SET_MODULE_RONX
- if (loc < core + mod->core_ro_size)
- readonly = true;
-#endif
-
- /* determine if the relocation spans a page boundary */
- numpages = ((loc & PAGE_MASK) == ((loc + size) & PAGE_MASK)) ? 1 : 2;
-
- if (readonly)
- set_memory_rw(loc & PAGE_MASK, numpages);
-
- ret = probe_kernel_write((void *)loc, &val, size);
-
- if (readonly)
- set_memory_ro(loc & PAGE_MASK, numpages);
-
- return ret;
+ return probe_kernel_write((void *)loc, &val, size);
}
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 819ab3f9c9c7..ba7fbba9831b 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -385,6 +385,7 @@ int arch_kimage_file_post_load_cleanup(struct kimage *image)
return image->fops->cleanup(image->image_loader_data);
}
+#ifdef CONFIG_KEXEC_VERIFY_SIG
int arch_kexec_kernel_verify_sig(struct kimage *image, void *kernel,
unsigned long kernel_len)
{
@@ -395,6 +396,7 @@ int arch_kexec_kernel_verify_sig(struct kimage *image, void *kernel,
return image->fops->verify_sig(kernel, kernel_len);
}
+#endif
/*
* Apply purgatory relocations.
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 113e70784854..64f9616f93f1 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -45,28 +45,6 @@
static struct class *msr_class;
-static loff_t msr_seek(struct file *file, loff_t offset, int orig)
-{
- loff_t ret;
- struct inode *inode = file_inode(file);
-
- mutex_lock(&inode->i_mutex);
- switch (orig) {
- case SEEK_SET:
- file->f_pos = offset;
- ret = file->f_pos;
- break;
- case SEEK_CUR:
- file->f_pos += offset;
- ret = file->f_pos;
- break;
- default:
- ret = -EINVAL;
- }
- mutex_unlock(&inode->i_mutex);
- return ret;
-}
-
static ssize_t msr_read(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
{
@@ -194,7 +172,7 @@ static int msr_open(struct inode *inode, struct file *file)
*/
static const struct file_operations msr_fops = {
.owner = THIS_MODULE,
- .llseek = msr_seek,
+ .llseek = no_seek_end_llseek,
.read = msr_read,
.write = msr_write,
.open = msr_open,
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index d64889aa2d46..ab0adc0fa5db 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -182,6 +182,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
DMI_MATCH(DMI_PRODUCT_NAME, "iMac9,1"),
},
},
+ { /* Handle problems with rebooting on the iMac10,1. */
+ .callback = set_pci_reboot,
+ .ident = "Apple iMac10,1",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "iMac10,1"),
+ },
+ },
/* ASRock */
{ /* Handle problems with rebooting on ASRock Q1900DC-ITX */
diff --git a/arch/x86/kernel/verify_cpu.S b/arch/x86/kernel/verify_cpu.S
index 4cf401f581e7..07efb35ee4bc 100644
--- a/arch/x86/kernel/verify_cpu.S
+++ b/arch/x86/kernel/verify_cpu.S
@@ -48,31 +48,31 @@ verify_cpu:
pushfl
popl %eax
cmpl %eax,%ebx
- jz verify_cpu_no_longmode # cpu has no cpuid
+ jz .Lverify_cpu_no_longmode # cpu has no cpuid
#endif
movl $0x0,%eax # See if cpuid 1 is implemented
cpuid
cmpl $0x1,%eax
- jb verify_cpu_no_longmode # no cpuid 1
+ jb .Lverify_cpu_no_longmode # no cpuid 1
xor %di,%di
cmpl $0x68747541,%ebx # AuthenticAMD
- jnz verify_cpu_noamd
+ jnz .Lverify_cpu_noamd
cmpl $0x69746e65,%edx
- jnz verify_cpu_noamd
+ jnz .Lverify_cpu_noamd
cmpl $0x444d4163,%ecx
- jnz verify_cpu_noamd
+ jnz .Lverify_cpu_noamd
mov $1,%di # cpu is from AMD
- jmp verify_cpu_check
+ jmp .Lverify_cpu_check
-verify_cpu_noamd:
+.Lverify_cpu_noamd:
cmpl $0x756e6547,%ebx # GenuineIntel?
- jnz verify_cpu_check
+ jnz .Lverify_cpu_check
cmpl $0x49656e69,%edx
- jnz verify_cpu_check
+ jnz .Lverify_cpu_check
cmpl $0x6c65746e,%ecx
- jnz verify_cpu_check
+ jnz .Lverify_cpu_check
# only call IA32_MISC_ENABLE when:
# family > 6 || (family == 6 && model >= 0xd)
@@ -83,59 +83,59 @@ verify_cpu_noamd:
andl $0x0ff00f00, %eax # mask family and extended family
shrl $8, %eax
cmpl $6, %eax
- ja verify_cpu_clear_xd # family > 6, ok
- jb verify_cpu_check # family < 6, skip
+ ja .Lverify_cpu_clear_xd # family > 6, ok
+ jb .Lverify_cpu_check # family < 6, skip
andl $0x000f00f0, %ecx # mask model and extended model
shrl $4, %ecx
cmpl $0xd, %ecx
- jb verify_cpu_check # family == 6, model < 0xd, skip
+ jb .Lverify_cpu_check # family == 6, model < 0xd, skip
-verify_cpu_clear_xd:
+.Lverify_cpu_clear_xd:
movl $MSR_IA32_MISC_ENABLE, %ecx
rdmsr
btrl $2, %edx # clear MSR_IA32_MISC_ENABLE_XD_DISABLE
- jnc verify_cpu_check # only write MSR if bit was changed
+ jnc .Lverify_cpu_check # only write MSR if bit was changed
wrmsr
-verify_cpu_check:
+.Lverify_cpu_check:
movl $0x1,%eax # Does the cpu have what it takes
cpuid
andl $REQUIRED_MASK0,%edx
xorl $REQUIRED_MASK0,%edx
- jnz verify_cpu_no_longmode
+ jnz .Lverify_cpu_no_longmode
movl $0x80000000,%eax # See if extended cpuid is implemented
cpuid
cmpl $0x80000001,%eax
- jb verify_cpu_no_longmode # no extended cpuid
+ jb .Lverify_cpu_no_longmode # no extended cpuid
movl $0x80000001,%eax # Does the cpu have what it takes
cpuid
andl $REQUIRED_MASK1,%edx
xorl $REQUIRED_MASK1,%edx
- jnz verify_cpu_no_longmode
+ jnz .Lverify_cpu_no_longmode
-verify_cpu_sse_test:
+.Lverify_cpu_sse_test:
movl $1,%eax
cpuid
andl $SSE_MASK,%edx
cmpl $SSE_MASK,%edx
- je verify_cpu_sse_ok
+ je .Lverify_cpu_sse_ok
test %di,%di
- jz verify_cpu_no_longmode # only try to force SSE on AMD
+ jz .Lverify_cpu_no_longmode # only try to force SSE on AMD
movl $MSR_K7_HWCR,%ecx
rdmsr
btr $15,%eax # enable SSE
wrmsr
xor %di,%di # don't loop
- jmp verify_cpu_sse_test # try again
+ jmp .Lverify_cpu_sse_test # try again
-verify_cpu_no_longmode:
+.Lverify_cpu_no_longmode:
popf # Restore caller passed flags
movl $1,%eax
ret
-verify_cpu_sse_ok:
+.Lverify_cpu_sse_ok:
popf # Restore caller passed flags
xorl %eax, %eax
ret
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 483231ebbb0b..e574b8546518 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -175,7 +175,11 @@ static void mark_screen_rdonly(struct mm_struct *mm)
if (pud_none_or_clear_bad(pud))
goto out;
pmd = pmd_offset(pud, 0xA0000);
- split_huge_page_pmd_mm(mm, 0xA0000, pmd);
+
+ if (pmd_trans_huge(*pmd)) {
+ struct vm_area_struct *vma = find_vma(mm, 0xA0000);
+ split_huge_pmd(vma, pmd, 0xA0000);
+ }
if (pmd_none_or_clear_bad(pmd))
goto out;
pte = pte_offset_map_lock(mm, pmd, 0xA0000, &ptl);
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 62cf8c915e95..c58ba67175ac 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -23,13 +23,665 @@
#include "x86.h"
#include "lapic.h"
+#include "ioapic.h"
#include "hyperv.h"
#include <linux/kvm_host.h>
+#include <linux/highmem.h>
+#include <asm/apicdef.h>
#include <trace/events/kvm.h>
#include "trace.h"
+static inline u64 synic_read_sint(struct kvm_vcpu_hv_synic *synic, int sint)
+{
+ return atomic64_read(&synic->sint[sint]);
+}
+
+static inline int synic_get_sint_vector(u64 sint_value)
+{
+ if (sint_value & HV_SYNIC_SINT_MASKED)
+ return -1;
+ return sint_value & HV_SYNIC_SINT_VECTOR_MASK;
+}
+
+static bool synic_has_vector_connected(struct kvm_vcpu_hv_synic *synic,
+ int vector)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(synic->sint); i++) {
+ if (synic_get_sint_vector(synic_read_sint(synic, i)) == vector)
+ return true;
+ }
+ return false;
+}
+
+static bool synic_has_vector_auto_eoi(struct kvm_vcpu_hv_synic *synic,
+ int vector)
+{
+ int i;
+ u64 sint_value;
+
+ for (i = 0; i < ARRAY_SIZE(synic->sint); i++) {
+ sint_value = synic_read_sint(synic, i);
+ if (synic_get_sint_vector(sint_value) == vector &&
+ sint_value & HV_SYNIC_SINT_AUTO_EOI)
+ return true;
+ }
+ return false;
+}
+
+static int synic_set_sint(struct kvm_vcpu_hv_synic *synic, int sint,
+ u64 data, bool host)
+{
+ int vector;
+
+ vector = data & HV_SYNIC_SINT_VECTOR_MASK;
+ if (vector < 16 && !host)
+ return 1;
+ /*
+ * Guest may configure multiple SINTs to use the same vector, so
+ * we maintain a bitmap of vectors handled by synic, and a
+ * bitmap of vectors with auto-eoi behavior. The bitmaps are
+ * updated here, and atomically queried on fast paths.
+ */
+
+ atomic64_set(&synic->sint[sint], data);
+
+ if (synic_has_vector_connected(synic, vector))
+ __set_bit(vector, synic->vec_bitmap);
+ else
+ __clear_bit(vector, synic->vec_bitmap);
+
+ if (synic_has_vector_auto_eoi(synic, vector))
+ __set_bit(vector, synic->auto_eoi_bitmap);
+ else
+ __clear_bit(vector, synic->auto_eoi_bitmap);
+
+ /* Load SynIC vectors into EOI exit bitmap */
+ kvm_make_request(KVM_REQ_SCAN_IOAPIC, synic_to_vcpu(synic));
+ return 0;
+}
+
+static struct kvm_vcpu_hv_synic *synic_get(struct kvm *kvm, u32 vcpu_id)
+{
+ struct kvm_vcpu *vcpu;
+ struct kvm_vcpu_hv_synic *synic;
+
+ if (vcpu_id >= atomic_read(&kvm->online_vcpus))
+ return NULL;
+ vcpu = kvm_get_vcpu(kvm, vcpu_id);
+ if (!vcpu)
+ return NULL;
+ synic = vcpu_to_synic(vcpu);
+ return (synic->active) ? synic : NULL;
+}
+
+static void synic_clear_sint_msg_pending(struct kvm_vcpu_hv_synic *synic,
+ u32 sint)
+{
+ struct kvm_vcpu *vcpu = synic_to_vcpu(synic);
+ struct page *page;
+ gpa_t gpa;
+ struct hv_message *msg;
+ struct hv_message_page *msg_page;
+
+ gpa = synic->msg_page & PAGE_MASK;
+ page = kvm_vcpu_gfn_to_page(vcpu, gpa >> PAGE_SHIFT);
+ if (is_error_page(page)) {
+ vcpu_err(vcpu, "Hyper-V SynIC can't get msg page, gpa 0x%llx\n",
+ gpa);
+ return;
+ }
+ msg_page = kmap_atomic(page);
+
+ msg = &msg_page->sint_message[sint];
+ msg->header.message_flags.msg_pending = 0;
+
+ kunmap_atomic(msg_page);
+ kvm_release_page_dirty(page);
+ kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT);
+}
+
+static void kvm_hv_notify_acked_sint(struct kvm_vcpu *vcpu, u32 sint)
+{
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_vcpu_hv_synic *synic = vcpu_to_synic(vcpu);
+ struct kvm_vcpu_hv *hv_vcpu = vcpu_to_hv_vcpu(vcpu);
+ struct kvm_vcpu_hv_stimer *stimer;
+ int gsi, idx, stimers_pending;
+
+ trace_kvm_hv_notify_acked_sint(vcpu->vcpu_id, sint);
+
+ if (synic->msg_page & HV_SYNIC_SIMP_ENABLE)
+ synic_clear_sint_msg_pending(synic, sint);
+
+ /* Try to deliver pending Hyper-V SynIC timers messages */
+ stimers_pending = 0;
+ for (idx = 0; idx < ARRAY_SIZE(hv_vcpu->stimer); idx++) {
+ stimer = &hv_vcpu->stimer[idx];
+ if (stimer->msg_pending &&
+ (stimer->config & HV_STIMER_ENABLE) &&
+ HV_STIMER_SINT(stimer->config) == sint) {
+ set_bit(stimer->index,
+ hv_vcpu->stimer_pending_bitmap);
+ stimers_pending++;
+ }
+ }
+ if (stimers_pending)
+ kvm_make_request(KVM_REQ_HV_STIMER, vcpu);
+
+ idx = srcu_read_lock(&kvm->irq_srcu);
+ gsi = atomic_read(&synic->sint_to_gsi[sint]);
+ if (gsi != -1)
+ kvm_notify_acked_gsi(kvm, gsi);
+ srcu_read_unlock(&kvm->irq_srcu, idx);
+}
+
+static void synic_exit(struct kvm_vcpu_hv_synic *synic, u32 msr)
+{
+ struct kvm_vcpu *vcpu = synic_to_vcpu(synic);
+ struct kvm_vcpu_hv *hv_vcpu = &vcpu->arch.hyperv;
+
+ hv_vcpu->exit.type = KVM_EXIT_HYPERV_SYNIC;
+ hv_vcpu->exit.u.synic.msr = msr;
+ hv_vcpu->exit.u.synic.control = synic->control;
+ hv_vcpu->exit.u.synic.evt_page = synic->evt_page;
+ hv_vcpu->exit.u.synic.msg_page = synic->msg_page;
+
+ kvm_make_request(KVM_REQ_HV_EXIT, vcpu);
+}
+
+static int synic_set_msr(struct kvm_vcpu_hv_synic *synic,
+ u32 msr, u64 data, bool host)
+{
+ struct kvm_vcpu *vcpu = synic_to_vcpu(synic);
+ int ret;
+
+ if (!synic->active)
+ return 1;
+
+ trace_kvm_hv_synic_set_msr(vcpu->vcpu_id, msr, data, host);
+
+ ret = 0;
+ switch (msr) {
+ case HV_X64_MSR_SCONTROL:
+ synic->control = data;
+ if (!host)
+ synic_exit(synic, msr);
+ break;
+ case HV_X64_MSR_SVERSION:
+ if (!host) {
+ ret = 1;
+ break;
+ }
+ synic->version = data;
+ break;
+ case HV_X64_MSR_SIEFP:
+ if (data & HV_SYNIC_SIEFP_ENABLE)
+ if (kvm_clear_guest(vcpu->kvm,
+ data & PAGE_MASK, PAGE_SIZE)) {
+ ret = 1;
+ break;
+ }
+ synic->evt_page = data;
+ if (!host)
+ synic_exit(synic, msr);
+ break;
+ case HV_X64_MSR_SIMP:
+ if (data & HV_SYNIC_SIMP_ENABLE)
+ if (kvm_clear_guest(vcpu->kvm,
+ data & PAGE_MASK, PAGE_SIZE)) {
+ ret = 1;
+ break;
+ }
+ synic->msg_page = data;
+ if (!host)
+ synic_exit(synic, msr);
+ break;
+ case HV_X64_MSR_EOM: {
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(synic->sint); i++)
+ kvm_hv_notify_acked_sint(vcpu, i);
+ break;
+ }
+ case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15:
+ ret = synic_set_sint(synic, msr - HV_X64_MSR_SINT0, data, host);
+ break;
+ default:
+ ret = 1;
+ break;
+ }
+ return ret;
+}
+
+static int synic_get_msr(struct kvm_vcpu_hv_synic *synic, u32 msr, u64 *pdata)
+{
+ int ret;
+
+ if (!synic->active)
+ return 1;
+
+ ret = 0;
+ switch (msr) {
+ case HV_X64_MSR_SCONTROL:
+ *pdata = synic->control;
+ break;
+ case HV_X64_MSR_SVERSION:
+ *pdata = synic->version;
+ break;
+ case HV_X64_MSR_SIEFP:
+ *pdata = synic->evt_page;
+ break;
+ case HV_X64_MSR_SIMP:
+ *pdata = synic->msg_page;
+ break;
+ case HV_X64_MSR_EOM:
+ *pdata = 0;
+ break;
+ case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15:
+ *pdata = atomic64_read(&synic->sint[msr - HV_X64_MSR_SINT0]);
+ break;
+ default:
+ ret = 1;
+ break;
+ }
+ return ret;
+}
+
+int synic_set_irq(struct kvm_vcpu_hv_synic *synic, u32 sint)
+{
+ struct kvm_vcpu *vcpu = synic_to_vcpu(synic);
+ struct kvm_lapic_irq irq;
+ int ret, vector;
+
+ if (sint >= ARRAY_SIZE(synic->sint))
+ return -EINVAL;
+
+ vector = synic_get_sint_vector(synic_read_sint(synic, sint));
+ if (vector < 0)
+ return -ENOENT;
+
+ memset(&irq, 0, sizeof(irq));
+ irq.dest_id = kvm_apic_id(vcpu->arch.apic);
+ irq.dest_mode = APIC_DEST_PHYSICAL;
+ irq.delivery_mode = APIC_DM_FIXED;
+ irq.vector = vector;
+ irq.level = 1;
+
+ ret = kvm_irq_delivery_to_apic(vcpu->kvm, NULL, &irq, NULL);
+ trace_kvm_hv_synic_set_irq(vcpu->vcpu_id, sint, irq.vector, ret);
+ return ret;
+}
+
+int kvm_hv_synic_set_irq(struct kvm *kvm, u32 vcpu_id, u32 sint)
+{
+ struct kvm_vcpu_hv_synic *synic;
+
+ synic = synic_get(kvm, vcpu_id);
+ if (!synic)
+ return -EINVAL;
+
+ return synic_set_irq(synic, sint);
+}
+
+void kvm_hv_synic_send_eoi(struct kvm_vcpu *vcpu, int vector)
+{
+ struct kvm_vcpu_hv_synic *synic = vcpu_to_synic(vcpu);
+ int i;
+
+ trace_kvm_hv_synic_send_eoi(vcpu->vcpu_id, vector);
+
+ for (i = 0; i < ARRAY_SIZE(synic->sint); i++)
+ if (synic_get_sint_vector(synic_read_sint(synic, i)) == vector)
+ kvm_hv_notify_acked_sint(vcpu, i);
+}
+
+static int kvm_hv_set_sint_gsi(struct kvm *kvm, u32 vcpu_id, u32 sint, int gsi)
+{
+ struct kvm_vcpu_hv_synic *synic;
+
+ synic = synic_get(kvm, vcpu_id);
+ if (!synic)
+ return -EINVAL;
+
+ if (sint >= ARRAY_SIZE(synic->sint_to_gsi))
+ return -EINVAL;
+
+ atomic_set(&synic->sint_to_gsi[sint], gsi);
+ return 0;
+}
+
+void kvm_hv_irq_routing_update(struct kvm *kvm)
+{
+ struct kvm_irq_routing_table *irq_rt;
+ struct kvm_kernel_irq_routing_entry *e;
+ u32 gsi;
+
+ irq_rt = srcu_dereference_check(kvm->irq_routing, &kvm->irq_srcu,
+ lockdep_is_held(&kvm->irq_lock));
+
+ for (gsi = 0; gsi < irq_rt->nr_rt_entries; gsi++) {
+ hlist_for_each_entry(e, &irq_rt->map[gsi], link) {
+ if (e->type == KVM_IRQ_ROUTING_HV_SINT)
+ kvm_hv_set_sint_gsi(kvm, e->hv_sint.vcpu,
+ e->hv_sint.sint, gsi);
+ }
+ }
+}
+
+static void synic_init(struct kvm_vcpu_hv_synic *synic)
+{
+ int i;
+
+ memset(synic, 0, sizeof(*synic));
+ synic->version = HV_SYNIC_VERSION_1;
+ for (i = 0; i < ARRAY_SIZE(synic->sint); i++) {
+ atomic64_set(&synic->sint[i], HV_SYNIC_SINT_MASKED);
+ atomic_set(&synic->sint_to_gsi[i], -1);
+ }
+}
+
+static u64 get_time_ref_counter(struct kvm *kvm)
+{
+ return div_u64(get_kernel_ns() + kvm->arch.kvmclock_offset, 100);
+}
+
+static void stimer_mark_pending(struct kvm_vcpu_hv_stimer *stimer,
+ bool vcpu_kick)
+{
+ struct kvm_vcpu *vcpu = stimer_to_vcpu(stimer);
+
+ set_bit(stimer->index,
+ vcpu_to_hv_vcpu(vcpu)->stimer_pending_bitmap);
+ kvm_make_request(KVM_REQ_HV_STIMER, vcpu);
+ if (vcpu_kick)
+ kvm_vcpu_kick(vcpu);
+}
+
+static void stimer_cleanup(struct kvm_vcpu_hv_stimer *stimer)
+{
+ struct kvm_vcpu *vcpu = stimer_to_vcpu(stimer);
+
+ trace_kvm_hv_stimer_cleanup(stimer_to_vcpu(stimer)->vcpu_id,
+ stimer->index);
+
+ hrtimer_cancel(&stimer->timer);
+ clear_bit(stimer->index,
+ vcpu_to_hv_vcpu(vcpu)->stimer_pending_bitmap);
+ stimer->msg_pending = false;
+ stimer->exp_time = 0;
+}
+
+static enum hrtimer_restart stimer_timer_callback(struct hrtimer *timer)
+{
+ struct kvm_vcpu_hv_stimer *stimer;
+
+ stimer = container_of(timer, struct kvm_vcpu_hv_stimer, timer);
+ trace_kvm_hv_stimer_callback(stimer_to_vcpu(stimer)->vcpu_id,
+ stimer->index);
+ stimer_mark_pending(stimer, true);
+
+ return HRTIMER_NORESTART;
+}
+
+/*
+ * stimer_start() assumptions:
+ * a) stimer->count is not equal to 0
+ * b) stimer->config has HV_STIMER_ENABLE flag
+ */
+static int stimer_start(struct kvm_vcpu_hv_stimer *stimer)
+{
+ u64 time_now;
+ ktime_t ktime_now;
+
+ time_now = get_time_ref_counter(stimer_to_vcpu(stimer)->kvm);
+ ktime_now = ktime_get();
+
+ if (stimer->config & HV_STIMER_PERIODIC) {
+ if (stimer->exp_time) {
+ if (time_now >= stimer->exp_time) {
+ u64 remainder;
+
+ div64_u64_rem(time_now - stimer->exp_time,
+ stimer->count, &remainder);
+ stimer->exp_time =
+ time_now + (stimer->count - remainder);
+ }
+ } else
+ stimer->exp_time = time_now + stimer->count;
+
+ trace_kvm_hv_stimer_start_periodic(
+ stimer_to_vcpu(stimer)->vcpu_id,
+ stimer->index,
+ time_now, stimer->exp_time);
+
+ hrtimer_start(&stimer->timer,
+ ktime_add_ns(ktime_now,
+ 100 * (stimer->exp_time - time_now)),
+ HRTIMER_MODE_ABS);
+ return 0;
+ }
+ stimer->exp_time = stimer->count;
+ if (time_now >= stimer->count) {
+ /*
+ * Expire timer according to Hypervisor Top-Level Functional
+ * specification v4(15.3.1):
+ * "If a one shot is enabled and the specified count is in
+ * the past, it will expire immediately."
+ */
+ stimer_mark_pending(stimer, false);
+ return 0;
+ }
+
+ trace_kvm_hv_stimer_start_one_shot(stimer_to_vcpu(stimer)->vcpu_id,
+ stimer->index,
+ time_now, stimer->count);
+
+ hrtimer_start(&stimer->timer,
+ ktime_add_ns(ktime_now, 100 * (stimer->count - time_now)),
+ HRTIMER_MODE_ABS);
+ return 0;
+}
+
+static int stimer_set_config(struct kvm_vcpu_hv_stimer *stimer, u64 config,
+ bool host)
+{
+ trace_kvm_hv_stimer_set_config(stimer_to_vcpu(stimer)->vcpu_id,
+ stimer->index, config, host);
+
+ stimer_cleanup(stimer);
+ if ((stimer->config & HV_STIMER_ENABLE) && HV_STIMER_SINT(config) == 0)
+ config &= ~HV_STIMER_ENABLE;
+ stimer->config = config;
+ stimer_mark_pending(stimer, false);
+ return 0;
+}
+
+static int stimer_set_count(struct kvm_vcpu_hv_stimer *stimer, u64 count,
+ bool host)
+{
+ trace_kvm_hv_stimer_set_count(stimer_to_vcpu(stimer)->vcpu_id,
+ stimer->index, count, host);
+
+ stimer_cleanup(stimer);
+ stimer->count = count;
+ if (stimer->count == 0)
+ stimer->config &= ~HV_STIMER_ENABLE;
+ else if (stimer->config & HV_STIMER_AUTOENABLE)
+ stimer->config |= HV_STIMER_ENABLE;
+ stimer_mark_pending(stimer, false);
+ return 0;
+}
+
+static int stimer_get_config(struct kvm_vcpu_hv_stimer *stimer, u64 *pconfig)
+{
+ *pconfig = stimer->config;
+ return 0;
+}
+
+static int stimer_get_count(struct kvm_vcpu_hv_stimer *stimer, u64 *pcount)
+{
+ *pcount = stimer->count;
+ return 0;
+}
+
+static int synic_deliver_msg(struct kvm_vcpu_hv_synic *synic, u32 sint,
+ struct hv_message *src_msg)
+{
+ struct kvm_vcpu *vcpu = synic_to_vcpu(synic);
+ struct page *page;
+ gpa_t gpa;
+ struct hv_message *dst_msg;
+ int r;
+ struct hv_message_page *msg_page;
+
+ if (!(synic->msg_page & HV_SYNIC_SIMP_ENABLE))
+ return -ENOENT;
+
+ gpa = synic->msg_page & PAGE_MASK;
+ page = kvm_vcpu_gfn_to_page(vcpu, gpa >> PAGE_SHIFT);
+ if (is_error_page(page))
+ return -EFAULT;
+
+ msg_page = kmap_atomic(page);
+ dst_msg = &msg_page->sint_message[sint];
+ if (sync_cmpxchg(&dst_msg->header.message_type, HVMSG_NONE,
+ src_msg->header.message_type) != HVMSG_NONE) {
+ dst_msg->header.message_flags.msg_pending = 1;
+ r = -EAGAIN;
+ } else {
+ memcpy(&dst_msg->u.payload, &src_msg->u.payload,
+ src_msg->header.payload_size);
+ dst_msg->header.message_type = src_msg->header.message_type;
+ dst_msg->header.payload_size = src_msg->header.payload_size;
+ r = synic_set_irq(synic, sint);
+ if (r >= 1)
+ r = 0;
+ else if (r == 0)
+ r = -EFAULT;
+ }
+ kunmap_atomic(msg_page);
+ kvm_release_page_dirty(page);
+ kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT);
+ return r;
+}
+
+static int stimer_send_msg(struct kvm_vcpu_hv_stimer *stimer)
+{
+ struct kvm_vcpu *vcpu = stimer_to_vcpu(stimer);
+ struct hv_message *msg = &stimer->msg;
+ struct hv_timer_message_payload *payload =
+ (struct hv_timer_message_payload *)&msg->u.payload;
+
+ payload->expiration_time = stimer->exp_time;
+ payload->delivery_time = get_time_ref_counter(vcpu->kvm);
+ return synic_deliver_msg(vcpu_to_synic(vcpu),
+ HV_STIMER_SINT(stimer->config), msg);
+}
+
+static void stimer_expiration(struct kvm_vcpu_hv_stimer *stimer)
+{
+ int r;
+
+ stimer->msg_pending = true;
+ r = stimer_send_msg(stimer);
+ trace_kvm_hv_stimer_expiration(stimer_to_vcpu(stimer)->vcpu_id,
+ stimer->index, r);
+ if (!r) {
+ stimer->msg_pending = false;
+ if (!(stimer->config & HV_STIMER_PERIODIC))
+ stimer->config &= ~HV_STIMER_ENABLE;
+ }
+}
+
+void kvm_hv_process_stimers(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_hv *hv_vcpu = vcpu_to_hv_vcpu(vcpu);
+ struct kvm_vcpu_hv_stimer *stimer;
+ u64 time_now, exp_time;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(hv_vcpu->stimer); i++)
+ if (test_and_clear_bit(i, hv_vcpu->stimer_pending_bitmap)) {
+ stimer = &hv_vcpu->stimer[i];
+ if (stimer->config & HV_STIMER_ENABLE) {
+ exp_time = stimer->exp_time;
+
+ if (exp_time) {
+ time_now =
+ get_time_ref_counter(vcpu->kvm);
+ if (time_now >= exp_time)
+ stimer_expiration(stimer);
+ }
+
+ if ((stimer->config & HV_STIMER_ENABLE) &&
+ stimer->count)
+ stimer_start(stimer);
+ else
+ stimer_cleanup(stimer);
+ }
+ }
+}
+
+void kvm_hv_vcpu_uninit(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_hv *hv_vcpu = vcpu_to_hv_vcpu(vcpu);
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(hv_vcpu->stimer); i++)
+ stimer_cleanup(&hv_vcpu->stimer[i]);
+}
+
+static void stimer_prepare_msg(struct kvm_vcpu_hv_stimer *stimer)
+{
+ struct hv_message *msg = &stimer->msg;
+ struct hv_timer_message_payload *payload =
+ (struct hv_timer_message_payload *)&msg->u.payload;
+
+ memset(&msg->header, 0, sizeof(msg->header));
+ msg->header.message_type = HVMSG_TIMER_EXPIRED;
+ msg->header.payload_size = sizeof(*payload);
+
+ payload->timer_index = stimer->index;
+ payload->expiration_time = 0;
+ payload->delivery_time = 0;
+}
+
+static void stimer_init(struct kvm_vcpu_hv_stimer *stimer, int timer_index)
+{
+ memset(stimer, 0, sizeof(*stimer));
+ stimer->index = timer_index;
+ hrtimer_init(&stimer->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+ stimer->timer.function = stimer_timer_callback;
+ stimer_prepare_msg(stimer);
+}
+
+void kvm_hv_vcpu_init(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_hv *hv_vcpu = vcpu_to_hv_vcpu(vcpu);
+ int i;
+
+ synic_init(&hv_vcpu->synic);
+
+ bitmap_zero(hv_vcpu->stimer_pending_bitmap, HV_SYNIC_STIMER_COUNT);
+ for (i = 0; i < ARRAY_SIZE(hv_vcpu->stimer); i++)
+ stimer_init(&hv_vcpu->stimer[i], i);
+}
+
+int kvm_hv_activate_synic(struct kvm_vcpu *vcpu)
+{
+ /*
+ * Hyper-V SynIC auto EOI SINT's are
+ * not compatible with APICV, so deactivate APICV
+ */
+ kvm_vcpu_deactivate_apicv(vcpu);
+ vcpu_to_synic(vcpu)->active = true;
+ return 0;
+}
+
static bool kvm_hv_msr_partition_wide(u32 msr)
{
bool r = false;
@@ -226,6 +878,31 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
return 1;
hv->runtime_offset = data - current_task_runtime_100ns();
break;
+ case HV_X64_MSR_SCONTROL:
+ case HV_X64_MSR_SVERSION:
+ case HV_X64_MSR_SIEFP:
+ case HV_X64_MSR_SIMP:
+ case HV_X64_MSR_EOM:
+ case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15:
+ return synic_set_msr(vcpu_to_synic(vcpu), msr, data, host);
+ case HV_X64_MSR_STIMER0_CONFIG:
+ case HV_X64_MSR_STIMER1_CONFIG:
+ case HV_X64_MSR_STIMER2_CONFIG:
+ case HV_X64_MSR_STIMER3_CONFIG: {
+ int timer_index = (msr - HV_X64_MSR_STIMER0_CONFIG)/2;
+
+ return stimer_set_config(vcpu_to_stimer(vcpu, timer_index),
+ data, host);
+ }
+ case HV_X64_MSR_STIMER0_COUNT:
+ case HV_X64_MSR_STIMER1_COUNT:
+ case HV_X64_MSR_STIMER2_COUNT:
+ case HV_X64_MSR_STIMER3_COUNT: {
+ int timer_index = (msr - HV_X64_MSR_STIMER0_COUNT)/2;
+
+ return stimer_set_count(vcpu_to_stimer(vcpu, timer_index),
+ data, host);
+ }
default:
vcpu_unimpl(vcpu, "Hyper-V uhandled wrmsr: 0x%x data 0x%llx\n",
msr, data);
@@ -248,11 +925,9 @@ static int kvm_hv_get_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
case HV_X64_MSR_HYPERCALL:
data = hv->hv_hypercall;
break;
- case HV_X64_MSR_TIME_REF_COUNT: {
- data =
- div_u64(get_kernel_ns() + kvm->arch.kvmclock_offset, 100);
+ case HV_X64_MSR_TIME_REF_COUNT:
+ data = get_time_ref_counter(kvm);
break;
- }
case HV_X64_MSR_REFERENCE_TSC:
data = hv->hv_tsc_page;
break;
@@ -304,6 +979,31 @@ static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
case HV_X64_MSR_VP_RUNTIME:
data = current_task_runtime_100ns() + hv->runtime_offset;
break;
+ case HV_X64_MSR_SCONTROL:
+ case HV_X64_MSR_SVERSION:
+ case HV_X64_MSR_SIEFP:
+ case HV_X64_MSR_SIMP:
+ case HV_X64_MSR_EOM:
+ case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15:
+ return synic_get_msr(vcpu_to_synic(vcpu), msr, pdata);
+ case HV_X64_MSR_STIMER0_CONFIG:
+ case HV_X64_MSR_STIMER1_CONFIG:
+ case HV_X64_MSR_STIMER2_CONFIG:
+ case HV_X64_MSR_STIMER3_CONFIG: {
+ int timer_index = (msr - HV_X64_MSR_STIMER0_CONFIG)/2;
+
+ return stimer_get_config(vcpu_to_stimer(vcpu, timer_index),
+ pdata);
+ }
+ case HV_X64_MSR_STIMER0_COUNT:
+ case HV_X64_MSR_STIMER1_COUNT:
+ case HV_X64_MSR_STIMER2_COUNT:
+ case HV_X64_MSR_STIMER3_COUNT: {
+ int timer_index = (msr - HV_X64_MSR_STIMER0_COUNT)/2;
+
+ return stimer_get_count(vcpu_to_stimer(vcpu, timer_index),
+ pdata);
+ }
default:
vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
return 1;
diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h
index c7bce559f67b..60eccd4bd1d3 100644
--- a/arch/x86/kvm/hyperv.h
+++ b/arch/x86/kvm/hyperv.h
@@ -24,9 +24,64 @@
#ifndef __ARCH_X86_KVM_HYPERV_H__
#define __ARCH_X86_KVM_HYPERV_H__
+static inline struct kvm_vcpu_hv *vcpu_to_hv_vcpu(struct kvm_vcpu *vcpu)
+{
+ return &vcpu->arch.hyperv;
+}
+
+static inline struct kvm_vcpu *hv_vcpu_to_vcpu(struct kvm_vcpu_hv *hv_vcpu)
+{
+ struct kvm_vcpu_arch *arch;
+
+ arch = container_of(hv_vcpu, struct kvm_vcpu_arch, hyperv);
+ return container_of(arch, struct kvm_vcpu, arch);
+}
+
+static inline struct kvm_vcpu_hv_synic *vcpu_to_synic(struct kvm_vcpu *vcpu)
+{
+ return &vcpu->arch.hyperv.synic;
+}
+
+static inline struct kvm_vcpu *synic_to_vcpu(struct kvm_vcpu_hv_synic *synic)
+{
+ return hv_vcpu_to_vcpu(container_of(synic, struct kvm_vcpu_hv, synic));
+}
+
int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host);
int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
+
bool kvm_hv_hypercall_enabled(struct kvm *kvm);
int kvm_hv_hypercall(struct kvm_vcpu *vcpu);
+void kvm_hv_irq_routing_update(struct kvm *kvm);
+int kvm_hv_synic_set_irq(struct kvm *kvm, u32 vcpu_id, u32 sint);
+void kvm_hv_synic_send_eoi(struct kvm_vcpu *vcpu, int vector);
+int kvm_hv_activate_synic(struct kvm_vcpu *vcpu);
+
+void kvm_hv_vcpu_init(struct kvm_vcpu *vcpu);
+void kvm_hv_vcpu_uninit(struct kvm_vcpu *vcpu);
+
+static inline struct kvm_vcpu_hv_stimer *vcpu_to_stimer(struct kvm_vcpu *vcpu,
+ int timer_index)
+{
+ return &vcpu_to_hv_vcpu(vcpu)->stimer[timer_index];
+}
+
+static inline struct kvm_vcpu *stimer_to_vcpu(struct kvm_vcpu_hv_stimer *stimer)
+{
+ struct kvm_vcpu_hv *hv_vcpu;
+
+ hv_vcpu = container_of(stimer - stimer->index, struct kvm_vcpu_hv,
+ stimer[0]);
+ return hv_vcpu_to_vcpu(hv_vcpu);
+}
+
+static inline bool kvm_hv_has_stimer_pending(struct kvm_vcpu *vcpu)
+{
+ return !bitmap_empty(vcpu->arch.hyperv.stimer_pending_bitmap,
+ HV_SYNIC_STIMER_COUNT);
+}
+
+void kvm_hv_process_stimers(struct kvm_vcpu *vcpu);
+
#endif
diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
index 88d0a92d3f94..1facfd60b04a 100644
--- a/arch/x86/kvm/ioapic.c
+++ b/arch/x86/kvm/ioapic.c
@@ -233,7 +233,7 @@ static void kvm_ioapic_inject_all(struct kvm_ioapic *ioapic, unsigned long irr)
}
-void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
+void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, ulong *ioapic_handled_vectors)
{
struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic;
union kvm_ioapic_redirect_entry *e;
@@ -250,7 +250,7 @@ void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
(e->fields.trig_mode == IOAPIC_EDGE_TRIG &&
kvm_apic_pending_eoi(vcpu, e->fields.vector)))
__set_bit(e->fields.vector,
- (unsigned long *)eoi_exit_bitmap);
+ ioapic_handled_vectors);
}
}
spin_unlock(&ioapic->lock);
diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h
index 084617d37c74..2d16dc251d81 100644
--- a/arch/x86/kvm/ioapic.h
+++ b/arch/x86/kvm/ioapic.h
@@ -121,7 +121,8 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
struct kvm_lapic_irq *irq, unsigned long *dest_map);
int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
-void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
-void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
-
+void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu,
+ ulong *ioapic_handled_vectors);
+void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu,
+ ulong *ioapic_handled_vectors);
#endif
diff --git a/arch/x86/kvm/iommu.c b/arch/x86/kvm/iommu.c
index 5c520ebf6343..a22a488b4622 100644
--- a/arch/x86/kvm/iommu.c
+++ b/arch/x86/kvm/iommu.c
@@ -43,11 +43,11 @@ static int kvm_iommu_unmap_memslots(struct kvm *kvm);
static void kvm_iommu_put_pages(struct kvm *kvm,
gfn_t base_gfn, unsigned long npages);
-static pfn_t kvm_pin_pages(struct kvm_memory_slot *slot, gfn_t gfn,
+static kvm_pfn_t kvm_pin_pages(struct kvm_memory_slot *slot, gfn_t gfn,
unsigned long npages)
{
gfn_t end_gfn;
- pfn_t pfn;
+ kvm_pfn_t pfn;
pfn = gfn_to_pfn_memslot(slot, gfn);
end_gfn = gfn + npages;
@@ -62,7 +62,8 @@ static pfn_t kvm_pin_pages(struct kvm_memory_slot *slot, gfn_t gfn,
return pfn;
}
-static void kvm_unpin_pages(struct kvm *kvm, pfn_t pfn, unsigned long npages)
+static void kvm_unpin_pages(struct kvm *kvm, kvm_pfn_t pfn,
+ unsigned long npages)
{
unsigned long i;
@@ -73,7 +74,7 @@ static void kvm_unpin_pages(struct kvm *kvm, pfn_t pfn, unsigned long npages)
int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot)
{
gfn_t gfn, end_gfn;
- pfn_t pfn;
+ kvm_pfn_t pfn;
int r = 0;
struct iommu_domain *domain = kvm->arch.iommu_domain;
int flags;
@@ -275,7 +276,7 @@ static void kvm_iommu_put_pages(struct kvm *kvm,
{
struct iommu_domain *domain;
gfn_t end_gfn, gfn;
- pfn_t pfn;
+ kvm_pfn_t pfn;
u64 phys;
domain = kvm->arch.iommu_domain;
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index 097060e33bd6..3982b479bb5f 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -76,7 +76,7 @@ int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v)
if (kvm_cpu_has_extint(v))
return 1;
- if (kvm_vcpu_apic_vid_enabled(v))
+ if (kvm_vcpu_apicv_active(v))
return 0;
return kvm_apic_has_interrupt(v) != -1; /* LAPIC */
diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
index 84b96d319909..8fc89efb5250 100644
--- a/arch/x86/kvm/irq_comm.c
+++ b/arch/x86/kvm/irq_comm.c
@@ -33,6 +33,8 @@
#include "lapic.h"
+#include "hyperv.h"
+
static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e,
struct kvm *kvm, int irq_source_id, int level,
bool line_status)
@@ -219,6 +221,16 @@ void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin,
srcu_read_unlock(&kvm->irq_srcu, idx);
}
+static int kvm_hv_set_sint(struct kvm_kernel_irq_routing_entry *e,
+ struct kvm *kvm, int irq_source_id, int level,
+ bool line_status)
+{
+ if (!level)
+ return -1;
+
+ return kvm_hv_synic_set_irq(kvm, e->hv_sint.vcpu, e->hv_sint.sint);
+}
+
int kvm_set_routing_entry(struct kvm_kernel_irq_routing_entry *e,
const struct kvm_irq_routing_entry *ue)
{
@@ -257,6 +269,11 @@ int kvm_set_routing_entry(struct kvm_kernel_irq_routing_entry *e,
e->msi.address_hi = ue->u.msi.address_hi;
e->msi.data = ue->u.msi.data;
break;
+ case KVM_IRQ_ROUTING_HV_SINT:
+ e->set = kvm_hv_set_sint;
+ e->hv_sint.vcpu = ue->u.hv_sint.vcpu;
+ e->hv_sint.sint = ue->u.hv_sint.sint;
+ break;
default:
goto out;
}
@@ -332,14 +349,15 @@ int kvm_setup_empty_irq_routing(struct kvm *kvm)
return kvm_set_irq_routing(kvm, empty_routing, 0, 0);
}
-void kvm_arch_irq_routing_update(struct kvm *kvm)
+void kvm_arch_post_irq_routing_update(struct kvm *kvm)
{
if (ioapic_in_kernel(kvm) || !irqchip_in_kernel(kvm))
return;
kvm_make_scan_ioapic_request(kvm);
}
-void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
+void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu,
+ ulong *ioapic_handled_vectors)
{
struct kvm *kvm = vcpu->kvm;
struct kvm_kernel_irq_routing_entry *entry;
@@ -369,9 +387,26 @@ void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
u32 vector = entry->msi.data & 0xff;
__set_bit(vector,
- (unsigned long *) eoi_exit_bitmap);
+ ioapic_handled_vectors);
}
}
}
srcu_read_unlock(&kvm->irq_srcu, idx);
}
+
+int kvm_arch_set_irq(struct kvm_kernel_irq_routing_entry *irq, struct kvm *kvm,
+ int irq_source_id, int level, bool line_status)
+{
+ switch (irq->type) {
+ case KVM_IRQ_ROUTING_HV_SINT:
+ return kvm_hv_set_sint(irq, kvm, irq_source_id, level,
+ line_status);
+ default:
+ return -EWOULDBLOCK;
+ }
+}
+
+void kvm_arch_irq_routing_update(struct kvm *kvm)
+{
+ kvm_hv_irq_routing_update(kvm);
+}
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 4d30b865be30..36591faed13b 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -41,6 +41,7 @@
#include "trace.h"
#include "x86.h"
#include "cpuid.h"
+#include "hyperv.h"
#ifndef CONFIG_X86_64
#define mod_64(x, y) ((x) - (y) * div64_u64(x, y))
@@ -128,11 +129,6 @@ static inline int apic_enabled(struct kvm_lapic *apic)
(LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \
APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER)
-static inline int kvm_apic_id(struct kvm_lapic *apic)
-{
- return (kvm_apic_get_reg(apic, APIC_ID) >> 24) & 0xff;
-}
-
/* The logical map is definitely wrong if we have multiple
* modes at the same time. (Physical map is always right.)
*/
@@ -379,7 +375,8 @@ static inline int apic_find_highest_irr(struct kvm_lapic *apic)
if (!apic->irr_pending)
return -1;
- kvm_x86_ops->sync_pir_to_irr(apic->vcpu);
+ if (apic->vcpu->arch.apicv_active)
+ kvm_x86_ops->sync_pir_to_irr(apic->vcpu);
result = apic_search_irr(apic);
ASSERT(result == -1 || result >= 16);
@@ -392,7 +389,7 @@ static inline void apic_clear_irr(int vec, struct kvm_lapic *apic)
vcpu = apic->vcpu;
- if (unlikely(kvm_vcpu_apic_vid_enabled(vcpu))) {
+ if (unlikely(vcpu->arch.apicv_active)) {
/* try to update RVI */
apic_clear_vector(vec, apic->regs + APIC_IRR);
kvm_make_request(KVM_REQ_EVENT, vcpu);
@@ -418,7 +415,7 @@ static inline void apic_set_isr(int vec, struct kvm_lapic *apic)
* because the processor can modify ISR under the hood. Instead
* just set SVI.
*/
- if (unlikely(kvm_x86_ops->hwapic_isr_update))
+ if (unlikely(vcpu->arch.apicv_active))
kvm_x86_ops->hwapic_isr_update(vcpu->kvm, vec);
else {
++apic->isr_count;
@@ -466,7 +463,7 @@ static inline void apic_clear_isr(int vec, struct kvm_lapic *apic)
* on the other hand isr_count and highest_isr_cache are unused
* and must be left alone.
*/
- if (unlikely(kvm_x86_ops->hwapic_isr_update))
+ if (unlikely(vcpu->arch.apicv_active))
kvm_x86_ops->hwapic_isr_update(vcpu->kvm,
apic_find_highest_isr(apic));
else {
@@ -852,7 +849,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
apic_clear_vector(vector, apic->regs + APIC_TMR);
}
- if (kvm_x86_ops->deliver_posted_interrupt)
+ if (vcpu->arch.apicv_active)
kvm_x86_ops->deliver_posted_interrupt(vcpu, vector);
else {
apic_set_irr(vector, apic);
@@ -932,7 +929,7 @@ int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2)
static bool kvm_ioapic_handles_vector(struct kvm_lapic *apic, int vector)
{
- return test_bit(vector, (ulong *)apic->vcpu->arch.eoi_exit_bitmap);
+ return test_bit(vector, apic->vcpu->arch.ioapic_handled_vectors);
}
static void kvm_ioapic_send_eoi(struct kvm_lapic *apic, int vector)
@@ -974,6 +971,9 @@ static int apic_set_eoi(struct kvm_lapic *apic)
apic_clear_isr(vector, apic);
apic_update_ppr(apic);
+ if (test_bit(vector, vcpu_to_synic(apic->vcpu)->vec_bitmap))
+ kvm_hv_synic_send_eoi(apic->vcpu, vector);
+
kvm_ioapic_send_eoi(apic, vector);
kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
return vector;
@@ -1225,7 +1225,7 @@ static bool lapic_timer_int_injected(struct kvm_vcpu *vcpu)
int vec = reg & APIC_VECTOR_MASK;
void *bitmap = apic->regs + APIC_ISR;
- if (kvm_x86_ops->deliver_posted_interrupt)
+ if (vcpu->arch.apicv_active)
bitmap = apic->regs + APIC_IRR;
if (apic_test_vector(vec, bitmap))
@@ -1693,8 +1693,8 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
apic_set_reg(apic, APIC_ISR + 0x10 * i, 0);
apic_set_reg(apic, APIC_TMR + 0x10 * i, 0);
}
- apic->irr_pending = kvm_vcpu_apic_vid_enabled(vcpu);
- apic->isr_count = kvm_x86_ops->hwapic_isr_update ? 1 : 0;
+ apic->irr_pending = vcpu->arch.apicv_active;
+ apic->isr_count = vcpu->arch.apicv_active ? 1 : 0;
apic->highest_isr_cache = -1;
update_divide_count(apic);
atomic_set(&apic->lapic_timer.pending, 0);
@@ -1883,6 +1883,12 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu)
apic_set_isr(vector, apic);
apic_update_ppr(apic);
apic_clear_irr(vector, apic);
+
+ if (test_bit(vector, vcpu_to_synic(vcpu)->auto_eoi_bitmap)) {
+ apic_clear_isr(vector, apic);
+ apic_update_ppr(apic);
+ }
+
return vector;
}
@@ -1906,15 +1912,15 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu,
update_divide_count(apic);
start_apic_timer(apic);
apic->irr_pending = true;
- apic->isr_count = kvm_x86_ops->hwapic_isr_update ?
+ apic->isr_count = vcpu->arch.apicv_active ?
1 : count_vectors(apic->regs + APIC_ISR);
apic->highest_isr_cache = -1;
- if (kvm_x86_ops->hwapic_irr_update)
+ if (vcpu->arch.apicv_active) {
kvm_x86_ops->hwapic_irr_update(vcpu,
apic_find_highest_irr(apic));
- if (unlikely(kvm_x86_ops->hwapic_isr_update))
kvm_x86_ops->hwapic_isr_update(vcpu->kvm,
apic_find_highest_isr(apic));
+ }
kvm_make_request(KVM_REQ_EVENT, vcpu);
if (ioapic_in_kernel(vcpu->kvm))
kvm_rtc_eoi_tracking_restore_one(vcpu);
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index fde8e35d5850..41bdb35b4b67 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -143,9 +143,9 @@ static inline int apic_x2apic_mode(struct kvm_lapic *apic)
return apic->vcpu->arch.apic_base & X2APIC_ENABLE;
}
-static inline bool kvm_vcpu_apic_vid_enabled(struct kvm_vcpu *vcpu)
+static inline bool kvm_vcpu_apicv_active(struct kvm_vcpu *vcpu)
{
- return kvm_x86_ops->cpu_uses_apicv(vcpu);
+ return vcpu->arch.apic && vcpu->arch.apicv_active;
}
static inline bool kvm_apic_has_events(struct kvm_vcpu *vcpu)
@@ -164,6 +164,11 @@ static inline int kvm_lapic_latched_init(struct kvm_vcpu *vcpu)
return kvm_vcpu_has_lapic(vcpu) && test_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
}
+static inline int kvm_apic_id(struct kvm_lapic *apic)
+{
+ return (kvm_apic_get_reg(apic, APIC_ID) >> 24) & 0xff;
+}
+
bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector);
void wait_lapic_expire(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index e7c2c1428a69..95a955de5964 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -259,7 +259,7 @@ static unsigned get_mmio_spte_access(u64 spte)
}
static bool set_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
- pfn_t pfn, unsigned access)
+ kvm_pfn_t pfn, unsigned access)
{
if (unlikely(is_noslot_pfn(pfn))) {
mark_mmio_spte(vcpu, sptep, gfn, access);
@@ -311,11 +311,6 @@ static int is_large_pte(u64 pte)
return pte & PT_PAGE_SIZE_MASK;
}
-static int is_rmap_spte(u64 pte)
-{
- return is_shadow_present_pte(pte);
-}
-
static int is_last_spte(u64 pte, int level)
{
if (level == PT_PAGE_TABLE_LEVEL)
@@ -325,7 +320,7 @@ static int is_last_spte(u64 pte, int level)
return 0;
}
-static pfn_t spte_to_pfn(u64 pte)
+static kvm_pfn_t spte_to_pfn(u64 pte)
{
return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
}
@@ -540,7 +535,7 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte)
u64 old_spte = *sptep;
bool ret = false;
- WARN_ON(!is_rmap_spte(new_spte));
+ WARN_ON(!is_shadow_present_pte(new_spte));
if (!is_shadow_present_pte(old_spte)) {
mmu_spte_set(sptep, new_spte);
@@ -587,7 +582,7 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte)
*/
static int mmu_spte_clear_track_bits(u64 *sptep)
{
- pfn_t pfn;
+ kvm_pfn_t pfn;
u64 old_spte = *sptep;
if (!spte_has_volatile_bits(old_spte))
@@ -595,7 +590,7 @@ static int mmu_spte_clear_track_bits(u64 *sptep)
else
old_spte = __update_clear_spte_slow(sptep, 0ull);
- if (!is_rmap_spte(old_spte))
+ if (!is_shadow_present_pte(old_spte))
return 0;
pfn = spte_to_pfn(old_spte);
@@ -909,36 +904,35 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn,
}
/*
- * Pte mapping structures:
+ * About rmap_head encoding:
*
- * If pte_list bit zero is zero, then pte_list point to the spte.
- *
- * If pte_list bit zero is one, (then pte_list & ~1) points to a struct
+ * If the bit zero of rmap_head->val is clear, then it points to the only spte
+ * in this rmap chain. Otherwise, (rmap_head->val & ~1) points to a struct
* pte_list_desc containing more mappings.
- *
- * Returns the number of pte entries before the spte was added or zero if
- * the spte was not added.
- *
+ */
+
+/*
+ * Returns the number of pointers in the rmap chain, not counting the new one.
*/
static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte,
- unsigned long *pte_list)
+ struct kvm_rmap_head *rmap_head)
{
struct pte_list_desc *desc;
int i, count = 0;
- if (!*pte_list) {
+ if (!rmap_head->val) {
rmap_printk("pte_list_add: %p %llx 0->1\n", spte, *spte);
- *pte_list = (unsigned long)spte;
- } else if (!(*pte_list & 1)) {
+ rmap_head->val = (unsigned long)spte;
+ } else if (!(rmap_head->val & 1)) {
rmap_printk("pte_list_add: %p %llx 1->many\n", spte, *spte);
desc = mmu_alloc_pte_list_desc(vcpu);
- desc->sptes[0] = (u64 *)*pte_list;
+ desc->sptes[0] = (u64 *)rmap_head->val;
desc->sptes[1] = spte;
- *pte_list = (unsigned long)desc | 1;
+ rmap_head->val = (unsigned long)desc | 1;
++count;
} else {
rmap_printk("pte_list_add: %p %llx many->many\n", spte, *spte);
- desc = (struct pte_list_desc *)(*pte_list & ~1ul);
+ desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
while (desc->sptes[PTE_LIST_EXT-1] && desc->more) {
desc = desc->more;
count += PTE_LIST_EXT;
@@ -955,8 +949,9 @@ static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte,
}
static void
-pte_list_desc_remove_entry(unsigned long *pte_list, struct pte_list_desc *desc,
- int i, struct pte_list_desc *prev_desc)
+pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head,
+ struct pte_list_desc *desc, int i,
+ struct pte_list_desc *prev_desc)
{
int j;
@@ -967,43 +962,43 @@ pte_list_desc_remove_entry(unsigned long *pte_list, struct pte_list_desc *desc,
if (j != 0)
return;
if (!prev_desc && !desc->more)
- *pte_list = (unsigned long)desc->sptes[0];
+ rmap_head->val = (unsigned long)desc->sptes[0];
else
if (prev_desc)
prev_desc->more = desc->more;
else
- *pte_list = (unsigned long)desc->more | 1;
+ rmap_head->val = (unsigned long)desc->more | 1;
mmu_free_pte_list_desc(desc);
}
-static void pte_list_remove(u64 *spte, unsigned long *pte_list)
+static void pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head)
{
struct pte_list_desc *desc;
struct pte_list_desc *prev_desc;
int i;
- if (!*pte_list) {
+ if (!rmap_head->val) {
printk(KERN_ERR "pte_list_remove: %p 0->BUG\n", spte);
BUG();
- } else if (!(*pte_list & 1)) {
+ } else if (!(rmap_head->val & 1)) {
rmap_printk("pte_list_remove: %p 1->0\n", spte);
- if ((u64 *)*pte_list != spte) {
+ if ((u64 *)rmap_head->val != spte) {
printk(KERN_ERR "pte_list_remove: %p 1->BUG\n", spte);
BUG();
}
- *pte_list = 0;
+ rmap_head->val = 0;
} else {
rmap_printk("pte_list_remove: %p many->many\n", spte);
- desc = (struct pte_list_desc *)(*pte_list & ~1ul);
+ desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
prev_desc = NULL;
while (desc) {
- for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i)
+ for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i) {
if (desc->sptes[i] == spte) {
- pte_list_desc_remove_entry(pte_list,
- desc, i,
- prev_desc);
+ pte_list_desc_remove_entry(rmap_head,
+ desc, i, prev_desc);
return;
}
+ }
prev_desc = desc;
desc = desc->more;
}
@@ -1012,28 +1007,8 @@ static void pte_list_remove(u64 *spte, unsigned long *pte_list)
}
}
-typedef void (*pte_list_walk_fn) (u64 *spte);
-static void pte_list_walk(unsigned long *pte_list, pte_list_walk_fn fn)
-{
- struct pte_list_desc *desc;
- int i;
-
- if (!*pte_list)
- return;
-
- if (!(*pte_list & 1))
- return fn((u64 *)*pte_list);
-
- desc = (struct pte_list_desc *)(*pte_list & ~1ul);
- while (desc) {
- for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i)
- fn(desc->sptes[i]);
- desc = desc->more;
- }
-}
-
-static unsigned long *__gfn_to_rmap(gfn_t gfn, int level,
- struct kvm_memory_slot *slot)
+static struct kvm_rmap_head *__gfn_to_rmap(gfn_t gfn, int level,
+ struct kvm_memory_slot *slot)
{
unsigned long idx;
@@ -1041,10 +1016,8 @@ static unsigned long *__gfn_to_rmap(gfn_t gfn, int level,
return &slot->arch.rmap[level - PT_PAGE_TABLE_LEVEL][idx];
}
-/*
- * Take gfn and return the reverse mapping to it.
- */
-static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, struct kvm_mmu_page *sp)
+static struct kvm_rmap_head *gfn_to_rmap(struct kvm *kvm, gfn_t gfn,
+ struct kvm_mmu_page *sp)
{
struct kvm_memslots *slots;
struct kvm_memory_slot *slot;
@@ -1065,24 +1038,24 @@ static bool rmap_can_add(struct kvm_vcpu *vcpu)
static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
{
struct kvm_mmu_page *sp;
- unsigned long *rmapp;
+ struct kvm_rmap_head *rmap_head;
sp = page_header(__pa(spte));
kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn);
- rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp);
- return pte_list_add(vcpu, spte, rmapp);
+ rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp);
+ return pte_list_add(vcpu, spte, rmap_head);
}
static void rmap_remove(struct kvm *kvm, u64 *spte)
{
struct kvm_mmu_page *sp;
gfn_t gfn;
- unsigned long *rmapp;
+ struct kvm_rmap_head *rmap_head;
sp = page_header(__pa(spte));
gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
- rmapp = gfn_to_rmap(kvm, gfn, sp);
- pte_list_remove(spte, rmapp);
+ rmap_head = gfn_to_rmap(kvm, gfn, sp);
+ pte_list_remove(spte, rmap_head);
}
/*
@@ -1102,19 +1075,26 @@ struct rmap_iterator {
*
* Returns sptep if found, NULL otherwise.
*/
-static u64 *rmap_get_first(unsigned long rmap, struct rmap_iterator *iter)
+static u64 *rmap_get_first(struct kvm_rmap_head *rmap_head,
+ struct rmap_iterator *iter)
{
- if (!rmap)
+ u64 *sptep;
+
+ if (!rmap_head->val)
return NULL;
- if (!(rmap & 1)) {
+ if (!(rmap_head->val & 1)) {
iter->desc = NULL;
- return (u64 *)rmap;
+ sptep = (u64 *)rmap_head->val;
+ goto out;
}
- iter->desc = (struct pte_list_desc *)(rmap & ~1ul);
+ iter->desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
iter->pos = 0;
- return iter->desc->sptes[iter->pos];
+ sptep = iter->desc->sptes[iter->pos];
+out:
+ BUG_ON(!is_shadow_present_pte(*sptep));
+ return sptep;
}
/*
@@ -1124,14 +1104,14 @@ static u64 *rmap_get_first(unsigned long rmap, struct rmap_iterator *iter)
*/
static u64 *rmap_get_next(struct rmap_iterator *iter)
{
+ u64 *sptep;
+
if (iter->desc) {
if (iter->pos < PTE_LIST_EXT - 1) {
- u64 *sptep;
-
++iter->pos;
sptep = iter->desc->sptes[iter->pos];
if (sptep)
- return sptep;
+ goto out;
}
iter->desc = iter->desc->more;
@@ -1139,17 +1119,20 @@ static u64 *rmap_get_next(struct rmap_iterator *iter)
if (iter->desc) {
iter->pos = 0;
/* desc->sptes[0] cannot be NULL */
- return iter->desc->sptes[iter->pos];
+ sptep = iter->desc->sptes[iter->pos];
+ goto out;
}
}
return NULL;
+out:
+ BUG_ON(!is_shadow_present_pte(*sptep));
+ return sptep;
}
-#define for_each_rmap_spte(_rmap_, _iter_, _spte_) \
- for (_spte_ = rmap_get_first(*_rmap_, _iter_); \
- _spte_ && ({BUG_ON(!is_shadow_present_pte(*_spte_)); 1;}); \
- _spte_ = rmap_get_next(_iter_))
+#define for_each_rmap_spte(_rmap_head_, _iter_, _spte_) \
+ for (_spte_ = rmap_get_first(_rmap_head_, _iter_); \
+ _spte_; _spte_ = rmap_get_next(_iter_))
static void drop_spte(struct kvm *kvm, u64 *sptep)
{
@@ -1207,14 +1190,15 @@ static bool spte_write_protect(struct kvm *kvm, u64 *sptep, bool pt_protect)
return mmu_spte_update(sptep, spte);
}
-static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp,
+static bool __rmap_write_protect(struct kvm *kvm,
+ struct kvm_rmap_head *rmap_head,
bool pt_protect)
{
u64 *sptep;
struct rmap_iterator iter;
bool flush = false;
- for_each_rmap_spte(rmapp, &iter, sptep)
+ for_each_rmap_spte(rmap_head, &iter, sptep)
flush |= spte_write_protect(kvm, sptep, pt_protect);
return flush;
@@ -1231,13 +1215,13 @@ static bool spte_clear_dirty(struct kvm *kvm, u64 *sptep)
return mmu_spte_update(sptep, spte);
}
-static bool __rmap_clear_dirty(struct kvm *kvm, unsigned long *rmapp)
+static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
{
u64 *sptep;
struct rmap_iterator iter;
bool flush = false;
- for_each_rmap_spte(rmapp, &iter, sptep)
+ for_each_rmap_spte(rmap_head, &iter, sptep)
flush |= spte_clear_dirty(kvm, sptep);
return flush;
@@ -1254,13 +1238,13 @@ static bool spte_set_dirty(struct kvm *kvm, u64 *sptep)
return mmu_spte_update(sptep, spte);
}
-static bool __rmap_set_dirty(struct kvm *kvm, unsigned long *rmapp)
+static bool __rmap_set_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
{
u64 *sptep;
struct rmap_iterator iter;
bool flush = false;
- for_each_rmap_spte(rmapp, &iter, sptep)
+ for_each_rmap_spte(rmap_head, &iter, sptep)
flush |= spte_set_dirty(kvm, sptep);
return flush;
@@ -1280,12 +1264,12 @@ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
struct kvm_memory_slot *slot,
gfn_t gfn_offset, unsigned long mask)
{
- unsigned long *rmapp;
+ struct kvm_rmap_head *rmap_head;
while (mask) {
- rmapp = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
- PT_PAGE_TABLE_LEVEL, slot);
- __rmap_write_protect(kvm, rmapp, false);
+ rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
+ PT_PAGE_TABLE_LEVEL, slot);
+ __rmap_write_protect(kvm, rmap_head, false);
/* clear the first set bit */
mask &= mask - 1;
@@ -1305,12 +1289,12 @@ void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
struct kvm_memory_slot *slot,
gfn_t gfn_offset, unsigned long mask)
{
- unsigned long *rmapp;
+ struct kvm_rmap_head *rmap_head;
while (mask) {
- rmapp = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
- PT_PAGE_TABLE_LEVEL, slot);
- __rmap_clear_dirty(kvm, rmapp);
+ rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
+ PT_PAGE_TABLE_LEVEL, slot);
+ __rmap_clear_dirty(kvm, rmap_head);
/* clear the first set bit */
mask &= mask - 1;
@@ -1342,28 +1326,27 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn)
{
struct kvm_memory_slot *slot;
- unsigned long *rmapp;
+ struct kvm_rmap_head *rmap_head;
int i;
bool write_protected = false;
slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
- rmapp = __gfn_to_rmap(gfn, i, slot);
- write_protected |= __rmap_write_protect(vcpu->kvm, rmapp, true);
+ rmap_head = __gfn_to_rmap(gfn, i, slot);
+ write_protected |= __rmap_write_protect(vcpu->kvm, rmap_head, true);
}
return write_protected;
}
-static bool kvm_zap_rmapp(struct kvm *kvm, unsigned long *rmapp)
+static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
{
u64 *sptep;
struct rmap_iterator iter;
bool flush = false;
- while ((sptep = rmap_get_first(*rmapp, &iter))) {
- BUG_ON(!(*sptep & PT_PRESENT_MASK));
+ while ((sptep = rmap_get_first(rmap_head, &iter))) {
rmap_printk("%s: spte %p %llx.\n", __func__, sptep, *sptep);
drop_spte(kvm, sptep);
@@ -1373,14 +1356,14 @@ static bool kvm_zap_rmapp(struct kvm *kvm, unsigned long *rmapp)
return flush;
}
-static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
+static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
struct kvm_memory_slot *slot, gfn_t gfn, int level,
unsigned long data)
{
- return kvm_zap_rmapp(kvm, rmapp);
+ return kvm_zap_rmapp(kvm, rmap_head);
}
-static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
+static int kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
struct kvm_memory_slot *slot, gfn_t gfn, int level,
unsigned long data)
{
@@ -1389,13 +1372,13 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
int need_flush = 0;
u64 new_spte;
pte_t *ptep = (pte_t *)data;
- pfn_t new_pfn;
+ kvm_pfn_t new_pfn;
WARN_ON(pte_huge(*ptep));
new_pfn = pte_pfn(*ptep);
restart:
- for_each_rmap_spte(rmapp, &iter, sptep) {
+ for_each_rmap_spte(rmap_head, &iter, sptep) {
rmap_printk("kvm_set_pte_rmapp: spte %p %llx gfn %llx (%d)\n",
sptep, *sptep, gfn, level);
@@ -1433,11 +1416,11 @@ struct slot_rmap_walk_iterator {
/* output fields. */
gfn_t gfn;
- unsigned long *rmap;
+ struct kvm_rmap_head *rmap;
int level;
/* private field. */
- unsigned long *end_rmap;
+ struct kvm_rmap_head *end_rmap;
};
static void
@@ -1496,7 +1479,7 @@ static int kvm_handle_hva_range(struct kvm *kvm,
unsigned long end,
unsigned long data,
int (*handler)(struct kvm *kvm,
- unsigned long *rmapp,
+ struct kvm_rmap_head *rmap_head,
struct kvm_memory_slot *slot,
gfn_t gfn,
int level,
@@ -1540,7 +1523,8 @@ static int kvm_handle_hva_range(struct kvm *kvm,
static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
unsigned long data,
- int (*handler)(struct kvm *kvm, unsigned long *rmapp,
+ int (*handler)(struct kvm *kvm,
+ struct kvm_rmap_head *rmap_head,
struct kvm_memory_slot *slot,
gfn_t gfn, int level,
unsigned long data))
@@ -1563,7 +1547,7 @@ void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp);
}
-static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
+static int kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
struct kvm_memory_slot *slot, gfn_t gfn, int level,
unsigned long data)
{
@@ -1573,18 +1557,19 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
BUG_ON(!shadow_accessed_mask);
- for_each_rmap_spte(rmapp, &iter, sptep)
+ for_each_rmap_spte(rmap_head, &iter, sptep) {
if (*sptep & shadow_accessed_mask) {
young = 1;
clear_bit((ffs(shadow_accessed_mask) - 1),
(unsigned long *)sptep);
}
+ }
trace_kvm_age_page(gfn, level, slot, young);
return young;
}
-static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
+static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
struct kvm_memory_slot *slot, gfn_t gfn,
int level, unsigned long data)
{
@@ -1600,11 +1585,12 @@ static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
if (!shadow_accessed_mask)
goto out;
- for_each_rmap_spte(rmapp, &iter, sptep)
+ for_each_rmap_spte(rmap_head, &iter, sptep) {
if (*sptep & shadow_accessed_mask) {
young = 1;
break;
}
+ }
out:
return young;
}
@@ -1613,14 +1599,14 @@ out:
static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
{
- unsigned long *rmapp;
+ struct kvm_rmap_head *rmap_head;
struct kvm_mmu_page *sp;
sp = page_header(__pa(spte));
- rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp);
+ rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp);
- kvm_unmap_rmapp(vcpu->kvm, rmapp, NULL, gfn, sp->role.level, 0);
+ kvm_unmap_rmapp(vcpu->kvm, rmap_head, NULL, gfn, sp->role.level, 0);
kvm_flush_remote_tlbs(vcpu->kvm);
}
@@ -1720,8 +1706,7 @@ static void drop_parent_pte(struct kvm_mmu_page *sp,
mmu_spte_clear_no_track(parent_pte);
}
-static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
- u64 *parent_pte, int direct)
+static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, int direct)
{
struct kvm_mmu_page *sp;
@@ -1737,8 +1722,6 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
* this feature. See the comments in kvm_zap_obsolete_pages().
*/
list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
- sp->parent_ptes = 0;
- mmu_page_add_parent_pte(vcpu, sp, parent_pte);
kvm_mod_used_mmu_pages(vcpu->kvm, +1);
return sp;
}
@@ -1746,7 +1729,12 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
static void mark_unsync(u64 *spte);
static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
{
- pte_list_walk(&sp->parent_ptes, mark_unsync);
+ u64 *sptep;
+ struct rmap_iterator iter;
+
+ for_each_rmap_spte(&sp->parent_ptes, &iter, sptep) {
+ mark_unsync(sptep);
+ }
}
static void mark_unsync(u64 *spte)
@@ -1806,6 +1794,13 @@ static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp,
return (pvec->nr == KVM_PAGE_ARRAY_NR);
}
+static inline void clear_unsync_child_bit(struct kvm_mmu_page *sp, int idx)
+{
+ --sp->unsync_children;
+ WARN_ON((int)sp->unsync_children < 0);
+ __clear_bit(idx, sp->unsync_child_bitmap);
+}
+
static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
struct kvm_mmu_pages *pvec)
{
@@ -1815,8 +1810,10 @@ static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
struct kvm_mmu_page *child;
u64 ent = sp->spt[i];
- if (!is_shadow_present_pte(ent) || is_large_pte(ent))
- goto clear_child_bitmap;
+ if (!is_shadow_present_pte(ent) || is_large_pte(ent)) {
+ clear_unsync_child_bit(sp, i);
+ continue;
+ }
child = page_header(ent & PT64_BASE_ADDR_MASK);
@@ -1825,28 +1822,21 @@ static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
return -ENOSPC;
ret = __mmu_unsync_walk(child, pvec);
- if (!ret)
- goto clear_child_bitmap;
- else if (ret > 0)
+ if (!ret) {
+ clear_unsync_child_bit(sp, i);
+ continue;
+ } else if (ret > 0) {
nr_unsync_leaf += ret;
- else
+ } else
return ret;
} else if (child->unsync) {
nr_unsync_leaf++;
if (mmu_pages_add(pvec, child, i))
return -ENOSPC;
} else
- goto clear_child_bitmap;
-
- continue;
-
-clear_child_bitmap:
- __clear_bit(i, sp->unsync_child_bitmap);
- sp->unsync_children--;
- WARN_ON((int)sp->unsync_children < 0);
+ clear_unsync_child_bit(sp, i);
}
-
return nr_unsync_leaf;
}
@@ -2009,9 +1999,7 @@ static void mmu_pages_clear_parents(struct mmu_page_path *parents)
if (!sp)
return;
- --sp->unsync_children;
- WARN_ON((int)sp->unsync_children < 0);
- __clear_bit(idx, sp->unsync_child_bitmap);
+ clear_unsync_child_bit(sp, idx);
level++;
} while (level < PT64_ROOT_LEVEL-1 && !sp->unsync_children);
}
@@ -2053,14 +2041,6 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,
}
}
-static void init_shadow_page_table(struct kvm_mmu_page *sp)
-{
- int i;
-
- for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
- sp->spt[i] = 0ull;
-}
-
static void __clear_sp_write_flooding_count(struct kvm_mmu_page *sp)
{
sp->write_flooding_count = 0;
@@ -2083,8 +2063,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
gva_t gaddr,
unsigned level,
int direct,
- unsigned access,
- u64 *parent_pte)
+ unsigned access)
{
union kvm_mmu_page_role role;
unsigned quadrant;
@@ -2116,21 +2095,18 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
if (sp->unsync && kvm_sync_page_transient(vcpu, sp))
break;
- mmu_page_add_parent_pte(vcpu, sp, parent_pte);
- if (sp->unsync_children) {
+ if (sp->unsync_children)
kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
- kvm_mmu_mark_parents_unsync(sp);
- } else if (sp->unsync)
- kvm_mmu_mark_parents_unsync(sp);
__clear_sp_write_flooding_count(sp);
trace_kvm_mmu_get_page(sp, false);
return sp;
}
+
++vcpu->kvm->stat.mmu_cache_miss;
- sp = kvm_mmu_alloc_page(vcpu, parent_pte, direct);
- if (!sp)
- return sp;
+
+ sp = kvm_mmu_alloc_page(vcpu, direct);
+
sp->gfn = gfn;
sp->role = role;
hlist_add_head(&sp->hash_link,
@@ -2144,7 +2120,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
account_shadowed(vcpu->kvm, sp);
}
sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen;
- init_shadow_page_table(sp);
+ clear_page(sp->spt);
trace_kvm_mmu_get_page(sp, true);
return sp;
}
@@ -2198,7 +2174,8 @@ static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
return __shadow_walk_next(iterator, *iterator->sptep);
}
-static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp, bool accessed)
+static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep,
+ struct kvm_mmu_page *sp)
{
u64 spte;
@@ -2206,12 +2183,14 @@ static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp, bool accessed)
VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK);
spte = __pa(sp->spt) | PT_PRESENT_MASK | PT_WRITABLE_MASK |
- shadow_user_mask | shadow_x_mask;
-
- if (accessed)
- spte |= shadow_accessed_mask;
+ shadow_user_mask | shadow_x_mask | shadow_accessed_mask;
mmu_spte_set(sptep, spte);
+
+ mmu_page_add_parent_pte(vcpu, sp, sptep);
+
+ if (sp->unsync_children || sp->unsync)
+ mark_unsync(sptep);
}
static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
@@ -2270,17 +2249,12 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm,
mmu_page_zap_pte(kvm, sp, sp->spt + i);
}
-static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte)
-{
- mmu_page_remove_parent_pte(sp, parent_pte);
-}
-
static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
{
u64 *sptep;
struct rmap_iterator iter;
- while ((sptep = rmap_get_first(sp->parent_ptes, &iter)))
+ while ((sptep = rmap_get_first(&sp->parent_ptes, &iter)))
drop_parent_pte(sp, sptep);
}
@@ -2476,7 +2450,7 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
return 0;
}
-static bool kvm_is_mmio_pfn(pfn_t pfn)
+static bool kvm_is_mmio_pfn(kvm_pfn_t pfn)
{
if (pfn_valid(pfn))
return !is_zero_pfn(pfn) && PageReserved(pfn_to_page(pfn));
@@ -2486,7 +2460,7 @@ static bool kvm_is_mmio_pfn(pfn_t pfn)
static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
unsigned pte_access, int level,
- gfn_t gfn, pfn_t pfn, bool speculative,
+ gfn_t gfn, kvm_pfn_t pfn, bool speculative,
bool can_unsync, bool host_writable)
{
u64 spte;
@@ -2564,18 +2538,18 @@ done:
return ret;
}
-static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
- unsigned pte_access, int write_fault, int *emulate,
- int level, gfn_t gfn, pfn_t pfn, bool speculative,
- bool host_writable)
+static bool mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access,
+ int write_fault, int level, gfn_t gfn, kvm_pfn_t pfn,
+ bool speculative, bool host_writable)
{
int was_rmapped = 0;
int rmap_count;
+ bool emulate = false;
pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__,
*sptep, write_fault, gfn);
- if (is_rmap_spte(*sptep)) {
+ if (is_shadow_present_pte(*sptep)) {
/*
* If we overwrite a PTE page pointer with a 2MB PMD, unlink
* the parent of the now unreachable PTE.
@@ -2600,12 +2574,12 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
if (set_spte(vcpu, sptep, pte_access, level, gfn, pfn, speculative,
true, host_writable)) {
if (write_fault)
- *emulate = 1;
+ emulate = true;
kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
}
- if (unlikely(is_mmio_spte(*sptep) && emulate))
- *emulate = 1;
+ if (unlikely(is_mmio_spte(*sptep)))
+ emulate = true;
pgprintk("%s: setting spte %llx\n", __func__, *sptep);
pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n",
@@ -2624,9 +2598,11 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
}
kvm_release_pfn_clean(pfn);
+
+ return emulate;
}
-static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
+static kvm_pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
bool no_dirty_log)
{
struct kvm_memory_slot *slot;
@@ -2658,9 +2634,8 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
return -1;
for (i = 0; i < ret; i++, gfn++, start++)
- mmu_set_spte(vcpu, start, access, 0, NULL,
- sp->role.level, gfn, page_to_pfn(pages[i]),
- true, true);
+ mmu_set_spte(vcpu, start, access, 0, sp->role.level, gfn,
+ page_to_pfn(pages[i]), true, true);
return 0;
}
@@ -2708,9 +2683,8 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
__direct_pte_prefetch(vcpu, sp, sptep);
}
-static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
- int map_writable, int level, gfn_t gfn, pfn_t pfn,
- bool prefault)
+static int __direct_map(struct kvm_vcpu *vcpu, int write, int map_writable,
+ int level, gfn_t gfn, kvm_pfn_t pfn, bool prefault)
{
struct kvm_shadow_walk_iterator iterator;
struct kvm_mmu_page *sp;
@@ -2722,9 +2696,9 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) {
if (iterator.level == level) {
- mmu_set_spte(vcpu, iterator.sptep, ACC_ALL,
- write, &emulate, level, gfn, pfn,
- prefault, map_writable);
+ emulate = mmu_set_spte(vcpu, iterator.sptep, ACC_ALL,
+ write, level, gfn, pfn, prefault,
+ map_writable);
direct_pte_prefetch(vcpu, iterator.sptep);
++vcpu->stat.pf_fixed;
break;
@@ -2737,10 +2711,9 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
base_addr &= PT64_LVL_ADDR_MASK(iterator.level);
pseudo_gfn = base_addr >> PAGE_SHIFT;
sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr,
- iterator.level - 1,
- 1, ACC_ALL, iterator.sptep);
+ iterator.level - 1, 1, ACC_ALL);
- link_shadow_page(iterator.sptep, sp, true);
+ link_shadow_page(vcpu, iterator.sptep, sp);
}
}
return emulate;
@@ -2759,7 +2732,7 @@ static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *
send_sig_info(SIGBUS, &info, tsk);
}
-static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, pfn_t pfn)
+static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
{
/*
* Do not cache the mmio info caused by writing the readonly gfn
@@ -2779,9 +2752,10 @@ static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, pfn_t pfn)
}
static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
- gfn_t *gfnp, pfn_t *pfnp, int *levelp)
+ gfn_t *gfnp, kvm_pfn_t *pfnp,
+ int *levelp)
{
- pfn_t pfn = *pfnp;
+ kvm_pfn_t pfn = *pfnp;
gfn_t gfn = *gfnp;
int level = *levelp;
@@ -2820,7 +2794,7 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
}
static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
- pfn_t pfn, unsigned access, int *ret_val)
+ kvm_pfn_t pfn, unsigned access, int *ret_val)
{
bool ret = true;
@@ -2919,7 +2893,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
* If the mapping has been changed, let the vcpu fault on the
* same address again.
*/
- if (!is_rmap_spte(spte)) {
+ if (!is_shadow_present_pte(spte)) {
ret = true;
goto exit;
}
@@ -2974,7 +2948,7 @@ exit:
}
static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
- gva_t gva, pfn_t *pfn, bool write, bool *writable);
+ gva_t gva, kvm_pfn_t *pfn, bool write, bool *writable);
static void make_mmu_pages_available(struct kvm_vcpu *vcpu);
static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
@@ -2983,7 +2957,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
int r;
int level;
bool force_pt_level = false;
- pfn_t pfn;
+ kvm_pfn_t pfn;
unsigned long mmu_seq;
bool map_writable, write = error_code & PFERR_WRITE_MASK;
@@ -3018,11 +2992,9 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
make_mmu_pages_available(vcpu);
if (likely(!force_pt_level))
transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
- r = __direct_map(vcpu, v, write, map_writable, level, gfn, pfn,
- prefault);
+ r = __direct_map(vcpu, write, map_writable, level, gfn, pfn, prefault);
spin_unlock(&vcpu->kvm->mmu_lock);
-
return r;
out_unlock:
@@ -3097,8 +3069,7 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
spin_lock(&vcpu->kvm->mmu_lock);
make_mmu_pages_available(vcpu);
- sp = kvm_mmu_get_page(vcpu, 0, 0, PT64_ROOT_LEVEL,
- 1, ACC_ALL, NULL);
+ sp = kvm_mmu_get_page(vcpu, 0, 0, PT64_ROOT_LEVEL, 1, ACC_ALL);
++sp->root_count;
spin_unlock(&vcpu->kvm->mmu_lock);
vcpu->arch.mmu.root_hpa = __pa(sp->spt);
@@ -3110,9 +3081,7 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
spin_lock(&vcpu->kvm->mmu_lock);
make_mmu_pages_available(vcpu);
sp = kvm_mmu_get_page(vcpu, i << (30 - PAGE_SHIFT),
- i << 30,
- PT32_ROOT_LEVEL, 1, ACC_ALL,
- NULL);
+ i << 30, PT32_ROOT_LEVEL, 1, ACC_ALL);
root = __pa(sp->spt);
++sp->root_count;
spin_unlock(&vcpu->kvm->mmu_lock);
@@ -3149,7 +3118,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
spin_lock(&vcpu->kvm->mmu_lock);
make_mmu_pages_available(vcpu);
sp = kvm_mmu_get_page(vcpu, root_gfn, 0, PT64_ROOT_LEVEL,
- 0, ACC_ALL, NULL);
+ 0, ACC_ALL);
root = __pa(sp->spt);
++sp->root_count;
spin_unlock(&vcpu->kvm->mmu_lock);
@@ -3182,9 +3151,8 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
}
spin_lock(&vcpu->kvm->mmu_lock);
make_mmu_pages_available(vcpu);
- sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
- PT32_ROOT_LEVEL, 0,
- ACC_ALL, NULL);
+ sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, PT32_ROOT_LEVEL,
+ 0, ACC_ALL);
root = __pa(sp->spt);
++sp->root_count;
spin_unlock(&vcpu->kvm->mmu_lock);
@@ -3443,7 +3411,7 @@ static bool can_do_async_pf(struct kvm_vcpu *vcpu)
}
static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
- gva_t gva, pfn_t *pfn, bool write, bool *writable)
+ gva_t gva, kvm_pfn_t *pfn, bool write, bool *writable)
{
struct kvm_memory_slot *slot;
bool async;
@@ -3481,7 +3449,7 @@ check_hugepage_cache_consistency(struct kvm_vcpu *vcpu, gfn_t gfn, int level)
static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
bool prefault)
{
- pfn_t pfn;
+ kvm_pfn_t pfn;
int r;
int level;
bool force_pt_level;
@@ -3531,8 +3499,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
make_mmu_pages_available(vcpu);
if (likely(!force_pt_level))
transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
- r = __direct_map(vcpu, gpa, write, map_writable,
- level, gfn, pfn, prefault);
+ r = __direct_map(vcpu, write, map_writable, level, gfn, pfn, prefault);
spin_unlock(&vcpu->kvm->mmu_lock);
return r;
@@ -4058,10 +4025,12 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
g_context->inject_page_fault = kvm_inject_page_fault;
/*
- * Note that arch.mmu.gva_to_gpa translates l2_gva to l1_gpa. The
- * translation of l2_gpa to l1_gpa addresses is done using the
- * arch.nested_mmu.gva_to_gpa function. Basically the gva_to_gpa
- * functions between mmu and nested_mmu are swapped.
+ * Note that arch.mmu.gva_to_gpa translates l2_gpa to l1_gpa using
+ * L1's nested page tables (e.g. EPT12). The nested translation
+ * of l2_gva to l1_gpa is done by arch.nested_mmu.gva_to_gpa using
+ * L2's page tables as the first level of translation and L1's
+ * nested page tables as the second level of translation. Basically
+ * the gva_to_gpa functions between mmu and nested_mmu are swapped.
*/
if (!is_paging(vcpu)) {
g_context->nx = false;
@@ -4495,7 +4464,7 @@ void kvm_mmu_setup(struct kvm_vcpu *vcpu)
}
/* The return value indicates if tlb flush on all vcpus is needed. */
-typedef bool (*slot_level_handler) (struct kvm *kvm, unsigned long *rmap);
+typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_head);
/* The caller should hold mmu-lock before calling this function. */
static bool
@@ -4589,9 +4558,10 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
spin_unlock(&kvm->mmu_lock);
}
-static bool slot_rmap_write_protect(struct kvm *kvm, unsigned long *rmapp)
+static bool slot_rmap_write_protect(struct kvm *kvm,
+ struct kvm_rmap_head *rmap_head)
{
- return __rmap_write_protect(kvm, rmapp, false);
+ return __rmap_write_protect(kvm, rmap_head, false);
}
void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
@@ -4627,16 +4597,16 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
}
static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
- unsigned long *rmapp)
+ struct kvm_rmap_head *rmap_head)
{
u64 *sptep;
struct rmap_iterator iter;
int need_tlb_flush = 0;
- pfn_t pfn;
+ kvm_pfn_t pfn;
struct kvm_mmu_page *sp;
restart:
- for_each_rmap_spte(rmapp, &iter, sptep) {
+ for_each_rmap_spte(rmap_head, &iter, sptep) {
sp = page_header(__pa(sptep));
pfn = spte_to_pfn(*sptep);
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
index 03d518e499a6..dcce533d420c 100644
--- a/arch/x86/kvm/mmu_audit.c
+++ b/arch/x86/kvm/mmu_audit.c
@@ -97,7 +97,7 @@ static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level)
{
struct kvm_mmu_page *sp;
gfn_t gfn;
- pfn_t pfn;
+ kvm_pfn_t pfn;
hpa_t hpa;
sp = page_header(__pa(sptep));
@@ -129,7 +129,7 @@ static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level)
static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
{
static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10);
- unsigned long *rmapp;
+ struct kvm_rmap_head *rmap_head;
struct kvm_mmu_page *rev_sp;
struct kvm_memslots *slots;
struct kvm_memory_slot *slot;
@@ -150,8 +150,8 @@ static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
return;
}
- rmapp = __gfn_to_rmap(gfn, rev_sp->role.level, slot);
- if (!*rmapp) {
+ rmap_head = __gfn_to_rmap(gfn, rev_sp->role.level, slot);
+ if (!rmap_head->val) {
if (!__ratelimit(&ratelimit_state))
return;
audit_printk(kvm, "no rmap for writable spte %llx\n",
@@ -183,7 +183,7 @@ static void check_mappings_rmap(struct kvm *kvm, struct kvm_mmu_page *sp)
return;
for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
- if (!is_rmap_spte(sp->spt[i]))
+ if (!is_shadow_present_pte(sp->spt[i]))
continue;
inspect_spte_has_rmap(kvm, sp->spt + i);
@@ -192,7 +192,7 @@ static void check_mappings_rmap(struct kvm *kvm, struct kvm_mmu_page *sp)
static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp)
{
- unsigned long *rmapp;
+ struct kvm_rmap_head *rmap_head;
u64 *sptep;
struct rmap_iterator iter;
struct kvm_memslots *slots;
@@ -203,13 +203,14 @@ static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp)
slots = kvm_memslots_for_spte_role(kvm, sp->role);
slot = __gfn_to_memslot(slots, sp->gfn);
- rmapp = __gfn_to_rmap(sp->gfn, PT_PAGE_TABLE_LEVEL, slot);
+ rmap_head = __gfn_to_rmap(sp->gfn, PT_PAGE_TABLE_LEVEL, slot);
- for_each_rmap_spte(rmapp, &iter, sptep)
+ for_each_rmap_spte(rmap_head, &iter, sptep) {
if (is_writable_pte(*sptep))
audit_printk(kvm, "shadow page has writable "
"mappings: gfn %llx role %x\n",
sp->gfn, sp->role.word);
+ }
}
static void audit_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 3058a22a658d..6c9fed957cce 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -456,7 +456,7 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
{
unsigned pte_access;
gfn_t gfn;
- pfn_t pfn;
+ kvm_pfn_t pfn;
if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
return false;
@@ -475,8 +475,8 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
* we call mmu_set_spte() with host_writable = true because
* pte_prefetch_gfn_to_pfn always gets a writable pfn.
*/
- mmu_set_spte(vcpu, spte, pte_access, 0, NULL, PT_PAGE_TABLE_LEVEL,
- gfn, pfn, true, true);
+ mmu_set_spte(vcpu, spte, pte_access, 0, PT_PAGE_TABLE_LEVEL, gfn, pfn,
+ true, true);
return true;
}
@@ -551,12 +551,12 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
struct guest_walker *gw,
int write_fault, int hlevel,
- pfn_t pfn, bool map_writable, bool prefault)
+ kvm_pfn_t pfn, bool map_writable, bool prefault)
{
struct kvm_mmu_page *sp = NULL;
struct kvm_shadow_walk_iterator it;
unsigned direct_access, access = gw->pt_access;
- int top_level, emulate = 0;
+ int top_level, emulate;
direct_access = gw->pte_access;
@@ -587,7 +587,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
if (!is_shadow_present_pte(*it.sptep)) {
table_gfn = gw->table_gfn[it.level - 2];
sp = kvm_mmu_get_page(vcpu, table_gfn, addr, it.level-1,
- false, access, it.sptep);
+ false, access);
}
/*
@@ -598,7 +598,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
goto out_gpte_changed;
if (sp)
- link_shadow_page(it.sptep, sp, PT_GUEST_ACCESSED_MASK);
+ link_shadow_page(vcpu, it.sptep, sp);
}
for (;
@@ -617,20 +617,18 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
direct_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
sp = kvm_mmu_get_page(vcpu, direct_gfn, addr, it.level-1,
- true, direct_access, it.sptep);
- link_shadow_page(it.sptep, sp, PT_GUEST_ACCESSED_MASK);
+ true, direct_access);
+ link_shadow_page(vcpu, it.sptep, sp);
}
clear_sp_write_flooding_count(it.sptep);
- mmu_set_spte(vcpu, it.sptep, gw->pte_access, write_fault, &emulate,
- it.level, gw->gfn, pfn, prefault, map_writable);
+ emulate = mmu_set_spte(vcpu, it.sptep, gw->pte_access, write_fault,
+ it.level, gw->gfn, pfn, prefault, map_writable);
FNAME(pte_prefetch)(vcpu, gw, it.sptep);
return emulate;
out_gpte_changed:
- if (sp)
- kvm_mmu_put_page(sp, it.sptep);
kvm_release_pfn_clean(pfn);
return 0;
}
@@ -696,7 +694,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
int user_fault = error_code & PFERR_USER_MASK;
struct guest_walker walker;
int r;
- pfn_t pfn;
+ kvm_pfn_t pfn;
int level = PT_PAGE_TABLE_LEVEL;
bool force_pt_level = false;
unsigned long mmu_seq;
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 22aef20bf9c7..c13a64b7d789 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -86,6 +86,7 @@ static const u32 host_save_user_msrs[] = {
MSR_FS_BASE,
#endif
MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
+ MSR_TSC_AUX,
};
#define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs)
@@ -135,6 +136,7 @@ struct vcpu_svm {
uint64_t asid_generation;
uint64_t sysenter_esp;
uint64_t sysenter_eip;
+ uint64_t tsc_aux;
u64 next_rip;
@@ -1238,6 +1240,9 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
wrmsrl(MSR_AMD64_TSC_RATIO, tsc_ratio);
}
}
+ /* This assumes that the kernel never uses MSR_TSC_AUX */
+ if (static_cpu_has(X86_FEATURE_RDTSCP))
+ wrmsrl(MSR_TSC_AUX, svm->tsc_aux);
}
static void svm_vcpu_put(struct kvm_vcpu *vcpu)
@@ -3024,6 +3029,11 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_IA32_SYSENTER_ESP:
msr_info->data = svm->sysenter_esp;
break;
+ case MSR_TSC_AUX:
+ if (!boot_cpu_has(X86_FEATURE_RDTSCP))
+ return 1;
+ msr_info->data = svm->tsc_aux;
+ break;
/*
* Nobody will change the following 5 values in the VMCB so we can
* safely return them on rdmsr. They will always be 0 until LBRV is
@@ -3162,6 +3172,18 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
svm->sysenter_esp = data;
svm->vmcb->save.sysenter_esp = data;
break;
+ case MSR_TSC_AUX:
+ if (!boot_cpu_has(X86_FEATURE_RDTSCP))
+ return 1;
+
+ /*
+ * This is rare, so we update the MSR here instead of using
+ * direct_access_msrs. Doing that would require a rdmsr in
+ * svm_vcpu_put.
+ */
+ svm->tsc_aux = data;
+ wrmsrl(MSR_TSC_AUX, svm->tsc_aux);
+ break;
case MSR_IA32_DEBUGCTLMSR:
if (!boot_cpu_has(X86_FEATURE_LBRV)) {
vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n",
@@ -3578,12 +3600,16 @@ static void svm_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
return;
}
-static int svm_cpu_uses_apicv(struct kvm_vcpu *vcpu)
+static bool svm_get_enable_apicv(void)
+{
+ return false;
+}
+
+static void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
{
- return 0;
}
-static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu)
+static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
{
return;
}
@@ -4054,7 +4080,7 @@ static int svm_get_lpage_level(void)
static bool svm_rdtscp_supported(void)
{
- return false;
+ return boot_cpu_has(X86_FEATURE_RDTSCP);
}
static bool svm_invpcid_supported(void)
@@ -4345,7 +4371,8 @@ static struct kvm_x86_ops svm_x86_ops = {
.enable_irq_window = enable_irq_window,
.update_cr8_intercept = update_cr8_intercept,
.set_virtual_x2apic_mode = svm_set_virtual_x2apic_mode,
- .cpu_uses_apicv = svm_cpu_uses_apicv,
+ .get_enable_apicv = svm_get_enable_apicv,
+ .refresh_apicv_exec_ctrl = svm_refresh_apicv_exec_ctrl,
.load_eoi_exitmap = svm_load_eoi_exitmap,
.sync_pir_to_irr = svm_sync_pir_to_irr,
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 120302511802..ad9f6a23f139 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -268,7 +268,7 @@ TRACE_EVENT(kvm_inj_virq,
#define kvm_trace_sym_exc \
EXS(DE), EXS(DB), EXS(BP), EXS(OF), EXS(BR), EXS(UD), EXS(NM), \
EXS(DF), EXS(TS), EXS(NP), EXS(SS), EXS(GP), EXS(PF), \
- EXS(MF), EXS(MC)
+ EXS(MF), EXS(AC), EXS(MC)
/*
* Tracepoint for kvm interrupt injection:
@@ -1025,6 +1025,269 @@ TRACE_EVENT(kvm_pi_irte_update,
__entry->pi_desc_addr)
);
+/*
+ * Tracepoint for kvm_hv_notify_acked_sint.
+ */
+TRACE_EVENT(kvm_hv_notify_acked_sint,
+ TP_PROTO(int vcpu_id, u32 sint),
+ TP_ARGS(vcpu_id, sint),
+
+ TP_STRUCT__entry(
+ __field(int, vcpu_id)
+ __field(u32, sint)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->sint = sint;
+ ),
+
+ TP_printk("vcpu_id %d sint %u", __entry->vcpu_id, __entry->sint)
+);
+
+/*
+ * Tracepoint for synic_set_irq.
+ */
+TRACE_EVENT(kvm_hv_synic_set_irq,
+ TP_PROTO(int vcpu_id, u32 sint, int vector, int ret),
+ TP_ARGS(vcpu_id, sint, vector, ret),
+
+ TP_STRUCT__entry(
+ __field(int, vcpu_id)
+ __field(u32, sint)
+ __field(int, vector)
+ __field(int, ret)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->sint = sint;
+ __entry->vector = vector;
+ __entry->ret = ret;
+ ),
+
+ TP_printk("vcpu_id %d sint %u vector %d ret %d",
+ __entry->vcpu_id, __entry->sint, __entry->vector,
+ __entry->ret)
+);
+
+/*
+ * Tracepoint for kvm_hv_synic_send_eoi.
+ */
+TRACE_EVENT(kvm_hv_synic_send_eoi,
+ TP_PROTO(int vcpu_id, int vector),
+ TP_ARGS(vcpu_id, vector),
+
+ TP_STRUCT__entry(
+ __field(int, vcpu_id)
+ __field(u32, sint)
+ __field(int, vector)
+ __field(int, ret)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->vector = vector;
+ ),
+
+ TP_printk("vcpu_id %d vector %d", __entry->vcpu_id, __entry->vector)
+);
+
+/*
+ * Tracepoint for synic_set_msr.
+ */
+TRACE_EVENT(kvm_hv_synic_set_msr,
+ TP_PROTO(int vcpu_id, u32 msr, u64 data, bool host),
+ TP_ARGS(vcpu_id, msr, data, host),
+
+ TP_STRUCT__entry(
+ __field(int, vcpu_id)
+ __field(u32, msr)
+ __field(u64, data)
+ __field(bool, host)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->msr = msr;
+ __entry->data = data;
+ __entry->host = host
+ ),
+
+ TP_printk("vcpu_id %d msr 0x%x data 0x%llx host %d",
+ __entry->vcpu_id, __entry->msr, __entry->data, __entry->host)
+);
+
+/*
+ * Tracepoint for stimer_set_config.
+ */
+TRACE_EVENT(kvm_hv_stimer_set_config,
+ TP_PROTO(int vcpu_id, int timer_index, u64 config, bool host),
+ TP_ARGS(vcpu_id, timer_index, config, host),
+
+ TP_STRUCT__entry(
+ __field(int, vcpu_id)
+ __field(int, timer_index)
+ __field(u64, config)
+ __field(bool, host)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->timer_index = timer_index;
+ __entry->config = config;
+ __entry->host = host;
+ ),
+
+ TP_printk("vcpu_id %d timer %d config 0x%llx host %d",
+ __entry->vcpu_id, __entry->timer_index, __entry->config,
+ __entry->host)
+);
+
+/*
+ * Tracepoint for stimer_set_count.
+ */
+TRACE_EVENT(kvm_hv_stimer_set_count,
+ TP_PROTO(int vcpu_id, int timer_index, u64 count, bool host),
+ TP_ARGS(vcpu_id, timer_index, count, host),
+
+ TP_STRUCT__entry(
+ __field(int, vcpu_id)
+ __field(int, timer_index)
+ __field(u64, count)
+ __field(bool, host)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->timer_index = timer_index;
+ __entry->count = count;
+ __entry->host = host;
+ ),
+
+ TP_printk("vcpu_id %d timer %d count %llu host %d",
+ __entry->vcpu_id, __entry->timer_index, __entry->count,
+ __entry->host)
+);
+
+/*
+ * Tracepoint for stimer_start(periodic timer case).
+ */
+TRACE_EVENT(kvm_hv_stimer_start_periodic,
+ TP_PROTO(int vcpu_id, int timer_index, u64 time_now, u64 exp_time),
+ TP_ARGS(vcpu_id, timer_index, time_now, exp_time),
+
+ TP_STRUCT__entry(
+ __field(int, vcpu_id)
+ __field(int, timer_index)
+ __field(u64, time_now)
+ __field(u64, exp_time)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->timer_index = timer_index;
+ __entry->time_now = time_now;
+ __entry->exp_time = exp_time;
+ ),
+
+ TP_printk("vcpu_id %d timer %d time_now %llu exp_time %llu",
+ __entry->vcpu_id, __entry->timer_index, __entry->time_now,
+ __entry->exp_time)
+);
+
+/*
+ * Tracepoint for stimer_start(one-shot timer case).
+ */
+TRACE_EVENT(kvm_hv_stimer_start_one_shot,
+ TP_PROTO(int vcpu_id, int timer_index, u64 time_now, u64 count),
+ TP_ARGS(vcpu_id, timer_index, time_now, count),
+
+ TP_STRUCT__entry(
+ __field(int, vcpu_id)
+ __field(int, timer_index)
+ __field(u64, time_now)
+ __field(u64, count)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->timer_index = timer_index;
+ __entry->time_now = time_now;
+ __entry->count = count;
+ ),
+
+ TP_printk("vcpu_id %d timer %d time_now %llu count %llu",
+ __entry->vcpu_id, __entry->timer_index, __entry->time_now,
+ __entry->count)
+);
+
+/*
+ * Tracepoint for stimer_timer_callback.
+ */
+TRACE_EVENT(kvm_hv_stimer_callback,
+ TP_PROTO(int vcpu_id, int timer_index),
+ TP_ARGS(vcpu_id, timer_index),
+
+ TP_STRUCT__entry(
+ __field(int, vcpu_id)
+ __field(int, timer_index)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->timer_index = timer_index;
+ ),
+
+ TP_printk("vcpu_id %d timer %d",
+ __entry->vcpu_id, __entry->timer_index)
+);
+
+/*
+ * Tracepoint for stimer_expiration.
+ */
+TRACE_EVENT(kvm_hv_stimer_expiration,
+ TP_PROTO(int vcpu_id, int timer_index, int msg_send_result),
+ TP_ARGS(vcpu_id, timer_index, msg_send_result),
+
+ TP_STRUCT__entry(
+ __field(int, vcpu_id)
+ __field(int, timer_index)
+ __field(int, msg_send_result)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->timer_index = timer_index;
+ __entry->msg_send_result = msg_send_result;
+ ),
+
+ TP_printk("vcpu_id %d timer %d msg send result %d",
+ __entry->vcpu_id, __entry->timer_index,
+ __entry->msg_send_result)
+);
+
+/*
+ * Tracepoint for stimer_cleanup.
+ */
+TRACE_EVENT(kvm_hv_stimer_cleanup,
+ TP_PROTO(int vcpu_id, int timer_index),
+ TP_ARGS(vcpu_id, timer_index),
+
+ TP_STRUCT__entry(
+ __field(int, vcpu_id)
+ __field(int, timer_index)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->timer_index = timer_index;
+ ),
+
+ TP_printk("vcpu_id %d timer %d",
+ __entry->vcpu_id, __entry->timer_index)
+);
+
#endif /* _TRACE_KVM_H */
#undef TRACE_INCLUDE_PATH
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 44976a596fa6..e2951b6edbbc 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -19,6 +19,7 @@
#include "irq.h"
#include "mmu.h"
#include "cpuid.h"
+#include "lapic.h"
#include <linux/kvm_host.h>
#include <linux/module.h>
@@ -862,7 +863,6 @@ static void kvm_cpu_vmxon(u64 addr);
static void kvm_cpu_vmxoff(void);
static bool vmx_mpx_supported(void);
static bool vmx_xsaves_supported(void);
-static int vmx_cpu_uses_apicv(struct kvm_vcpu *vcpu);
static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
static void vmx_set_segment(struct kvm_vcpu *vcpu,
struct kvm_segment *var, int seg);
@@ -870,7 +870,6 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu,
struct kvm_segment *var, int seg);
static bool guest_state_valid(struct kvm_vcpu *vcpu);
static u32 vmx_segment_access_rights(struct kvm_segment *var);
-static void vmx_sync_pir_to_irr_dummy(struct kvm_vcpu *vcpu);
static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx);
static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx);
static int alloc_identity_pagetable(struct kvm *kvm);
@@ -1448,7 +1447,51 @@ static inline void ept_sync_context(u64 eptp)
}
}
-static __always_inline unsigned long vmcs_readl(unsigned long field)
+static __always_inline void vmcs_check16(unsigned long field)
+{
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2000,
+ "16-bit accessor invalid for 64-bit field");
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001,
+ "16-bit accessor invalid for 64-bit high field");
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000,
+ "16-bit accessor invalid for 32-bit high field");
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000,
+ "16-bit accessor invalid for natural width field");
+}
+
+static __always_inline void vmcs_check32(unsigned long field)
+{
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0,
+ "32-bit accessor invalid for 16-bit field");
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000,
+ "32-bit accessor invalid for natural width field");
+}
+
+static __always_inline void vmcs_check64(unsigned long field)
+{
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0,
+ "64-bit accessor invalid for 16-bit field");
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001,
+ "64-bit accessor invalid for 64-bit high field");
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000,
+ "64-bit accessor invalid for 32-bit field");
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000,
+ "64-bit accessor invalid for natural width field");
+}
+
+static __always_inline void vmcs_checkl(unsigned long field)
+{
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0,
+ "Natural width accessor invalid for 16-bit field");
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2000,
+ "Natural width accessor invalid for 64-bit field");
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001,
+ "Natural width accessor invalid for 64-bit high field");
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000,
+ "Natural width accessor invalid for 32-bit field");
+}
+
+static __always_inline unsigned long __vmcs_readl(unsigned long field)
{
unsigned long value;
@@ -1459,23 +1502,32 @@ static __always_inline unsigned long vmcs_readl(unsigned long field)
static __always_inline u16 vmcs_read16(unsigned long field)
{
- return vmcs_readl(field);
+ vmcs_check16(field);
+ return __vmcs_readl(field);
}
static __always_inline u32 vmcs_read32(unsigned long field)
{
- return vmcs_readl(field);
+ vmcs_check32(field);
+ return __vmcs_readl(field);
}
static __always_inline u64 vmcs_read64(unsigned long field)
{
+ vmcs_check64(field);
#ifdef CONFIG_X86_64
- return vmcs_readl(field);
+ return __vmcs_readl(field);
#else
- return vmcs_readl(field) | ((u64)vmcs_readl(field+1) << 32);
+ return __vmcs_readl(field) | ((u64)__vmcs_readl(field+1) << 32);
#endif
}
+static __always_inline unsigned long vmcs_readl(unsigned long field)
+{
+ vmcs_checkl(field);
+ return __vmcs_readl(field);
+}
+
static noinline void vmwrite_error(unsigned long field, unsigned long value)
{
printk(KERN_ERR "vmwrite error: reg %lx value %lx (err %d)\n",
@@ -1483,7 +1535,7 @@ static noinline void vmwrite_error(unsigned long field, unsigned long value)
dump_stack();
}
-static void vmcs_writel(unsigned long field, unsigned long value)
+static __always_inline void __vmcs_writel(unsigned long field, unsigned long value)
{
u8 error;
@@ -1493,33 +1545,46 @@ static void vmcs_writel(unsigned long field, unsigned long value)
vmwrite_error(field, value);
}
-static void vmcs_write16(unsigned long field, u16 value)
+static __always_inline void vmcs_write16(unsigned long field, u16 value)
{
- vmcs_writel(field, value);
+ vmcs_check16(field);
+ __vmcs_writel(field, value);
}
-static void vmcs_write32(unsigned long field, u32 value)
+static __always_inline void vmcs_write32(unsigned long field, u32 value)
{
- vmcs_writel(field, value);
+ vmcs_check32(field);
+ __vmcs_writel(field, value);
}
-static void vmcs_write64(unsigned long field, u64 value)
+static __always_inline void vmcs_write64(unsigned long field, u64 value)
{
- vmcs_writel(field, value);
+ vmcs_check64(field);
+ __vmcs_writel(field, value);
#ifndef CONFIG_X86_64
asm volatile ("");
- vmcs_writel(field+1, value >> 32);
+ __vmcs_writel(field+1, value >> 32);
#endif
}
-static void vmcs_clear_bits(unsigned long field, u32 mask)
+static __always_inline void vmcs_writel(unsigned long field, unsigned long value)
{
- vmcs_writel(field, vmcs_readl(field) & ~mask);
+ vmcs_checkl(field);
+ __vmcs_writel(field, value);
}
-static void vmcs_set_bits(unsigned long field, u32 mask)
+static __always_inline void vmcs_clear_bits(unsigned long field, u32 mask)
{
- vmcs_writel(field, vmcs_readl(field) | mask);
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x2000,
+ "vmcs_clear_bits does not support 64-bit fields");
+ __vmcs_writel(field, __vmcs_readl(field) & ~mask);
+}
+
+static __always_inline void vmcs_set_bits(unsigned long field, u32 mask)
+{
+ BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x2000,
+ "vmcs_set_bits does not support 64-bit fields");
+ __vmcs_writel(field, __vmcs_readl(field) | mask);
}
static inline void vm_entry_controls_init(struct vcpu_vmx *vmx, u32 val)
@@ -2498,7 +2563,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
vmx->nested.nested_vmx_pinbased_ctls_high |=
PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
PIN_BASED_VMX_PREEMPTION_TIMER;
- if (vmx_cpu_uses_apicv(&vmx->vcpu))
+ if (kvm_vcpu_apicv_active(&vmx->vcpu))
vmx->nested.nested_vmx_pinbased_ctls_high |=
PIN_BASED_POSTED_INTR;
@@ -4186,7 +4251,7 @@ out:
static int init_rmode_identity_map(struct kvm *kvm)
{
int i, idx, r = 0;
- pfn_t identity_map_pfn;
+ kvm_pfn_t identity_map_pfn;
u32 tmp;
if (!enable_ept)
@@ -4462,9 +4527,9 @@ static void vmx_disable_intercept_msr_write_x2apic(u32 msr)
msr, MSR_TYPE_W);
}
-static int vmx_cpu_uses_apicv(struct kvm_vcpu *vcpu)
+static bool vmx_get_enable_apicv(void)
{
- return enable_apicv && lapic_in_kernel(vcpu);
+ return enable_apicv;
}
static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
@@ -4586,11 +4651,6 @@ static void vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
kvm_apic_update_irr(vcpu, vmx->pi_desc.pir);
}
-static void vmx_sync_pir_to_irr_dummy(struct kvm_vcpu *vcpu)
-{
- return;
-}
-
/*
* Set up the vmcs's constant host-state fields, i.e., host-state fields that
* will not change in the lifetime of the guest.
@@ -4660,11 +4720,18 @@ static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx)
{
u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl;
- if (!vmx_cpu_uses_apicv(&vmx->vcpu))
+ if (!kvm_vcpu_apicv_active(&vmx->vcpu))
pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR;
return pin_based_exec_ctrl;
}
+static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_ctrl(vmx));
+}
+
static u32 vmx_exec_control(struct vcpu_vmx *vmx)
{
u32 exec_control = vmcs_config.cpu_based_exec_ctrl;
@@ -4703,7 +4770,7 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
if (!ple_gap)
exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
- if (!vmx_cpu_uses_apicv(&vmx->vcpu))
+ if (!kvm_vcpu_apicv_active(&vmx->vcpu))
exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT |
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
@@ -4767,7 +4834,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
vmx_secondary_exec_control(vmx));
- if (vmx_cpu_uses_apicv(&vmx->vcpu)) {
+ if (kvm_vcpu_apicv_active(&vmx->vcpu)) {
vmcs_write64(EOI_EXIT_BITMAP0, 0);
vmcs_write64(EOI_EXIT_BITMAP1, 0);
vmcs_write64(EOI_EXIT_BITMAP2, 0);
@@ -4775,7 +4842,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
vmcs_write16(GUEST_INTR_STATUS, 0);
- vmcs_write64(POSTED_INTR_NV, POSTED_INTR_VECTOR);
+ vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR);
vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->pi_desc)));
}
@@ -4867,7 +4934,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
seg_setup(VCPU_SREG_CS);
vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
- vmcs_write32(GUEST_CS_BASE, 0xffff0000);
+ vmcs_writel(GUEST_CS_BASE, 0xffff0000ul);
seg_setup(VCPU_SREG_DS);
seg_setup(VCPU_SREG_ES);
@@ -4903,7 +4970,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
- vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
+ vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 0);
setup_msrs(vmx);
@@ -4919,7 +4986,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
- if (vmx_cpu_uses_apicv(vcpu))
+ if (kvm_vcpu_apicv_active(vcpu))
memset(&vmx->pi_desc, 0, sizeof(struct pi_desc));
if (vmx->vpid != 0)
@@ -6203,15 +6270,6 @@ static __init int hardware_setup(void)
kvm_tsc_scaling_ratio_frac_bits = 48;
}
- if (enable_apicv)
- kvm_x86_ops->update_cr8_intercept = NULL;
- else {
- kvm_x86_ops->hwapic_irr_update = NULL;
- kvm_x86_ops->hwapic_isr_update = NULL;
- kvm_x86_ops->deliver_posted_interrupt = NULL;
- kvm_x86_ops->sync_pir_to_irr = vmx_sync_pir_to_irr_dummy;
- }
-
vmx_disable_intercept_for_msr(MSR_FS_BASE, false);
vmx_disable_intercept_for_msr(MSR_GS_BASE, false);
vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true);
@@ -7901,7 +7959,7 @@ static void dump_vmcs(void)
u32 pin_based_exec_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL);
u32 secondary_exec_control = 0;
unsigned long cr4 = vmcs_readl(GUEST_CR4);
- u64 efer = vmcs_readl(GUEST_IA32_EFER);
+ u64 efer = vmcs_read64(GUEST_IA32_EFER);
int i, n;
if (cpu_has_secondary_exec_ctrls())
@@ -7917,10 +7975,10 @@ static void dump_vmcs(void)
if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT) &&
(cr4 & X86_CR4_PAE) && !(efer & EFER_LMA))
{
- pr_err("PDPTR0 = 0x%016lx PDPTR1 = 0x%016lx\n",
- vmcs_readl(GUEST_PDPTR0), vmcs_readl(GUEST_PDPTR1));
- pr_err("PDPTR2 = 0x%016lx PDPTR3 = 0x%016lx\n",
- vmcs_readl(GUEST_PDPTR2), vmcs_readl(GUEST_PDPTR3));
+ pr_err("PDPTR0 = 0x%016llx PDPTR1 = 0x%016llx\n",
+ vmcs_read64(GUEST_PDPTR0), vmcs_read64(GUEST_PDPTR1));
+ pr_err("PDPTR2 = 0x%016llx PDPTR3 = 0x%016llx\n",
+ vmcs_read64(GUEST_PDPTR2), vmcs_read64(GUEST_PDPTR3));
}
pr_err("RSP = 0x%016lx RIP = 0x%016lx\n",
vmcs_readl(GUEST_RSP), vmcs_readl(GUEST_RIP));
@@ -7941,16 +7999,16 @@ static void dump_vmcs(void)
vmx_dump_sel("TR: ", GUEST_TR_SELECTOR);
if ((vmexit_ctl & (VM_EXIT_SAVE_IA32_PAT | VM_EXIT_SAVE_IA32_EFER)) ||
(vmentry_ctl & (VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_IA32_EFER)))
- pr_err("EFER = 0x%016llx PAT = 0x%016lx\n",
- efer, vmcs_readl(GUEST_IA32_PAT));
- pr_err("DebugCtl = 0x%016lx DebugExceptions = 0x%016lx\n",
- vmcs_readl(GUEST_IA32_DEBUGCTL),
+ pr_err("EFER = 0x%016llx PAT = 0x%016llx\n",
+ efer, vmcs_read64(GUEST_IA32_PAT));
+ pr_err("DebugCtl = 0x%016llx DebugExceptions = 0x%016lx\n",
+ vmcs_read64(GUEST_IA32_DEBUGCTL),
vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS));
if (vmentry_ctl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL)
- pr_err("PerfGlobCtl = 0x%016lx\n",
- vmcs_readl(GUEST_IA32_PERF_GLOBAL_CTRL));
+ pr_err("PerfGlobCtl = 0x%016llx\n",
+ vmcs_read64(GUEST_IA32_PERF_GLOBAL_CTRL));
if (vmentry_ctl & VM_ENTRY_LOAD_BNDCFGS)
- pr_err("BndCfgS = 0x%016lx\n", vmcs_readl(GUEST_BNDCFGS));
+ pr_err("BndCfgS = 0x%016llx\n", vmcs_read64(GUEST_BNDCFGS));
pr_err("Interruptibility = %08x ActivityState = %08x\n",
vmcs_read32(GUEST_INTERRUPTIBILITY_INFO),
vmcs_read32(GUEST_ACTIVITY_STATE));
@@ -7979,11 +8037,12 @@ static void dump_vmcs(void)
vmcs_read32(HOST_IA32_SYSENTER_CS),
vmcs_readl(HOST_IA32_SYSENTER_EIP));
if (vmexit_ctl & (VM_EXIT_LOAD_IA32_PAT | VM_EXIT_LOAD_IA32_EFER))
- pr_err("EFER = 0x%016lx PAT = 0x%016lx\n",
- vmcs_readl(HOST_IA32_EFER), vmcs_readl(HOST_IA32_PAT));
+ pr_err("EFER = 0x%016llx PAT = 0x%016llx\n",
+ vmcs_read64(HOST_IA32_EFER),
+ vmcs_read64(HOST_IA32_PAT));
if (vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
- pr_err("PerfGlobCtl = 0x%016lx\n",
- vmcs_readl(HOST_IA32_PERF_GLOBAL_CTRL));
+ pr_err("PerfGlobCtl = 0x%016llx\n",
+ vmcs_read64(HOST_IA32_PERF_GLOBAL_CTRL));
pr_err("*** Control State ***\n");
pr_err("PinBased=%08x CPUBased=%08x SecondaryExec=%08x\n",
@@ -8006,16 +8065,16 @@ static void dump_vmcs(void)
pr_err("IDTVectoring: info=%08x errcode=%08x\n",
vmcs_read32(IDT_VECTORING_INFO_FIELD),
vmcs_read32(IDT_VECTORING_ERROR_CODE));
- pr_err("TSC Offset = 0x%016lx\n", vmcs_readl(TSC_OFFSET));
+ pr_err("TSC Offset = 0x%016llx\n", vmcs_read64(TSC_OFFSET));
if (secondary_exec_control & SECONDARY_EXEC_TSC_SCALING)
- pr_err("TSC Multiplier = 0x%016lx\n",
- vmcs_readl(TSC_MULTIPLIER));
+ pr_err("TSC Multiplier = 0x%016llx\n",
+ vmcs_read64(TSC_MULTIPLIER));
if (cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW)
pr_err("TPR Threshold = 0x%02x\n", vmcs_read32(TPR_THRESHOLD));
if (pin_based_exec_ctrl & PIN_BASED_POSTED_INTR)
pr_err("PostedIntrVec = 0x%02x\n", vmcs_read16(POSTED_INTR_NV));
if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT))
- pr_err("EPT pointer = 0x%016lx\n", vmcs_readl(EPT_POINTER));
+ pr_err("EPT pointer = 0x%016llx\n", vmcs_read64(EPT_POINTER));
n = vmcs_read32(CR3_TARGET_COUNT);
for (i = 0; i + 1 < n; i += 4)
pr_err("CR3 target%u=%016lx target%u=%016lx\n",
@@ -8154,7 +8213,7 @@ static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
* apicv
*/
if (!cpu_has_vmx_virtualize_x2apic_mode() ||
- !vmx_cpu_uses_apicv(vcpu))
+ !kvm_vcpu_apicv_active(vcpu))
return;
if (!cpu_need_tpr_shadow(vcpu))
@@ -8259,10 +8318,9 @@ static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
}
}
-static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu)
+static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
{
- u64 *eoi_exit_bitmap = vcpu->arch.eoi_exit_bitmap;
- if (!vmx_cpu_uses_apicv(vcpu))
+ if (!kvm_vcpu_apicv_active(vcpu))
return;
vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]);
@@ -8932,7 +8990,8 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
best->ebx &= ~bit(X86_FEATURE_INVPCID);
}
- vmcs_set_secondary_exec_control(secondary_exec_ctl);
+ if (cpu_has_secondary_exec_ctrls())
+ vmcs_set_secondary_exec_control(secondary_exec_ctl);
if (static_cpu_has(X86_FEATURE_PCOMMIT) && nested) {
if (guest_cpuid_has_pcommit(vcpu))
@@ -9508,7 +9567,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
*/
vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv;
vmx->nested.pi_pending = false;
- vmcs_write64(POSTED_INTR_NV, POSTED_INTR_VECTOR);
+ vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR);
vmcs_write64(POSTED_INTR_DESC_ADDR,
page_to_phys(vmx->nested.pi_desc_page) +
(unsigned long)(vmcs12->posted_intr_desc_addr &
@@ -10169,7 +10228,7 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
* Additionally, restore L2's PDPTR to vmcs12.
*/
if (enable_ept) {
- vmcs12->guest_cr3 = vmcs_read64(GUEST_CR3);
+ vmcs12->guest_cr3 = vmcs_readl(GUEST_CR3);
vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0);
vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1);
vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2);
@@ -10805,7 +10864,8 @@ static struct kvm_x86_ops vmx_x86_ops = {
.update_cr8_intercept = update_cr8_intercept,
.set_virtual_x2apic_mode = vmx_set_virtual_x2apic_mode,
.set_apic_access_page_addr = vmx_set_apic_access_page_addr,
- .cpu_uses_apicv = vmx_cpu_uses_apicv,
+ .get_enable_apicv = vmx_get_enable_apicv,
+ .refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl,
.load_eoi_exitmap = vmx_load_eoi_exitmap,
.hwapic_irr_update = vmx_hwapic_irr_update,
.hwapic_isr_update = vmx_hwapic_isr_update,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 97592e190413..4244c2baf57d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -951,7 +951,7 @@ static u32 msrs_to_save[] = {
MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
#endif
MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
- MSR_IA32_FEATURE_CONTROL, MSR_IA32_BNDCFGS
+ MSR_IA32_FEATURE_CONTROL, MSR_IA32_BNDCFGS, MSR_TSC_AUX,
};
static unsigned num_msrs_to_save;
@@ -966,6 +966,8 @@ static u32 emulated_msrs[] = {
HV_X64_MSR_RESET,
HV_X64_MSR_VP_INDEX,
HV_X64_MSR_VP_RUNTIME,
+ HV_X64_MSR_SCONTROL,
+ HV_X64_MSR_STIMER0_CONFIG,
HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
MSR_KVM_PV_EOI_EN,
@@ -1167,7 +1169,8 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
++version;
- kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
+ if (kvm_write_guest(kvm, wall_clock, &version, sizeof(version)))
+ return;
/*
* The guest calculates current wall clock time by adding
@@ -1683,6 +1686,11 @@ static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
#endif
}
+void kvm_make_mclock_inprogress_request(struct kvm *kvm)
+{
+ kvm_make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS);
+}
+
static void kvm_gen_update_masterclock(struct kvm *kvm)
{
#ifdef CONFIG_X86_64
@@ -2198,6 +2206,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
case HV_X64_MSR_CRASH_CTL:
+ case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT:
return kvm_hv_set_msr_common(vcpu, msr, data,
msr_info->host_initiated);
case MSR_IA32_BBL_CR_CTL3:
@@ -2402,6 +2411,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
case HV_X64_MSR_CRASH_CTL:
+ case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT:
return kvm_hv_get_msr_common(vcpu,
msr_info->index, &msr_info->data);
break;
@@ -2541,6 +2551,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_HYPERV:
case KVM_CAP_HYPERV_VAPIC:
case KVM_CAP_HYPERV_SPIN:
+ case KVM_CAP_HYPERV_SYNIC:
case KVM_CAP_PCI_SEGMENT:
case KVM_CAP_DEBUGREGS:
case KVM_CAP_X86_ROBUST_SINGLESTEP:
@@ -2693,6 +2704,11 @@ static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu)
return kvm_arch_has_noncoherent_dma(vcpu->kvm);
}
+static inline void kvm_migrate_timers(struct kvm_vcpu *vcpu)
+{
+ set_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests);
+}
+
void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
/* Address WBINVD may be executed by guest */
@@ -2748,7 +2764,9 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
struct kvm_lapic_state *s)
{
- kvm_x86_ops->sync_pir_to_irr(vcpu);
+ if (vcpu->arch.apicv_active)
+ kvm_x86_ops->sync_pir_to_irr(vcpu);
+
memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s);
return 0;
@@ -3191,6 +3209,20 @@ static int kvm_set_guest_paused(struct kvm_vcpu *vcpu)
return 0;
}
+static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
+ struct kvm_enable_cap *cap)
+{
+ if (cap->flags)
+ return -EINVAL;
+
+ switch (cap->cap) {
+ case KVM_CAP_HYPERV_SYNIC:
+ return kvm_hv_activate_synic(vcpu);
+ default:
+ return -EINVAL;
+ }
+}
+
long kvm_arch_vcpu_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg)
{
@@ -3455,6 +3487,15 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
r = kvm_set_guest_paused(vcpu);
goto out;
}
+ case KVM_ENABLE_CAP: {
+ struct kvm_enable_cap cap;
+
+ r = -EFAULT;
+ if (copy_from_user(&cap, argp, sizeof(cap)))
+ goto out;
+ r = kvm_vcpu_ioctl_enable_cap(vcpu, &cap);
+ break;
+ }
default:
r = -EINVAL;
}
@@ -4006,16 +4047,17 @@ static void kvm_init_msr_list(void)
/*
* Even MSRs that are valid in the host may not be exposed
- * to the guests in some cases. We could work around this
- * in VMX with the generic MSR save/load machinery, but it
- * is not really worthwhile since it will really only
- * happen with nested virtualization.
+ * to the guests in some cases.
*/
switch (msrs_to_save[i]) {
case MSR_IA32_BNDCFGS:
if (!kvm_x86_ops->mpx_supported())
continue;
break;
+ case MSR_TSC_AUX:
+ if (!kvm_x86_ops->rdtscp_supported())
+ continue;
+ break;
default:
break;
}
@@ -5106,7 +5148,7 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2,
int emulation_type)
{
gpa_t gpa = cr2;
- pfn_t pfn;
+ kvm_pfn_t pfn;
if (emulation_type & EMULTYPE_NO_REEXECUTE)
return false;
@@ -5872,6 +5914,12 @@ static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid)
kvm_irq_delivery_to_apic(kvm, NULL, &lapic_irq, NULL);
}
+void kvm_vcpu_deactivate_apicv(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.apicv_active = false;
+ kvm_x86_ops->refresh_apicv_exec_ctrl(vcpu);
+}
+
int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
{
unsigned long nr, a0, a1, a2, a3, ret;
@@ -5965,6 +6013,9 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu)
if (!vcpu->arch.apic)
return;
+ if (vcpu->arch.apicv_active)
+ return;
+
if (!vcpu->arch.apic->vapic_addr)
max_irr = kvm_lapic_find_highest_irr(vcpu);
else
@@ -6301,20 +6352,30 @@ static void process_smi(struct kvm_vcpu *vcpu)
kvm_mmu_reset_context(vcpu);
}
+void kvm_make_scan_ioapic_request(struct kvm *kvm)
+{
+ kvm_make_all_cpus_request(kvm, KVM_REQ_SCAN_IOAPIC);
+}
+
static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
{
+ u64 eoi_exit_bitmap[4];
+
if (!kvm_apic_hw_enabled(vcpu->arch.apic))
return;
- memset(vcpu->arch.eoi_exit_bitmap, 0, 256 / 8);
+ bitmap_zero(vcpu->arch.ioapic_handled_vectors, 256);
if (irqchip_split(vcpu->kvm))
- kvm_scan_ioapic_routes(vcpu, vcpu->arch.eoi_exit_bitmap);
+ kvm_scan_ioapic_routes(vcpu, vcpu->arch.ioapic_handled_vectors);
else {
- kvm_x86_ops->sync_pir_to_irr(vcpu);
- kvm_ioapic_scan_entry(vcpu, vcpu->arch.eoi_exit_bitmap);
+ if (vcpu->arch.apicv_active)
+ kvm_x86_ops->sync_pir_to_irr(vcpu);
+ kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors);
}
- kvm_x86_ops->load_eoi_exitmap(vcpu);
+ bitmap_or((ulong *)eoi_exit_bitmap, vcpu->arch.ioapic_handled_vectors,
+ vcpu_to_synic(vcpu)->vec_bitmap, 256);
+ kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap);
}
static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu)
@@ -6422,7 +6483,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
if (kvm_check_request(KVM_REQ_IOAPIC_EOI_EXIT, vcpu)) {
BUG_ON(vcpu->arch.pending_ioapic_eoi > 255);
if (test_bit(vcpu->arch.pending_ioapic_eoi,
- (void *) vcpu->arch.eoi_exit_bitmap)) {
+ vcpu->arch.ioapic_handled_vectors)) {
vcpu->run->exit_reason = KVM_EXIT_IOAPIC_EOI;
vcpu->run->eoi.vector =
vcpu->arch.pending_ioapic_eoi;
@@ -6446,6 +6507,20 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
r = 0;
goto out;
}
+ if (kvm_check_request(KVM_REQ_HV_EXIT, vcpu)) {
+ vcpu->run->exit_reason = KVM_EXIT_HYPERV;
+ vcpu->run->hyperv = vcpu->arch.hyperv.exit;
+ r = 0;
+ goto out;
+ }
+
+ /*
+ * KVM_REQ_HV_STIMER has to be processed after
+ * KVM_REQ_CLOCK_UPDATE, because Hyper-V SynIC timers
+ * depend on the guest clock being up-to-date
+ */
+ if (kvm_check_request(KVM_REQ_HV_STIMER, vcpu))
+ kvm_hv_process_stimers(vcpu);
}
/*
@@ -6457,7 +6532,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
* Update architecture specific hints for APIC
* virtual interrupt delivery.
*/
- if (kvm_x86_ops->hwapic_irr_update)
+ if (vcpu->arch.apicv_active)
kvm_x86_ops->hwapic_irr_update(vcpu,
kvm_lapic_find_highest_irr(vcpu));
}
@@ -7528,6 +7603,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
BUG_ON(vcpu->kvm == NULL);
kvm = vcpu->kvm;
+ vcpu->arch.apicv_active = kvm_x86_ops->get_enable_apicv();
vcpu->arch.pv.pv_unhalted = false;
vcpu->arch.emulate_ctxt.ops = &emulate_ops;
if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_reset_bsp(vcpu))
@@ -7585,6 +7661,8 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
vcpu->arch.pending_external_vector = -1;
+ kvm_hv_vcpu_init(vcpu);
+
return 0;
fail_free_mce_banks:
@@ -7603,6 +7681,7 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
{
int idx;
+ kvm_hv_vcpu_uninit(vcpu);
kvm_pmu_destroy(vcpu);
kfree(vcpu->arch.mce_banks);
kvm_free_lapic(vcpu);
@@ -7997,6 +8076,9 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
kvm_cpu_has_interrupt(vcpu))
return true;
+ if (kvm_hv_has_stimer_pending(vcpu))
+ return true;
+
return false;
}
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
index ae9a37bf1371..6d5eb5900372 100644
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -9,6 +9,7 @@
#include <linux/vmstat.h>
#include <linux/highmem.h>
#include <linux/swap.h>
+#include <linux/memremap.h>
#include <asm/pgtable.h>
@@ -63,6 +64,16 @@ retry:
#endif
}
+static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages)
+{
+ while ((*nr) - nr_start) {
+ struct page *page = pages[--(*nr)];
+
+ ClearPageReferenced(page);
+ put_page(page);
+ }
+}
+
/*
* The performance critical leaf functions are made noinline otherwise gcc
* inlines everything into a single function which results in too much
@@ -71,7 +82,9 @@ retry:
static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
unsigned long end, int write, struct page **pages, int *nr)
{
+ struct dev_pagemap *pgmap = NULL;
unsigned long mask;
+ int nr_start = *nr;
pte_t *ptep;
mask = _PAGE_PRESENT|_PAGE_USER;
@@ -89,13 +102,21 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
return 0;
}
- if ((pte_flags(pte) & (mask | _PAGE_SPECIAL)) != mask) {
+ page = pte_page(pte);
+ if (pte_devmap(pte)) {
+ pgmap = get_dev_pagemap(pte_pfn(pte), pgmap);
+ if (unlikely(!pgmap)) {
+ undo_dev_pagemap(nr, nr_start, pages);
+ pte_unmap(ptep);
+ return 0;
+ }
+ } else if ((pte_flags(pte) & (mask | _PAGE_SPECIAL)) != mask) {
pte_unmap(ptep);
return 0;
}
VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
- page = pte_page(pte);
get_page(page);
+ put_dev_pagemap(pgmap);
SetPageReferenced(page);
pages[*nr] = page;
(*nr)++;
@@ -114,6 +135,32 @@ static inline void get_head_page_multiple(struct page *page, int nr)
SetPageReferenced(page);
}
+static int __gup_device_huge_pmd(pmd_t pmd, unsigned long addr,
+ unsigned long end, struct page **pages, int *nr)
+{
+ int nr_start = *nr;
+ unsigned long pfn = pmd_pfn(pmd);
+ struct dev_pagemap *pgmap = NULL;
+
+ pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT;
+ do {
+ struct page *page = pfn_to_page(pfn);
+
+ pgmap = get_dev_pagemap(pfn, pgmap);
+ if (unlikely(!pgmap)) {
+ undo_dev_pagemap(nr, nr_start, pages);
+ return 0;
+ }
+ SetPageReferenced(page);
+ pages[*nr] = page;
+ get_page(page);
+ put_dev_pagemap(pgmap);
+ (*nr)++;
+ pfn++;
+ } while (addr += PAGE_SIZE, addr != end);
+ return 1;
+}
+
static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
unsigned long end, int write, struct page **pages, int *nr)
{
@@ -126,9 +173,13 @@ static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
mask |= _PAGE_RW;
if ((pmd_flags(pmd) & mask) != mask)
return 0;
+
+ VM_BUG_ON(!pfn_valid(pmd_pfn(pmd)));
+ if (pmd_devmap(pmd))
+ return __gup_device_huge_pmd(pmd, addr, end, pages, nr);
+
/* hugepages are never "special" */
VM_BUG_ON(pmd_flags(pmd) & _PAGE_SPECIAL);
- VM_BUG_ON(!pfn_valid(pmd_pfn(pmd)));
refs = 0;
head = pmd_page(pmd);
@@ -136,8 +187,6 @@ static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
do {
VM_BUG_ON_PAGE(compound_head(page) != head, page);
pages[*nr] = page;
- if (PageTail(page))
- get_huge_page_tail(page);
(*nr)++;
page++;
refs++;
@@ -158,18 +207,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
pmd_t pmd = *pmdp;
next = pmd_addr_end(addr, end);
- /*
- * The pmd_trans_splitting() check below explains why
- * pmdp_splitting_flush has to flush the tlb, to stop
- * this gup-fast code from running while we set the
- * splitting bit in the pmd. Returning zero will take
- * the slow path that will call wait_split_huge_page()
- * if the pmd is still in splitting state. gup-fast
- * can't because it has irq disabled and
- * wait_split_huge_page() would never return as the
- * tlb flush IPI wouldn't run.
- */
- if (pmd_none(pmd) || pmd_trans_splitting(pmd))
+ if (pmd_none(pmd))
return 0;
if (unlikely(pmd_large(pmd) || !pmd_present(pmd))) {
/*
@@ -212,8 +250,6 @@ static noinline int gup_huge_pud(pud_t pud, unsigned long addr,
do {
VM_BUG_ON_PAGE(compound_head(page) != head, page);
pages[*nr] = page;
- if (PageTail(page))
- get_huge_page_tail(page);
(*nr)++;
page++;
refs++;
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index ec081fe0ce2c..5488d21123bd 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -30,6 +30,7 @@
#include <linux/module.h>
#include <linux/memory.h>
#include <linux/memory_hotplug.h>
+#include <linux/memremap.h>
#include <linux/nmi.h>
#include <linux/gfp.h>
#include <linux/kcore.h>
@@ -714,6 +715,12 @@ static void __meminit free_pagetable(struct page *page, int order)
{
unsigned long magic;
unsigned int nr_pages = 1 << order;
+ struct vmem_altmap *altmap = to_vmem_altmap((unsigned long) page);
+
+ if (altmap) {
+ vmem_altmap_free(altmap, nr_pages);
+ return;
+ }
/* bootmem page has reserved flag */
if (PageReserved(page)) {
@@ -814,8 +821,7 @@ remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end,
if (phys_addr < (phys_addr_t)0x40000000)
return;
- if (IS_ALIGNED(addr, PAGE_SIZE) &&
- IS_ALIGNED(next, PAGE_SIZE)) {
+ if (PAGE_ALIGNED(addr) && PAGE_ALIGNED(next)) {
/*
* Do not free direct mapping pages since they were
* freed when offlining, or simplely not in use.
@@ -1018,13 +1024,19 @@ int __ref arch_remove_memory(u64 start, u64 size)
{
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
+ struct page *page = pfn_to_page(start_pfn);
+ struct vmem_altmap *altmap;
struct zone *zone;
int ret;
- zone = page_zone(pfn_to_page(start_pfn));
- kernel_physical_mapping_remove(start, start + size);
+ /* With altmap the first mapped page is offset from @start */
+ altmap = to_vmem_altmap((unsigned long) page);
+ if (altmap)
+ page += vmem_altmap_offset(altmap);
+ zone = page_zone(page);
ret = __remove_pages(zone, start_pfn, nr_pages);
WARN_ON_ONCE(ret);
+ kernel_physical_mapping_remove(start, start + size);
return ret;
}
@@ -1236,7 +1248,7 @@ static void __meminitdata *p_start, *p_end;
static int __meminitdata node_start;
static int __meminit vmemmap_populate_hugepages(unsigned long start,
- unsigned long end, int node)
+ unsigned long end, int node, struct vmem_altmap *altmap)
{
unsigned long addr;
unsigned long next;
@@ -1259,7 +1271,7 @@ static int __meminit vmemmap_populate_hugepages(unsigned long start,
if (pmd_none(*pmd)) {
void *p;
- p = vmemmap_alloc_block_buf(PMD_SIZE, node);
+ p = __vmemmap_alloc_block_buf(PMD_SIZE, node, altmap);
if (p) {
pte_t entry;
@@ -1280,7 +1292,8 @@ static int __meminit vmemmap_populate_hugepages(unsigned long start,
addr_end = addr + PMD_SIZE;
p_end = p + PMD_SIZE;
continue;
- }
+ } else if (altmap)
+ return -ENOMEM; /* no fallback */
} else if (pmd_large(*pmd)) {
vmemmap_verify((pte_t *)pmd, node, addr, next);
continue;
@@ -1294,11 +1307,16 @@ static int __meminit vmemmap_populate_hugepages(unsigned long start,
int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node)
{
+ struct vmem_altmap *altmap = to_vmem_altmap(start);
int err;
if (cpu_has_pse)
- err = vmemmap_populate_hugepages(start, end, node);
- else
+ err = vmemmap_populate_hugepages(start, end, node, altmap);
+ else if (altmap) {
+ pr_err_once("%s: no cpu support for altmap allocations\n",
+ __func__);
+ err = -ENOMEM;
+ } else
err = vmemmap_populate_basepages(start, end, node);
if (!err)
sync_global_pgds(start, end - 1, 0);
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index 844b06d67df4..96bd1e2bffaf 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -69,14 +69,14 @@ unsigned long arch_mmap_rnd(void)
{
unsigned long rnd;
- /*
- * 8 bits of randomness in 32bit mmaps, 20 address space bits
- * 28 bits of randomness in 64bit mmaps, 40 address space bits
- */
if (mmap_is_ia32())
- rnd = (unsigned long)get_random_int() % (1<<8);
+#ifdef CONFIG_COMPAT
+ rnd = (unsigned long)get_random_int() & ((1 << mmap_rnd_compat_bits) - 1);
+#else
+ rnd = (unsigned long)get_random_int() & ((1 << mmap_rnd_bits) - 1);
+#endif
else
- rnd = (unsigned long)get_random_int() % (1<<28);
+ rnd = (unsigned long)get_random_int() & ((1 << mmap_rnd_bits) - 1);
return rnd << PAGE_SHIFT;
}
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 6000ad7f560c..fc6a4c8f6e2a 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -66,6 +66,9 @@ void update_page_count(int level, unsigned long pages)
static void split_page_count(int level)
{
+ if (direct_pages_count[level] == 0)
+ return;
+
direct_pages_count[level]--;
direct_pages_count[level - 1] += PTRS_PER_PTE;
}
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index 031782e74231..f4ae536b0914 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -12,6 +12,7 @@
#include <linux/debugfs.h>
#include <linux/kernel.h>
#include <linux/module.h>
+#include <linux/pfn_t.h>
#include <linux/slab.h>
#include <linux/mm.h>
#include <linux/fs.h>
@@ -949,7 +950,7 @@ int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot,
}
int track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot,
- unsigned long pfn)
+ pfn_t pfn)
{
enum page_cache_mode pcm;
@@ -957,7 +958,7 @@ int track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot,
return 0;
/* Set prot based on lookup */
- pcm = lookup_memtype((resource_size_t)pfn << PAGE_SHIFT);
+ pcm = lookup_memtype(pfn_t_to_phys(pfn));
*prot = __pgprot((pgprot_val(vma->vm_page_prot) & (~_PAGE_CACHE_MASK)) |
cachemode2protval(pcm));
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index ee9c2e3a7199..4eb287e25043 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -505,19 +505,6 @@ int pmdp_clear_flush_young(struct vm_area_struct *vma,
return young;
}
-
-void pmdp_splitting_flush(struct vm_area_struct *vma,
- unsigned long address, pmd_t *pmdp)
-{
- int set;
- VM_BUG_ON(address & ~HPAGE_PMD_MASK);
- set = !test_and_set_bit(_PAGE_BIT_SPLITTING,
- (unsigned long *)pmdp);
- if (set) {
- /* need tlb flush only to serialize against gup-fast */
- flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
- }
-}
#endif
/**
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 8ddb5d0d66fb..8f4cc3dfac32 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -161,7 +161,10 @@ void flush_tlb_current_task(void)
preempt_disable();
count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
+
+ /* This is an implicit full barrier that synchronizes with switch_mm. */
local_flush_tlb();
+
trace_tlb_flush(TLB_LOCAL_SHOOTDOWN, TLB_FLUSH_ALL);
if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL);
@@ -188,17 +191,29 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
unsigned long base_pages_to_flush = TLB_FLUSH_ALL;
preempt_disable();
- if (current->active_mm != mm)
+ if (current->active_mm != mm) {
+ /* Synchronize with switch_mm. */
+ smp_mb();
+
goto out;
+ }
if (!current->mm) {
leave_mm(smp_processor_id());
+
+ /* Synchronize with switch_mm. */
+ smp_mb();
+
goto out;
}
if ((end != TLB_FLUSH_ALL) && !(vmflag & VM_HUGETLB))
base_pages_to_flush = (end - start) >> PAGE_SHIFT;
+ /*
+ * Both branches below are implicit full barriers (MOV to CR or
+ * INVLPG) that synchronize with switch_mm.
+ */
if (base_pages_to_flush > tlb_single_page_flush_ceiling) {
base_pages_to_flush = TLB_FLUSH_ALL;
count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
@@ -228,10 +243,18 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long start)
preempt_disable();
if (current->active_mm == mm) {
- if (current->mm)
+ if (current->mm) {
+ /*
+ * Implicit full barrier (INVLPG) that synchronizes
+ * with switch_mm.
+ */
__flush_tlb_one(start);
- else
+ } else {
leave_mm(smp_processor_id());
+
+ /* Synchronize with switch_mm. */
+ smp_mb();
+ }
}
if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 75991979f667..4286f3618bd0 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -193,7 +193,7 @@ struct jit_context {
32 /* space for rbx, r13, r14, r15 */ + \
8 /* space for skb_copy_bits() buffer */)
-#define PROLOGUE_SIZE 51
+#define PROLOGUE_SIZE 48
/* emit x64 prologue code for BPF program and check it's size.
* bpf_tail_call helper will skip it while jumping into another program
@@ -229,11 +229,15 @@ static void emit_prologue(u8 **pprog)
/* mov qword ptr [rbp-X],r15 */
EMIT3_off32(0x4C, 0x89, 0xBD, -STACKSIZE + 24);
- /* clear A and X registers */
- EMIT2(0x31, 0xc0); /* xor eax, eax */
- EMIT3(0x4D, 0x31, 0xED); /* xor r13, r13 */
+ /* Clear the tail call counter (tail_call_cnt): for eBPF tail calls
+ * we need to reset the counter to 0. It's done in two instructions,
+ * resetting rax register to 0 (xor on eax gets 0 extended), and
+ * moving it to the counter location.
+ */
- /* clear tail_cnt: mov qword ptr [rbp-X], rax */
+ /* xor eax, eax */
+ EMIT2(0x31, 0xc0);
+ /* mov qword ptr [rbp-X], rax */
EMIT3_off32(0x48, 0x89, 0x85, -STACKSIZE + 32);
BUILD_BUG_ON(cnt != PROLOGUE_SIZE);
@@ -455,6 +459,18 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
}
case BPF_ALU | BPF_MOV | BPF_K:
+ /* optimization: if imm32 is zero, use 'xor <dst>,<dst>'
+ * to save 3 bytes.
+ */
+ if (imm32 == 0) {
+ if (is_ereg(dst_reg))
+ EMIT1(add_2mod(0x40, dst_reg, dst_reg));
+ b2 = 0x31; /* xor */
+ b3 = 0xC0;
+ EMIT2(b2, add_2reg(b3, dst_reg, dst_reg));
+ break;
+ }
+
/* mov %eax, imm32 */
if (is_ereg(dst_reg))
EMIT1(add_1mod(0x40, dst_reg));
@@ -469,6 +485,20 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
return -EINVAL;
}
+ /* optimization: if imm64 is zero, use 'xor <dst>,<dst>'
+ * to save 7 bytes.
+ */
+ if (insn[0].imm == 0 && insn[1].imm == 0) {
+ b1 = add_2mod(0x48, dst_reg, dst_reg);
+ b2 = 0x31; /* xor */
+ b3 = 0xC0;
+ EMIT3(b1, b2, add_2reg(b3, dst_reg, dst_reg));
+
+ insn++;
+ i++;
+ break;
+ }
+
/* movabsq %rax, imm64 */
EMIT2(add_1mod(0x48, dst_reg), add_1reg(0xB8, dst_reg));
EMIT(insn[0].imm, 4);
diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile
index 5c6fc3577a49..97062a635b77 100644
--- a/arch/x86/pci/Makefile
+++ b/arch/x86/pci/Makefile
@@ -23,6 +23,8 @@ obj-y += bus_numa.o
obj-$(CONFIG_AMD_NB) += amd_bus.o
obj-$(CONFIG_PCI_CNB20LE_QUIRK) += broadcom_bus.o
+obj-$(CONFIG_VMD) += vmd.o
+
ifeq ($(CONFIG_PCI_DEBUG),y)
EXTRA_CFLAGS += -DDEBUG
endif
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index eccd4d99e6a4..2879efc73a96 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -641,6 +641,43 @@ unsigned int pcibios_assign_all_busses(void)
return (pci_probe & PCI_ASSIGN_ALL_BUSSES) ? 1 : 0;
}
+#if defined(CONFIG_X86_DEV_DMA_OPS) && defined(CONFIG_PCI_DOMAINS)
+static LIST_HEAD(dma_domain_list);
+static DEFINE_SPINLOCK(dma_domain_list_lock);
+
+void add_dma_domain(struct dma_domain *domain)
+{
+ spin_lock(&dma_domain_list_lock);
+ list_add(&domain->node, &dma_domain_list);
+ spin_unlock(&dma_domain_list_lock);
+}
+EXPORT_SYMBOL_GPL(add_dma_domain);
+
+void del_dma_domain(struct dma_domain *domain)
+{
+ spin_lock(&dma_domain_list_lock);
+ list_del(&domain->node);
+ spin_unlock(&dma_domain_list_lock);
+}
+EXPORT_SYMBOL_GPL(del_dma_domain);
+
+static void set_dma_domain_ops(struct pci_dev *pdev)
+{
+ struct dma_domain *domain;
+
+ spin_lock(&dma_domain_list_lock);
+ list_for_each_entry(domain, &dma_domain_list, node) {
+ if (pci_domain_nr(pdev->bus) == domain->domain_nr) {
+ pdev->dev.archdata.dma_ops = domain->dma_ops;
+ break;
+ }
+ }
+ spin_unlock(&dma_domain_list_lock);
+}
+#else
+static void set_dma_domain_ops(struct pci_dev *pdev) {}
+#endif
+
int pcibios_add_device(struct pci_dev *dev)
{
struct setup_data *data;
@@ -670,6 +707,7 @@ int pcibios_add_device(struct pci_dev *dev)
pa_data = data->next;
iounmap(data);
}
+ set_dma_domain_ops(dev);
return 0;
}
diff --git a/arch/x86/pci/pcbios.c b/arch/x86/pci/pcbios.c
index 9b83b9051ae7..9770e55e768f 100644
--- a/arch/x86/pci/pcbios.c
+++ b/arch/x86/pci/pcbios.c
@@ -180,6 +180,7 @@ static int pci_bios_read(unsigned int seg, unsigned int bus,
unsigned long result = 0;
unsigned long flags;
unsigned long bx = (bus << 8) | devfn;
+ u16 number = 0, mask = 0;
WARN_ON(seg);
if (!value || (bus > 255) || (devfn > 255) || (reg > 255))
@@ -189,53 +190,35 @@ static int pci_bios_read(unsigned int seg, unsigned int bus,
switch (len) {
case 1:
- __asm__("lcall *(%%esi); cld\n\t"
- "jc 1f\n\t"
- "xor %%ah, %%ah\n"
- "1:"
- : "=c" (*value),
- "=a" (result)
- : "1" (PCIBIOS_READ_CONFIG_BYTE),
- "b" (bx),
- "D" ((long)reg),
- "S" (&pci_indirect));
- /*
- * Zero-extend the result beyond 8 bits, do not trust the
- * BIOS having done it:
- */
- *value &= 0xff;
+ number = PCIBIOS_READ_CONFIG_BYTE;
+ mask = 0xff;
break;
case 2:
- __asm__("lcall *(%%esi); cld\n\t"
- "jc 1f\n\t"
- "xor %%ah, %%ah\n"
- "1:"
- : "=c" (*value),
- "=a" (result)
- : "1" (PCIBIOS_READ_CONFIG_WORD),
- "b" (bx),
- "D" ((long)reg),
- "S" (&pci_indirect));
- /*
- * Zero-extend the result beyond 16 bits, do not trust the
- * BIOS having done it:
- */
- *value &= 0xffff;
+ number = PCIBIOS_READ_CONFIG_WORD;
+ mask = 0xffff;
break;
case 4:
- __asm__("lcall *(%%esi); cld\n\t"
- "jc 1f\n\t"
- "xor %%ah, %%ah\n"
- "1:"
- : "=c" (*value),
- "=a" (result)
- : "1" (PCIBIOS_READ_CONFIG_DWORD),
- "b" (bx),
- "D" ((long)reg),
- "S" (&pci_indirect));
+ number = PCIBIOS_READ_CONFIG_DWORD;
break;
}
+ __asm__("lcall *(%%esi); cld\n\t"
+ "jc 1f\n\t"
+ "xor %%ah, %%ah\n"
+ "1:"
+ : "=c" (*value),
+ "=a" (result)
+ : "1" (number),
+ "b" (bx),
+ "D" ((long)reg),
+ "S" (&pci_indirect));
+ /*
+ * Zero-extend the result beyond 8 or 16 bits, do not trust the
+ * BIOS having done it:
+ */
+ if (mask)
+ *value &= mask;
+
raw_spin_unlock_irqrestore(&pci_config_lock, flags);
return (int)((result & 0xff00) >> 8);
@@ -247,6 +230,7 @@ static int pci_bios_write(unsigned int seg, unsigned int bus,
unsigned long result = 0;
unsigned long flags;
unsigned long bx = (bus << 8) | devfn;
+ u16 number = 0;
WARN_ON(seg);
if ((bus > 255) || (devfn > 255) || (reg > 255))
@@ -256,43 +240,27 @@ static int pci_bios_write(unsigned int seg, unsigned int bus,
switch (len) {
case 1:
- __asm__("lcall *(%%esi); cld\n\t"
- "jc 1f\n\t"
- "xor %%ah, %%ah\n"
- "1:"
- : "=a" (result)
- : "0" (PCIBIOS_WRITE_CONFIG_BYTE),
- "c" (value),
- "b" (bx),
- "D" ((long)reg),
- "S" (&pci_indirect));
+ number = PCIBIOS_WRITE_CONFIG_BYTE;
break;
case 2:
- __asm__("lcall *(%%esi); cld\n\t"
- "jc 1f\n\t"
- "xor %%ah, %%ah\n"
- "1:"
- : "=a" (result)
- : "0" (PCIBIOS_WRITE_CONFIG_WORD),
- "c" (value),
- "b" (bx),
- "D" ((long)reg),
- "S" (&pci_indirect));
+ number = PCIBIOS_WRITE_CONFIG_WORD;
break;
case 4:
- __asm__("lcall *(%%esi); cld\n\t"
- "jc 1f\n\t"
- "xor %%ah, %%ah\n"
- "1:"
- : "=a" (result)
- : "0" (PCIBIOS_WRITE_CONFIG_DWORD),
- "c" (value),
- "b" (bx),
- "D" ((long)reg),
- "S" (&pci_indirect));
+ number = PCIBIOS_WRITE_CONFIG_DWORD;
break;
}
+ __asm__("lcall *(%%esi); cld\n\t"
+ "jc 1f\n\t"
+ "xor %%ah, %%ah\n"
+ "1:"
+ : "=a" (result)
+ : "0" (number),
+ "c" (value),
+ "b" (bx),
+ "D" ((long)reg),
+ "S" (&pci_indirect));
+
raw_spin_unlock_irqrestore(&pci_config_lock, flags);
return (int)((result & 0xff00) >> 8);
diff --git a/arch/x86/pci/vmd.c b/arch/x86/pci/vmd.c
new file mode 100644
index 000000000000..d57e48016f15
--- /dev/null
+++ b/arch/x86/pci/vmd.c
@@ -0,0 +1,723 @@
+/*
+ * Volume Management Device driver
+ * Copyright (c) 2015, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ */
+
+#include <linux/device.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/msi.h>
+#include <linux/pci.h>
+#include <linux/rculist.h>
+#include <linux/rcupdate.h>
+
+#include <asm/irqdomain.h>
+#include <asm/device.h>
+#include <asm/msi.h>
+#include <asm/msidef.h>
+
+#define VMD_CFGBAR 0
+#define VMD_MEMBAR1 2
+#define VMD_MEMBAR2 4
+
+/*
+ * Lock for manipulating VMD IRQ lists.
+ */
+static DEFINE_RAW_SPINLOCK(list_lock);
+
+/**
+ * struct vmd_irq - private data to map driver IRQ to the VMD shared vector
+ * @node: list item for parent traversal.
+ * @rcu: RCU callback item for freeing.
+ * @irq: back pointer to parent.
+ * @virq: the virtual IRQ value provided to the requesting driver.
+ *
+ * Every MSI/MSI-X IRQ requested for a device in a VMD domain will be mapped to
+ * a VMD IRQ using this structure.
+ */
+struct vmd_irq {
+ struct list_head node;
+ struct rcu_head rcu;
+ struct vmd_irq_list *irq;
+ unsigned int virq;
+};
+
+/**
+ * struct vmd_irq_list - list of driver requested IRQs mapping to a VMD vector
+ * @irq_list: the list of irq's the VMD one demuxes to.
+ * @vmd_vector: the h/w IRQ assigned to the VMD.
+ * @index: index into the VMD MSI-X table; used for message routing.
+ * @count: number of child IRQs assigned to this vector; used to track
+ * sharing.
+ */
+struct vmd_irq_list {
+ struct list_head irq_list;
+ struct vmd_dev *vmd;
+ unsigned int vmd_vector;
+ unsigned int index;
+ unsigned int count;
+};
+
+struct vmd_dev {
+ struct pci_dev *dev;
+
+ spinlock_t cfg_lock;
+ char __iomem *cfgbar;
+
+ int msix_count;
+ struct msix_entry *msix_entries;
+ struct vmd_irq_list *irqs;
+
+ struct pci_sysdata sysdata;
+ struct resource resources[3];
+ struct irq_domain *irq_domain;
+ struct pci_bus *bus;
+
+#ifdef CONFIG_X86_DEV_DMA_OPS
+ struct dma_map_ops dma_ops;
+ struct dma_domain dma_domain;
+#endif
+};
+
+static inline struct vmd_dev *vmd_from_bus(struct pci_bus *bus)
+{
+ return container_of(bus->sysdata, struct vmd_dev, sysdata);
+}
+
+/*
+ * Drivers managing a device in a VMD domain allocate their own IRQs as before,
+ * but the MSI entry for the hardware it's driving will be programmed with a
+ * destination ID for the VMD MSI-X table. The VMD muxes interrupts in its
+ * domain into one of its own, and the VMD driver de-muxes these for the
+ * handlers sharing that VMD IRQ. The vmd irq_domain provides the operations
+ * and irq_chip to set this up.
+ */
+static void vmd_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
+{
+ struct vmd_irq *vmdirq = data->chip_data;
+ struct vmd_irq_list *irq = vmdirq->irq;
+
+ msg->address_hi = MSI_ADDR_BASE_HI;
+ msg->address_lo = MSI_ADDR_BASE_LO | MSI_ADDR_DEST_ID(irq->index);
+ msg->data = 0;
+}
+
+/*
+ * We rely on MSI_FLAG_USE_DEF_CHIP_OPS to set the IRQ mask/unmask ops.
+ */
+static void vmd_irq_enable(struct irq_data *data)
+{
+ struct vmd_irq *vmdirq = data->chip_data;
+
+ raw_spin_lock(&list_lock);
+ list_add_tail_rcu(&vmdirq->node, &vmdirq->irq->irq_list);
+ raw_spin_unlock(&list_lock);
+
+ data->chip->irq_unmask(data);
+}
+
+static void vmd_irq_disable(struct irq_data *data)
+{
+ struct vmd_irq *vmdirq = data->chip_data;
+
+ data->chip->irq_mask(data);
+
+ raw_spin_lock(&list_lock);
+ list_del_rcu(&vmdirq->node);
+ raw_spin_unlock(&list_lock);
+}
+
+/*
+ * XXX: Stubbed until we develop acceptable way to not create conflicts with
+ * other devices sharing the same vector.
+ */
+static int vmd_irq_set_affinity(struct irq_data *data,
+ const struct cpumask *dest, bool force)
+{
+ return -EINVAL;
+}
+
+static struct irq_chip vmd_msi_controller = {
+ .name = "VMD-MSI",
+ .irq_enable = vmd_irq_enable,
+ .irq_disable = vmd_irq_disable,
+ .irq_compose_msi_msg = vmd_compose_msi_msg,
+ .irq_set_affinity = vmd_irq_set_affinity,
+};
+
+static irq_hw_number_t vmd_get_hwirq(struct msi_domain_info *info,
+ msi_alloc_info_t *arg)
+{
+ return 0;
+}
+
+/*
+ * XXX: We can be even smarter selecting the best IRQ once we solve the
+ * affinity problem.
+ */
+static struct vmd_irq_list *vmd_next_irq(struct vmd_dev *vmd)
+{
+ int i, best = 0;
+
+ raw_spin_lock(&list_lock);
+ for (i = 1; i < vmd->msix_count; i++)
+ if (vmd->irqs[i].count < vmd->irqs[best].count)
+ best = i;
+ vmd->irqs[best].count++;
+ raw_spin_unlock(&list_lock);
+
+ return &vmd->irqs[best];
+}
+
+static int vmd_msi_init(struct irq_domain *domain, struct msi_domain_info *info,
+ unsigned int virq, irq_hw_number_t hwirq,
+ msi_alloc_info_t *arg)
+{
+ struct vmd_dev *vmd = vmd_from_bus(msi_desc_to_pci_dev(arg->desc)->bus);
+ struct vmd_irq *vmdirq = kzalloc(sizeof(*vmdirq), GFP_KERNEL);
+
+ if (!vmdirq)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&vmdirq->node);
+ vmdirq->irq = vmd_next_irq(vmd);
+ vmdirq->virq = virq;
+
+ irq_domain_set_info(domain, virq, vmdirq->irq->vmd_vector, info->chip,
+ vmdirq, handle_simple_irq, vmd, NULL);
+ return 0;
+}
+
+static void vmd_msi_free(struct irq_domain *domain,
+ struct msi_domain_info *info, unsigned int virq)
+{
+ struct vmd_irq *vmdirq = irq_get_chip_data(virq);
+
+ /* XXX: Potential optimization to rebalance */
+ raw_spin_lock(&list_lock);
+ vmdirq->irq->count--;
+ raw_spin_unlock(&list_lock);
+
+ kfree_rcu(vmdirq, rcu);
+}
+
+static int vmd_msi_prepare(struct irq_domain *domain, struct device *dev,
+ int nvec, msi_alloc_info_t *arg)
+{
+ struct pci_dev *pdev = to_pci_dev(dev);
+ struct vmd_dev *vmd = vmd_from_bus(pdev->bus);
+
+ if (nvec > vmd->msix_count)
+ return vmd->msix_count;
+
+ memset(arg, 0, sizeof(*arg));
+ return 0;
+}
+
+static void vmd_set_desc(msi_alloc_info_t *arg, struct msi_desc *desc)
+{
+ arg->desc = desc;
+}
+
+static struct msi_domain_ops vmd_msi_domain_ops = {
+ .get_hwirq = vmd_get_hwirq,
+ .msi_init = vmd_msi_init,
+ .msi_free = vmd_msi_free,
+ .msi_prepare = vmd_msi_prepare,
+ .set_desc = vmd_set_desc,
+};
+
+static struct msi_domain_info vmd_msi_domain_info = {
+ .flags = MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS |
+ MSI_FLAG_PCI_MSIX,
+ .ops = &vmd_msi_domain_ops,
+ .chip = &vmd_msi_controller,
+};
+
+#ifdef CONFIG_X86_DEV_DMA_OPS
+/*
+ * VMD replaces the requester ID with its own. DMA mappings for devices in a
+ * VMD domain need to be mapped for the VMD, not the device requiring
+ * the mapping.
+ */
+static struct device *to_vmd_dev(struct device *dev)
+{
+ struct pci_dev *pdev = to_pci_dev(dev);
+ struct vmd_dev *vmd = vmd_from_bus(pdev->bus);
+
+ return &vmd->dev->dev;
+}
+
+static struct dma_map_ops *vmd_dma_ops(struct device *dev)
+{
+ return to_vmd_dev(dev)->archdata.dma_ops;
+}
+
+static void *vmd_alloc(struct device *dev, size_t size, dma_addr_t *addr,
+ gfp_t flag, struct dma_attrs *attrs)
+{
+ return vmd_dma_ops(dev)->alloc(to_vmd_dev(dev), size, addr, flag,
+ attrs);
+}
+
+static void vmd_free(struct device *dev, size_t size, void *vaddr,
+ dma_addr_t addr, struct dma_attrs *attrs)
+{
+ return vmd_dma_ops(dev)->free(to_vmd_dev(dev), size, vaddr, addr,
+ attrs);
+}
+
+static int vmd_mmap(struct device *dev, struct vm_area_struct *vma,
+ void *cpu_addr, dma_addr_t addr, size_t size,
+ struct dma_attrs *attrs)
+{
+ return vmd_dma_ops(dev)->mmap(to_vmd_dev(dev), vma, cpu_addr, addr,
+ size, attrs);
+}
+
+static int vmd_get_sgtable(struct device *dev, struct sg_table *sgt,
+ void *cpu_addr, dma_addr_t addr, size_t size,
+ struct dma_attrs *attrs)
+{
+ return vmd_dma_ops(dev)->get_sgtable(to_vmd_dev(dev), sgt, cpu_addr,
+ addr, size, attrs);
+}
+
+static dma_addr_t vmd_map_page(struct device *dev, struct page *page,
+ unsigned long offset, size_t size,
+ enum dma_data_direction dir,
+ struct dma_attrs *attrs)
+{
+ return vmd_dma_ops(dev)->map_page(to_vmd_dev(dev), page, offset, size,
+ dir, attrs);
+}
+
+static void vmd_unmap_page(struct device *dev, dma_addr_t addr, size_t size,
+ enum dma_data_direction dir, struct dma_attrs *attrs)
+{
+ vmd_dma_ops(dev)->unmap_page(to_vmd_dev(dev), addr, size, dir, attrs);
+}
+
+static int vmd_map_sg(struct device *dev, struct scatterlist *sg, int nents,
+ enum dma_data_direction dir, struct dma_attrs *attrs)
+{
+ return vmd_dma_ops(dev)->map_sg(to_vmd_dev(dev), sg, nents, dir, attrs);
+}
+
+static void vmd_unmap_sg(struct device *dev, struct scatterlist *sg, int nents,
+ enum dma_data_direction dir, struct dma_attrs *attrs)
+{
+ vmd_dma_ops(dev)->unmap_sg(to_vmd_dev(dev), sg, nents, dir, attrs);
+}
+
+static void vmd_sync_single_for_cpu(struct device *dev, dma_addr_t addr,
+ size_t size, enum dma_data_direction dir)
+{
+ vmd_dma_ops(dev)->sync_single_for_cpu(to_vmd_dev(dev), addr, size, dir);
+}
+
+static void vmd_sync_single_for_device(struct device *dev, dma_addr_t addr,
+ size_t size, enum dma_data_direction dir)
+{
+ vmd_dma_ops(dev)->sync_single_for_device(to_vmd_dev(dev), addr, size,
+ dir);
+}
+
+static void vmd_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
+ int nents, enum dma_data_direction dir)
+{
+ vmd_dma_ops(dev)->sync_sg_for_cpu(to_vmd_dev(dev), sg, nents, dir);
+}
+
+static void vmd_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
+ int nents, enum dma_data_direction dir)
+{
+ vmd_dma_ops(dev)->sync_sg_for_device(to_vmd_dev(dev), sg, nents, dir);
+}
+
+static int vmd_mapping_error(struct device *dev, dma_addr_t addr)
+{
+ return vmd_dma_ops(dev)->mapping_error(to_vmd_dev(dev), addr);
+}
+
+static int vmd_dma_supported(struct device *dev, u64 mask)
+{
+ return vmd_dma_ops(dev)->dma_supported(to_vmd_dev(dev), mask);
+}
+
+#ifdef ARCH_HAS_DMA_GET_REQUIRED_MASK
+static u64 vmd_get_required_mask(struct device *dev)
+{
+ return vmd_dma_ops(dev)->get_required_mask(to_vmd_dev(dev));
+}
+#endif
+
+static void vmd_teardown_dma_ops(struct vmd_dev *vmd)
+{
+ struct dma_domain *domain = &vmd->dma_domain;
+
+ if (vmd->dev->dev.archdata.dma_ops)
+ del_dma_domain(domain);
+}
+
+#define ASSIGN_VMD_DMA_OPS(source, dest, fn) \
+ do { \
+ if (source->fn) \
+ dest->fn = vmd_##fn; \
+ } while (0)
+
+static void vmd_setup_dma_ops(struct vmd_dev *vmd)
+{
+ const struct dma_map_ops *source = vmd->dev->dev.archdata.dma_ops;
+ struct dma_map_ops *dest = &vmd->dma_ops;
+ struct dma_domain *domain = &vmd->dma_domain;
+
+ domain->domain_nr = vmd->sysdata.domain;
+ domain->dma_ops = dest;
+
+ if (!source)
+ return;
+ ASSIGN_VMD_DMA_OPS(source, dest, alloc);
+ ASSIGN_VMD_DMA_OPS(source, dest, free);
+ ASSIGN_VMD_DMA_OPS(source, dest, mmap);
+ ASSIGN_VMD_DMA_OPS(source, dest, get_sgtable);
+ ASSIGN_VMD_DMA_OPS(source, dest, map_page);
+ ASSIGN_VMD_DMA_OPS(source, dest, unmap_page);
+ ASSIGN_VMD_DMA_OPS(source, dest, map_sg);
+ ASSIGN_VMD_DMA_OPS(source, dest, unmap_sg);
+ ASSIGN_VMD_DMA_OPS(source, dest, sync_single_for_cpu);
+ ASSIGN_VMD_DMA_OPS(source, dest, sync_single_for_device);
+ ASSIGN_VMD_DMA_OPS(source, dest, sync_sg_for_cpu);
+ ASSIGN_VMD_DMA_OPS(source, dest, sync_sg_for_device);
+ ASSIGN_VMD_DMA_OPS(source, dest, mapping_error);
+ ASSIGN_VMD_DMA_OPS(source, dest, dma_supported);
+#ifdef ARCH_HAS_DMA_GET_REQUIRED_MASK
+ ASSIGN_VMD_DMA_OPS(source, dest, get_required_mask);
+#endif
+ add_dma_domain(domain);
+}
+#undef ASSIGN_VMD_DMA_OPS
+#else
+static void vmd_teardown_dma_ops(struct vmd_dev *vmd) {}
+static void vmd_setup_dma_ops(struct vmd_dev *vmd) {}
+#endif
+
+static char __iomem *vmd_cfg_addr(struct vmd_dev *vmd, struct pci_bus *bus,
+ unsigned int devfn, int reg, int len)
+{
+ char __iomem *addr = vmd->cfgbar +
+ (bus->number << 20) + (devfn << 12) + reg;
+
+ if ((addr - vmd->cfgbar) + len >=
+ resource_size(&vmd->dev->resource[VMD_CFGBAR]))
+ return NULL;
+
+ return addr;
+}
+
+/*
+ * CPU may deadlock if config space is not serialized on some versions of this
+ * hardware, so all config space access is done under a spinlock.
+ */
+static int vmd_pci_read(struct pci_bus *bus, unsigned int devfn, int reg,
+ int len, u32 *value)
+{
+ struct vmd_dev *vmd = vmd_from_bus(bus);
+ char __iomem *addr = vmd_cfg_addr(vmd, bus, devfn, reg, len);
+ unsigned long flags;
+ int ret = 0;
+
+ if (!addr)
+ return -EFAULT;
+
+ spin_lock_irqsave(&vmd->cfg_lock, flags);
+ switch (len) {
+ case 1:
+ *value = readb(addr);
+ break;
+ case 2:
+ *value = readw(addr);
+ break;
+ case 4:
+ *value = readl(addr);
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+ spin_unlock_irqrestore(&vmd->cfg_lock, flags);
+ return ret;
+}
+
+/*
+ * VMD h/w converts non-posted config writes to posted memory writes. The
+ * read-back in this function forces the completion so it returns only after
+ * the config space was written, as expected.
+ */
+static int vmd_pci_write(struct pci_bus *bus, unsigned int devfn, int reg,
+ int len, u32 value)
+{
+ struct vmd_dev *vmd = vmd_from_bus(bus);
+ char __iomem *addr = vmd_cfg_addr(vmd, bus, devfn, reg, len);
+ unsigned long flags;
+ int ret = 0;
+
+ if (!addr)
+ return -EFAULT;
+
+ spin_lock_irqsave(&vmd->cfg_lock, flags);
+ switch (len) {
+ case 1:
+ writeb(value, addr);
+ readb(addr);
+ break;
+ case 2:
+ writew(value, addr);
+ readw(addr);
+ break;
+ case 4:
+ writel(value, addr);
+ readl(addr);
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+ spin_unlock_irqrestore(&vmd->cfg_lock, flags);
+ return ret;
+}
+
+static struct pci_ops vmd_ops = {
+ .read = vmd_pci_read,
+ .write = vmd_pci_write,
+};
+
+/*
+ * VMD domains start at 0x1000 to not clash with ACPI _SEG domains.
+ */
+static int vmd_find_free_domain(void)
+{
+ int domain = 0xffff;
+ struct pci_bus *bus = NULL;
+
+ while ((bus = pci_find_next_bus(bus)) != NULL)
+ domain = max_t(int, domain, pci_domain_nr(bus));
+ return domain + 1;
+}
+
+static int vmd_enable_domain(struct vmd_dev *vmd)
+{
+ struct pci_sysdata *sd = &vmd->sysdata;
+ struct resource *res;
+ u32 upper_bits;
+ unsigned long flags;
+ LIST_HEAD(resources);
+
+ res = &vmd->dev->resource[VMD_CFGBAR];
+ vmd->resources[0] = (struct resource) {
+ .name = "VMD CFGBAR",
+ .start = res->start,
+ .end = (resource_size(res) >> 20) - 1,
+ .flags = IORESOURCE_BUS | IORESOURCE_PCI_FIXED,
+ };
+
+ res = &vmd->dev->resource[VMD_MEMBAR1];
+ upper_bits = upper_32_bits(res->end);
+ flags = res->flags & ~IORESOURCE_SIZEALIGN;
+ if (!upper_bits)
+ flags &= ~IORESOURCE_MEM_64;
+ vmd->resources[1] = (struct resource) {
+ .name = "VMD MEMBAR1",
+ .start = res->start,
+ .end = res->end,
+ .flags = flags,
+ };
+
+ res = &vmd->dev->resource[VMD_MEMBAR2];
+ upper_bits = upper_32_bits(res->end);
+ flags = res->flags & ~IORESOURCE_SIZEALIGN;
+ if (!upper_bits)
+ flags &= ~IORESOURCE_MEM_64;
+ vmd->resources[2] = (struct resource) {
+ .name = "VMD MEMBAR2",
+ .start = res->start + 0x2000,
+ .end = res->end,
+ .flags = flags,
+ };
+
+ sd->domain = vmd_find_free_domain();
+ if (sd->domain < 0)
+ return sd->domain;
+
+ sd->node = pcibus_to_node(vmd->dev->bus);
+
+ vmd->irq_domain = pci_msi_create_irq_domain(NULL, &vmd_msi_domain_info,
+ NULL);
+ if (!vmd->irq_domain)
+ return -ENODEV;
+
+ pci_add_resource(&resources, &vmd->resources[0]);
+ pci_add_resource(&resources, &vmd->resources[1]);
+ pci_add_resource(&resources, &vmd->resources[2]);
+ vmd->bus = pci_create_root_bus(&vmd->dev->dev, 0, &vmd_ops, sd,
+ &resources);
+ if (!vmd->bus) {
+ pci_free_resource_list(&resources);
+ irq_domain_remove(vmd->irq_domain);
+ return -ENODEV;
+ }
+
+ vmd_setup_dma_ops(vmd);
+ dev_set_msi_domain(&vmd->bus->dev, vmd->irq_domain);
+ pci_rescan_bus(vmd->bus);
+
+ WARN(sysfs_create_link(&vmd->dev->dev.kobj, &vmd->bus->dev.kobj,
+ "domain"), "Can't create symlink to domain\n");
+ return 0;
+}
+
+static irqreturn_t vmd_irq(int irq, void *data)
+{
+ struct vmd_irq_list *irqs = data;
+ struct vmd_irq *vmdirq;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(vmdirq, &irqs->irq_list, node)
+ generic_handle_irq(vmdirq->virq);
+ rcu_read_unlock();
+
+ return IRQ_HANDLED;
+}
+
+static int vmd_probe(struct pci_dev *dev, const struct pci_device_id *id)
+{
+ struct vmd_dev *vmd;
+ int i, err;
+
+ if (resource_size(&dev->resource[VMD_CFGBAR]) < (1 << 20))
+ return -ENOMEM;
+
+ vmd = devm_kzalloc(&dev->dev, sizeof(*vmd), GFP_KERNEL);
+ if (!vmd)
+ return -ENOMEM;
+
+ vmd->dev = dev;
+ err = pcim_enable_device(dev);
+ if (err < 0)
+ return err;
+
+ vmd->cfgbar = pcim_iomap(dev, VMD_CFGBAR, 0);
+ if (!vmd->cfgbar)
+ return -ENOMEM;
+
+ pci_set_master(dev);
+ if (dma_set_mask_and_coherent(&dev->dev, DMA_BIT_MASK(64)) &&
+ dma_set_mask_and_coherent(&dev->dev, DMA_BIT_MASK(32)))
+ return -ENODEV;
+
+ vmd->msix_count = pci_msix_vec_count(dev);
+ if (vmd->msix_count < 0)
+ return -ENODEV;
+
+ vmd->irqs = devm_kcalloc(&dev->dev, vmd->msix_count, sizeof(*vmd->irqs),
+ GFP_KERNEL);
+ if (!vmd->irqs)
+ return -ENOMEM;
+
+ vmd->msix_entries = devm_kcalloc(&dev->dev, vmd->msix_count,
+ sizeof(*vmd->msix_entries),
+ GFP_KERNEL);
+ if (!vmd->msix_entries)
+ return -ENOMEM;
+ for (i = 0; i < vmd->msix_count; i++)
+ vmd->msix_entries[i].entry = i;
+
+ vmd->msix_count = pci_enable_msix_range(vmd->dev, vmd->msix_entries, 1,
+ vmd->msix_count);
+ if (vmd->msix_count < 0)
+ return vmd->msix_count;
+
+ for (i = 0; i < vmd->msix_count; i++) {
+ INIT_LIST_HEAD(&vmd->irqs[i].irq_list);
+ vmd->irqs[i].vmd_vector = vmd->msix_entries[i].vector;
+ vmd->irqs[i].index = i;
+
+ err = devm_request_irq(&dev->dev, vmd->irqs[i].vmd_vector,
+ vmd_irq, 0, "vmd", &vmd->irqs[i]);
+ if (err)
+ return err;
+ }
+
+ spin_lock_init(&vmd->cfg_lock);
+ pci_set_drvdata(dev, vmd);
+ err = vmd_enable_domain(vmd);
+ if (err)
+ return err;
+
+ dev_info(&vmd->dev->dev, "Bound to PCI domain %04x\n",
+ vmd->sysdata.domain);
+ return 0;
+}
+
+static void vmd_remove(struct pci_dev *dev)
+{
+ struct vmd_dev *vmd = pci_get_drvdata(dev);
+
+ pci_set_drvdata(dev, NULL);
+ sysfs_remove_link(&vmd->dev->dev.kobj, "domain");
+ pci_stop_root_bus(vmd->bus);
+ pci_remove_root_bus(vmd->bus);
+ vmd_teardown_dma_ops(vmd);
+ irq_domain_remove(vmd->irq_domain);
+}
+
+#ifdef CONFIG_PM
+static int vmd_suspend(struct device *dev)
+{
+ struct pci_dev *pdev = to_pci_dev(dev);
+
+ pci_save_state(pdev);
+ return 0;
+}
+
+static int vmd_resume(struct device *dev)
+{
+ struct pci_dev *pdev = to_pci_dev(dev);
+
+ pci_restore_state(pdev);
+ return 0;
+}
+#endif
+static SIMPLE_DEV_PM_OPS(vmd_dev_pm_ops, vmd_suspend, vmd_resume);
+
+static const struct pci_device_id vmd_ids[] = {
+ {PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x201d),},
+ {0,}
+};
+MODULE_DEVICE_TABLE(pci, vmd_ids);
+
+static struct pci_driver vmd_drv = {
+ .name = "vmd",
+ .id_table = vmd_ids,
+ .probe = vmd_probe,
+ .remove = vmd_remove,
+ .driver = {
+ .pm = &vmd_dev_pm_ops,
+ },
+};
+module_pci_driver(vmd_drv);
+
+MODULE_AUTHOR("Intel Corporation");
+MODULE_LICENSE("GPL v2");
+MODULE_VERSION("0.6");
diff --git a/arch/x86/platform/atom/punit_atom_debug.c b/arch/x86/platform/atom/punit_atom_debug.c
index 5ca8ead91579..81c769e80614 100644
--- a/arch/x86/platform/atom/punit_atom_debug.c
+++ b/arch/x86/platform/atom/punit_atom_debug.c
@@ -25,8 +25,6 @@
#include <asm/cpu_device_id.h>
#include <asm/iosf_mbi.h>
-/* Side band Interface port */
-#define PUNIT_PORT 0x04
/* Power gate status reg */
#define PWRGT_STATUS 0x61
/* Subsystem config/status Video processor */
@@ -85,9 +83,8 @@ static int punit_dev_state_show(struct seq_file *seq_file, void *unused)
seq_puts(seq_file, "\n\nPUNIT NORTH COMPLEX DEVICES :\n");
while (punit_devp->name) {
- status = iosf_mbi_read(PUNIT_PORT, BT_MBI_PMC_READ,
- punit_devp->reg,
- &punit_pwr_status);
+ status = iosf_mbi_read(BT_MBI_UNIT_PMC, MBI_REG_READ,
+ punit_devp->reg, &punit_pwr_status);
if (status) {
seq_printf(seq_file, "%9s : Read Failed\n",
punit_devp->name);
diff --git a/arch/x86/platform/intel-quark/imr.c b/arch/x86/platform/intel-quark/imr.c
index 0ee619f9fcb7..c1bdafaac3ca 100644
--- a/arch/x86/platform/intel-quark/imr.c
+++ b/arch/x86/platform/intel-quark/imr.c
@@ -111,23 +111,19 @@ static int imr_read(struct imr_device *idev, u32 imr_id, struct imr_regs *imr)
u32 reg = imr_id * IMR_NUM_REGS + idev->reg_base;
int ret;
- ret = iosf_mbi_read(QRK_MBI_UNIT_MM, QRK_MBI_MM_READ,
- reg++, &imr->addr_lo);
+ ret = iosf_mbi_read(QRK_MBI_UNIT_MM, MBI_REG_READ, reg++, &imr->addr_lo);
if (ret)
return ret;
- ret = iosf_mbi_read(QRK_MBI_UNIT_MM, QRK_MBI_MM_READ,
- reg++, &imr->addr_hi);
+ ret = iosf_mbi_read(QRK_MBI_UNIT_MM, MBI_REG_READ, reg++, &imr->addr_hi);
if (ret)
return ret;
- ret = iosf_mbi_read(QRK_MBI_UNIT_MM, QRK_MBI_MM_READ,
- reg++, &imr->rmask);
+ ret = iosf_mbi_read(QRK_MBI_UNIT_MM, MBI_REG_READ, reg++, &imr->rmask);
if (ret)
return ret;
- return iosf_mbi_read(QRK_MBI_UNIT_MM, QRK_MBI_MM_READ,
- reg++, &imr->wmask);
+ return iosf_mbi_read(QRK_MBI_UNIT_MM, MBI_REG_READ, reg++, &imr->wmask);
}
/**
@@ -151,31 +147,27 @@ static int imr_write(struct imr_device *idev, u32 imr_id,
local_irq_save(flags);
- ret = iosf_mbi_write(QRK_MBI_UNIT_MM, QRK_MBI_MM_WRITE, reg++,
- imr->addr_lo);
+ ret = iosf_mbi_write(QRK_MBI_UNIT_MM, MBI_REG_WRITE, reg++, imr->addr_lo);
if (ret)
goto failed;
- ret = iosf_mbi_write(QRK_MBI_UNIT_MM, QRK_MBI_MM_WRITE,
- reg++, imr->addr_hi);
+ ret = iosf_mbi_write(QRK_MBI_UNIT_MM, MBI_REG_WRITE, reg++, imr->addr_hi);
if (ret)
goto failed;
- ret = iosf_mbi_write(QRK_MBI_UNIT_MM, QRK_MBI_MM_WRITE,
- reg++, imr->rmask);
+ ret = iosf_mbi_write(QRK_MBI_UNIT_MM, MBI_REG_WRITE, reg++, imr->rmask);
if (ret)
goto failed;
- ret = iosf_mbi_write(QRK_MBI_UNIT_MM, QRK_MBI_MM_WRITE,
- reg++, imr->wmask);
+ ret = iosf_mbi_write(QRK_MBI_UNIT_MM, MBI_REG_WRITE, reg++, imr->wmask);
if (ret)
goto failed;
/* Lock bit must be set separately to addr_lo address bits. */
if (lock) {
imr->addr_lo |= IMR_LOCK;
- ret = iosf_mbi_write(QRK_MBI_UNIT_MM, QRK_MBI_MM_WRITE,
- reg - IMR_NUM_REGS, imr->addr_lo);
+ ret = iosf_mbi_write(QRK_MBI_UNIT_MM, MBI_REG_WRITE,
+ reg - IMR_NUM_REGS, imr->addr_lo);
if (ret)
goto failed;
}
diff --git a/arch/x86/realmode/rm/Makefile b/arch/x86/realmode/rm/Makefile
index 2730d775ef9a..3e75fcf6b836 100644
--- a/arch/x86/realmode/rm/Makefile
+++ b/arch/x86/realmode/rm/Makefile
@@ -70,3 +70,4 @@ KBUILD_CFLAGS := $(LINUXINCLUDE) $(REALMODE_CFLAGS) -D_SETUP -D_WAKEUP \
-I$(srctree)/arch/x86/boot
KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__
GCOV_PROFILE := n
+UBSAN_SANITIZE := n
diff --git a/arch/x86/um/Makefile b/arch/x86/um/Makefile
index a8fecc226946..3ee2bb6b440b 100644
--- a/arch/x86/um/Makefile
+++ b/arch/x86/um/Makefile
@@ -17,7 +17,7 @@ obj-y = bug.o bugs_$(BITS).o delay.o fault.o ksyms.o ldt.o \
ifeq ($(CONFIG_X86_32),y)
obj-y += checksum_32.o
-obj-$(CONFIG_BINFMT_ELF) += elfcore.o
+obj-$(CONFIG_ELF_CORE) += elfcore.o
subarch-y = ../lib/string_32.o ../lib/atomic64_32.o ../lib/atomic64_cx8_32.o
subarch-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += ../lib/rwsem.o
diff --git a/arch/x86/um/asm/barrier.h b/arch/x86/um/asm/barrier.h
index 755481f14d90..174781a404ff 100644
--- a/arch/x86/um/asm/barrier.h
+++ b/arch/x86/um/asm/barrier.h
@@ -36,13 +36,6 @@
#endif /* CONFIG_X86_PPRO_FENCE */
#define dma_wmb() barrier()
-#define smp_mb() barrier()
-#define smp_rmb() barrier()
-#define smp_wmb() barrier()
-
-#define smp_store_mb(var, value) do { WRITE_ONCE(var, value); barrier(); } while (0)
-
-#define read_barrier_depends() do { } while (0)
-#define smp_read_barrier_depends() do { } while (0)
+#include <asm-generic/barrier.h>
#endif
diff --git a/arch/x86/um/asm/syscall.h b/arch/x86/um/asm/syscall.h
index 81d6562ce01d..11ab90dc5f14 100644
--- a/arch/x86/um/asm/syscall.h
+++ b/arch/x86/um/asm/syscall.h
@@ -1,6 +1,7 @@
#ifndef __UM_ASM_SYSCALL_H
#define __UM_ASM_SYSCALL_H
+#include <asm/syscall-generic.h>
#include <uapi/linux/audit.h>
typedef asmlinkage long (*sys_call_ptr_t)(unsigned long, unsigned long,
diff --git a/arch/x86/um/ptrace_32.c b/arch/x86/um/ptrace_32.c
index a29756f2d940..47c78d5e5c32 100644
--- a/arch/x86/um/ptrace_32.c
+++ b/arch/x86/um/ptrace_32.c
@@ -68,6 +68,7 @@ static const int reg_offsets[] = {
[EFL] = HOST_EFLAGS,
[UESP] = HOST_SP,
[SS] = HOST_SS,
+ [ORIG_EAX] = HOST_ORIG_AX,
};
int putreg(struct task_struct *child, int regno, unsigned long value)
@@ -83,6 +84,7 @@ int putreg(struct task_struct *child, int regno, unsigned long value)
case EAX:
case EIP:
case UESP:
+ case ORIG_EAX:
break;
case FS:
if (value && (value & 3) != 3)
@@ -108,9 +110,6 @@ int putreg(struct task_struct *child, int regno, unsigned long value)
value &= FLAG_MASK;
child->thread.regs.regs.gp[HOST_EFLAGS] |= value;
return 0;
- case ORIG_EAX:
- child->thread.regs.regs.syscall = value;
- return 0;
default :
panic("Bad register in putreg() : %d\n", regno);
}
@@ -143,8 +142,6 @@ unsigned long getreg(struct task_struct *child, int regno)
regno >>= 2;
switch (regno) {
- case ORIG_EAX:
- return child->thread.regs.regs.syscall;
case FS:
case GS:
case DS:
@@ -163,6 +160,7 @@ unsigned long getreg(struct task_struct *child, int regno)
case EDI:
case EBP:
case EFL:
+ case ORIG_EAX:
break;
default:
panic("Bad register in getreg() : %d\n", regno);
diff --git a/arch/x86/xen/apic.c b/arch/x86/xen/apic.c
index acda713ab5be..abf4901c917b 100644
--- a/arch/x86/xen/apic.c
+++ b/arch/x86/xen/apic.c
@@ -64,7 +64,7 @@ static u32 xen_apic_read(u32 reg)
if (reg != APIC_ID)
return 0;
- ret = HYPERVISOR_dom0_op(&op);
+ ret = HYPERVISOR_platform_op(&op);
if (ret)
return 0;
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 23063923e364..d09e4c9d7cc5 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -415,7 +415,7 @@ static bool __init xen_check_mwait(void)
set_xen_guest_handle(op.u.set_pminfo.pdc, buf);
- if ((HYPERVISOR_dom0_op(&op) == 0) &&
+ if ((HYPERVISOR_platform_op(&op) == 0) &&
(buf[2] & (ACPI_PDC_C_C1_FFH | ACPI_PDC_C_C2C3_FFH))) {
cpuid_leaf5_ecx_val = cx;
cpuid_leaf5_edx_val = dx;
@@ -1365,7 +1365,7 @@ static void __init xen_boot_params_init_edd(void)
info->params.length = sizeof(info->params);
set_xen_guest_handle(op.u.firmware_info.u.disk_info.edd_params,
&info->params);
- ret = HYPERVISOR_dom0_op(&op);
+ ret = HYPERVISOR_platform_op(&op);
if (ret)
break;
@@ -1383,7 +1383,7 @@ static void __init xen_boot_params_init_edd(void)
op.u.firmware_info.type = XEN_FW_DISK_MBR_SIGNATURE;
for (nr = 0; nr < EDD_MBR_SIG_MAX; nr++) {
op.u.firmware_info.index = nr;
- ret = HYPERVISOR_dom0_op(&op);
+ ret = HYPERVISOR_platform_op(&op);
if (ret)
break;
mbr_signature[nr] = op.u.firmware_info.u.disk_mbr_signature.mbr_signature;
@@ -1690,7 +1690,7 @@ asmlinkage __visible void __init xen_start_kernel(void)
xen_start_info->console.domU.mfn = 0;
xen_start_info->console.domU.evtchn = 0;
- if (HYPERVISOR_dom0_op(&op) == 0)
+ if (HYPERVISOR_platform_op(&op) == 0)
boot_params.kbd_status = op.u.firmware_info.u.kbd_shift_flags;
/* Make sure ACS will be enabled */
diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
index df0c40559583..7f664c416faf 100644
--- a/arch/x86/xen/suspend.c
+++ b/arch/x86/xen/suspend.c
@@ -34,7 +34,8 @@ static void xen_hvm_post_suspend(int suspend_cancelled)
{
#ifdef CONFIG_XEN_PVHVM
int cpu;
- xen_hvm_init_shared_info();
+ if (!suspend_cancelled)
+ xen_hvm_init_shared_info();
xen_callback_vector();
xen_unplug_emulated_devices();
if (xen_feature(XENFEAT_hvm_safe_pvclock)) {
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index f1ba6a092854..a0a4e554c6f1 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -16,6 +16,7 @@
#include <linux/gfp.h>
#include <linux/slab.h>
#include <linux/pvclock_gtod.h>
+#include <linux/timekeeper_internal.h>
#include <asm/pvclock.h>
#include <asm/xen/hypervisor.h>
@@ -32,86 +33,12 @@
#define TIMER_SLOP 100000
#define NS_PER_TICK (1000000000LL / HZ)
-/* runstate info updated by Xen */
-static DEFINE_PER_CPU(struct vcpu_runstate_info, xen_runstate);
-
/* snapshots of runstate info */
static DEFINE_PER_CPU(struct vcpu_runstate_info, xen_runstate_snapshot);
/* unused ns of stolen time */
static DEFINE_PER_CPU(u64, xen_residual_stolen);
-/* return an consistent snapshot of 64-bit time/counter value */
-static u64 get64(const u64 *p)
-{
- u64 ret;
-
- if (BITS_PER_LONG < 64) {
- u32 *p32 = (u32 *)p;
- u32 h, l;
-
- /*
- * Read high then low, and then make sure high is
- * still the same; this will only loop if low wraps
- * and carries into high.
- * XXX some clean way to make this endian-proof?
- */
- do {
- h = p32[1];
- barrier();
- l = p32[0];
- barrier();
- } while (p32[1] != h);
-
- ret = (((u64)h) << 32) | l;
- } else
- ret = *p;
-
- return ret;
-}
-
-/*
- * Runstate accounting
- */
-static void get_runstate_snapshot(struct vcpu_runstate_info *res)
-{
- u64 state_time;
- struct vcpu_runstate_info *state;
-
- BUG_ON(preemptible());
-
- state = this_cpu_ptr(&xen_runstate);
-
- /*
- * The runstate info is always updated by the hypervisor on
- * the current CPU, so there's no need to use anything
- * stronger than a compiler barrier when fetching it.
- */
- do {
- state_time = get64(&state->state_entry_time);
- barrier();
- *res = *state;
- barrier();
- } while (get64(&state->state_entry_time) != state_time);
-}
-
-/* return true when a vcpu could run but has no real cpu to run on */
-bool xen_vcpu_stolen(int vcpu)
-{
- return per_cpu(xen_runstate, vcpu).state == RUNSTATE_runnable;
-}
-
-void xen_setup_runstate_info(int cpu)
-{
- struct vcpu_register_runstate_memory_area area;
-
- area.addr.v = &per_cpu(xen_runstate, cpu);
-
- if (HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area,
- cpu, &area))
- BUG();
-}
-
static void do_stolen_accounting(void)
{
struct vcpu_runstate_info state;
@@ -119,7 +46,7 @@ static void do_stolen_accounting(void)
s64 runnable, offline, stolen;
cputime_t ticks;
- get_runstate_snapshot(&state);
+ xen_get_runstate_snapshot(&state);
WARN_ON(state.state != RUNSTATE_running);
@@ -194,26 +121,46 @@ static int xen_pvclock_gtod_notify(struct notifier_block *nb,
unsigned long was_set, void *priv)
{
/* Protected by the calling core code serialization */
- static struct timespec next_sync;
+ static struct timespec64 next_sync;
struct xen_platform_op op;
- struct timespec now;
+ struct timespec64 now;
+ struct timekeeper *tk = priv;
+ static bool settime64_supported = true;
+ int ret;
- now = __current_kernel_time();
+ now.tv_sec = tk->xtime_sec;
+ now.tv_nsec = (long)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift);
/*
* We only take the expensive HV call when the clock was set
* or when the 11 minutes RTC synchronization time elapsed.
*/
- if (!was_set && timespec_compare(&now, &next_sync) < 0)
+ if (!was_set && timespec64_compare(&now, &next_sync) < 0)
return NOTIFY_OK;
- op.cmd = XENPF_settime;
- op.u.settime.secs = now.tv_sec;
- op.u.settime.nsecs = now.tv_nsec;
- op.u.settime.system_time = xen_clocksource_read();
+again:
+ if (settime64_supported) {
+ op.cmd = XENPF_settime64;
+ op.u.settime64.mbz = 0;
+ op.u.settime64.secs = now.tv_sec;
+ op.u.settime64.nsecs = now.tv_nsec;
+ op.u.settime64.system_time = xen_clocksource_read();
+ } else {
+ op.cmd = XENPF_settime32;
+ op.u.settime32.secs = now.tv_sec;
+ op.u.settime32.nsecs = now.tv_nsec;
+ op.u.settime32.system_time = xen_clocksource_read();
+ }
+
+ ret = HYPERVISOR_platform_op(&op);
- (void)HYPERVISOR_dom0_op(&op);
+ if (ret == -ENOSYS && settime64_supported) {
+ settime64_supported = false;
+ goto again;
+ }
+ if (ret < 0)
+ return NOTIFY_BAD;
/*
* Move the next drift compensation time 11 minutes