aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
authorThomas Gleixner <tglx@linutronix.de>2021-09-11 00:38:47 +0200
committerThomas Gleixner <tglx@linutronix.de>2021-09-11 00:38:47 +0200
commitc2f4954c2d3fc4f77b46c67585e17a58df4ba8e4 (patch)
tree533a2077028e02a851e51ad509a0aa3a9107999f /arch/x86
parentdrivers: base: cacheinfo: Get rid of DEFINE_SMP_CALL_CACHE_FUNCTION() (diff)
parentMerge tag 'acpi-5.15-rc1-3' of git://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm (diff)
downloadlinux-dev-c2f4954c2d3fc4f77b46c67585e17a58df4ba8e4.tar.xz
linux-dev-c2f4954c2d3fc4f77b46c67585e17a58df4ba8e4.zip
Merge branch 'linus' into smp/urgent
Ensure that all usage sites of get/put_online_cpus() except for the struggler in drivers/thermal are gone. So the last user and the deprecated inlines can be removed.
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig29
-rw-r--r--arch/x86/Kconfig.debug3
-rw-r--r--arch/x86/Makefile2
-rw-r--r--arch/x86/Makefile_32.cpu6
-rw-r--r--arch/x86/boot/boot.h2
-rw-r--r--arch/x86/configs/i386_defconfig1
-rw-r--r--arch/x86/configs/x86_64_defconfig1
-rw-r--r--arch/x86/entry/syscalls/syscall_32.tbl7
-rw-r--r--arch/x86/entry/syscalls/syscall_64.tbl3
-rw-r--r--arch/x86/entry/vdso/Makefile2
-rw-r--r--arch/x86/hyperv/hv_init.c165
-rw-r--r--arch/x86/ia32/ia32_aout.c8
-rw-r--r--arch/x86/include/asm/compat.h27
-rw-r--r--arch/x86/include/asm/hyperv-tlfs.h9
-rw-r--r--arch/x86/include/asm/kvm-x86-ops.h1
-rw-r--r--arch/x86/include/asm/kvm_host.h96
-rw-r--r--arch/x86/include/asm/mshyperv.h4
-rw-r--r--arch/x86/include/asm/signal.h1
-rw-r--r--arch/x86/include/asm/sysfb.h94
-rw-r--r--arch/x86/include/asm/uaccess_64.h7
-rw-r--r--arch/x86/include/uapi/asm/kvm.h1
-rw-r--r--arch/x86/kernel/Makefile3
-rw-r--r--arch/x86/kernel/amd_gart_64.c18
-rw-r--r--arch/x86/kernel/amd_nb.c5
-rw-r--r--arch/x86/kernel/aperture_64.c5
-rw-r--r--arch/x86/kernel/cpu/mshyperv.c38
-rw-r--r--arch/x86/kernel/head_32.S2
-rw-r--r--arch/x86/kernel/kvm.c5
-rw-r--r--arch/x86/kernel/ldt.c6
-rw-r--r--arch/x86/kernel/setup.c11
-rw-r--r--arch/x86/kernel/signal_compat.c6
-rw-r--r--arch/x86/kernel/sysfb.c70
-rw-r--r--arch/x86/kernel/sysfb_efi.c284
-rw-r--r--arch/x86/kernel/sysfb_simplefb.c111
-rw-r--r--arch/x86/kvm/debugfs.c111
-rw-r--r--arch/x86/kvm/hyperv.c32
-rw-r--r--arch/x86/kvm/i8254.c3
-rw-r--r--arch/x86/kvm/ioapic.h4
-rw-r--r--arch/x86/kvm/lapic.c26
-rw-r--r--arch/x86/kvm/mmu.h25
-rw-r--r--arch/x86/kvm/mmu/mmu.c524
-rw-r--r--arch/x86/kvm/mmu/mmu_audit.c4
-rw-r--r--arch/x86/kvm/mmu/mmu_internal.h18
-rw-r--r--arch/x86/kvm/mmu/mmutrace.h6
-rw-r--r--arch/x86/kvm/mmu/page_track.c1
-rw-r--r--arch/x86/kvm/mmu/paging_tmpl.h6
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.c139
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.h29
-rw-r--r--arch/x86/kvm/pmu.c5
-rw-r--r--arch/x86/kvm/pmu.h2
-rw-r--r--arch/x86/kvm/svm/avic.c49
-rw-r--r--arch/x86/kvm/svm/nested.c5
-rw-r--r--arch/x86/kvm/svm/sev.c3
-rw-r--r--arch/x86/kvm/svm/svm.c97
-rw-r--r--arch/x86/kvm/svm/svm.h8
-rw-r--r--arch/x86/kvm/svm/svm_ops.h2
-rw-r--r--arch/x86/kvm/vmx/evmcs.c1
-rw-r--r--arch/x86/kvm/vmx/evmcs.h4
-rw-r--r--arch/x86/kvm/vmx/nested.c56
-rw-r--r--arch/x86/kvm/vmx/pmu_intel.c4
-rw-r--r--arch/x86/kvm/vmx/vmcs.h2
-rw-r--r--arch/x86/kvm/vmx/vmcs12.c1
-rw-r--r--arch/x86/kvm/vmx/vmcs12.h4
-rw-r--r--arch/x86/kvm/vmx/vmx.c333
-rw-r--r--arch/x86/kvm/vmx/vmx.h38
-rw-r--r--arch/x86/kvm/vmx/vmx_ops.h4
-rw-r--r--arch/x86/kvm/x86.c189
-rw-r--r--arch/x86/kvm/x86.h2
-rw-r--r--arch/x86/kvm/xen.c23
-rw-r--r--arch/x86/kvm/xen.h5
-rw-r--r--arch/x86/mm/init.c23
-rw-r--r--arch/x86/mm/init_32.c3
-rw-r--r--arch/x86/mm/init_64.c3
-rw-r--r--arch/x86/mm/numa.c5
-rw-r--r--arch/x86/mm/numa_emulation.c5
-rw-r--r--arch/x86/net/bpf_jit_comp.c19
-rw-r--r--arch/x86/pci/numachip.c1
-rw-r--r--arch/x86/pci/sta2x11-fixup.c3
-rw-r--r--arch/x86/realmode/init.c2
-rw-r--r--arch/x86/um/shared/sysdep/stub_32.h12
-rw-r--r--arch/x86/um/shared/sysdep/stub_64.h12
-rw-r--r--arch/x86/um/stub_segv.c3
-rw-r--r--arch/x86/xen/enlighten_pv.c12
-rw-r--r--arch/x86/xen/mmu_pv.c4
-rw-r--r--arch/x86/xen/p2m.c4
-rw-r--r--arch/x86/xen/platform-pci-unplug.c16
-rw-r--r--arch/x86/xen/smp_pv.c1
87 files changed, 1309 insertions, 1617 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 421fa9e38c60..4e001bbbb425 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -145,8 +145,6 @@ config X86
select GENERIC_PENDING_IRQ if SMP
select GENERIC_PTDUMP
select GENERIC_SMP_IDLE_THREAD
- select GENERIC_STRNCPY_FROM_USER
- select GENERIC_STRNLEN_USER
select GENERIC_TIME_VSYSCALL
select GENERIC_GETTIMEOFDAY
select GENERIC_VDSO_TIME_NS
@@ -261,6 +259,7 @@ config X86
select STACK_VALIDATION if HAVE_STACK_VALIDATION && (HAVE_STATIC_CALL_INLINE || RETPOLINE)
select SYSCTL_EXCEPTION_TRACE
select THREAD_INFO_IN_TASK
+ select TRACE_IRQFLAGS_SUPPORT
select USER_STACKTRACE_SUPPORT
select VIRT_TO_BUS
select HAVE_ARCH_KCSAN if X86_64
@@ -2767,32 +2766,6 @@ config AMD_NB
def_bool y
depends on CPU_SUP_AMD && PCI
-config X86_SYSFB
- bool "Mark VGA/VBE/EFI FB as generic system framebuffer"
- help
- Firmwares often provide initial graphics framebuffers so the BIOS,
- bootloader or kernel can show basic video-output during boot for
- user-guidance and debugging. Historically, x86 used the VESA BIOS
- Extensions and EFI-framebuffers for this, which are mostly limited
- to x86.
- This option, if enabled, marks VGA/VBE/EFI framebuffers as generic
- framebuffers so the new generic system-framebuffer drivers can be
- used on x86. If the framebuffer is not compatible with the generic
- modes, it is advertised as fallback platform framebuffer so legacy
- drivers like efifb, vesafb and uvesafb can pick it up.
- If this option is not selected, all system framebuffers are always
- marked as fallback platform framebuffers as usual.
-
- Note: Legacy fbdev drivers, including vesafb, efifb, uvesafb, will
- not be able to pick up generic system framebuffers if this option
- is selected. You are highly encouraged to enable simplefb as
- replacement if you select this option. simplefb can correctly deal
- with generic system framebuffers. But you should still keep vesafb
- and others enabled as fallback if a system framebuffer is
- incompatible with simplefb.
-
- If unsure, say Y.
-
endmenu
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 80b57e7f4947..d3a6f74a94bd 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -1,8 +1,5 @@
# SPDX-License-Identifier: GPL-2.0
-config TRACE_IRQFLAGS_SUPPORT
- def_bool y
-
config TRACE_IRQFLAGS_NMI_SUPPORT
def_bool y
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index d82d01490dd3..7488cfbbd2f6 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -73,7 +73,7 @@ ifeq ($(CONFIG_X86_32),y)
KBUILD_CFLAGS += $(cc_stack_align4)
# CPU-specific tuning. Anything which can be shared with UML should go here.
- include arch/x86/Makefile_32.cpu
+ include $(srctree)/arch/x86/Makefile_32.cpu
KBUILD_CFLAGS += $(cflags-y)
# temporary until string.h is fixed
diff --git a/arch/x86/Makefile_32.cpu b/arch/x86/Makefile_32.cpu
index cd3056759880..e7355f8b51c2 100644
--- a/arch/x86/Makefile_32.cpu
+++ b/arch/x86/Makefile_32.cpu
@@ -2,13 +2,7 @@
# CPU tuning section - shared with UML.
# Must change only cflags-y (or [yn]), not CFLAGS! That makes a difference for UML.
-#-mtune exists since gcc 3.4
-HAS_MTUNE := $(call cc-option-yn, -mtune=i386)
-ifeq ($(HAS_MTUNE),y)
tune = $(call cc-option,-mtune=$(1),$(2))
-else
-tune = $(call cc-option,-mcpu=$(1),$(2))
-endif
cflags-$(CONFIG_M486SX) += -march=i486
cflags-$(CONFIG_M486) += -march=i486
diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h
index ca866f1cca2e..34c9dbb6a47d 100644
--- a/arch/x86/boot/boot.h
+++ b/arch/x86/boot/boot.h
@@ -18,7 +18,7 @@
#ifndef __ASSEMBLY__
-#include <stdarg.h>
+#include <linux/stdarg.h>
#include <linux/types.h>
#include <linux/edd.h>
#include <asm/setup.h>
diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig
index 9c9c4a888b1d..e81885384f60 100644
--- a/arch/x86/configs/i386_defconfig
+++ b/arch/x86/configs/i386_defconfig
@@ -156,7 +156,6 @@ CONFIG_FORCEDETH=y
CONFIG_8139TOO=y
# CONFIG_8139TOO_PIO is not set
CONFIG_R8169=y
-CONFIG_INPUT_POLLDEV=y
CONFIG_INPUT_EVDEV=y
CONFIG_INPUT_JOYSTICK=y
CONFIG_INPUT_TABLET=y
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig
index b60bd2d86034..e8a7a0af2bda 100644
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -148,7 +148,6 @@ CONFIG_SKY2=y
CONFIG_FORCEDETH=y
CONFIG_8139TOO=y
CONFIG_R8169=y
-CONFIG_INPUT_POLLDEV=y
CONFIG_INPUT_EVDEV=y
CONFIG_INPUT_JOYSTICK=y
CONFIG_INPUT_TABLET=y
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index ce763a12311c..960a021d543e 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -145,7 +145,7 @@
131 i386 quotactl sys_quotactl
132 i386 getpgid sys_getpgid
133 i386 fchdir sys_fchdir
-134 i386 bdflush sys_bdflush
+134 i386 bdflush sys_ni_syscall
135 i386 sysfs sys_sysfs
136 i386 personality sys_personality
137 i386 afs_syscall
@@ -286,7 +286,7 @@
272 i386 fadvise64_64 sys_ia32_fadvise64_64
273 i386 vserver
274 i386 mbind sys_mbind
-275 i386 get_mempolicy sys_get_mempolicy compat_sys_get_mempolicy
+275 i386 get_mempolicy sys_get_mempolicy
276 i386 set_mempolicy sys_set_mempolicy
277 i386 mq_open sys_mq_open compat_sys_mq_open
278 i386 mq_unlink sys_mq_unlink
@@ -328,7 +328,7 @@
314 i386 sync_file_range sys_ia32_sync_file_range
315 i386 tee sys_tee
316 i386 vmsplice sys_vmsplice
-317 i386 move_pages sys_move_pages compat_sys_move_pages
+317 i386 move_pages sys_move_pages
318 i386 getcpu sys_getcpu
319 i386 epoll_pwait sys_epoll_pwait
320 i386 utimensat sys_utimensat_time32
@@ -452,3 +452,4 @@
445 i386 landlock_add_rule sys_landlock_add_rule
446 i386 landlock_restrict_self sys_landlock_restrict_self
447 i386 memfd_secret sys_memfd_secret
+448 i386 process_mrelease sys_process_mrelease
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index f6b57799c1ea..18b5500ea8bf 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -369,6 +369,7 @@
445 common landlock_add_rule sys_landlock_add_rule
446 common landlock_restrict_self sys_landlock_restrict_self
447 common memfd_secret sys_memfd_secret
+448 common process_mrelease sys_process_mrelease
#
# Due to a historical design error, certain syscalls are numbered differently
@@ -397,7 +398,7 @@
530 x32 set_robust_list compat_sys_set_robust_list
531 x32 get_robust_list compat_sys_get_robust_list
532 x32 vmsplice sys_vmsplice
-533 x32 move_pages compat_sys_move_pages
+533 x32 move_pages sys_move_pages
534 x32 preadv compat_sys_preadv64
535 x32 pwritev compat_sys_pwritev64
536 x32 rt_tgsigqueueinfo compat_sys_rt_tgsigqueueinfo
diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile
index 05c4abc2fdfd..a2dddcc189f6 100644
--- a/arch/x86/entry/vdso/Makefile
+++ b/arch/x86/entry/vdso/Makefile
@@ -131,7 +131,7 @@ $(obj)/%-x32.o: $(obj)/%.o FORCE
targets += vdsox32.lds $(vobjx32s-y)
$(obj)/%.so: OBJCOPYFLAGS := -S --remove-section __ex_table
-$(obj)/%.so: $(obj)/%.so.dbg
+$(obj)/%.so: $(obj)/%.so.dbg FORCE
$(call if_changed,objcopy)
$(obj)/vdsox32.so.dbg: $(obj)/vdsox32.lds $(vobjx32s) FORCE
diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
index 6952e219cba3..708a2712a516 100644
--- a/arch/x86/hyperv/hv_init.c
+++ b/arch/x86/hyperv/hv_init.c
@@ -7,10 +7,10 @@
* Author : K. Y. Srinivasan <kys@microsoft.com>
*/
-#include <linux/acpi.h>
#include <linux/efi.h>
#include <linux/types.h>
#include <linux/bitfield.h>
+#include <linux/io.h>
#include <asm/apic.h>
#include <asm/desc.h>
#include <asm/hypervisor.h>
@@ -39,71 +39,50 @@ EXPORT_SYMBOL_GPL(hv_hypercall_pg);
/* Storage to save the hypercall page temporarily for hibernation */
static void *hv_hypercall_pg_saved;
-u32 *hv_vp_index;
-EXPORT_SYMBOL_GPL(hv_vp_index);
-
struct hv_vp_assist_page **hv_vp_assist_page;
EXPORT_SYMBOL_GPL(hv_vp_assist_page);
-void __percpu **hyperv_pcpu_input_arg;
-EXPORT_SYMBOL_GPL(hyperv_pcpu_input_arg);
-
-void __percpu **hyperv_pcpu_output_arg;
-EXPORT_SYMBOL_GPL(hyperv_pcpu_output_arg);
-
-u32 hv_max_vp_index;
-EXPORT_SYMBOL_GPL(hv_max_vp_index);
-
static int hv_cpu_init(unsigned int cpu)
{
- u64 msr_vp_index;
+ union hv_vp_assist_msr_contents msr = { 0 };
struct hv_vp_assist_page **hvp = &hv_vp_assist_page[smp_processor_id()];
- void **input_arg;
- struct page *pg;
-
- /* hv_cpu_init() can be called with IRQs disabled from hv_resume() */
- pg = alloc_pages(irqs_disabled() ? GFP_ATOMIC : GFP_KERNEL, hv_root_partition ? 1 : 0);
- if (unlikely(!pg))
- return -ENOMEM;
-
- input_arg = (void **)this_cpu_ptr(hyperv_pcpu_input_arg);
- *input_arg = page_address(pg);
- if (hv_root_partition) {
- void **output_arg;
-
- output_arg = (void **)this_cpu_ptr(hyperv_pcpu_output_arg);
- *output_arg = page_address(pg + 1);
- }
-
- msr_vp_index = hv_get_register(HV_REGISTER_VP_INDEX);
-
- hv_vp_index[smp_processor_id()] = msr_vp_index;
+ int ret;
- if (msr_vp_index > hv_max_vp_index)
- hv_max_vp_index = msr_vp_index;
+ ret = hv_common_cpu_init(cpu);
+ if (ret)
+ return ret;
if (!hv_vp_assist_page)
return 0;
- /*
- * The VP ASSIST PAGE is an "overlay" page (see Hyper-V TLFS's Section
- * 5.2.1 "GPA Overlay Pages"). Here it must be zeroed out to make sure
- * we always write the EOI MSR in hv_apic_eoi_write() *after* the
- * EOI optimization is disabled in hv_cpu_die(), otherwise a CPU may
- * not be stopped in the case of CPU offlining and the VM will hang.
- */
if (!*hvp) {
- *hvp = __vmalloc(PAGE_SIZE, GFP_KERNEL | __GFP_ZERO);
- }
-
- if (*hvp) {
- u64 val;
-
- val = vmalloc_to_pfn(*hvp);
- val = (val << HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT) |
- HV_X64_MSR_VP_ASSIST_PAGE_ENABLE;
-
- wrmsrl(HV_X64_MSR_VP_ASSIST_PAGE, val);
+ if (hv_root_partition) {
+ /*
+ * For root partition we get the hypervisor provided VP assist
+ * page, instead of allocating a new page.
+ */
+ rdmsrl(HV_X64_MSR_VP_ASSIST_PAGE, msr.as_uint64);
+ *hvp = memremap(msr.pfn <<
+ HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT,
+ PAGE_SIZE, MEMREMAP_WB);
+ } else {
+ /*
+ * The VP assist page is an "overlay" page (see Hyper-V TLFS's
+ * Section 5.2.1 "GPA Overlay Pages"). Here it must be zeroed
+ * out to make sure we always write the EOI MSR in
+ * hv_apic_eoi_write() *after* the EOI optimization is disabled
+ * in hv_cpu_die(), otherwise a CPU may not be stopped in the
+ * case of CPU offlining and the VM will hang.
+ */
+ *hvp = __vmalloc(PAGE_SIZE, GFP_KERNEL | __GFP_ZERO);
+ if (*hvp)
+ msr.pfn = vmalloc_to_pfn(*hvp);
+ }
+ WARN_ON(!(*hvp));
+ if (*hvp) {
+ msr.enable = 1;
+ wrmsrl(HV_X64_MSR_VP_ASSIST_PAGE, msr.as_uint64);
+ }
}
return 0;
@@ -198,29 +177,26 @@ static int hv_cpu_die(unsigned int cpu)
{
struct hv_reenlightenment_control re_ctrl;
unsigned int new_cpu;
- unsigned long flags;
- void **input_arg;
- void *pg;
- local_irq_save(flags);
- input_arg = (void **)this_cpu_ptr(hyperv_pcpu_input_arg);
- pg = *input_arg;
- *input_arg = NULL;
-
- if (hv_root_partition) {
- void **output_arg;
-
- output_arg = (void **)this_cpu_ptr(hyperv_pcpu_output_arg);
- *output_arg = NULL;
+ hv_common_cpu_die(cpu);
+
+ if (hv_vp_assist_page && hv_vp_assist_page[cpu]) {
+ union hv_vp_assist_msr_contents msr = { 0 };
+ if (hv_root_partition) {
+ /*
+ * For root partition the VP assist page is mapped to
+ * hypervisor provided page, and thus we unmap the
+ * page here and nullify it, so that in future we have
+ * correct page address mapped in hv_cpu_init.
+ */
+ memunmap(hv_vp_assist_page[cpu]);
+ hv_vp_assist_page[cpu] = NULL;
+ rdmsrl(HV_X64_MSR_VP_ASSIST_PAGE, msr.as_uint64);
+ msr.enable = 0;
+ }
+ wrmsrl(HV_X64_MSR_VP_ASSIST_PAGE, msr.as_uint64);
}
- local_irq_restore(flags);
-
- free_pages((unsigned long)pg, hv_root_partition ? 1 : 0);
-
- if (hv_vp_assist_page && hv_vp_assist_page[cpu])
- wrmsrl(HV_X64_MSR_VP_ASSIST_PAGE, 0);
-
if (hv_reenlightenment_cb == NULL)
return 0;
@@ -368,7 +344,7 @@ void __init hyperv_init(void)
{
u64 guest_id, required_msrs;
union hv_x64_msr_hypercall_contents hypercall_msr;
- int cpuhp, i;
+ int cpuhp;
if (x86_hyper_type != X86_HYPER_MS_HYPERV)
return;
@@ -380,36 +356,14 @@ void __init hyperv_init(void)
if ((ms_hyperv.features & required_msrs) != required_msrs)
return;
- /*
- * Allocate the per-CPU state for the hypercall input arg.
- * If this allocation fails, we will not be able to setup
- * (per-CPU) hypercall input page and thus this failure is
- * fatal on Hyper-V.
- */
- hyperv_pcpu_input_arg = alloc_percpu(void *);
-
- BUG_ON(hyperv_pcpu_input_arg == NULL);
-
- /* Allocate the per-CPU state for output arg for root */
- if (hv_root_partition) {
- hyperv_pcpu_output_arg = alloc_percpu(void *);
- BUG_ON(hyperv_pcpu_output_arg == NULL);
- }
-
- /* Allocate percpu VP index */
- hv_vp_index = kmalloc_array(num_possible_cpus(), sizeof(*hv_vp_index),
- GFP_KERNEL);
- if (!hv_vp_index)
+ if (hv_common_init())
return;
- for (i = 0; i < num_possible_cpus(); i++)
- hv_vp_index[i] = VP_INVAL;
-
hv_vp_assist_page = kcalloc(num_possible_cpus(),
sizeof(*hv_vp_assist_page), GFP_KERNEL);
if (!hv_vp_assist_page) {
ms_hyperv.hints &= ~HV_X64_ENLIGHTENED_VMCS_RECOMMENDED;
- goto free_vp_index;
+ goto common_free;
}
cpuhp = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/hyperv_init:online",
@@ -507,9 +461,8 @@ remove_cpuhp_state:
free_vp_assist_page:
kfree(hv_vp_assist_page);
hv_vp_assist_page = NULL;
-free_vp_index:
- kfree(hv_vp_index);
- hv_vp_index = NULL;
+common_free:
+ hv_common_free();
}
/*
@@ -539,7 +492,6 @@ void hyperv_cleanup(void)
hypercall_msr.as_uint64 = 0;
wrmsrl(HV_X64_MSR_REFERENCE_TSC, hypercall_msr.as_uint64);
}
-EXPORT_SYMBOL_GPL(hyperv_cleanup);
void hyperv_report_panic(struct pt_regs *regs, long err, bool in_die)
{
@@ -595,12 +547,6 @@ bool hv_is_hyperv_initialized(void)
}
EXPORT_SYMBOL_GPL(hv_is_hyperv_initialized);
-bool hv_is_hibernation_supported(void)
-{
- return !hv_root_partition && acpi_sleep_state_supported(ACPI_STATE_S4);
-}
-EXPORT_SYMBOL_GPL(hv_is_hibernation_supported);
-
enum hv_isolation_type hv_get_isolation_type(void)
{
if (!(ms_hyperv.priv_high & HV_ISOLATION))
@@ -613,4 +559,3 @@ bool hv_is_isolation_supported(void)
{
return hv_get_isolation_type() != HV_ISOLATION_TYPE_NONE;
}
-EXPORT_SYMBOL_GPL(hv_is_isolation_supported);
diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c
index 5e5b9fc2747f..9bd15241fadb 100644
--- a/arch/x86/ia32/ia32_aout.c
+++ b/arch/x86/ia32/ia32_aout.c
@@ -202,8 +202,7 @@ static int load_aout_binary(struct linux_binprm *bprm)
error = vm_mmap(bprm->file, N_TXTADDR(ex), ex.a_text,
PROT_READ | PROT_EXEC,
- MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE |
- MAP_32BIT,
+ MAP_FIXED | MAP_PRIVATE | MAP_32BIT,
fd_offset);
if (error != N_TXTADDR(ex))
@@ -211,8 +210,7 @@ static int load_aout_binary(struct linux_binprm *bprm)
error = vm_mmap(bprm->file, N_DATADDR(ex), ex.a_data,
PROT_READ | PROT_WRITE | PROT_EXEC,
- MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE |
- MAP_32BIT,
+ MAP_FIXED | MAP_PRIVATE | MAP_32BIT,
fd_offset + ex.a_text);
if (error != N_DATADDR(ex))
return error;
@@ -293,7 +291,7 @@ static int load_aout_library(struct file *file)
/* Now use mmap to map the library into memory. */
error = vm_mmap(file, start_addr, ex.a_text + ex.a_data,
PROT_READ | PROT_WRITE | PROT_EXEC,
- MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | MAP_32BIT,
+ MAP_FIXED | MAP_PRIVATE | MAP_32BIT,
N_TXTOFF(ex));
retval = error;
if (error != start_addr)
diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h
index be09c7eac89f..7516e4199b3c 100644
--- a/arch/x86/include/asm/compat.h
+++ b/arch/x86/include/asm/compat.h
@@ -12,6 +12,9 @@
#include <asm/user32.h>
#include <asm/unistd.h>
+#define compat_mode_t compat_mode_t
+typedef u16 compat_mode_t;
+
#include <asm-generic/compat.h>
#define COMPAT_USER_HZ 100
@@ -19,13 +22,9 @@
typedef u16 __compat_uid_t;
typedef u16 __compat_gid_t;
-typedef u32 __compat_uid32_t;
-typedef u32 __compat_gid32_t;
-typedef u16 compat_mode_t;
typedef u16 compat_dev_t;
typedef u16 compat_nlink_t;
typedef u16 compat_ipc_pid_t;
-typedef u32 compat_caddr_t;
typedef __kernel_fsid_t compat_fsid_t;
struct compat_stat {
@@ -92,13 +91,6 @@ struct compat_statfs {
#define COMPAT_RLIM_INFINITY 0xffffffff
-typedef u32 compat_old_sigset_t; /* at least 32 bits */
-
-#define _COMPAT_NSIG 64
-#define _COMPAT_NSIG_BPW 32
-
-typedef u32 compat_sigset_word;
-
#define COMPAT_OFF_T_MAX 0x7fffffff
struct compat_ipc64_perm {
@@ -164,19 +156,6 @@ struct compat_shmid64_ds {
(!!(task_pt_regs(current)->orig_ax & __X32_SYSCALL_BIT))
#endif
-static inline void __user *arch_compat_alloc_user_space(long len)
-{
- compat_uptr_t sp = task_pt_regs(current)->sp;
-
- /*
- * -128 for the x32 ABI redzone. For IA32, it is not strictly
- * necessary, but not harmful.
- */
- sp -= 128;
-
- return (void __user *)round_down(sp - len, 16);
-}
-
static inline bool in_x32_syscall(void)
{
#ifdef CONFIG_X86_X32_ABI
diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h
index f1366ce609e3..2322d6bd5883 100644
--- a/arch/x86/include/asm/hyperv-tlfs.h
+++ b/arch/x86/include/asm/hyperv-tlfs.h
@@ -288,6 +288,15 @@ union hv_x64_msr_hypercall_contents {
} __packed;
};
+union hv_vp_assist_msr_contents {
+ u64 as_uint64;
+ struct {
+ u64 enable:1;
+ u64 reserved:11;
+ u64 pfn:52;
+ } __packed;
+};
+
struct hv_reenlightenment_control {
__u64 vector:8;
__u64 reserved1:8;
diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
index a12a4987154e..cefe1d81e2e8 100644
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -72,7 +72,6 @@ KVM_X86_OP(enable_nmi_window)
KVM_X86_OP(enable_irq_window)
KVM_X86_OP(update_cr8_intercept)
KVM_X86_OP(check_apicv_inhibit_reasons)
-KVM_X86_OP_NULL(pre_update_apicv_exec_ctrl)
KVM_X86_OP(refresh_apicv_exec_ctrl)
KVM_X86_OP(hwapic_irr_update)
KVM_X86_OP(hwapic_isr_update)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index af6ce8d4c86a..f8f48a7ec577 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -37,9 +37,21 @@
#define __KVM_HAVE_ARCH_VCPU_DEBUGFS
-#define KVM_MAX_VCPUS 288
-#define KVM_SOFT_MAX_VCPUS 240
-#define KVM_MAX_VCPU_ID 1023
+#define KVM_MAX_VCPUS 1024
+#define KVM_SOFT_MAX_VCPUS 710
+
+/*
+ * In x86, the VCPU ID corresponds to the APIC ID, and APIC IDs
+ * might be larger than the actual number of VCPUs because the
+ * APIC ID encodes CPU topology information.
+ *
+ * In the worst case, we'll need less than one extra bit for the
+ * Core ID, and less than one extra bit for the Package (Die) ID,
+ * so ratio of 4 should be enough.
+ */
+#define KVM_VCPU_ID_RATIO 4
+#define KVM_MAX_VCPU_ID (KVM_MAX_VCPUS * KVM_VCPU_ID_RATIO)
+
/* memory slots that are not exposed to userspace */
#define KVM_PRIVATE_MEM_SLOTS 3
@@ -124,13 +136,6 @@
#define KVM_HPAGE_MASK(x) (~(KVM_HPAGE_SIZE(x) - 1))
#define KVM_PAGES_PER_HPAGE(x) (KVM_HPAGE_SIZE(x) / PAGE_SIZE)
-static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level)
-{
- /* KVM_HPAGE_GFN_SHIFT(PG_LEVEL_4K) must be 0. */
- return (gfn >> KVM_HPAGE_GFN_SHIFT(level)) -
- (base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
-}
-
#define KVM_PERMILLE_MMU_PAGES 20
#define KVM_MIN_ALLOC_MMU_PAGES 64UL
#define KVM_MMU_HASH_SHIFT 12
@@ -229,7 +234,8 @@ enum x86_intercept_stage;
KVM_GUESTDBG_USE_HW_BP | \
KVM_GUESTDBG_USE_SW_BP | \
KVM_GUESTDBG_INJECT_BP | \
- KVM_GUESTDBG_INJECT_DB)
+ KVM_GUESTDBG_INJECT_DB | \
+ KVM_GUESTDBG_BLOCKIRQ)
#define PFERR_PRESENT_BIT 0
@@ -447,6 +453,7 @@ struct kvm_mmu {
u64 *pae_root;
u64 *pml4_root;
+ u64 *pml5_root;
/*
* check zero bits on shadow page table entries, these
@@ -482,6 +489,7 @@ struct kvm_pmc {
* ctrl value for fixed counters.
*/
u64 current_config;
+ bool is_paused;
};
struct kvm_pmu {
@@ -522,7 +530,6 @@ struct kvm_pmu_ops;
enum {
KVM_DEBUGREG_BP_ENABLED = 1,
KVM_DEBUGREG_WONT_EXIT = 2,
- KVM_DEBUGREG_RELOAD = 4,
};
struct kvm_mtrr_range {
@@ -723,7 +730,6 @@ struct kvm_vcpu_arch {
u64 reserved_gpa_bits;
int maxphyaddr;
- int max_tdp_level;
/* emulate context */
@@ -988,6 +994,12 @@ struct kvm_hv {
/* How many vCPUs have VP index != vCPU index */
atomic_t num_mismatched_vp_indexes;
+ /*
+ * How many SynICs use 'AutoEOI' feature
+ * (protected by arch.apicv_update_lock)
+ */
+ unsigned int synic_auto_eoi_used;
+
struct hv_partition_assist_pg *hv_pa_pg;
struct kvm_hv_syndbg hv_syndbg;
};
@@ -1002,9 +1014,8 @@ struct msr_bitmap_range {
/* Xen emulation context */
struct kvm_xen {
bool long_mode;
- bool shinfo_set;
u8 upcall_vector;
- struct gfn_to_hva_cache shinfo_cache;
+ gfn_t shinfo_gfn;
};
enum kvm_irqchip_mode {
@@ -1061,6 +1072,9 @@ struct kvm_arch {
struct kvm_apic_map __rcu *apic_map;
atomic_t apic_map_dirty;
+ /* Protects apic_access_memslot_enabled and apicv_inhibit_reasons */
+ struct mutex apicv_update_lock;
+
bool apic_access_memslot_enabled;
unsigned long apicv_inhibit_reasons;
@@ -1213,9 +1227,17 @@ struct kvm_vm_stat {
u64 mmu_recycled;
u64 mmu_cache_miss;
u64 mmu_unsync;
- u64 lpages;
+ union {
+ struct {
+ atomic64_t pages_4k;
+ atomic64_t pages_2m;
+ atomic64_t pages_1g;
+ };
+ atomic64_t pages[KVM_NR_PAGE_SIZES];
+ };
u64 nx_lpage_splits;
u64 max_mmu_page_hash_collisions;
+ u64 max_mmu_rmap_size;
};
struct kvm_vcpu_stat {
@@ -1359,7 +1381,6 @@ struct kvm_x86_ops {
void (*enable_irq_window)(struct kvm_vcpu *vcpu);
void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
bool (*check_apicv_inhibit_reasons)(ulong bit);
- void (*pre_update_apicv_exec_ctrl)(struct kvm *kvm, bool activate);
void (*refresh_apicv_exec_ctrl)(struct kvm_vcpu *vcpu);
void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr);
void (*hwapic_isr_update)(struct kvm_vcpu *vcpu, int isr);
@@ -1543,12 +1564,12 @@ void kvm_mmu_uninit_vm(struct kvm *kvm);
void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu);
void kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
- struct kvm_memory_slot *memslot,
+ const struct kvm_memory_slot *memslot,
int start_level);
void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
const struct kvm_memory_slot *memslot);
void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
- struct kvm_memory_slot *memslot);
+ const struct kvm_memory_slot *memslot);
void kvm_mmu_zap_all(struct kvm *kvm);
void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen);
unsigned long kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm);
@@ -1744,6 +1765,9 @@ void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu);
void kvm_request_apicv_update(struct kvm *kvm, bool activate,
unsigned long bit);
+void __kvm_request_apicv_update(struct kvm *kvm, bool activate,
+ unsigned long bit);
+
int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
@@ -1754,8 +1778,8 @@ void kvm_mmu_invalidate_gva(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid);
void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd);
-void kvm_configure_mmu(bool enable_tdp, int tdp_max_root_level,
- int tdp_huge_page_level);
+void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level,
+ int tdp_max_root_level, int tdp_huge_page_level);
static inline u16 kvm_read_ldt(void)
{
@@ -1779,11 +1803,6 @@ static inline unsigned long read_msr(unsigned long msr)
}
#endif
-static inline u32 get_rdx_init_val(void)
-{
- return 0x600; /* P6 family */
-}
-
static inline void kvm_inject_gp(struct kvm_vcpu *vcpu, u32 error_code)
{
kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
@@ -1816,31 +1835,6 @@ enum {
#define kvm_arch_vcpu_memslots_id(vcpu) ((vcpu)->arch.hflags & HF_SMM_MASK ? 1 : 0)
#define kvm_memslots_for_spte_role(kvm, role) __kvm_memslots(kvm, (role).smm)
-asmlinkage void kvm_spurious_fault(void);
-
-/*
- * Hardware virtualization extension instructions may fault if a
- * reboot turns off virtualization while processes are running.
- * Usually after catching the fault we just panic; during reboot
- * instead the instruction is ignored.
- */
-#define __kvm_handle_fault_on_reboot(insn) \
- "666: \n\t" \
- insn "\n\t" \
- "jmp 668f \n\t" \
- "667: \n\t" \
- "1: \n\t" \
- ".pushsection .discard.instr_begin \n\t" \
- ".long 1b - . \n\t" \
- ".popsection \n\t" \
- "call kvm_spurious_fault \n\t" \
- "1: \n\t" \
- ".pushsection .discard.instr_end \n\t" \
- ".long 1b - . \n\t" \
- ".popsection \n\t" \
- "668: \n\t" \
- _ASM_EXTABLE(666b, 667b)
-
#define KVM_ARCH_WANT_MMU_NOTIFIER
int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v);
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index 67ff0d637e55..adccbc209169 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -36,8 +36,6 @@ void hyperv_vector_handler(struct pt_regs *regs);
extern int hyperv_init_cpuhp;
extern void *hv_hypercall_pg;
-extern void __percpu **hyperv_pcpu_input_arg;
-extern void __percpu **hyperv_pcpu_output_arg;
extern u64 hv_current_partition_id;
@@ -170,8 +168,6 @@ int hyperv_fill_flush_guest_mapping_list(
struct hv_guest_mapping_flush_list *flush,
u64 start_gfn, u64 end_gfn);
-extern bool hv_root_partition;
-
#ifdef CONFIG_X86_64
void hv_apic_init(void);
void __init hv_init_spinlocks(void);
diff --git a/arch/x86/include/asm/signal.h b/arch/x86/include/asm/signal.h
index 6fd8410a3910..2dfb5fea13af 100644
--- a/arch/x86/include/asm/signal.h
+++ b/arch/x86/include/asm/signal.h
@@ -29,6 +29,7 @@ typedef struct {
#define SA_X32_ABI 0x01000000u
#ifndef CONFIG_COMPAT
+#define compat_sigset_t compat_sigset_t
typedef sigset_t compat_sigset_t;
#endif
diff --git a/arch/x86/include/asm/sysfb.h b/arch/x86/include/asm/sysfb.h
deleted file mode 100644
index 9834eef7f034..000000000000
--- a/arch/x86/include/asm/sysfb.h
+++ /dev/null
@@ -1,94 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-#ifndef _ARCH_X86_KERNEL_SYSFB_H
-#define _ARCH_X86_KERNEL_SYSFB_H
-
-/*
- * Generic System Framebuffers on x86
- * Copyright (c) 2012-2013 David Herrmann <dh.herrmann@gmail.com>
- */
-
-#include <linux/kernel.h>
-#include <linux/platform_data/simplefb.h>
-#include <linux/screen_info.h>
-
-enum {
- M_I17, /* 17-Inch iMac */
- M_I20, /* 20-Inch iMac */
- M_I20_SR, /* 20-Inch iMac (Santa Rosa) */
- M_I24, /* 24-Inch iMac */
- M_I24_8_1, /* 24-Inch iMac, 8,1th gen */
- M_I24_10_1, /* 24-Inch iMac, 10,1th gen */
- M_I27_11_1, /* 27-Inch iMac, 11,1th gen */
- M_MINI, /* Mac Mini */
- M_MINI_3_1, /* Mac Mini, 3,1th gen */
- M_MINI_4_1, /* Mac Mini, 4,1th gen */
- M_MB, /* MacBook */
- M_MB_2, /* MacBook, 2nd rev. */
- M_MB_3, /* MacBook, 3rd rev. */
- M_MB_5_1, /* MacBook, 5th rev. */
- M_MB_6_1, /* MacBook, 6th rev. */
- M_MB_7_1, /* MacBook, 7th rev. */
- M_MB_SR, /* MacBook, 2nd gen, (Santa Rosa) */
- M_MBA, /* MacBook Air */
- M_MBA_3, /* Macbook Air, 3rd rev */
- M_MBP, /* MacBook Pro */
- M_MBP_2, /* MacBook Pro 2nd gen */
- M_MBP_2_2, /* MacBook Pro 2,2nd gen */
- M_MBP_SR, /* MacBook Pro (Santa Rosa) */
- M_MBP_4, /* MacBook Pro, 4th gen */
- M_MBP_5_1, /* MacBook Pro, 5,1th gen */
- M_MBP_5_2, /* MacBook Pro, 5,2th gen */
- M_MBP_5_3, /* MacBook Pro, 5,3rd gen */
- M_MBP_6_1, /* MacBook Pro, 6,1th gen */
- M_MBP_6_2, /* MacBook Pro, 6,2th gen */
- M_MBP_7_1, /* MacBook Pro, 7,1th gen */
- M_MBP_8_2, /* MacBook Pro, 8,2nd gen */
- M_UNKNOWN /* placeholder */
-};
-
-struct efifb_dmi_info {
- char *optname;
- unsigned long base;
- int stride;
- int width;
- int height;
- int flags;
-};
-
-#ifdef CONFIG_EFI
-
-extern struct efifb_dmi_info efifb_dmi_list[];
-void sysfb_apply_efi_quirks(void);
-
-#else /* CONFIG_EFI */
-
-static inline void sysfb_apply_efi_quirks(void)
-{
-}
-
-#endif /* CONFIG_EFI */
-
-#ifdef CONFIG_X86_SYSFB
-
-bool parse_mode(const struct screen_info *si,
- struct simplefb_platform_data *mode);
-int create_simplefb(const struct screen_info *si,
- const struct simplefb_platform_data *mode);
-
-#else /* CONFIG_X86_SYSFB */
-
-static inline bool parse_mode(const struct screen_info *si,
- struct simplefb_platform_data *mode)
-{
- return false;
-}
-
-static inline int create_simplefb(const struct screen_info *si,
- const struct simplefb_platform_data *mode)
-{
- return -EINVAL;
-}
-
-#endif /* CONFIG_X86_SYSFB */
-
-#endif /* _ARCH_X86_KERNEL_SYSFB_H */
diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index e7265a552f4f..45697e04d771 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -58,13 +58,6 @@ raw_copy_to_user(void __user *dst, const void *src, unsigned long size)
return copy_user_generic((__force void *)dst, src, size);
}
-static __always_inline __must_check
-unsigned long raw_copy_in_user(void __user *dst, const void __user *src, unsigned long size)
-{
- return copy_user_generic((__force void *)dst,
- (__force void *)src, size);
-}
-
extern long __copy_user_nocache(void *dst, const void __user *src,
unsigned size, int zerorest);
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index a6c327f8ad9e..2ef1f6513c68 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -295,6 +295,7 @@ struct kvm_debug_exit_arch {
#define KVM_GUESTDBG_USE_HW_BP 0x00020000
#define KVM_GUESTDBG_INJECT_DB 0x00040000
#define KVM_GUESTDBG_INJECT_BP 0x00080000
+#define KVM_GUESTDBG_BLOCKIRQ 0x00100000
/* for KVM_SET_GUEST_DEBUG */
struct kvm_guest_debug_arch {
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 3e625c61f008..8f4e8fa6ed75 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -136,9 +136,6 @@ obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o
obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o
obj-$(CONFIG_OF) += devicetree.o
obj-$(CONFIG_UPROBES) += uprobes.o
-obj-y += sysfb.o
-obj-$(CONFIG_X86_SYSFB) += sysfb_simplefb.o
-obj-$(CONFIG_EFI) += sysfb_efi.o
obj-$(CONFIG_PERF_EVENTS) += perf_regs.o
obj-$(CONFIG_TRACING) += tracepoint.o
diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c
index 9ac696487b13..ed837383de5c 100644
--- a/arch/x86/kernel/amd_gart_64.c
+++ b/arch/x86/kernel/amd_gart_64.c
@@ -331,7 +331,7 @@ static int __dma_map_cont(struct device *dev, struct scatterlist *start,
int i;
if (iommu_start == -1)
- return -1;
+ return -ENOMEM;
for_each_sg(start, s, nelems, i) {
unsigned long pages, addr;
@@ -380,13 +380,13 @@ static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents,
enum dma_data_direction dir, unsigned long attrs)
{
struct scatterlist *s, *ps, *start_sg, *sgmap;
- int need = 0, nextneed, i, out, start;
+ int need = 0, nextneed, i, out, start, ret;
unsigned long pages = 0;
unsigned int seg_size;
unsigned int max_seg_size;
if (nents == 0)
- return 0;
+ return -EINVAL;
out = 0;
start = 0;
@@ -414,8 +414,9 @@ static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents,
if (!iommu_merge || !nextneed || !need || s->offset ||
(s->length + seg_size > max_seg_size) ||
(ps->offset + ps->length) % PAGE_SIZE) {
- if (dma_map_cont(dev, start_sg, i - start,
- sgmap, pages, need) < 0)
+ ret = dma_map_cont(dev, start_sg, i - start,
+ sgmap, pages, need);
+ if (ret < 0)
goto error;
out++;
@@ -432,7 +433,8 @@ static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents,
pages += iommu_num_pages(s->offset, s->length, PAGE_SIZE);
ps = s;
}
- if (dma_map_cont(dev, start_sg, i - start, sgmap, pages, need) < 0)
+ ret = dma_map_cont(dev, start_sg, i - start, sgmap, pages, need);
+ if (ret < 0)
goto error;
out++;
flush_gart();
@@ -456,9 +458,7 @@ error:
panic("dma_map_sg: overflow on %lu pages\n", pages);
iommu_full(dev, pages << PAGE_SHIFT, dir);
- for_each_sg(sg, s, nents, i)
- s->dma_address = DMA_MAPPING_ERROR;
- return 0;
+ return ret;
}
/* allocate and map a coherent mapping */
diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
index 23dda362dc0f..c92c9c774c0e 100644
--- a/arch/x86/kernel/amd_nb.c
+++ b/arch/x86/kernel/amd_nb.c
@@ -25,6 +25,8 @@
#define PCI_DEVICE_ID_AMD_17H_M60H_DF_F4 0x144c
#define PCI_DEVICE_ID_AMD_17H_M70H_DF_F4 0x1444
#define PCI_DEVICE_ID_AMD_19H_DF_F4 0x1654
+#define PCI_DEVICE_ID_AMD_19H_M40H_ROOT 0x14b5
+#define PCI_DEVICE_ID_AMD_19H_M40H_DF_F4 0x167d
#define PCI_DEVICE_ID_AMD_19H_M50H_DF_F4 0x166e
/* Protect the PCI config register pairs used for SMN and DF indirect access. */
@@ -37,6 +39,7 @@ static const struct pci_device_id amd_root_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M10H_ROOT) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M30H_ROOT) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M60H_ROOT) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M40H_ROOT) },
{}
};
@@ -58,6 +61,7 @@ static const struct pci_device_id amd_nb_misc_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M70H_DF_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_DF_F3) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M40H_DF_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M50H_DF_F3) },
{}
};
@@ -74,6 +78,7 @@ static const struct pci_device_id amd_nb_link_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M60H_DF_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M70H_DF_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_DF_F4) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M40H_DF_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M50H_DF_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F4) },
{}
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 294ed4392a0e..10562885f5fc 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -109,14 +109,13 @@ static u32 __init allocate_aperture(void)
* memory. Unfortunately we cannot move it up because that would
* make the IOMMU useless.
*/
- addr = memblock_find_in_range(GART_MIN_ADDR, GART_MAX_ADDR,
- aper_size, aper_size);
+ addr = memblock_phys_alloc_range(aper_size, aper_size,
+ GART_MIN_ADDR, GART_MAX_ADDR);
if (!addr) {
pr_err("Cannot allocate aperture memory hole [mem %#010lx-%#010lx] (%uKB)\n",
addr, addr + aper_size - 1, aper_size >> 10);
return 0;
}
- memblock_reserve(addr, aper_size);
pr_info("Mapping aperture over RAM [mem %#010lx-%#010lx] (%uKB)\n",
addr, addr + aper_size - 1, aper_size >> 10);
register_nosave_region(addr >> PAGE_SHIFT,
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index c890d67a64ad..e095c28d27ae 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -17,7 +17,6 @@
#include <linux/irq.h>
#include <linux/kexec.h>
#include <linux/i8253.h>
-#include <linux/panic_notifier.h>
#include <linux/random.h>
#include <asm/processor.h>
#include <asm/hypervisor.h>
@@ -36,10 +35,7 @@
/* Is Linux running as the root partition? */
bool hv_root_partition;
-EXPORT_SYMBOL_GPL(hv_root_partition);
-
struct ms_hyperv_info ms_hyperv;
-EXPORT_SYMBOL_GPL(ms_hyperv);
#if IS_ENABLED(CONFIG_HYPERV)
static void (*vmbus_handler)(void);
@@ -65,14 +61,12 @@ void hv_setup_vmbus_handler(void (*handler)(void))
{
vmbus_handler = handler;
}
-EXPORT_SYMBOL_GPL(hv_setup_vmbus_handler);
void hv_remove_vmbus_handler(void)
{
/* We have no way to deallocate the interrupt gate */
vmbus_handler = NULL;
}
-EXPORT_SYMBOL_GPL(hv_remove_vmbus_handler);
/*
* Routines to do per-architecture handling of stimer0
@@ -107,25 +101,21 @@ void hv_setup_kexec_handler(void (*handler)(void))
{
hv_kexec_handler = handler;
}
-EXPORT_SYMBOL_GPL(hv_setup_kexec_handler);
void hv_remove_kexec_handler(void)
{
hv_kexec_handler = NULL;
}
-EXPORT_SYMBOL_GPL(hv_remove_kexec_handler);
void hv_setup_crash_handler(void (*handler)(struct pt_regs *regs))
{
hv_crash_handler = handler;
}
-EXPORT_SYMBOL_GPL(hv_setup_crash_handler);
void hv_remove_crash_handler(void)
{
hv_crash_handler = NULL;
}
-EXPORT_SYMBOL_GPL(hv_remove_crash_handler);
#ifdef CONFIG_KEXEC_CORE
static void hv_machine_shutdown(void)
@@ -335,16 +325,6 @@ static void __init ms_hyperv_init_platform(void)
ms_hyperv.nested_features);
}
- /*
- * Hyper-V expects to get crash register data or kmsg when
- * crash enlightment is available and system crashes. Set
- * crash_kexec_post_notifiers to be true to make sure that
- * calling crash enlightment interface before running kdump
- * kernel.
- */
- if (ms_hyperv.misc_features & HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE)
- crash_kexec_post_notifiers = true;
-
#ifdef CONFIG_X86_LOCAL_APIC
if (ms_hyperv.features & HV_ACCESS_FREQUENCY_MSRS &&
ms_hyperv.misc_features & HV_FEATURE_FREQUENCY_MSRS_AVAILABLE) {
@@ -373,10 +353,17 @@ static void __init ms_hyperv_init_platform(void)
machine_ops.crash_shutdown = hv_machine_crash_shutdown;
#endif
if (ms_hyperv.features & HV_ACCESS_TSC_INVARIANT) {
+ /*
+ * Writing to synthetic MSR 0x40000118 updates/changes the
+ * guest visible CPUIDs. Setting bit 0 of this MSR enables
+ * guests to report invariant TSC feature through CPUID
+ * instruction, CPUID 0x800000007/EDX, bit 8. See code in
+ * early_init_intel() where this bit is examined. The
+ * setting of this MSR bit should happen before init_intel()
+ * is called.
+ */
wrmsrl(HV_X64_MSR_TSC_INVARIANT_CONTROL, 0x1);
setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE);
- } else {
- mark_tsc_unstable("running on Hyper-V");
}
/*
@@ -437,6 +424,13 @@ static void __init ms_hyperv_init_platform(void)
/* Register Hyper-V specific clocksource */
hv_init_clocksource();
#endif
+ /*
+ * TSC should be marked as unstable only after Hyper-V
+ * clocksource has been initialized. This ensures that the
+ * stability of the sched_clock is not altered.
+ */
+ if (!(ms_hyperv.features & HV_ACCESS_TSC_INVARIANT))
+ mark_tsc_unstable("running on Hyper-V");
}
static bool __init ms_hyperv_x2apic_available(void)
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 67f590425d90..d8c64dab0efe 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -432,7 +432,7 @@ SYM_FUNC_START(early_ignore_irq)
pushl 32(%esp)
pushl 40(%esp)
pushl $int_msg
- call printk
+ call _printk
call dump_stack
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index a26643dc6bd6..b656456c3a94 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -884,10 +884,11 @@ static void kvm_wait(u8 *ptr, u8 val)
} else {
local_irq_disable();
+ /* safe_halt() will enable IRQ */
if (READ_ONCE(*ptr) == val)
safe_halt();
-
- local_irq_enable();
+ else
+ local_irq_enable();
}
}
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index aa15132228da..525876e7b9f4 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -154,7 +154,7 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries)
if (num_entries > LDT_ENTRIES)
return NULL;
- new_ldt = kmalloc(sizeof(struct ldt_struct), GFP_KERNEL);
+ new_ldt = kmalloc(sizeof(struct ldt_struct), GFP_KERNEL_ACCOUNT);
if (!new_ldt)
return NULL;
@@ -168,9 +168,9 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries)
* than PAGE_SIZE.
*/
if (alloc_size > PAGE_SIZE)
- new_ldt->entries = vzalloc(alloc_size);
+ new_ldt->entries = __vmalloc(alloc_size, GFP_KERNEL_ACCOUNT | __GFP_ZERO);
else
- new_ldt->entries = (void *)get_zeroed_page(GFP_KERNEL);
+ new_ldt->entries = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
if (!new_ldt->entries) {
kfree(new_ldt);
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index bff3a784aec5..79f164141116 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -5,6 +5,7 @@
* This file contains the setup_arch() code, which handles the architecture-dependent
* parts of early kernel initialization.
*/
+#include <linux/acpi.h>
#include <linux/console.h>
#include <linux/crash_dump.h>
#include <linux/dma-map-ops.h>
@@ -572,16 +573,6 @@ void __init reserve_standard_io_resources(void)
}
-static __init void reserve_ibft_region(void)
-{
- unsigned long addr, size = 0;
-
- addr = find_ibft_region(&size);
-
- if (size)
- memblock_reserve(addr, size);
-}
-
static bool __init snb_gfx_workaround_needed(void)
{
#ifdef CONFIG_PCI
diff --git a/arch/x86/kernel/signal_compat.c b/arch/x86/kernel/signal_compat.c
index 06743ec054d2..b52407c56000 100644
--- a/arch/x86/kernel/signal_compat.c
+++ b/arch/x86/kernel/signal_compat.c
@@ -34,7 +34,13 @@ static inline void signal_compat_build_tests(void)
BUILD_BUG_ON(NSIGSYS != 2);
/* This is part of the ABI and can never change in size: */
+ BUILD_BUG_ON(sizeof(siginfo_t) != 128);
BUILD_BUG_ON(sizeof(compat_siginfo_t) != 128);
+
+ /* This is a part of the ABI and can never change in alignment */
+ BUILD_BUG_ON(__alignof__(siginfo_t) != 8);
+ BUILD_BUG_ON(__alignof__(compat_siginfo_t) != 4);
+
/*
* The offsets of all the (unioned) si_fields are fixed
* in the ABI, of course. Make sure none of them ever
diff --git a/arch/x86/kernel/sysfb.c b/arch/x86/kernel/sysfb.c
deleted file mode 100644
index 014ebd8ca869..000000000000
--- a/arch/x86/kernel/sysfb.c
+++ /dev/null
@@ -1,70 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Generic System Framebuffers on x86
- * Copyright (c) 2012-2013 David Herrmann <dh.herrmann@gmail.com>
- */
-
-/*
- * Simple-Framebuffer support for x86 systems
- * Create a platform-device for any available boot framebuffer. The
- * simple-framebuffer platform device is already available on DT systems, so
- * this module parses the global "screen_info" object and creates a suitable
- * platform device compatible with the "simple-framebuffer" DT object. If
- * the framebuffer is incompatible, we instead create a legacy
- * "vesa-framebuffer", "efi-framebuffer" or "platform-framebuffer" device and
- * pass the screen_info as platform_data. This allows legacy drivers
- * to pick these devices up without messing with simple-framebuffer drivers.
- * The global "screen_info" is still valid at all times.
- *
- * If CONFIG_X86_SYSFB is not selected, we never register "simple-framebuffer"
- * platform devices, but only use legacy framebuffer devices for
- * backwards compatibility.
- *
- * TODO: We set the dev_id field of all platform-devices to 0. This allows
- * other x86 OF/DT parsers to create such devices, too. However, they must
- * start at offset 1 for this to work.
- */
-
-#include <linux/err.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/platform_data/simplefb.h>
-#include <linux/platform_device.h>
-#include <linux/screen_info.h>
-#include <asm/sysfb.h>
-
-static __init int sysfb_init(void)
-{
- struct screen_info *si = &screen_info;
- struct simplefb_platform_data mode;
- struct platform_device *pd;
- const char *name;
- bool compatible;
- int ret;
-
- sysfb_apply_efi_quirks();
-
- /* try to create a simple-framebuffer device */
- compatible = parse_mode(si, &mode);
- if (compatible) {
- ret = create_simplefb(si, &mode);
- if (!ret)
- return 0;
- }
-
- /* if the FB is incompatible, create a legacy framebuffer device */
- if (si->orig_video_isVGA == VIDEO_TYPE_EFI)
- name = "efi-framebuffer";
- else if (si->orig_video_isVGA == VIDEO_TYPE_VLFB)
- name = "vesa-framebuffer";
- else
- name = "platform-framebuffer";
-
- pd = platform_device_register_resndata(NULL, name, 0,
- NULL, 0, si, sizeof(*si));
- return PTR_ERR_OR_ZERO(pd);
-}
-
-/* must execute after PCI subsystem for EFI quirks */
-device_initcall(sysfb_init);
diff --git a/arch/x86/kernel/sysfb_efi.c b/arch/x86/kernel/sysfb_efi.c
deleted file mode 100644
index 8a56a6d80098..000000000000
--- a/arch/x86/kernel/sysfb_efi.c
+++ /dev/null
@@ -1,284 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Generic System Framebuffers on x86
- * Copyright (c) 2012-2013 David Herrmann <dh.herrmann@gmail.com>
- *
- * EFI Quirks Copyright (c) 2006 Edgar Hucek <gimli@dark-green.com>
- */
-
-/*
- * EFI Quirks
- * Several EFI systems do not correctly advertise their boot framebuffers.
- * Hence, we use this static table of known broken machines and fix up the
- * information so framebuffer drivers can load correctly.
- */
-
-#include <linux/dmi.h>
-#include <linux/err.h>
-#include <linux/efi.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/pci.h>
-#include <linux/screen_info.h>
-#include <video/vga.h>
-
-#include <asm/efi.h>
-#include <asm/sysfb.h>
-
-enum {
- OVERRIDE_NONE = 0x0,
- OVERRIDE_BASE = 0x1,
- OVERRIDE_STRIDE = 0x2,
- OVERRIDE_HEIGHT = 0x4,
- OVERRIDE_WIDTH = 0x8,
-};
-
-struct efifb_dmi_info efifb_dmi_list[] = {
- [M_I17] = { "i17", 0x80010000, 1472 * 4, 1440, 900, OVERRIDE_NONE },
- [M_I20] = { "i20", 0x80010000, 1728 * 4, 1680, 1050, OVERRIDE_NONE }, /* guess */
- [M_I20_SR] = { "imac7", 0x40010000, 1728 * 4, 1680, 1050, OVERRIDE_NONE },
- [M_I24] = { "i24", 0x80010000, 2048 * 4, 1920, 1200, OVERRIDE_NONE }, /* guess */
- [M_I24_8_1] = { "imac8", 0xc0060000, 2048 * 4, 1920, 1200, OVERRIDE_NONE },
- [M_I24_10_1] = { "imac10", 0xc0010000, 2048 * 4, 1920, 1080, OVERRIDE_NONE },
- [M_I27_11_1] = { "imac11", 0xc0010000, 2560 * 4, 2560, 1440, OVERRIDE_NONE },
- [M_MINI]= { "mini", 0x80000000, 2048 * 4, 1024, 768, OVERRIDE_NONE },
- [M_MINI_3_1] = { "mini31", 0x40010000, 1024 * 4, 1024, 768, OVERRIDE_NONE },
- [M_MINI_4_1] = { "mini41", 0xc0010000, 2048 * 4, 1920, 1200, OVERRIDE_NONE },
- [M_MB] = { "macbook", 0x80000000, 2048 * 4, 1280, 800, OVERRIDE_NONE },
- [M_MB_5_1] = { "macbook51", 0x80010000, 2048 * 4, 1280, 800, OVERRIDE_NONE },
- [M_MB_6_1] = { "macbook61", 0x80010000, 2048 * 4, 1280, 800, OVERRIDE_NONE },
- [M_MB_7_1] = { "macbook71", 0x80010000, 2048 * 4, 1280, 800, OVERRIDE_NONE },
- [M_MBA] = { "mba", 0x80000000, 2048 * 4, 1280, 800, OVERRIDE_NONE },
- /* 11" Macbook Air 3,1 passes the wrong stride */
- [M_MBA_3] = { "mba3", 0, 2048 * 4, 0, 0, OVERRIDE_STRIDE },
- [M_MBP] = { "mbp", 0x80010000, 1472 * 4, 1440, 900, OVERRIDE_NONE },
- [M_MBP_2] = { "mbp2", 0, 0, 0, 0, OVERRIDE_NONE }, /* placeholder */
- [M_MBP_2_2] = { "mbp22", 0x80010000, 1472 * 4, 1440, 900, OVERRIDE_NONE },
- [M_MBP_SR] = { "mbp3", 0x80030000, 2048 * 4, 1440, 900, OVERRIDE_NONE },
- [M_MBP_4] = { "mbp4", 0xc0060000, 2048 * 4, 1920, 1200, OVERRIDE_NONE },
- [M_MBP_5_1] = { "mbp51", 0xc0010000, 2048 * 4, 1440, 900, OVERRIDE_NONE },
- [M_MBP_5_2] = { "mbp52", 0xc0010000, 2048 * 4, 1920, 1200, OVERRIDE_NONE },
- [M_MBP_5_3] = { "mbp53", 0xd0010000, 2048 * 4, 1440, 900, OVERRIDE_NONE },
- [M_MBP_6_1] = { "mbp61", 0x90030000, 2048 * 4, 1920, 1200, OVERRIDE_NONE },
- [M_MBP_6_2] = { "mbp62", 0x90030000, 2048 * 4, 1680, 1050, OVERRIDE_NONE },
- [M_MBP_7_1] = { "mbp71", 0xc0010000, 2048 * 4, 1280, 800, OVERRIDE_NONE },
- [M_MBP_8_2] = { "mbp82", 0x90010000, 1472 * 4, 1440, 900, OVERRIDE_NONE },
- [M_UNKNOWN] = { NULL, 0, 0, 0, 0, OVERRIDE_NONE }
-};
-
-void efifb_setup_from_dmi(struct screen_info *si, const char *opt)
-{
- int i;
-
- for (i = 0; i < M_UNKNOWN; i++) {
- if (efifb_dmi_list[i].base != 0 &&
- !strcmp(opt, efifb_dmi_list[i].optname)) {
- si->lfb_base = efifb_dmi_list[i].base;
- si->lfb_linelength = efifb_dmi_list[i].stride;
- si->lfb_width = efifb_dmi_list[i].width;
- si->lfb_height = efifb_dmi_list[i].height;
- }
- }
-}
-
-#define choose_value(dmivalue, fwvalue, field, flags) ({ \
- typeof(fwvalue) _ret_ = fwvalue; \
- if ((flags) & (field)) \
- _ret_ = dmivalue; \
- else if ((fwvalue) == 0) \
- _ret_ = dmivalue; \
- _ret_; \
- })
-
-static int __init efifb_set_system(const struct dmi_system_id *id)
-{
- struct efifb_dmi_info *info = id->driver_data;
-
- if (info->base == 0 && info->height == 0 && info->width == 0 &&
- info->stride == 0)
- return 0;
-
- /* Trust the bootloader over the DMI tables */
- if (screen_info.lfb_base == 0) {
-#if defined(CONFIG_PCI)
- struct pci_dev *dev = NULL;
- int found_bar = 0;
-#endif
- if (info->base) {
- screen_info.lfb_base = choose_value(info->base,
- screen_info.lfb_base, OVERRIDE_BASE,
- info->flags);
-
-#if defined(CONFIG_PCI)
- /* make sure that the address in the table is actually
- * on a VGA device's PCI BAR */
-
- for_each_pci_dev(dev) {
- int i;
- if ((dev->class >> 8) != PCI_CLASS_DISPLAY_VGA)
- continue;
- for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) {
- resource_size_t start, end;
- unsigned long flags;
-
- flags = pci_resource_flags(dev, i);
- if (!(flags & IORESOURCE_MEM))
- continue;
-
- if (flags & IORESOURCE_UNSET)
- continue;
-
- if (pci_resource_len(dev, i) == 0)
- continue;
-
- start = pci_resource_start(dev, i);
- end = pci_resource_end(dev, i);
- if (screen_info.lfb_base >= start &&
- screen_info.lfb_base < end) {
- found_bar = 1;
- break;
- }
- }
- }
- if (!found_bar)
- screen_info.lfb_base = 0;
-#endif
- }
- }
- if (screen_info.lfb_base) {
- screen_info.lfb_linelength = choose_value(info->stride,
- screen_info.lfb_linelength, OVERRIDE_STRIDE,
- info->flags);
- screen_info.lfb_width = choose_value(info->width,
- screen_info.lfb_width, OVERRIDE_WIDTH,
- info->flags);
- screen_info.lfb_height = choose_value(info->height,
- screen_info.lfb_height, OVERRIDE_HEIGHT,
- info->flags);
- if (screen_info.orig_video_isVGA == 0)
- screen_info.orig_video_isVGA = VIDEO_TYPE_EFI;
- } else {
- screen_info.lfb_linelength = 0;
- screen_info.lfb_width = 0;
- screen_info.lfb_height = 0;
- screen_info.orig_video_isVGA = 0;
- return 0;
- }
-
- printk(KERN_INFO "efifb: dmi detected %s - framebuffer at 0x%08x "
- "(%dx%d, stride %d)\n", id->ident,
- screen_info.lfb_base, screen_info.lfb_width,
- screen_info.lfb_height, screen_info.lfb_linelength);
-
- return 1;
-}
-
-#define EFIFB_DMI_SYSTEM_ID(vendor, name, enumid) \
- { \
- efifb_set_system, \
- name, \
- { \
- DMI_MATCH(DMI_BIOS_VENDOR, vendor), \
- DMI_MATCH(DMI_PRODUCT_NAME, name) \
- }, \
- &efifb_dmi_list[enumid] \
- }
-
-static const struct dmi_system_id efifb_dmi_system_table[] __initconst = {
- EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "iMac4,1", M_I17),
- /* At least one of these two will be right; maybe both? */
- EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "iMac5,1", M_I20),
- EFIFB_DMI_SYSTEM_ID("Apple Inc.", "iMac5,1", M_I20),
- /* At least one of these two will be right; maybe both? */
- EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "iMac6,1", M_I24),
- EFIFB_DMI_SYSTEM_ID("Apple Inc.", "iMac6,1", M_I24),
- EFIFB_DMI_SYSTEM_ID("Apple Inc.", "iMac7,1", M_I20_SR),
- EFIFB_DMI_SYSTEM_ID("Apple Inc.", "iMac8,1", M_I24_8_1),
- EFIFB_DMI_SYSTEM_ID("Apple Inc.", "iMac10,1", M_I24_10_1),
- EFIFB_DMI_SYSTEM_ID("Apple Inc.", "iMac11,1", M_I27_11_1),
- EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "Macmini1,1", M_MINI),
- EFIFB_DMI_SYSTEM_ID("Apple Inc.", "Macmini3,1", M_MINI_3_1),
- EFIFB_DMI_SYSTEM_ID("Apple Inc.", "Macmini4,1", M_MINI_4_1),
- EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "MacBook1,1", M_MB),
- /* At least one of these two will be right; maybe both? */
- EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "MacBook2,1", M_MB),
- EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBook2,1", M_MB),
- /* At least one of these two will be right; maybe both? */
- EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "MacBook3,1", M_MB),
- EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBook3,1", M_MB),
- EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBook4,1", M_MB),
- EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBook5,1", M_MB_5_1),
- EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBook6,1", M_MB_6_1),
- EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBook7,1", M_MB_7_1),
- EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookAir1,1", M_MBA),
- EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookAir3,1", M_MBA_3),
- EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "MacBookPro1,1", M_MBP),
- EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "MacBookPro2,1", M_MBP_2),
- EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "MacBookPro2,2", M_MBP_2_2),
- EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro2,1", M_MBP_2),
- EFIFB_DMI_SYSTEM_ID("Apple Computer, Inc.", "MacBookPro3,1", M_MBP_SR),
- EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro3,1", M_MBP_SR),
- EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro4,1", M_MBP_4),
- EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro5,1", M_MBP_5_1),
- EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro5,2", M_MBP_5_2),
- EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro5,3", M_MBP_5_3),
- EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro6,1", M_MBP_6_1),
- EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro6,2", M_MBP_6_2),
- EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro7,1", M_MBP_7_1),
- EFIFB_DMI_SYSTEM_ID("Apple Inc.", "MacBookPro8,2", M_MBP_8_2),
- {},
-};
-
-/*
- * Some devices have a portrait LCD but advertise a landscape resolution (and
- * pitch). We simply swap width and height for these devices so that we can
- * correctly deal with some of them coming with multiple resolutions.
- */
-static const struct dmi_system_id efifb_dmi_swap_width_height[] __initconst = {
- {
- /*
- * Lenovo MIIX310-10ICR, only some batches have the troublesome
- * 800x1280 portrait screen. Luckily the portrait version has
- * its own BIOS version, so we match on that.
- */
- .matches = {
- DMI_EXACT_MATCH(DMI_SYS_VENDOR, "LENOVO"),
- DMI_EXACT_MATCH(DMI_PRODUCT_VERSION, "MIIX 310-10ICR"),
- DMI_EXACT_MATCH(DMI_BIOS_VERSION, "1HCN44WW"),
- },
- },
- {
- /* Lenovo MIIX 320-10ICR with 800x1280 portrait screen */
- .matches = {
- DMI_EXACT_MATCH(DMI_SYS_VENDOR, "LENOVO"),
- DMI_EXACT_MATCH(DMI_PRODUCT_VERSION,
- "Lenovo MIIX 320-10ICR"),
- },
- },
- {
- /* Lenovo D330 with 800x1280 or 1200x1920 portrait screen */
- .matches = {
- DMI_EXACT_MATCH(DMI_SYS_VENDOR, "LENOVO"),
- DMI_EXACT_MATCH(DMI_PRODUCT_VERSION,
- "Lenovo ideapad D330-10IGM"),
- },
- },
- {},
-};
-
-__init void sysfb_apply_efi_quirks(void)
-{
- if (screen_info.orig_video_isVGA != VIDEO_TYPE_EFI ||
- !(screen_info.capabilities & VIDEO_CAPABILITY_SKIP_QUIRKS))
- dmi_check_system(efifb_dmi_system_table);
-
- if (screen_info.orig_video_isVGA == VIDEO_TYPE_EFI &&
- dmi_check_system(efifb_dmi_swap_width_height)) {
- u16 temp = screen_info.lfb_width;
-
- screen_info.lfb_width = screen_info.lfb_height;
- screen_info.lfb_height = temp;
- screen_info.lfb_linelength = 4 * screen_info.lfb_width;
- }
-}
diff --git a/arch/x86/kernel/sysfb_simplefb.c b/arch/x86/kernel/sysfb_simplefb.c
deleted file mode 100644
index 298fc1edd9c9..000000000000
--- a/arch/x86/kernel/sysfb_simplefb.c
+++ /dev/null
@@ -1,111 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Generic System Framebuffers on x86
- * Copyright (c) 2012-2013 David Herrmann <dh.herrmann@gmail.com>
- */
-
-/*
- * simple-framebuffer probing
- * Try to convert "screen_info" into a "simple-framebuffer" compatible mode.
- * If the mode is incompatible, we return "false" and let the caller create
- * legacy nodes instead.
- */
-
-#include <linux/err.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/platform_data/simplefb.h>
-#include <linux/platform_device.h>
-#include <linux/screen_info.h>
-#include <asm/sysfb.h>
-
-static const char simplefb_resname[] = "BOOTFB";
-static const struct simplefb_format formats[] = SIMPLEFB_FORMATS;
-
-/* try parsing x86 screen_info into a simple-framebuffer mode struct */
-__init bool parse_mode(const struct screen_info *si,
- struct simplefb_platform_data *mode)
-{
- const struct simplefb_format *f;
- __u8 type;
- unsigned int i;
-
- type = si->orig_video_isVGA;
- if (type != VIDEO_TYPE_VLFB && type != VIDEO_TYPE_EFI)
- return false;
-
- for (i = 0; i < ARRAY_SIZE(formats); ++i) {
- f = &formats[i];
- if (si->lfb_depth == f->bits_per_pixel &&
- si->red_size == f->red.length &&
- si->red_pos == f->red.offset &&
- si->green_size == f->green.length &&
- si->green_pos == f->green.offset &&
- si->blue_size == f->blue.length &&
- si->blue_pos == f->blue.offset &&
- si->rsvd_size == f->transp.length &&
- si->rsvd_pos == f->transp.offset) {
- mode->format = f->name;
- mode->width = si->lfb_width;
- mode->height = si->lfb_height;
- mode->stride = si->lfb_linelength;
- return true;
- }
- }
-
- return false;
-}
-
-__init int create_simplefb(const struct screen_info *si,
- const struct simplefb_platform_data *mode)
-{
- struct platform_device *pd;
- struct resource res;
- u64 base, size;
- u32 length;
-
- /*
- * If the 64BIT_BASE capability is set, ext_lfb_base will contain the
- * upper half of the base address. Assemble the address, then make sure
- * it is valid and we can actually access it.
- */
- base = si->lfb_base;
- if (si->capabilities & VIDEO_CAPABILITY_64BIT_BASE)
- base |= (u64)si->ext_lfb_base << 32;
- if (!base || (u64)(resource_size_t)base != base) {
- printk(KERN_DEBUG "sysfb: inaccessible VRAM base\n");
- return -EINVAL;
- }
-
- /*
- * Don't use lfb_size as IORESOURCE size, since it may contain the
- * entire VMEM, and thus require huge mappings. Use just the part we
- * need, that is, the part where the framebuffer is located. But verify
- * that it does not exceed the advertised VMEM.
- * Note that in case of VBE, the lfb_size is shifted by 16 bits for
- * historical reasons.
- */
- size = si->lfb_size;
- if (si->orig_video_isVGA == VIDEO_TYPE_VLFB)
- size <<= 16;
- length = mode->height * mode->stride;
- if (length > size) {
- printk(KERN_WARNING "sysfb: VRAM smaller than advertised\n");
- return -EINVAL;
- }
- length = PAGE_ALIGN(length);
-
- /* setup IORESOURCE_MEM as framebuffer memory */
- memset(&res, 0, sizeof(res));
- res.flags = IORESOURCE_MEM | IORESOURCE_BUSY;
- res.name = simplefb_resname;
- res.start = base;
- res.end = res.start + length - 1;
- if (res.end <= res.start)
- return -EINVAL;
-
- pd = platform_device_register_resndata(NULL, "simple-framebuffer", 0,
- &res, 1, mode, sizeof(*mode));
- return PTR_ERR_OR_ZERO(pd);
-}
diff --git a/arch/x86/kvm/debugfs.c b/arch/x86/kvm/debugfs.c
index 95a98413dc32..54a83a744538 100644
--- a/arch/x86/kvm/debugfs.c
+++ b/arch/x86/kvm/debugfs.c
@@ -7,6 +7,8 @@
#include <linux/kvm_host.h>
#include <linux/debugfs.h>
#include "lapic.h"
+#include "mmu.h"
+#include "mmu/mmu_internal.h"
static int vcpu_get_timer_advance_ns(void *data, u64 *val)
{
@@ -73,3 +75,112 @@ void kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu, struct dentry *debugfs_
&vcpu_tsc_scaling_frac_fops);
}
}
+
+/*
+ * This covers statistics <1024 (11=log(1024)+1), which should be enough to
+ * cover RMAP_RECYCLE_THRESHOLD.
+ */
+#define RMAP_LOG_SIZE 11
+
+static const char *kvm_lpage_str[KVM_NR_PAGE_SIZES] = { "4K", "2M", "1G" };
+
+static int kvm_mmu_rmaps_stat_show(struct seq_file *m, void *v)
+{
+ struct kvm_rmap_head *rmap;
+ struct kvm *kvm = m->private;
+ struct kvm_memory_slot *slot;
+ struct kvm_memslots *slots;
+ unsigned int lpage_size, index;
+ /* Still small enough to be on the stack */
+ unsigned int *log[KVM_NR_PAGE_SIZES], *cur;
+ int i, j, k, l, ret;
+
+ ret = -ENOMEM;
+ memset(log, 0, sizeof(log));
+ for (i = 0; i < KVM_NR_PAGE_SIZES; i++) {
+ log[i] = kcalloc(RMAP_LOG_SIZE, sizeof(unsigned int), GFP_KERNEL);
+ if (!log[i])
+ goto out;
+ }
+
+ mutex_lock(&kvm->slots_lock);
+ write_lock(&kvm->mmu_lock);
+
+ for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
+ slots = __kvm_memslots(kvm, i);
+ for (j = 0; j < slots->used_slots; j++) {
+ slot = &slots->memslots[j];
+ for (k = 0; k < KVM_NR_PAGE_SIZES; k++) {
+ rmap = slot->arch.rmap[k];
+ lpage_size = kvm_mmu_slot_lpages(slot, k + 1);
+ cur = log[k];
+ for (l = 0; l < lpage_size; l++) {
+ index = ffs(pte_list_count(&rmap[l]));
+ if (WARN_ON_ONCE(index >= RMAP_LOG_SIZE))
+ index = RMAP_LOG_SIZE - 1;
+ cur[index]++;
+ }
+ }
+ }
+ }
+
+ write_unlock(&kvm->mmu_lock);
+ mutex_unlock(&kvm->slots_lock);
+
+ /* index=0 counts no rmap; index=1 counts 1 rmap */
+ seq_printf(m, "Rmap_Count:\t0\t1\t");
+ for (i = 2; i < RMAP_LOG_SIZE; i++) {
+ j = 1 << (i - 1);
+ k = (1 << i) - 1;
+ seq_printf(m, "%d-%d\t", j, k);
+ }
+ seq_printf(m, "\n");
+
+ for (i = 0; i < KVM_NR_PAGE_SIZES; i++) {
+ seq_printf(m, "Level=%s:\t", kvm_lpage_str[i]);
+ cur = log[i];
+ for (j = 0; j < RMAP_LOG_SIZE; j++)
+ seq_printf(m, "%d\t", cur[j]);
+ seq_printf(m, "\n");
+ }
+
+ ret = 0;
+out:
+ for (i = 0; i < KVM_NR_PAGE_SIZES; i++)
+ kfree(log[i]);
+
+ return ret;
+}
+
+static int kvm_mmu_rmaps_stat_open(struct inode *inode, struct file *file)
+{
+ struct kvm *kvm = inode->i_private;
+
+ if (!kvm_get_kvm_safe(kvm))
+ return -ENOENT;
+
+ return single_open(file, kvm_mmu_rmaps_stat_show, kvm);
+}
+
+static int kvm_mmu_rmaps_stat_release(struct inode *inode, struct file *file)
+{
+ struct kvm *kvm = inode->i_private;
+
+ kvm_put_kvm(kvm);
+
+ return single_release(inode, file);
+}
+
+static const struct file_operations mmu_rmaps_stat_fops = {
+ .open = kvm_mmu_rmaps_stat_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = kvm_mmu_rmaps_stat_release,
+};
+
+int kvm_arch_create_vm_debugfs(struct kvm *kvm)
+{
+ debugfs_create_file("mmu_rmaps_stat", 0644, kvm->debugfs_dentry, kvm,
+ &mmu_rmaps_stat_fops);
+ return 0;
+}
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 41d2a53c5dea..232a86a6faaf 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -88,6 +88,10 @@ static bool synic_has_vector_auto_eoi(struct kvm_vcpu_hv_synic *synic,
static void synic_update_vector(struct kvm_vcpu_hv_synic *synic,
int vector)
{
+ struct kvm_vcpu *vcpu = hv_synic_to_vcpu(synic);
+ struct kvm_hv *hv = to_kvm_hv(vcpu->kvm);
+ int auto_eoi_old, auto_eoi_new;
+
if (vector < HV_SYNIC_FIRST_VALID_VECTOR)
return;
@@ -96,10 +100,30 @@ static void synic_update_vector(struct kvm_vcpu_hv_synic *synic,
else
__clear_bit(vector, synic->vec_bitmap);
+ auto_eoi_old = bitmap_weight(synic->auto_eoi_bitmap, 256);
+
if (synic_has_vector_auto_eoi(synic, vector))
__set_bit(vector, synic->auto_eoi_bitmap);
else
__clear_bit(vector, synic->auto_eoi_bitmap);
+
+ auto_eoi_new = bitmap_weight(synic->auto_eoi_bitmap, 256);
+
+ if (!!auto_eoi_old == !!auto_eoi_new)
+ return;
+
+ mutex_lock(&vcpu->kvm->arch.apicv_update_lock);
+
+ if (auto_eoi_new)
+ hv->synic_auto_eoi_used++;
+ else
+ hv->synic_auto_eoi_used--;
+
+ __kvm_request_apicv_update(vcpu->kvm,
+ !hv->synic_auto_eoi_used,
+ APICV_INHIBIT_REASON_HYPERV);
+
+ mutex_unlock(&vcpu->kvm->arch.apicv_update_lock);
}
static int synic_set_sint(struct kvm_vcpu_hv_synic *synic, int sint,
@@ -933,12 +957,6 @@ int kvm_hv_activate_synic(struct kvm_vcpu *vcpu, bool dont_zero_synic_pages)
synic = to_hv_synic(vcpu);
- /*
- * Hyper-V SynIC auto EOI SINT's are
- * not compatible with APICV, so request
- * to deactivate APICV permanently.
- */
- kvm_request_apicv_update(vcpu->kvm, false, APICV_INHIBIT_REASON_HYPERV);
synic->active = true;
synic->dont_zero_synic_pages = dont_zero_synic_pages;
synic->control = HV_SYNIC_CONTROL_ENABLE;
@@ -2476,6 +2494,8 @@ int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid,
ent->eax |= HV_X64_ENLIGHTENED_VMCS_RECOMMENDED;
if (!cpu_smt_possible())
ent->eax |= HV_X64_NO_NONARCH_CORESHARING;
+
+ ent->eax |= HV_DEPRECATING_AEOI_RECOMMENDED;
/*
* Default number of spinlock retry attempts, matches
* HyperV 2016.
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index a6e218c6140d..5a69cce4d72d 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -220,7 +220,8 @@ void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
struct kvm_pit *pit = vcpu->kvm->arch.vpit;
struct hrtimer *timer;
- if (!kvm_vcpu_is_bsp(vcpu) || !pit)
+ /* Somewhat arbitrarily make vcpu0 the owner of the PIT. */
+ if (vcpu->vcpu_id || !pit)
return;
timer = &pit->pit_state.timer;
diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h
index 11e4065e1617..bbd4a5d18b5d 100644
--- a/arch/x86/kvm/ioapic.h
+++ b/arch/x86/kvm/ioapic.h
@@ -35,11 +35,7 @@ struct kvm_vcpu;
#define IOAPIC_INIT 0x5
#define IOAPIC_EXTINT 0x7
-#ifdef CONFIG_X86
#define RTC_GSI 8
-#else
-#define RTC_GSI -1U
-#endif
struct dest_map {
/* vcpu bitmap where IRQ has been sent */
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index ba5a27879f1d..76fb00921203 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -192,6 +192,9 @@ void kvm_recalculate_apic_map(struct kvm *kvm)
if (atomic_read_acquire(&kvm->arch.apic_map_dirty) == CLEAN)
return;
+ WARN_ONCE(!irqchip_in_kernel(kvm),
+ "Dirty APIC map without an in-kernel local APIC");
+
mutex_lock(&kvm->arch.apic_map_lock);
/*
* Read kvm->arch.apic_map_dirty before kvm->arch.apic_map
@@ -2265,9 +2268,6 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
u64 old_value = vcpu->arch.apic_base;
struct kvm_lapic *apic = vcpu->arch.apic;
- if (!apic)
- value |= MSR_IA32_APICBASE_BSP;
-
vcpu->arch.apic_base = value;
if ((old_value ^ value) & MSR_IA32_APICBASE_ENABLE)
@@ -2323,6 +2323,13 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
struct kvm_lapic *apic = vcpu->arch.apic;
int i;
+ if (!init_event) {
+ vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE |
+ MSR_IA32_APICBASE_ENABLE;
+ if (kvm_vcpu_is_reset_bsp(vcpu))
+ vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP;
+ }
+
if (!apic)
return;
@@ -2330,8 +2337,8 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
hrtimer_cancel(&apic->lapic_timer.timer);
if (!init_event) {
- kvm_lapic_set_base(vcpu, APIC_DEFAULT_PHYS_BASE |
- MSR_IA32_APICBASE_ENABLE);
+ apic->base_address = APIC_DEFAULT_PHYS_BASE;
+
kvm_apic_set_xapic_id(apic, vcpu->vcpu_id);
}
kvm_apic_set_version(apic->vcpu);
@@ -2364,9 +2371,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
apic->highest_isr_cache = -1;
update_divide_count(apic);
atomic_set(&apic->lapic_timer.pending, 0);
- if (kvm_vcpu_is_bsp(vcpu))
- kvm_lapic_set_base(vcpu,
- vcpu->arch.apic_base | MSR_IA32_APICBASE_BSP);
+
vcpu->arch.pv_eoi.msr_val = 0;
apic_update_ppr(apic);
if (vcpu->arch.apicv_active) {
@@ -2476,11 +2481,6 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns)
lapic_timer_advance_dynamic = false;
}
- /*
- * APIC is created enabled. This will prevent kvm_lapic_set_base from
- * thinking that APIC state has changed.
- */
- vcpu->arch.apic_base = MSR_IA32_APICBASE_ENABLE;
static_branch_inc(&apic_sw_disabled.key); /* sw disabled at reset */
kvm_iodevice_init(&apic->dev, &apic_mmio_ops);
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 83e6c6965f1e..e9688a9f7b57 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -240,4 +240,29 @@ static inline bool kvm_memslots_have_rmaps(struct kvm *kvm)
return smp_load_acquire(&kvm->arch.memslots_have_rmaps);
}
+static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level)
+{
+ /* KVM_HPAGE_GFN_SHIFT(PG_LEVEL_4K) must be 0. */
+ return (gfn >> KVM_HPAGE_GFN_SHIFT(level)) -
+ (base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
+}
+
+static inline unsigned long
+__kvm_mmu_slot_lpages(struct kvm_memory_slot *slot, unsigned long npages,
+ int level)
+{
+ return gfn_to_index(slot->base_gfn + npages - 1,
+ slot->base_gfn, level) + 1;
+}
+
+static inline unsigned long
+kvm_mmu_slot_lpages(struct kvm_memory_slot *slot, int level)
+{
+ return __kvm_mmu_slot_lpages(slot, slot->npages, level);
+}
+
+static inline void kvm_update_page_stats(struct kvm *kvm, int level, int count)
+{
+ atomic64_add(count, &kvm->stat.pages[level - 1]);
+}
#endif
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 47b765270239..2d7e61122af8 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -97,6 +97,7 @@ module_param_named(flush_on_reuse, force_flush_and_sync_on_reuse, bool, 0644);
bool tdp_enabled = false;
static int max_huge_page_level __read_mostly;
+static int tdp_root_level __read_mostly;
static int max_tdp_level __read_mostly;
enum {
@@ -137,12 +138,22 @@ module_param(dbg, bool, 0644);
#include <trace/events/kvm.h>
-/* make pte_list_desc fit well in cache line */
-#define PTE_LIST_EXT 3
+/* make pte_list_desc fit well in cache lines */
+#define PTE_LIST_EXT 14
+/*
+ * Slight optimization of cacheline layout, by putting `more' and `spte_count'
+ * at the start; then accessing it will only use one single cacheline for
+ * either full (entries==PTE_LIST_EXT) case or entries<=6.
+ */
struct pte_list_desc {
- u64 *sptes[PTE_LIST_EXT];
struct pte_list_desc *more;
+ /*
+ * Stores number of entries stored in the pte_list_desc. No need to be
+ * u64 but just for easier alignment. When PTE_LIST_EXT, means full.
+ */
+ u64 spte_count;
+ u64 *sptes[PTE_LIST_EXT];
};
struct kvm_shadow_walk_iterator {
@@ -193,7 +204,7 @@ struct kvm_mmu_role_regs {
* the single source of truth for the MMU's state.
*/
#define BUILD_MMU_ROLE_REGS_ACCESSOR(reg, name, flag) \
-static inline bool ____is_##reg##_##name(struct kvm_mmu_role_regs *regs)\
+static inline bool __maybe_unused ____is_##reg##_##name(struct kvm_mmu_role_regs *regs)\
{ \
return !!(regs->reg & flag); \
}
@@ -215,7 +226,7 @@ BUILD_MMU_ROLE_REGS_ACCESSOR(efer, lma, EFER_LMA);
* and the vCPU may be incorrect/irrelevant.
*/
#define BUILD_MMU_ROLE_ACCESSOR(base_or_ext, reg, name) \
-static inline bool is_##reg##_##name(struct kvm_mmu *mmu) \
+static inline bool __maybe_unused is_##reg##_##name(struct kvm_mmu *mmu) \
{ \
return !!(mmu->mmu_role. base_or_ext . reg##_##name); \
}
@@ -323,12 +334,6 @@ static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte)
static gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access,
struct x86_exception *exception)
{
- /* Check if guest physical address doesn't exceed guest maximum */
- if (kvm_vcpu_is_illegal_gpa(vcpu, gpa)) {
- exception->error_code |= PFERR_RSVD_MASK;
- return UNMAPPED_GVA;
- }
-
return gpa;
}
@@ -592,12 +597,13 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte)
* Rules for using mmu_spte_clear_track_bits:
* It sets the sptep from present to nonpresent, and track the
* state bits, it is used to clear the last level sptep.
- * Returns non-zero if the PTE was previously valid.
+ * Returns the old PTE.
*/
-static int mmu_spte_clear_track_bits(u64 *sptep)
+static int mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep)
{
kvm_pfn_t pfn;
u64 old_spte = *sptep;
+ int level = sptep_to_sp(sptep)->role.level;
if (!spte_has_volatile_bits(old_spte))
__update_clear_spte_fast(sptep, 0ull);
@@ -605,7 +611,9 @@ static int mmu_spte_clear_track_bits(u64 *sptep)
old_spte = __update_clear_spte_slow(sptep, 0ull);
if (!is_shadow_present_pte(old_spte))
- return 0;
+ return old_spte;
+
+ kvm_update_page_stats(kvm, level, -1);
pfn = spte_to_pfn(old_spte);
@@ -622,7 +630,7 @@ static int mmu_spte_clear_track_bits(u64 *sptep)
if (is_dirty_spte(old_spte))
kvm_set_pfn_dirty(pfn);
- return 1;
+ return old_spte;
}
/*
@@ -686,28 +694,36 @@ static bool mmu_spte_age(u64 *sptep)
static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu)
{
- /*
- * Prevent page table teardown by making any free-er wait during
- * kvm_flush_remote_tlbs() IPI to all active vcpus.
- */
- local_irq_disable();
+ if (is_tdp_mmu(vcpu->arch.mmu)) {
+ kvm_tdp_mmu_walk_lockless_begin();
+ } else {
+ /*
+ * Prevent page table teardown by making any free-er wait during
+ * kvm_flush_remote_tlbs() IPI to all active vcpus.
+ */
+ local_irq_disable();
- /*
- * Make sure a following spte read is not reordered ahead of the write
- * to vcpu->mode.
- */
- smp_store_mb(vcpu->mode, READING_SHADOW_PAGE_TABLES);
+ /*
+ * Make sure a following spte read is not reordered ahead of the write
+ * to vcpu->mode.
+ */
+ smp_store_mb(vcpu->mode, READING_SHADOW_PAGE_TABLES);
+ }
}
static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
{
- /*
- * Make sure the write to vcpu->mode is not reordered in front of
- * reads to sptes. If it does, kvm_mmu_commit_zap_page() can see us
- * OUTSIDE_GUEST_MODE and proceed to free the shadow page table.
- */
- smp_store_release(&vcpu->mode, OUTSIDE_GUEST_MODE);
- local_irq_enable();
+ if (is_tdp_mmu(vcpu->arch.mmu)) {
+ kvm_tdp_mmu_walk_lockless_end();
+ } else {
+ /*
+ * Make sure the write to vcpu->mode is not reordered in front of
+ * reads to sptes. If it does, kvm_mmu_commit_zap_page() can see us
+ * OUTSIDE_GUEST_MODE and proceed to free the shadow page table.
+ */
+ smp_store_release(&vcpu->mode, OUTSIDE_GUEST_MODE);
+ local_irq_enable();
+ }
}
static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect)
@@ -786,7 +802,7 @@ static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn,
return &slot->arch.lpage_info[level - 2][idx];
}
-static void update_gfn_disallow_lpage_count(struct kvm_memory_slot *slot,
+static void update_gfn_disallow_lpage_count(const struct kvm_memory_slot *slot,
gfn_t gfn, int count)
{
struct kvm_lpage_info *linfo;
@@ -799,12 +815,12 @@ static void update_gfn_disallow_lpage_count(struct kvm_memory_slot *slot,
}
}
-void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn)
+void kvm_mmu_gfn_disallow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn)
{
update_gfn_disallow_lpage_count(slot, gfn, 1);
}
-void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn)
+void kvm_mmu_gfn_allow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn)
{
update_gfn_disallow_lpage_count(slot, gfn, -1);
}
@@ -893,7 +909,7 @@ static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte,
struct kvm_rmap_head *rmap_head)
{
struct pte_list_desc *desc;
- int i, count = 0;
+ int count = 0;
if (!rmap_head->val) {
rmap_printk("%p %llx 0->1\n", spte, *spte);
@@ -903,24 +919,24 @@ static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte,
desc = mmu_alloc_pte_list_desc(vcpu);
desc->sptes[0] = (u64 *)rmap_head->val;
desc->sptes[1] = spte;
+ desc->spte_count = 2;
rmap_head->val = (unsigned long)desc | 1;
++count;
} else {
rmap_printk("%p %llx many->many\n", spte, *spte);
desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
- while (desc->sptes[PTE_LIST_EXT-1]) {
+ while (desc->spte_count == PTE_LIST_EXT) {
count += PTE_LIST_EXT;
-
if (!desc->more) {
desc->more = mmu_alloc_pte_list_desc(vcpu);
desc = desc->more;
+ desc->spte_count = 0;
break;
}
desc = desc->more;
}
- for (i = 0; desc->sptes[i]; ++i)
- ++count;
- desc->sptes[i] = spte;
+ count += desc->spte_count;
+ desc->sptes[desc->spte_count++] = spte;
}
return count;
}
@@ -930,13 +946,12 @@ pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head,
struct pte_list_desc *desc, int i,
struct pte_list_desc *prev_desc)
{
- int j;
+ int j = desc->spte_count - 1;
- for (j = PTE_LIST_EXT - 1; !desc->sptes[j] && j > i; --j)
- ;
desc->sptes[i] = desc->sptes[j];
desc->sptes[j] = NULL;
- if (j != 0)
+ desc->spte_count--;
+ if (desc->spte_count)
return;
if (!prev_desc && !desc->more)
rmap_head->val = 0;
@@ -969,7 +984,7 @@ static void __pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head)
desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
prev_desc = NULL;
while (desc) {
- for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i) {
+ for (i = 0; i < desc->spte_count; ++i) {
if (desc->sptes[i] == spte) {
pte_list_desc_remove_entry(rmap_head,
desc, i, prev_desc);
@@ -984,30 +999,68 @@ static void __pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head)
}
}
-static void pte_list_remove(struct kvm_rmap_head *rmap_head, u64 *sptep)
+static void pte_list_remove(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
+ u64 *sptep)
{
- mmu_spte_clear_track_bits(sptep);
+ mmu_spte_clear_track_bits(kvm, sptep);
__pte_list_remove(sptep, rmap_head);
}
-static struct kvm_rmap_head *__gfn_to_rmap(gfn_t gfn, int level,
- struct kvm_memory_slot *slot)
+/* Return true if rmap existed, false otherwise */
+static bool pte_list_destroy(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
{
- unsigned long idx;
+ struct pte_list_desc *desc, *next;
+ int i;
- idx = gfn_to_index(gfn, slot->base_gfn, level);
- return &slot->arch.rmap[level - PG_LEVEL_4K][idx];
+ if (!rmap_head->val)
+ return false;
+
+ if (!(rmap_head->val & 1)) {
+ mmu_spte_clear_track_bits(kvm, (u64 *)rmap_head->val);
+ goto out;
+ }
+
+ desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
+
+ for (; desc; desc = next) {
+ for (i = 0; i < desc->spte_count; i++)
+ mmu_spte_clear_track_bits(kvm, desc->sptes[i]);
+ next = desc->more;
+ mmu_free_pte_list_desc(desc);
+ }
+out:
+ /* rmap_head is meaningless now, remember to reset it */
+ rmap_head->val = 0;
+ return true;
}
-static struct kvm_rmap_head *gfn_to_rmap(struct kvm *kvm, gfn_t gfn,
- struct kvm_mmu_page *sp)
+unsigned int pte_list_count(struct kvm_rmap_head *rmap_head)
{
- struct kvm_memslots *slots;
- struct kvm_memory_slot *slot;
+ struct pte_list_desc *desc;
+ unsigned int count = 0;
- slots = kvm_memslots_for_spte_role(kvm, sp->role);
- slot = __gfn_to_memslot(slots, gfn);
- return __gfn_to_rmap(gfn, sp->role.level, slot);
+ if (!rmap_head->val)
+ return 0;
+ else if (!(rmap_head->val & 1))
+ return 1;
+
+ desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
+
+ while (desc) {
+ count += desc->spte_count;
+ desc = desc->more;
+ }
+
+ return count;
+}
+
+static struct kvm_rmap_head *gfn_to_rmap(gfn_t gfn, int level,
+ const struct kvm_memory_slot *slot)
+{
+ unsigned long idx;
+
+ idx = gfn_to_index(gfn, slot->base_gfn, level);
+ return &slot->arch.rmap[level - PG_LEVEL_4K][idx];
}
static bool rmap_can_add(struct kvm_vcpu *vcpu)
@@ -1020,24 +1073,39 @@ static bool rmap_can_add(struct kvm_vcpu *vcpu)
static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
{
+ struct kvm_memory_slot *slot;
struct kvm_mmu_page *sp;
struct kvm_rmap_head *rmap_head;
sp = sptep_to_sp(spte);
kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn);
- rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp);
+ slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
+ rmap_head = gfn_to_rmap(gfn, sp->role.level, slot);
return pte_list_add(vcpu, spte, rmap_head);
}
+
static void rmap_remove(struct kvm *kvm, u64 *spte)
{
+ struct kvm_memslots *slots;
+ struct kvm_memory_slot *slot;
struct kvm_mmu_page *sp;
gfn_t gfn;
struct kvm_rmap_head *rmap_head;
sp = sptep_to_sp(spte);
gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
- rmap_head = gfn_to_rmap(kvm, gfn, sp);
+
+ /*
+ * Unlike rmap_add and rmap_recycle, rmap_remove does not run in the
+ * context of a vCPU so have to determine which memslots to use based
+ * on context information in sp->role.
+ */
+ slots = kvm_memslots_for_spte_role(kvm, sp->role);
+
+ slot = __gfn_to_memslot(slots, gfn);
+ rmap_head = gfn_to_rmap(gfn, sp->role.level, slot);
+
__pte_list_remove(spte, rmap_head);
}
@@ -1119,7 +1187,9 @@ out:
static void drop_spte(struct kvm *kvm, u64 *sptep)
{
- if (mmu_spte_clear_track_bits(sptep))
+ u64 old_spte = mmu_spte_clear_track_bits(kvm, sptep);
+
+ if (is_shadow_present_pte(old_spte))
rmap_remove(kvm, sptep);
}
@@ -1129,7 +1199,6 @@ static bool __drop_large_spte(struct kvm *kvm, u64 *sptep)
if (is_large_pte(*sptep)) {
WARN_ON(sptep_to_sp(sptep)->role.level == PG_LEVEL_4K);
drop_spte(kvm, sptep);
- --kvm->stat.lpages;
return true;
}
@@ -1218,7 +1287,7 @@ static bool spte_wrprot_for_clear_dirty(u64 *sptep)
* Returns true iff any D or W bits were cleared.
*/
static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
- struct kvm_memory_slot *slot)
+ const struct kvm_memory_slot *slot)
{
u64 *sptep;
struct rmap_iterator iter;
@@ -1256,8 +1325,8 @@ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
return;
while (mask) {
- rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
- PG_LEVEL_4K, slot);
+ rmap_head = gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
+ PG_LEVEL_4K, slot);
__rmap_write_protect(kvm, rmap_head, false);
/* clear the first set bit */
@@ -1289,8 +1358,8 @@ static void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
return;
while (mask) {
- rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
- PG_LEVEL_4K, slot);
+ rmap_head = gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
+ PG_LEVEL_4K, slot);
__rmap_clear_dirty(kvm, rmap_head, slot);
/* clear the first set bit */
@@ -1356,7 +1425,7 @@ bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
if (kvm_memslots_have_rmaps(kvm)) {
for (i = min_level; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) {
- rmap_head = __gfn_to_rmap(gfn, i, slot);
+ rmap_head = gfn_to_rmap(gfn, i, slot);
write_protected |= __rmap_write_protect(kvm, rmap_head, true);
}
}
@@ -1377,20 +1446,9 @@ static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn)
}
static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
- struct kvm_memory_slot *slot)
+ const struct kvm_memory_slot *slot)
{
- u64 *sptep;
- struct rmap_iterator iter;
- bool flush = false;
-
- while ((sptep = rmap_get_first(rmap_head, &iter))) {
- rmap_printk("spte %p %llx.\n", sptep, *sptep);
-
- pte_list_remove(rmap_head, sptep);
- flush = true;
- }
-
- return flush;
+ return pte_list_destroy(kvm, rmap_head);
}
static bool kvm_unmap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
@@ -1421,13 +1479,13 @@ restart:
need_flush = 1;
if (pte_write(pte)) {
- pte_list_remove(rmap_head, sptep);
+ pte_list_remove(kvm, rmap_head, sptep);
goto restart;
} else {
new_spte = kvm_mmu_changed_pte_notifier_make_spte(
*sptep, new_pfn);
- mmu_spte_clear_track_bits(sptep);
+ mmu_spte_clear_track_bits(kvm, sptep);
mmu_spte_set(sptep, new_spte);
}
}
@@ -1442,7 +1500,7 @@ restart:
struct slot_rmap_walk_iterator {
/* input fields. */
- struct kvm_memory_slot *slot;
+ const struct kvm_memory_slot *slot;
gfn_t start_gfn;
gfn_t end_gfn;
int start_level;
@@ -1462,14 +1520,13 @@ rmap_walk_init_level(struct slot_rmap_walk_iterator *iterator, int level)
{
iterator->level = level;
iterator->gfn = iterator->start_gfn;
- iterator->rmap = __gfn_to_rmap(iterator->gfn, level, iterator->slot);
- iterator->end_rmap = __gfn_to_rmap(iterator->end_gfn, level,
- iterator->slot);
+ iterator->rmap = gfn_to_rmap(iterator->gfn, level, iterator->slot);
+ iterator->end_rmap = gfn_to_rmap(iterator->end_gfn, level, iterator->slot);
}
static void
slot_rmap_walk_init(struct slot_rmap_walk_iterator *iterator,
- struct kvm_memory_slot *slot, int start_level,
+ const struct kvm_memory_slot *slot, int start_level,
int end_level, gfn_t start_gfn, gfn_t end_gfn)
{
iterator->slot = slot;
@@ -1584,12 +1641,13 @@ static bool kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
{
+ struct kvm_memory_slot *slot;
struct kvm_rmap_head *rmap_head;
struct kvm_mmu_page *sp;
sp = sptep_to_sp(spte);
-
- rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp);
+ slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
+ rmap_head = gfn_to_rmap(gfn, sp->role.level, slot);
kvm_unmap_rmapp(vcpu->kvm, rmap_head, NULL, gfn, sp->role.level, __pte(0));
kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn,
@@ -2232,8 +2290,6 @@ static int mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
if (is_shadow_present_pte(pte)) {
if (is_last_spte(pte, sp->role.level)) {
drop_spte(kvm, spte);
- if (is_large_pte(pte))
- --kvm->stat.lpages;
} else {
child = to_shadow_page(pte & PT64_BASE_ADDR_MASK);
drop_parent_pte(child, spte);
@@ -2716,15 +2772,12 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
pgprintk("%s: setting spte %llx\n", __func__, *sptep);
trace_kvm_mmu_set_spte(level, gfn, sptep);
- if (!was_rmapped && is_large_pte(*sptep))
- ++vcpu->kvm->stat.lpages;
- if (is_shadow_present_pte(*sptep)) {
- if (!was_rmapped) {
- rmap_count = rmap_add(vcpu, sptep, gfn);
- if (rmap_count > RMAP_RECYCLE_THRESHOLD)
- rmap_recycle(vcpu, sptep, gfn);
- }
+ if (!was_rmapped) {
+ kvm_update_page_stats(vcpu->kvm, level, 1);
+ rmap_count = rmap_add(vcpu, sptep, gfn);
+ if (rmap_count > RMAP_RECYCLE_THRESHOLD)
+ rmap_recycle(vcpu, sptep, gfn);
}
return ret;
@@ -2852,6 +2905,7 @@ int kvm_mmu_max_mapping_level(struct kvm *kvm,
kvm_pfn_t pfn, int max_level)
{
struct kvm_lpage_info *linfo;
+ int host_level;
max_level = min(max_level, max_huge_page_level);
for ( ; max_level > PG_LEVEL_4K; max_level--) {
@@ -2863,7 +2917,8 @@ int kvm_mmu_max_mapping_level(struct kvm *kvm,
if (max_level == PG_LEVEL_4K)
return PG_LEVEL_4K;
- return host_pfn_mapping_level(kvm, gfn, pfn, slot);
+ host_level = host_pfn_mapping_level(kvm, gfn, pfn, slot);
+ return min(host_level, max_level);
}
int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn,
@@ -2887,17 +2942,12 @@ int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn,
if (!slot)
return PG_LEVEL_4K;
- level = kvm_mmu_max_mapping_level(vcpu->kvm, slot, gfn, pfn, max_level);
- if (level == PG_LEVEL_4K)
- return level;
-
- *req_level = level = min(level, max_level);
-
/*
* Enforce the iTLB multihit workaround after capturing the requested
* level, which will be used to do precise, accurate accounting.
*/
- if (huge_page_disallowed)
+ *req_level = level = kvm_mmu_max_mapping_level(vcpu->kvm, slot, gfn, pfn, max_level);
+ if (level == PG_LEVEL_4K || huge_page_disallowed)
return PG_LEVEL_4K;
/*
@@ -2965,15 +3015,16 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
break;
drop_large_spte(vcpu, it.sptep);
- if (!is_shadow_present_pte(*it.sptep)) {
- sp = kvm_mmu_get_page(vcpu, base_gfn, it.addr,
- it.level - 1, true, ACC_ALL);
-
- link_shadow_page(vcpu, it.sptep, sp);
- if (is_tdp && huge_page_disallowed &&
- req_level >= it.level)
- account_huge_nx_page(vcpu->kvm, sp);
- }
+ if (is_shadow_present_pte(*it.sptep))
+ continue;
+
+ sp = kvm_mmu_get_page(vcpu, base_gfn, it.addr,
+ it.level - 1, true, ACC_ALL);
+
+ link_shadow_page(vcpu, it.sptep, sp);
+ if (is_tdp && huge_page_disallowed &&
+ req_level >= it.level)
+ account_huge_nx_page(vcpu->kvm, sp);
}
ret = mmu_set_spte(vcpu, it.sptep, ACC_ALL,
@@ -3122,15 +3173,40 @@ static bool is_access_allowed(u32 fault_err_code, u64 spte)
}
/*
- * Returns one of RET_PF_INVALID, RET_PF_FIXED or RET_PF_SPURIOUS.
+ * Returns the last level spte pointer of the shadow page walk for the given
+ * gpa, and sets *spte to the spte value. This spte may be non-preset. If no
+ * walk could be performed, returns NULL and *spte does not contain valid data.
+ *
+ * Contract:
+ * - Must be called between walk_shadow_page_lockless_{begin,end}.
+ * - The returned sptep must not be used after walk_shadow_page_lockless_end.
*/
-static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
- u32 error_code)
+static u64 *fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, gpa_t gpa, u64 *spte)
{
struct kvm_shadow_walk_iterator iterator;
+ u64 old_spte;
+ u64 *sptep = NULL;
+
+ for_each_shadow_entry_lockless(vcpu, gpa, iterator, old_spte) {
+ sptep = iterator.sptep;
+ *spte = old_spte;
+
+ if (!is_shadow_present_pte(old_spte))
+ break;
+ }
+
+ return sptep;
+}
+
+/*
+ * Returns one of RET_PF_INVALID, RET_PF_FIXED or RET_PF_SPURIOUS.
+ */
+static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code)
+{
struct kvm_mmu_page *sp;
int ret = RET_PF_INVALID;
u64 spte = 0ull;
+ u64 *sptep = NULL;
uint retry_count = 0;
if (!page_fault_can_be_fast(error_code))
@@ -3141,14 +3217,15 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
do {
u64 new_spte;
- for_each_shadow_entry_lockless(vcpu, cr2_or_gpa, iterator, spte)
- if (!is_shadow_present_pte(spte))
- break;
+ if (is_tdp_mmu(vcpu->arch.mmu))
+ sptep = kvm_tdp_mmu_fast_pf_get_last_sptep(vcpu, gpa, &spte);
+ else
+ sptep = fast_pf_get_last_sptep(vcpu, gpa, &spte);
if (!is_shadow_present_pte(spte))
break;
- sp = sptep_to_sp(iterator.sptep);
+ sp = sptep_to_sp(sptep);
if (!is_last_spte(spte, sp->role.level))
break;
@@ -3206,8 +3283,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
* since the gfn is not stable for indirect shadow page. See
* Documentation/virt/kvm/locking.rst to get more detail.
*/
- if (fast_pf_fix_direct_spte(vcpu, sp, iterator.sptep, spte,
- new_spte)) {
+ if (fast_pf_fix_direct_spte(vcpu, sp, sptep, spte, new_spte)) {
ret = RET_PF_FIXED;
break;
}
@@ -3220,8 +3296,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
} while (true);
- trace_fast_page_fault(vcpu, cr2_or_gpa, error_code, iterator.sptep,
- spte, ret);
+ trace_fast_page_fault(vcpu, gpa, error_code, sptep, spte, ret);
walk_shadow_page_lockless_end(vcpu);
return ret;
@@ -3455,15 +3530,22 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
* the shadow page table may be a PAE or a long mode page table.
*/
pm_mask = PT_PRESENT_MASK | shadow_me_mask;
- if (mmu->shadow_root_level == PT64_ROOT_4LEVEL) {
+ if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL) {
pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK;
if (WARN_ON_ONCE(!mmu->pml4_root)) {
r = -EIO;
goto out_unlock;
}
-
mmu->pml4_root[0] = __pa(mmu->pae_root) | pm_mask;
+
+ if (mmu->shadow_root_level == PT64_ROOT_5LEVEL) {
+ if (WARN_ON_ONCE(!mmu->pml5_root)) {
+ r = -EIO;
+ goto out_unlock;
+ }
+ mmu->pml5_root[0] = __pa(mmu->pml4_root) | pm_mask;
+ }
}
for (i = 0; i < 4; ++i) {
@@ -3482,7 +3564,9 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
mmu->pae_root[i] = root | pm_mask;
}
- if (mmu->shadow_root_level == PT64_ROOT_4LEVEL)
+ if (mmu->shadow_root_level == PT64_ROOT_5LEVEL)
+ mmu->root_hpa = __pa(mmu->pml5_root);
+ else if (mmu->shadow_root_level == PT64_ROOT_4LEVEL)
mmu->root_hpa = __pa(mmu->pml4_root);
else
mmu->root_hpa = __pa(mmu->pae_root);
@@ -3498,7 +3582,10 @@ out_unlock:
static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu)
{
struct kvm_mmu *mmu = vcpu->arch.mmu;
- u64 *pml4_root, *pae_root;
+ bool need_pml5 = mmu->shadow_root_level > PT64_ROOT_4LEVEL;
+ u64 *pml5_root = NULL;
+ u64 *pml4_root = NULL;
+ u64 *pae_root;
/*
* When shadowing 32-bit or PAE NPT with 64-bit NPT, the PML4 and PDP
@@ -3511,20 +3598,21 @@ static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu)
return 0;
/*
- * This mess only works with 4-level paging and needs to be updated to
- * work with 5-level paging.
+ * NPT, the only paging mode that uses this horror, uses a fixed number
+ * of levels for the shadow page tables, e.g. all MMUs are 4-level or
+ * all MMus are 5-level. Thus, this can safely require that pml5_root
+ * is allocated if the other roots are valid and pml5 is needed, as any
+ * prior MMU would also have required pml5.
*/
- if (WARN_ON_ONCE(mmu->shadow_root_level != PT64_ROOT_4LEVEL))
- return -EIO;
-
- if (mmu->pae_root && mmu->pml4_root)
+ if (mmu->pae_root && mmu->pml4_root && (!need_pml5 || mmu->pml5_root))
return 0;
/*
* The special roots should always be allocated in concert. Yell and
* bail if KVM ends up in a state where only one of the roots is valid.
*/
- if (WARN_ON_ONCE(!tdp_enabled || mmu->pae_root || mmu->pml4_root))
+ if (WARN_ON_ONCE(!tdp_enabled || mmu->pae_root || mmu->pml4_root ||
+ (need_pml5 && mmu->pml5_root)))
return -EIO;
/*
@@ -3535,16 +3623,31 @@ static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu)
if (!pae_root)
return -ENOMEM;
+#ifdef CONFIG_X86_64
pml4_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
- if (!pml4_root) {
- free_page((unsigned long)pae_root);
- return -ENOMEM;
+ if (!pml4_root)
+ goto err_pml4;
+
+ if (need_pml5) {
+ pml5_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
+ if (!pml5_root)
+ goto err_pml5;
}
+#endif
mmu->pae_root = pae_root;
mmu->pml4_root = pml4_root;
+ mmu->pml5_root = pml5_root;
return 0;
+
+#ifdef CONFIG_X86_64
+err_pml5:
+ free_page((unsigned long)pml4_root);
+err_pml4:
+ free_page((unsigned long)pae_root);
+ return -ENOMEM;
+#endif
}
void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
@@ -3640,6 +3743,8 @@ static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct)
/*
* Return the level of the lowest level SPTE added to sptes.
* That SPTE may be non-present.
+ *
+ * Must be called between walk_shadow_page_lockless_{begin,end}.
*/
static int get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level)
{
@@ -3647,8 +3752,6 @@ static int get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level
int leaf = -1;
u64 spte;
- walk_shadow_page_lockless_begin(vcpu);
-
for (shadow_walk_init(&iterator, vcpu, addr),
*root_level = iterator.level;
shadow_walk_okay(&iterator);
@@ -3662,8 +3765,6 @@ static int get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level
break;
}
- walk_shadow_page_lockless_end(vcpu);
-
return leaf;
}
@@ -3675,11 +3776,15 @@ static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
int root, leaf, level;
bool reserved = false;
+ walk_shadow_page_lockless_begin(vcpu);
+
if (is_tdp_mmu(vcpu->arch.mmu))
leaf = kvm_tdp_mmu_get_walk(vcpu, addr, sptes, &root);
else
leaf = get_walk(vcpu, addr, sptes, &root);
+ walk_shadow_page_lockless_end(vcpu);
+
if (unlikely(leaf < 0)) {
*sptep = 0ull;
return reserved;
@@ -3795,9 +3900,9 @@ static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch);
}
-static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
+static bool kvm_faultin_pfn(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
gpa_t cr2_or_gpa, kvm_pfn_t *pfn, hva_t *hva,
- bool write, bool *writable)
+ bool write, bool *writable, int *r)
{
struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
bool async;
@@ -3808,13 +3913,26 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
* be zapped before KVM inserts a new MMIO SPTE for the gfn.
*/
if (slot && (slot->flags & KVM_MEMSLOT_INVALID))
- return true;
-
- /* Don't expose private memslots to L2. */
- if (is_guest_mode(vcpu) && !kvm_is_visible_memslot(slot)) {
- *pfn = KVM_PFN_NOSLOT;
- *writable = false;
- return false;
+ goto out_retry;
+
+ if (!kvm_is_visible_memslot(slot)) {
+ /* Don't expose private memslots to L2. */
+ if (is_guest_mode(vcpu)) {
+ *pfn = KVM_PFN_NOSLOT;
+ *writable = false;
+ return false;
+ }
+ /*
+ * If the APIC access page exists but is disabled, go directly
+ * to emulation without caching the MMIO access or creating a
+ * MMIO SPTE. That way the cache doesn't need to be purged
+ * when the AVIC is re-enabled.
+ */
+ if (slot && slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT &&
+ !kvm_apicv_activated(vcpu->kvm)) {
+ *r = RET_PF_EMULATE;
+ return true;
+ }
}
async = false;
@@ -3828,14 +3946,17 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
if (kvm_find_async_pf_gfn(vcpu, gfn)) {
trace_kvm_async_pf_doublefault(cr2_or_gpa, gfn);
kvm_make_request(KVM_REQ_APF_HALT, vcpu);
- return true;
+ goto out_retry;
} else if (kvm_arch_setup_async_pf(vcpu, cr2_or_gpa, gfn))
- return true;
+ goto out_retry;
}
*pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL,
write, writable, hva);
- return false;
+
+out_retry:
+ *r = RET_PF_RETRY;
+ return true;
}
static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
@@ -3854,11 +3975,9 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
if (page_fault_handle_page_track(vcpu, error_code, gfn))
return RET_PF_EMULATE;
- if (!is_tdp_mmu_fault) {
- r = fast_page_fault(vcpu, gpa, error_code);
- if (r != RET_PF_INVALID)
- return r;
- }
+ r = fast_page_fault(vcpu, gpa, error_code);
+ if (r != RET_PF_INVALID)
+ return r;
r = mmu_topup_memory_caches(vcpu, false);
if (r)
@@ -3867,9 +3986,9 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb();
- if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, &hva,
- write, &map_writable))
- return RET_PF_RETRY;
+ if (kvm_faultin_pfn(vcpu, prefault, gfn, gpa, &pfn, &hva,
+ write, &map_writable, &r))
+ return r;
if (handle_abnormal_pfn(vcpu, is_tdp ? 0 : gpa, gfn, pfn, ACC_ALL, &r))
return r;
@@ -4588,6 +4707,10 @@ static union kvm_mmu_role kvm_calc_mmu_role_common(struct kvm_vcpu *vcpu,
static inline int kvm_mmu_get_tdp_level(struct kvm_vcpu *vcpu)
{
+ /* tdp_root_level is architecture forced level, use it if nonzero */
+ if (tdp_root_level)
+ return tdp_root_level;
+
/* Use 5-level TDP if and only if it's useful/necessary. */
if (max_tdp_level == 5 && cpuid_maxphyaddr(vcpu) <= 48)
return 4;
@@ -5160,7 +5283,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
if (r == RET_PF_INVALID) {
r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa,
lower_32_bits(error_code), false);
- if (WARN_ON_ONCE(r == RET_PF_INVALID))
+ if (KVM_BUG_ON(r == RET_PF_INVALID, vcpu->kvm))
return -EIO;
}
@@ -5279,10 +5402,11 @@ void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid)
*/
}
-void kvm_configure_mmu(bool enable_tdp, int tdp_max_root_level,
- int tdp_huge_page_level)
+void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level,
+ int tdp_max_root_level, int tdp_huge_page_level)
{
tdp_enabled = enable_tdp;
+ tdp_root_level = tdp_forced_root_level;
max_tdp_level = tdp_max_root_level;
/*
@@ -5302,12 +5426,13 @@ void kvm_configure_mmu(bool enable_tdp, int tdp_max_root_level,
EXPORT_SYMBOL_GPL(kvm_configure_mmu);
/* The return value indicates if tlb flush on all vcpus is needed. */
-typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_head,
- struct kvm_memory_slot *slot);
+typedef bool (*slot_level_handler) (struct kvm *kvm,
+ struct kvm_rmap_head *rmap_head,
+ const struct kvm_memory_slot *slot);
/* The caller should hold mmu-lock before calling this function. */
static __always_inline bool
-slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot,
+slot_handle_level_range(struct kvm *kvm, const struct kvm_memory_slot *memslot,
slot_level_handler fn, int start_level, int end_level,
gfn_t start_gfn, gfn_t end_gfn, bool flush_on_yield,
bool flush)
@@ -5334,7 +5459,7 @@ slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot,
}
static __always_inline bool
-slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
+slot_handle_level(struct kvm *kvm, const struct kvm_memory_slot *memslot,
slot_level_handler fn, int start_level, int end_level,
bool flush_on_yield)
{
@@ -5345,7 +5470,7 @@ slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
}
static __always_inline bool
-slot_handle_leaf(struct kvm *kvm, struct kvm_memory_slot *memslot,
+slot_handle_leaf(struct kvm *kvm, const struct kvm_memory_slot *memslot,
slot_level_handler fn, bool flush_on_yield)
{
return slot_handle_level(kvm, memslot, fn, PG_LEVEL_4K,
@@ -5358,6 +5483,7 @@ static void free_mmu_pages(struct kvm_mmu *mmu)
set_memory_encrypted((unsigned long)mmu->pae_root, 1);
free_page((unsigned long)mmu->pae_root);
free_page((unsigned long)mmu->pml4_root);
+ free_page((unsigned long)mmu->pml5_root);
}
static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
@@ -5587,6 +5713,10 @@ void kvm_mmu_uninit_vm(struct kvm *kvm)
kvm_mmu_uninit_tdp_mmu(kvm);
}
+/*
+ * Invalidate (zap) SPTEs that cover GFNs from gfn_start and up to gfn_end
+ * (not including it)
+ */
void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
{
struct kvm_memslots *slots;
@@ -5594,8 +5724,11 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
int i;
bool flush = false;
+ write_lock(&kvm->mmu_lock);
+
+ kvm_inc_notifier_count(kvm, gfn_start, gfn_end);
+
if (kvm_memslots_have_rmaps(kvm)) {
- write_lock(&kvm->mmu_lock);
for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
slots = __kvm_memslots(kvm, i);
kvm_for_each_memslot(memslot, slots) {
@@ -5606,41 +5739,44 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
if (start >= end)
continue;
- flush = slot_handle_level_range(kvm, memslot,
+ flush = slot_handle_level_range(kvm,
+ (const struct kvm_memory_slot *) memslot,
kvm_zap_rmapp, PG_LEVEL_4K,
KVM_MAX_HUGEPAGE_LEVEL, start,
end - 1, true, flush);
}
}
if (flush)
- kvm_flush_remote_tlbs_with_address(kvm, gfn_start, gfn_end);
- write_unlock(&kvm->mmu_lock);
+ kvm_flush_remote_tlbs_with_address(kvm, gfn_start,
+ gfn_end - gfn_start);
}
if (is_tdp_mmu_enabled(kvm)) {
- flush = false;
-
- read_lock(&kvm->mmu_lock);
for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, gfn_start,
- gfn_end, flush, true);
+ gfn_end, flush);
if (flush)
kvm_flush_remote_tlbs_with_address(kvm, gfn_start,
- gfn_end);
-
- read_unlock(&kvm->mmu_lock);
+ gfn_end - gfn_start);
}
+
+ if (flush)
+ kvm_flush_remote_tlbs_with_address(kvm, gfn_start, gfn_end);
+
+ kvm_dec_notifier_count(kvm, gfn_start, gfn_end);
+
+ write_unlock(&kvm->mmu_lock);
}
static bool slot_rmap_write_protect(struct kvm *kvm,
struct kvm_rmap_head *rmap_head,
- struct kvm_memory_slot *slot)
+ const struct kvm_memory_slot *slot)
{
return __rmap_write_protect(kvm, rmap_head, false);
}
void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
- struct kvm_memory_slot *memslot,
+ const struct kvm_memory_slot *memslot,
int start_level)
{
bool flush = false;
@@ -5676,7 +5812,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
struct kvm_rmap_head *rmap_head,
- struct kvm_memory_slot *slot)
+ const struct kvm_memory_slot *slot)
{
u64 *sptep;
struct rmap_iterator iter;
@@ -5699,7 +5835,7 @@ restart:
if (sp->role.direct && !kvm_is_reserved_pfn(pfn) &&
sp->role.level < kvm_mmu_max_mapping_level(kvm, slot, sp->gfn,
pfn, PG_LEVEL_NUM)) {
- pte_list_remove(rmap_head, sptep);
+ pte_list_remove(kvm, rmap_head, sptep);
if (kvm_available_flush_tlb_with_range())
kvm_flush_remote_tlbs_with_address(kvm, sp->gfn,
@@ -5715,10 +5851,8 @@ restart:
}
void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
- const struct kvm_memory_slot *memslot)
+ const struct kvm_memory_slot *slot)
{
- /* FIXME: const-ify all uses of struct kvm_memory_slot. */
- struct kvm_memory_slot *slot = (struct kvm_memory_slot *)memslot;
bool flush = false;
if (kvm_memslots_have_rmaps(kvm)) {
@@ -5754,7 +5888,7 @@ void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm,
}
void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
- struct kvm_memory_slot *memslot)
+ const struct kvm_memory_slot *memslot)
{
bool flush = false;
diff --git a/arch/x86/kvm/mmu/mmu_audit.c b/arch/x86/kvm/mmu/mmu_audit.c
index cedc17b2f60e..9e7dcf999f08 100644
--- a/arch/x86/kvm/mmu/mmu_audit.c
+++ b/arch/x86/kvm/mmu/mmu_audit.c
@@ -147,7 +147,7 @@ static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
return;
}
- rmap_head = __gfn_to_rmap(gfn, rev_sp->role.level, slot);
+ rmap_head = gfn_to_rmap(gfn, rev_sp->role.level, slot);
if (!rmap_head->val) {
if (!__ratelimit(&ratelimit_state))
return;
@@ -200,7 +200,7 @@ static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp)
slots = kvm_memslots_for_spte_role(kvm, sp->role);
slot = __gfn_to_memslot(slots, sp->gfn);
- rmap_head = __gfn_to_rmap(sp->gfn, PG_LEVEL_4K, slot);
+ rmap_head = gfn_to_rmap(sp->gfn, PG_LEVEL_4K, slot);
for_each_rmap_spte(rmap_head, &iter, sptep) {
if (is_writable_pte(*sptep))
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index 35567293c1fd..bf2bdbf333c2 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -31,13 +31,16 @@ extern bool dbg;
#define IS_VALID_PAE_ROOT(x) (!!(x))
struct kvm_mmu_page {
+ /*
+ * Note, "link" through "spt" fit in a single 64 byte cache line on
+ * 64-bit kernels, keep it that way unless there's a reason not to.
+ */
struct list_head link;
struct hlist_node hash_link;
- struct list_head lpage_disallowed_link;
+ bool tdp_mmu_page;
bool unsync;
u8 mmu_valid_gen;
- bool mmio_cached;
bool lpage_disallowed; /* Can't be replaced by an equiv large page */
/*
@@ -59,6 +62,7 @@ struct kvm_mmu_page {
struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */
DECLARE_BITMAP(unsync_child_bitmap, 512);
+ struct list_head lpage_disallowed_link;
#ifdef CONFIG_X86_32
/*
* Used out of the mmu-lock to avoid reading spte values while an
@@ -71,8 +75,6 @@ struct kvm_mmu_page {
atomic_t write_flooding_count;
#ifdef CONFIG_X86_64
- bool tdp_mmu_page;
-
/* Used for freeing the page asynchronously if it is a TDP MMU page. */
struct rcu_head rcu_head;
#endif
@@ -124,13 +126,14 @@ static inline bool is_nx_huge_page_enabled(void)
int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn, bool can_unsync);
-void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn);
-void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn);
+void kvm_mmu_gfn_disallow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn);
+void kvm_mmu_gfn_allow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn);
bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
struct kvm_memory_slot *slot, u64 gfn,
int min_level);
void kvm_flush_remote_tlbs_with_address(struct kvm *kvm,
u64 start_gfn, u64 pages);
+unsigned int pte_list_count(struct kvm_rmap_head *rmap_head);
/*
* Return values of handle_mmio_page_fault, mmu.page_fault, and fast_page_fault().
@@ -140,6 +143,9 @@ void kvm_flush_remote_tlbs_with_address(struct kvm *kvm,
* RET_PF_INVALID: the spte is invalid, let the real page fault path update it.
* RET_PF_FIXED: The faulting entry has been fixed.
* RET_PF_SPURIOUS: The faulting entry was already fixed, e.g. by another vCPU.
+ *
+ * Any names added to this enum should be exported to userspace for use in
+ * tracepoints via TRACE_DEFINE_ENUM() in mmutrace.h
*/
enum {
RET_PF_RETRY = 0,
diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h
index efbad33a0645..2924a4081a19 100644
--- a/arch/x86/kvm/mmu/mmutrace.h
+++ b/arch/x86/kvm/mmu/mmutrace.h
@@ -54,6 +54,12 @@
{ PFERR_RSVD_MASK, "RSVD" }, \
{ PFERR_FETCH_MASK, "F" }
+TRACE_DEFINE_ENUM(RET_PF_RETRY);
+TRACE_DEFINE_ENUM(RET_PF_EMULATE);
+TRACE_DEFINE_ENUM(RET_PF_INVALID);
+TRACE_DEFINE_ENUM(RET_PF_FIXED);
+TRACE_DEFINE_ENUM(RET_PF_SPURIOUS);
+
/*
* A pagetable walk has started
*/
diff --git a/arch/x86/kvm/mmu/page_track.c b/arch/x86/kvm/mmu/page_track.c
index 91a9f7e0fd91..269f11f92fd0 100644
--- a/arch/x86/kvm/mmu/page_track.c
+++ b/arch/x86/kvm/mmu/page_track.c
@@ -16,6 +16,7 @@
#include <asm/kvm_page_track.h>
+#include "mmu.h"
#include "mmu_internal.h"
void kvm_page_track_free_memslot(struct kvm_memory_slot *slot)
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index ee044d357b5f..7d03e9b7ccfa 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -881,9 +881,9 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code,
mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb();
- if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, &hva,
- write_fault, &map_writable))
- return RET_PF_RETRY;
+ if (kvm_faultin_pfn(vcpu, prefault, walker.gfn, addr, &pfn, &hva,
+ write_fault, &map_writable, &r))
+ return r;
if (handle_abnormal_pfn(vcpu, addr, walker.gfn, pfn, walker.pte_access, &r))
return r;
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index d80cb122b5f3..64ccfc1fa553 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -10,7 +10,7 @@
#include <asm/cmpxchg.h>
#include <trace/events/kvm.h>
-static bool __read_mostly tdp_mmu_enabled = false;
+static bool __read_mostly tdp_mmu_enabled = true;
module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644);
/* Initializes the TDP MMU for the VM, if enabled. */
@@ -255,26 +255,17 @@ static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
*
* @kvm: kvm instance
* @sp: the new page
- * @shared: This operation may not be running under the exclusive use of
- * the MMU lock and the operation must synchronize with other
- * threads that might be adding or removing pages.
* @account_nx: This page replaces a NX large page and should be marked for
* eventual reclaim.
*/
static void tdp_mmu_link_page(struct kvm *kvm, struct kvm_mmu_page *sp,
- bool shared, bool account_nx)
+ bool account_nx)
{
- if (shared)
- spin_lock(&kvm->arch.tdp_mmu_pages_lock);
- else
- lockdep_assert_held_write(&kvm->mmu_lock);
-
+ spin_lock(&kvm->arch.tdp_mmu_pages_lock);
list_add(&sp->link, &kvm->arch.tdp_mmu_pages);
if (account_nx)
account_huge_nx_page(kvm, sp);
-
- if (shared)
- spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
+ spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
}
/**
@@ -445,13 +436,6 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);
- if (is_large_pte(old_spte) != is_large_pte(new_spte)) {
- if (is_large_pte(old_spte))
- atomic64_sub(1, (atomic64_t*)&kvm->stat.lpages);
- else
- atomic64_add(1, (atomic64_t*)&kvm->stat.lpages);
- }
-
/*
* The only times a SPTE should be changed from a non-present to
* non-present state is when an MMIO entry is installed/modified/
@@ -477,6 +461,8 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
return;
}
+ if (is_leaf != was_leaf)
+ kvm_update_page_stats(kvm, level, is_leaf ? 1 : -1);
if (was_leaf && is_dirty_spte(old_spte) &&
(!is_present || !is_dirty_spte(new_spte) || pfn_changed))
@@ -526,6 +512,10 @@ static inline bool tdp_mmu_set_spte_atomic_no_dirty_log(struct kvm *kvm,
if (is_removed_spte(iter->old_spte))
return false;
+ /*
+ * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and
+ * does not hold the mmu_lock.
+ */
if (cmpxchg64(rcu_dereference(iter->sptep), iter->old_spte,
new_spte) != iter->old_spte)
return false;
@@ -537,15 +527,40 @@ static inline bool tdp_mmu_set_spte_atomic_no_dirty_log(struct kvm *kvm,
return true;
}
-static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm,
- struct tdp_iter *iter,
- u64 new_spte)
+/*
+ * tdp_mmu_map_set_spte_atomic - Set a leaf TDP MMU SPTE atomically to resolve a
+ * TDP page fault.
+ *
+ * @vcpu: The vcpu instance that took the TDP page fault.
+ * @iter: a tdp_iter instance currently on the SPTE that should be set
+ * @new_spte: The value the SPTE should be set to
+ *
+ * Returns: true if the SPTE was set, false if it was not. If false is returned,
+ * this function will have no side-effects.
+ */
+static inline bool tdp_mmu_map_set_spte_atomic(struct kvm_vcpu *vcpu,
+ struct tdp_iter *iter,
+ u64 new_spte)
{
+ struct kvm *kvm = vcpu->kvm;
+
if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, iter, new_spte))
return false;
- handle_changed_spte_dirty_log(kvm, iter->as_id, iter->gfn,
- iter->old_spte, new_spte, iter->level);
+ /*
+ * Use kvm_vcpu_gfn_to_memslot() instead of going through
+ * handle_changed_spte_dirty_log() to leverage vcpu->last_used_slot.
+ */
+ if (is_writable_pte(new_spte)) {
+ struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, iter->gfn);
+
+ if (slot && kvm_slot_dirty_track_enabled(slot)) {
+ /* Enforced by kvm_mmu_hugepage_adjust. */
+ WARN_ON_ONCE(iter->level > PG_LEVEL_4K);
+ mark_page_dirty_in_slot(kvm, slot, iter->gfn);
+ }
+ }
+
return true;
}
@@ -558,7 +573,7 @@ static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm,
* immediately installing a present entry in its place
* before the TLBs are flushed.
*/
- if (!tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE))
+ if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, iter, REMOVED_SPTE))
return false;
kvm_flush_remote_tlbs_with_address(kvm, iter->gfn,
@@ -789,21 +804,15 @@ retry:
* non-root pages mapping GFNs strictly within that range. Returns true if
* SPTEs have been cleared and a TLB flush is needed before releasing the
* MMU lock.
- *
- * If shared is true, this thread holds the MMU lock in read mode and must
- * account for the possibility that other threads are modifying the paging
- * structures concurrently. If shared is false, this thread should hold the
- * MMU in write mode.
*/
bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start,
- gfn_t end, bool can_yield, bool flush,
- bool shared)
+ gfn_t end, bool can_yield, bool flush)
{
struct kvm_mmu_page *root;
- for_each_tdp_mmu_root_yield_safe(kvm, root, as_id, shared)
+ for_each_tdp_mmu_root_yield_safe(kvm, root, as_id, false)
flush = zap_gfn_range(kvm, root, start, end, can_yield, flush,
- shared);
+ false);
return flush;
}
@@ -814,8 +823,7 @@ void kvm_tdp_mmu_zap_all(struct kvm *kvm)
int i;
for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
- flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, 0, -1ull,
- flush, false);
+ flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, 0, -1ull, flush);
if (flush)
kvm_flush_remote_tlbs(kvm);
@@ -940,7 +948,7 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write,
if (new_spte == iter->old_spte)
ret = RET_PF_SPURIOUS;
- else if (!tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte))
+ else if (!tdp_mmu_map_set_spte_atomic(vcpu, iter, new_spte))
return RET_PF_RETRY;
/*
@@ -1044,9 +1052,8 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
new_spte = make_nonleaf_spte(child_pt,
!shadow_accessed_mask);
- if (tdp_mmu_set_spte_atomic(vcpu->kvm, &iter,
- new_spte)) {
- tdp_mmu_link_page(vcpu->kvm, sp, true,
+ if (tdp_mmu_set_spte_atomic_no_dirty_log(vcpu->kvm, &iter, new_spte)) {
+ tdp_mmu_link_page(vcpu->kvm, sp,
huge_page_disallowed &&
req_level >= iter.level);
@@ -1255,8 +1262,8 @@ retry:
* only affect leaf SPTEs down to min_level.
* Returns true if an SPTE has been changed and the TLBs need to be flushed.
*/
-bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, struct kvm_memory_slot *slot,
- int min_level)
+bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm,
+ const struct kvm_memory_slot *slot, int min_level)
{
struct kvm_mmu_page *root;
bool spte_set = false;
@@ -1326,7 +1333,8 @@ retry:
* each SPTE. Returns true if an SPTE has been changed and the TLBs need to
* be flushed.
*/
-bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, struct kvm_memory_slot *slot)
+bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm,
+ const struct kvm_memory_slot *slot)
{
struct kvm_mmu_page *root;
bool spte_set = false;
@@ -1529,6 +1537,8 @@ bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
/*
* Return the level of the lowest level SPTE added to sptes.
* That SPTE may be non-present.
+ *
+ * Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
*/
int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
int *root_level)
@@ -1540,14 +1550,47 @@ int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
*root_level = vcpu->arch.mmu->shadow_root_level;
- rcu_read_lock();
-
tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
leaf = iter.level;
sptes[leaf] = iter.old_spte;
}
- rcu_read_unlock();
-
return leaf;
}
+
+/*
+ * Returns the last level spte pointer of the shadow page walk for the given
+ * gpa, and sets *spte to the spte value. This spte may be non-preset. If no
+ * walk could be performed, returns NULL and *spte does not contain valid data.
+ *
+ * Contract:
+ * - Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}.
+ * - The returned sptep must not be used after kvm_tdp_mmu_walk_lockless_end.
+ *
+ * WARNING: This function is only intended to be called during fast_page_fault.
+ */
+u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr,
+ u64 *spte)
+{
+ struct tdp_iter iter;
+ struct kvm_mmu *mmu = vcpu->arch.mmu;
+ gfn_t gfn = addr >> PAGE_SHIFT;
+ tdp_ptep_t sptep = NULL;
+
+ tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
+ *spte = iter.old_spte;
+ sptep = iter.sptep;
+ }
+
+ /*
+ * Perform the rcu_dereference to get the raw spte pointer value since
+ * we are passing it up to fast_page_fault, which is shared with the
+ * legacy MMU and thus does not retain the TDP MMU-specific __rcu
+ * annotation.
+ *
+ * This is safe since fast_page_fault obeys the contracts of this
+ * function as well as all TDP MMU contracts around modifying SPTEs
+ * outside of mmu_lock.
+ */
+ return rcu_dereference(sptep);
+}
diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h
index 1cae4485b3bc..358f447d4012 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.h
+++ b/arch/x86/kvm/mmu/tdp_mmu.h
@@ -20,14 +20,11 @@ void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
bool shared);
bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start,
- gfn_t end, bool can_yield, bool flush,
- bool shared);
+ gfn_t end, bool can_yield, bool flush);
static inline bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id,
- gfn_t start, gfn_t end, bool flush,
- bool shared)
+ gfn_t start, gfn_t end, bool flush)
{
- return __kvm_tdp_mmu_zap_gfn_range(kvm, as_id, start, end, true, flush,
- shared);
+ return __kvm_tdp_mmu_zap_gfn_range(kvm, as_id, start, end, true, flush);
}
static inline bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
{
@@ -44,7 +41,7 @@ static inline bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
*/
lockdep_assert_held_write(&kvm->mmu_lock);
return __kvm_tdp_mmu_zap_gfn_range(kvm, kvm_mmu_page_as_id(sp),
- sp->gfn, end, false, false, false);
+ sp->gfn, end, false, false);
}
void kvm_tdp_mmu_zap_all(struct kvm *kvm);
@@ -61,10 +58,10 @@ bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range);
bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
-bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, struct kvm_memory_slot *slot,
- int min_level);
+bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm,
+ const struct kvm_memory_slot *slot, int min_level);
bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm,
- struct kvm_memory_slot *slot);
+ const struct kvm_memory_slot *slot);
void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
struct kvm_memory_slot *slot,
gfn_t gfn, unsigned long mask,
@@ -77,8 +74,20 @@ bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
struct kvm_memory_slot *slot, gfn_t gfn,
int min_level);
+static inline void kvm_tdp_mmu_walk_lockless_begin(void)
+{
+ rcu_read_lock();
+}
+
+static inline void kvm_tdp_mmu_walk_lockless_end(void)
+{
+ rcu_read_unlock();
+}
+
int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
int *root_level);
+u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr,
+ u64 *spte);
#ifdef CONFIG_X86_64
bool kvm_mmu_init_tdp_mmu(struct kvm *kvm);
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index 827886c12c16..0772bad9165c 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -137,18 +137,20 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
pmc->perf_event = event;
pmc_to_pmu(pmc)->event_count++;
clear_bit(pmc->idx, pmc_to_pmu(pmc)->reprogram_pmi);
+ pmc->is_paused = false;
}
static void pmc_pause_counter(struct kvm_pmc *pmc)
{
u64 counter = pmc->counter;
- if (!pmc->perf_event)
+ if (!pmc->perf_event || pmc->is_paused)
return;
/* update counter, reset event value to avoid redundant accumulation */
counter += perf_event_pause(pmc->perf_event, true);
pmc->counter = counter & pmc_bitmask(pmc);
+ pmc->is_paused = true;
}
static bool pmc_resume_counter(struct kvm_pmc *pmc)
@@ -163,6 +165,7 @@ static bool pmc_resume_counter(struct kvm_pmc *pmc)
/* reuse perf_event to serve as pmc_reprogram_counter() does*/
perf_event_enable(pmc->perf_event);
+ pmc->is_paused = false;
clear_bit(pmc->idx, (unsigned long *)&pmc_to_pmu(pmc)->reprogram_pmi);
return true;
diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h
index 67e753edfa22..0e4f2b1fa9fb 100644
--- a/arch/x86/kvm/pmu.h
+++ b/arch/x86/kvm/pmu.h
@@ -55,7 +55,7 @@ static inline u64 pmc_read_counter(struct kvm_pmc *pmc)
u64 counter, enabled, running;
counter = pmc->counter;
- if (pmc->perf_event)
+ if (pmc->perf_event && !pmc->is_paused)
counter += perf_event_read_value(pmc->perf_event,
&enabled, &running);
/* FIXME: Scaling needed? */
diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
index a8ad78a2faa1..8052d92069e0 100644
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@@ -197,6 +197,8 @@ void avic_init_vmcb(struct vcpu_svm *svm)
vmcb->control.avic_logical_id = lpa & AVIC_HPA_MASK;
vmcb->control.avic_physical_id = ppa & AVIC_HPA_MASK;
vmcb->control.avic_physical_id |= AVIC_MAX_PHYSICAL_ID_COUNT;
+ vmcb->control.avic_vapic_bar = APIC_DEFAULT_PHYS_BASE & VMCB_AVIC_APIC_BAR_MASK;
+
if (kvm_apicv_activated(svm->vcpu.kvm))
vmcb->control.int_ctl |= AVIC_ENABLE_MASK;
else
@@ -225,31 +227,26 @@ static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu,
* field of the VMCB. Therefore, we set up the
* APIC_ACCESS_PAGE_PRIVATE_MEMSLOT (4KB) here.
*/
-static int avic_update_access_page(struct kvm *kvm, bool activate)
+static int avic_alloc_access_page(struct kvm *kvm)
{
void __user *ret;
int r = 0;
mutex_lock(&kvm->slots_lock);
- /*
- * During kvm_destroy_vm(), kvm_pit_set_reinject() could trigger
- * APICv mode change, which update APIC_ACCESS_PAGE_PRIVATE_MEMSLOT
- * memory region. So, we need to ensure that kvm->mm == current->mm.
- */
- if ((kvm->arch.apic_access_memslot_enabled == activate) ||
- (kvm->mm != current->mm))
+
+ if (kvm->arch.apic_access_memslot_enabled)
goto out;
ret = __x86_set_memory_region(kvm,
APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
APIC_DEFAULT_PHYS_BASE,
- activate ? PAGE_SIZE : 0);
+ PAGE_SIZE);
if (IS_ERR(ret)) {
r = PTR_ERR(ret);
goto out;
}
- kvm->arch.apic_access_memslot_enabled = activate;
+ kvm->arch.apic_access_memslot_enabled = true;
out:
mutex_unlock(&kvm->slots_lock);
return r;
@@ -270,7 +267,7 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu)
if (kvm_apicv_activated(vcpu->kvm)) {
int ret;
- ret = avic_update_access_page(vcpu->kvm, true);
+ ret = avic_alloc_access_page(vcpu->kvm);
if (ret)
return ret;
}
@@ -587,17 +584,6 @@ void avic_post_state_restore(struct kvm_vcpu *vcpu)
avic_handle_ldr_update(vcpu);
}
-void svm_toggle_avic_for_irq_window(struct kvm_vcpu *vcpu, bool activate)
-{
- if (!enable_apicv || !lapic_in_kernel(vcpu))
- return;
-
- srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
- kvm_request_apicv_update(vcpu->kvm, activate,
- APICV_INHIBIT_REASON_IRQWIN);
- vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
-}
-
void svm_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
{
return;
@@ -667,6 +653,11 @@ void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
}
vmcb_mark_dirty(vmcb, VMCB_AVIC);
+ if (activated)
+ avic_vcpu_load(vcpu, vcpu->cpu);
+ else
+ avic_vcpu_put(vcpu);
+
svm_set_pi_irte_mode(vcpu, activated);
}
@@ -918,10 +909,6 @@ bool svm_check_apicv_inhibit_reasons(ulong bit)
return supported & BIT(bit);
}
-void svm_pre_update_apicv_exec_ctrl(struct kvm *kvm, bool activate)
-{
- avic_update_access_page(kvm, activate);
-}
static inline int
avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r)
@@ -960,9 +947,6 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
int h_physical_id = kvm_cpu_get_apicid(cpu);
struct vcpu_svm *svm = to_svm(vcpu);
- if (!kvm_vcpu_apicv_active(vcpu))
- return;
-
/*
* Since the host physical APIC id is 8 bits,
* we can support host APIC ID upto 255.
@@ -990,9 +974,6 @@ void avic_vcpu_put(struct kvm_vcpu *vcpu)
u64 entry;
struct vcpu_svm *svm = to_svm(vcpu);
- if (!kvm_vcpu_apicv_active(vcpu))
- return;
-
entry = READ_ONCE(*(svm->avic_physical_id_cache));
if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK)
avic_update_iommu_vcpu_affinity(vcpu, -1, 0);
@@ -1009,6 +990,10 @@ static void avic_set_running(struct kvm_vcpu *vcpu, bool is_run)
struct vcpu_svm *svm = to_svm(vcpu);
svm->avic_is_running = is_run;
+
+ if (!kvm_vcpu_apicv_active(vcpu))
+ return;
+
if (is_run)
avic_vcpu_load(vcpu, vcpu->cpu);
else
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index e5515477c30a..2545d0c61985 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -666,11 +666,6 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu)
goto out;
}
-
- /* Clear internal status */
- kvm_clear_exception_queue(vcpu);
- kvm_clear_interrupt_queue(vcpu);
-
/*
* Since vmcb01 is not in use, we can use it to store some of the L1
* state.
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 7fbce342eec4..75e0b21ad07c 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -28,8 +28,6 @@
#include "cpuid.h"
#include "trace.h"
-#define __ex(x) __kvm_handle_fault_on_reboot(x)
-
#ifndef CONFIG_KVM_AMD_SEV
/*
* When this config is not defined, SEV feature is not supported and APIs in
@@ -584,6 +582,7 @@ static int sev_es_sync_vmsa(struct vcpu_svm *svm)
save->xcr0 = svm->vcpu.arch.xcr0;
save->pkru = svm->vcpu.arch.pkru;
save->xss = svm->vcpu.arch.ia32_xss;
+ save->dr6 = svm->vcpu.arch.dr6;
/*
* SEV-ES will use a VMSA that is pointed to by the VMCB, not
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 69639f9624f5..05e8d4d27969 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -46,8 +46,6 @@
#include "kvm_onhyperv.h"
#include "svm_onhyperv.h"
-#define __ex(x) __kvm_handle_fault_on_reboot(x)
-
MODULE_AUTHOR("Qumranet");
MODULE_LICENSE("GPL");
@@ -261,7 +259,7 @@ u32 svm_msrpm_offset(u32 msr)
static int get_max_npt_level(void)
{
#ifdef CONFIG_X86_64
- return PT64_ROOT_4LEVEL;
+ return pgtable_l5_enabled() ? PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
#else
return PT32E_ROOT_LEVEL;
#endif
@@ -462,11 +460,6 @@ static int has_svm(void)
return 0;
}
- if (pgtable_l5_enabled()) {
- pr_info("KVM doesn't yet support 5-level paging on AMD SVM\n");
- return 0;
- }
-
return 1;
}
@@ -1015,7 +1008,9 @@ static __init int svm_hardware_setup(void)
if (!boot_cpu_has(X86_FEATURE_NPT))
npt_enabled = false;
- kvm_configure_mmu(npt_enabled, get_max_npt_level(), PG_LEVEL_1G);
+ /* Force VM NPT level equal to the host's max NPT level */
+ kvm_configure_mmu(npt_enabled, get_max_npt_level(),
+ get_max_npt_level(), PG_LEVEL_1G);
pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis");
/* Note, SEV setup consumes npt_enabled. */
@@ -1161,8 +1156,6 @@ static void init_vmcb(struct kvm_vcpu *vcpu)
struct vmcb_control_area *control = &svm->vmcb->control;
struct vmcb_save_area *save = &svm->vmcb->save;
- vcpu->arch.hflags = 0;
-
svm_set_intercept(svm, INTERCEPT_CR0_READ);
svm_set_intercept(svm, INTERCEPT_CR3_READ);
svm_set_intercept(svm, INTERCEPT_CR4_READ);
@@ -1241,29 +1234,14 @@ static void init_vmcb(struct kvm_vcpu *vcpu)
SVM_SELECTOR_S_MASK | SVM_SELECTOR_CODE_MASK;
save->cs.limit = 0xffff;
+ save->gdtr.base = 0;
save->gdtr.limit = 0xffff;
+ save->idtr.base = 0;
save->idtr.limit = 0xffff;
init_sys_seg(&save->ldtr, SEG_TYPE_LDT);
init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
- svm_set_cr4(vcpu, 0);
- svm_set_efer(vcpu, 0);
- save->dr6 = 0xffff0ff0;
- kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
- save->rip = 0x0000fff0;
- vcpu->arch.regs[VCPU_REGS_RIP] = save->rip;
-
- /*
- * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0.
- * It also updates the guest-visible cr0 value.
- */
- svm_set_cr0(vcpu, X86_CR0_NW | X86_CR0_CD | X86_CR0_ET);
- kvm_mmu_reset_context(vcpu);
-
- save->cr4 = X86_CR4_PAE;
- /* rdx = ?? */
-
if (npt_enabled) {
/* Setup VMCB for Nested Paging */
control->nested_ctl |= SVM_NESTED_CTL_NP_ENABLE;
@@ -1273,14 +1251,12 @@ static void init_vmcb(struct kvm_vcpu *vcpu)
svm_clr_intercept(svm, INTERCEPT_CR3_WRITE);
save->g_pat = vcpu->arch.pat;
save->cr3 = 0;
- save->cr4 = 0;
}
svm->current_vmcb->asid_generation = 0;
svm->asid = 0;
svm->nested.vmcb12_gpa = INVALID_GPA;
svm->nested.last_vmcb12_gpa = INVALID_GPA;
- vcpu->arch.hflags = 0;
if (!kvm_pause_in_guest(vcpu->kvm)) {
control->pause_filter_count = pause_filter_count;
@@ -1330,25 +1306,11 @@ static void init_vmcb(struct kvm_vcpu *vcpu)
static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
{
struct vcpu_svm *svm = to_svm(vcpu);
- u32 dummy;
- u32 eax = 1;
svm->spec_ctrl = 0;
svm->virt_spec_ctrl = 0;
- if (!init_event) {
- vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE |
- MSR_IA32_APICBASE_ENABLE;
- if (kvm_vcpu_is_reset_bsp(vcpu))
- vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP;
- }
init_vmcb(vcpu);
-
- kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy, false);
- kvm_rdx_write(vcpu, eax);
-
- if (kvm_vcpu_apicv_active(vcpu) && !init_event)
- avic_update_vapic_bar(svm, APIC_DEFAULT_PHYS_BASE);
}
void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb)
@@ -1513,12 +1475,15 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
sd->current_vmcb = svm->vmcb;
indirect_branch_prediction_barrier();
}
- avic_vcpu_load(vcpu, cpu);
+ if (kvm_vcpu_apicv_active(vcpu))
+ avic_vcpu_load(vcpu, cpu);
}
static void svm_vcpu_put(struct kvm_vcpu *vcpu)
{
- avic_vcpu_put(vcpu);
+ if (kvm_vcpu_apicv_active(vcpu))
+ avic_vcpu_put(vcpu);
+
svm_prepare_host_switch(vcpu);
++vcpu->stat.host_state_reload;
@@ -1560,7 +1525,7 @@ static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
break;
default:
- WARN_ON_ONCE(1);
+ KVM_BUG_ON(1, vcpu->kvm);
}
}
@@ -2078,11 +2043,15 @@ static int shutdown_interception(struct kvm_vcpu *vcpu)
return -EINVAL;
/*
- * VMCB is undefined after a SHUTDOWN intercept
- * so reinitialize it.
+ * VMCB is undefined after a SHUTDOWN intercept. INIT the vCPU to put
+ * the VMCB in a known good state. Unfortuately, KVM doesn't have
+ * KVM_MP_STATE_SHUTDOWN and can't add it without potentially breaking
+ * userspace. At a platform view, INIT is acceptable behavior as
+ * there exist bare metal platforms that automatically INIT the CPU
+ * in response to shutdown.
*/
clear_page(svm->vmcb);
- init_vmcb(vcpu);
+ kvm_vcpu_reset(vcpu, true);
kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
return 0;
@@ -2993,10 +2962,6 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
svm->msr_decfg = data;
break;
}
- case MSR_IA32_APICBASE:
- if (kvm_vcpu_apicv_active(vcpu))
- avic_update_vapic_bar(to_svm(vcpu), data);
- fallthrough;
default:
return kvm_set_msr_common(vcpu, msr);
}
@@ -3021,7 +2986,7 @@ static int interrupt_window_interception(struct kvm_vcpu *vcpu)
* In this case AVIC was temporarily disabled for
* requesting the IRQ window and we have to re-enable it.
*/
- svm_toggle_avic_for_irq_window(vcpu, true);
+ kvm_request_apicv_update(vcpu->kvm, true, APICV_INHIBIT_REASON_IRQWIN);
++vcpu->stat.irq_window_exits;
return 1;
@@ -3269,12 +3234,14 @@ static void dump_vmcb(struct kvm_vcpu *vcpu)
"excp_to:", save->last_excp_to);
}
-static int svm_handle_invalid_exit(struct kvm_vcpu *vcpu, u64 exit_code)
+static bool svm_check_exit_valid(struct kvm_vcpu *vcpu, u64 exit_code)
{
- if (exit_code < ARRAY_SIZE(svm_exit_handlers) &&
- svm_exit_handlers[exit_code])
- return 0;
+ return (exit_code < ARRAY_SIZE(svm_exit_handlers) &&
+ svm_exit_handlers[exit_code]);
+}
+static int svm_handle_invalid_exit(struct kvm_vcpu *vcpu, u64 exit_code)
+{
vcpu_unimpl(vcpu, "svm: unexpected exit reason 0x%llx\n", exit_code);
dump_vmcb(vcpu);
vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
@@ -3282,14 +3249,13 @@ static int svm_handle_invalid_exit(struct kvm_vcpu *vcpu, u64 exit_code)
vcpu->run->internal.ndata = 2;
vcpu->run->internal.data[0] = exit_code;
vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu;
-
- return -EINVAL;
+ return 0;
}
int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code)
{
- if (svm_handle_invalid_exit(vcpu, exit_code))
- return 0;
+ if (!svm_check_exit_valid(vcpu, exit_code))
+ return svm_handle_invalid_exit(vcpu, exit_code);
#ifdef CONFIG_RETPOLINE
if (exit_code == SVM_EXIT_MSR)
@@ -3573,7 +3539,7 @@ static void svm_enable_irq_window(struct kvm_vcpu *vcpu)
* via AVIC. In such case, we need to temporarily disable AVIC,
* and fallback to injecting IRQ via V_IRQ.
*/
- svm_toggle_avic_for_irq_window(vcpu, false);
+ kvm_request_apicv_update(vcpu->kvm, false, APICV_INHIBIT_REASON_IRQWIN);
svm_set_vintr(svm);
}
}
@@ -3808,6 +3774,8 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu)
pre_svm_run(vcpu);
+ WARN_ON_ONCE(kvm_apicv_activated(vcpu->kvm) != kvm_vcpu_apicv_active(vcpu));
+
sync_lapic_to_cr8(vcpu);
if (unlikely(svm->asid != svm->vmcb->control.asid)) {
@@ -4610,7 +4578,6 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.set_virtual_apic_mode = svm_set_virtual_apic_mode,
.refresh_apicv_exec_ctrl = svm_refresh_apicv_exec_ctrl,
.check_apicv_inhibit_reasons = svm_check_apicv_inhibit_reasons,
- .pre_update_apicv_exec_ctrl = svm_pre_update_apicv_exec_ctrl,
.load_eoi_exitmap = svm_load_eoi_exitmap,
.hwapic_irr_update = svm_hwapic_irr_update,
.hwapic_isr_update = svm_hwapic_isr_update,
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index bd0fe94c2920..524d943f3efc 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -503,12 +503,6 @@ extern struct kvm_x86_nested_ops svm_nested_ops;
#define VMCB_AVIC_APIC_BAR_MASK 0xFFFFFFFFFF000ULL
-static inline void avic_update_vapic_bar(struct vcpu_svm *svm, u64 data)
-{
- svm->vmcb->control.avic_vapic_bar = data & VMCB_AVIC_APIC_BAR_MASK;
- vmcb_mark_dirty(svm->vmcb, VMCB_AVIC);
-}
-
static inline bool avic_vcpu_is_running(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -524,7 +518,6 @@ int avic_ga_log_notifier(u32 ga_tag);
void avic_vm_destroy(struct kvm *kvm);
int avic_vm_init(struct kvm *kvm);
void avic_init_vmcb(struct vcpu_svm *svm);
-void svm_toggle_avic_for_irq_window(struct kvm_vcpu *vcpu, bool activate);
int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu);
int avic_unaccelerated_access_interception(struct kvm_vcpu *vcpu);
int avic_init_vcpu(struct vcpu_svm *svm);
@@ -534,7 +527,6 @@ void avic_post_state_restore(struct kvm_vcpu *vcpu);
void svm_set_virtual_apic_mode(struct kvm_vcpu *vcpu);
void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu);
bool svm_check_apicv_inhibit_reasons(ulong bit);
-void svm_pre_update_apicv_exec_ctrl(struct kvm *kvm, bool activate);
void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
void svm_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr);
void svm_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr);
diff --git a/arch/x86/kvm/svm/svm_ops.h b/arch/x86/kvm/svm/svm_ops.h
index 8170f2a5a16f..22e2b019de37 100644
--- a/arch/x86/kvm/svm/svm_ops.h
+++ b/arch/x86/kvm/svm/svm_ops.h
@@ -4,7 +4,7 @@
#include <linux/compiler_types.h>
-#include <asm/kvm_host.h>
+#include "x86.h"
#define svm_asm(insn, clobber...) \
do { \
diff --git a/arch/x86/kvm/vmx/evmcs.c b/arch/x86/kvm/vmx/evmcs.c
index 896b2a50b4aa..0dab1b7b529f 100644
--- a/arch/x86/kvm/vmx/evmcs.c
+++ b/arch/x86/kvm/vmx/evmcs.c
@@ -14,7 +14,6 @@ DEFINE_STATIC_KEY_FALSE(enable_evmcs);
#if IS_ENABLED(CONFIG_HYPERV)
-#define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n)))))
#define EVMCS1_OFFSET(x) offsetof(struct hv_enlightened_vmcs, x)
#define EVMCS1_FIELD(number, name, clean_field)[ROL16(number, 6)] = \
{EVMCS1_OFFSET(name), clean_field}
diff --git a/arch/x86/kvm/vmx/evmcs.h b/arch/x86/kvm/vmx/evmcs.h
index 2ec9b46f0d0c..152ab0aa82cf 100644
--- a/arch/x86/kvm/vmx/evmcs.h
+++ b/arch/x86/kvm/vmx/evmcs.h
@@ -73,8 +73,6 @@ struct evmcs_field {
extern const struct evmcs_field vmcs_field_to_evmcs_1[];
extern const unsigned int nr_evmcs_1_fields;
-#define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n)))))
-
static __always_inline int get_evmcs_offset(unsigned long field,
u16 *clean_field)
{
@@ -95,8 +93,6 @@ static __always_inline int get_evmcs_offset(unsigned long field,
return evmcs_field->offset;
}
-#undef ROL16
-
static inline void evmcs_write64(unsigned long field, u64 value)
{
u16 clean_field;
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index b3f77d18eb5a..ccb03d69546c 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -2207,7 +2207,8 @@ static void prepare_vmcs02_early_rare(struct vcpu_vmx *vmx,
}
}
-static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
+static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs01,
+ struct vmcs12 *vmcs12)
{
u32 exec_control;
u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12);
@@ -2218,23 +2219,22 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
/*
* PIN CONTROLS
*/
- exec_control = vmx_pin_based_exec_ctrl(vmx);
+ exec_control = __pin_controls_get(vmcs01);
exec_control |= (vmcs12->pin_based_vm_exec_control &
~PIN_BASED_VMX_PREEMPTION_TIMER);
/* Posted interrupts setting is only taken from vmcs12. */
- if (nested_cpu_has_posted_intr(vmcs12)) {
+ vmx->nested.pi_pending = false;
+ if (nested_cpu_has_posted_intr(vmcs12))
vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv;
- vmx->nested.pi_pending = false;
- } else {
+ else
exec_control &= ~PIN_BASED_POSTED_INTR;
- }
pin_controls_set(vmx, exec_control);
/*
* EXEC CONTROLS
*/
- exec_control = vmx_exec_control(vmx); /* L0's desires */
+ exec_control = __exec_controls_get(vmcs01); /* L0's desires */
exec_control &= ~CPU_BASED_INTR_WINDOW_EXITING;
exec_control &= ~CPU_BASED_NMI_WINDOW_EXITING;
exec_control &= ~CPU_BASED_TPR_SHADOW;
@@ -2271,10 +2271,11 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
* SECONDARY EXEC CONTROLS
*/
if (cpu_has_secondary_exec_ctrls()) {
- exec_control = vmx->secondary_exec_control;
+ exec_control = __secondary_exec_controls_get(vmcs01);
/* Take the following fields only from vmcs12 */
exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
+ SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
SECONDARY_EXEC_ENABLE_INVPCID |
SECONDARY_EXEC_ENABLE_RDTSCP |
SECONDARY_EXEC_XSAVES |
@@ -2282,7 +2283,9 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
SECONDARY_EXEC_APIC_REGISTER_VIRT |
SECONDARY_EXEC_ENABLE_VMFUNC |
- SECONDARY_EXEC_TSC_SCALING);
+ SECONDARY_EXEC_TSC_SCALING |
+ SECONDARY_EXEC_DESC);
+
if (nested_cpu_has(vmcs12,
CPU_BASED_ACTIVATE_SECONDARY_CONTROLS))
exec_control |= vmcs12->secondary_vm_exec_control;
@@ -2322,8 +2325,9 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
* on the related bits (if supported by the CPU) in the hope that
* we can avoid VMWrites during vmx_set_efer().
*/
- exec_control = (vmcs12->vm_entry_controls | vmx_vmentry_ctrl()) &
- ~VM_ENTRY_IA32E_MODE & ~VM_ENTRY_LOAD_IA32_EFER;
+ exec_control = __vm_entry_controls_get(vmcs01);
+ exec_control |= vmcs12->vm_entry_controls;
+ exec_control &= ~(VM_ENTRY_IA32E_MODE | VM_ENTRY_LOAD_IA32_EFER);
if (cpu_has_load_ia32_efer()) {
if (guest_efer & EFER_LMA)
exec_control |= VM_ENTRY_IA32E_MODE;
@@ -2339,9 +2343,11 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
* we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER
* bits may be modified by vmx_set_efer() in prepare_vmcs02().
*/
- exec_control = vmx_vmexit_ctrl();
+ exec_control = __vm_exit_controls_get(vmcs01);
if (cpu_has_load_ia32_efer() && guest_efer != host_efer)
exec_control |= VM_EXIT_LOAD_IA32_EFER;
+ else
+ exec_control &= ~VM_EXIT_LOAD_IA32_EFER;
vm_exit_controls_set(vmx, exec_control);
/*
@@ -3384,7 +3390,7 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02);
- prepare_vmcs02_early(vmx, vmcs12);
+ prepare_vmcs02_early(vmx, &vmx->vmcs01, vmcs12);
if (from_vmentry) {
if (unlikely(!nested_get_vmcs12_pages(vcpu))) {
@@ -4304,7 +4310,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
seg.l = 1;
else
seg.db = 1;
- vmx_set_segment(vcpu, &seg, VCPU_SREG_CS);
+ __vmx_set_segment(vcpu, &seg, VCPU_SREG_CS);
seg = (struct kvm_segment) {
.base = 0,
.limit = 0xFFFFFFFF,
@@ -4315,17 +4321,17 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
.g = 1
};
seg.selector = vmcs12->host_ds_selector;
- vmx_set_segment(vcpu, &seg, VCPU_SREG_DS);
+ __vmx_set_segment(vcpu, &seg, VCPU_SREG_DS);
seg.selector = vmcs12->host_es_selector;
- vmx_set_segment(vcpu, &seg, VCPU_SREG_ES);
+ __vmx_set_segment(vcpu, &seg, VCPU_SREG_ES);
seg.selector = vmcs12->host_ss_selector;
- vmx_set_segment(vcpu, &seg, VCPU_SREG_SS);
+ __vmx_set_segment(vcpu, &seg, VCPU_SREG_SS);
seg.selector = vmcs12->host_fs_selector;
seg.base = vmcs12->host_fs_base;
- vmx_set_segment(vcpu, &seg, VCPU_SREG_FS);
+ __vmx_set_segment(vcpu, &seg, VCPU_SREG_FS);
seg.selector = vmcs12->host_gs_selector;
seg.base = vmcs12->host_gs_base;
- vmx_set_segment(vcpu, &seg, VCPU_SREG_GS);
+ __vmx_set_segment(vcpu, &seg, VCPU_SREG_GS);
seg = (struct kvm_segment) {
.base = vmcs12->host_tr_base,
.limit = 0x67,
@@ -4333,14 +4339,15 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
.type = 11,
.present = 1
};
- vmx_set_segment(vcpu, &seg, VCPU_SREG_TR);
+ __vmx_set_segment(vcpu, &seg, VCPU_SREG_TR);
+
+ memset(&seg, 0, sizeof(seg));
+ seg.unusable = 1;
+ __vmx_set_segment(vcpu, &seg, VCPU_SREG_LDTR);
kvm_set_dr(vcpu, 7, 0x400);
vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
- if (cpu_has_vmx_msr_bitmap())
- vmx_update_msr_bitmap(vcpu);
-
if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr,
vmcs12->vm_exit_msr_load_count))
nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL);
@@ -4419,9 +4426,6 @@ static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu)
kvm_mmu_reset_context(vcpu);
- if (cpu_has_vmx_msr_bitmap())
- vmx_update_msr_bitmap(vcpu);
-
/*
* This nasty bit of open coding is a compromise between blindly
* loading L1's MSRs using the exit load lists (incorrect emulation
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
index 9efc1a6b8693..10cc4f65c4ef 100644
--- a/arch/x86/kvm/vmx/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@ -437,13 +437,13 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
!(msr & MSR_PMC_FULL_WIDTH_BIT))
data = (s64)(s32)data;
pmc->counter += data - pmc_read_counter(pmc);
- if (pmc->perf_event)
+ if (pmc->perf_event && !pmc->is_paused)
perf_event_period(pmc->perf_event,
get_sample_period(pmc, data));
return 0;
} else if ((pmc = get_fixed_pmc(pmu, msr))) {
pmc->counter += data - pmc_read_counter(pmc);
- if (pmc->perf_event)
+ if (pmc->perf_event && !pmc->is_paused)
perf_event_period(pmc->perf_event,
get_sample_period(pmc, data));
return 0;
diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h
index 4b9957e2bf5b..6e5de2e2b0da 100644
--- a/arch/x86/kvm/vmx/vmcs.h
+++ b/arch/x86/kvm/vmx/vmcs.h
@@ -11,6 +11,8 @@
#include "capabilities.h"
+#define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n)))))
+
struct vmcs_hdr {
u32 revision_id:31;
u32 shadow_vmcs:1;
diff --git a/arch/x86/kvm/vmx/vmcs12.c b/arch/x86/kvm/vmx/vmcs12.c
index d9f5d7c56ae3..cab6ba7a5005 100644
--- a/arch/x86/kvm/vmx/vmcs12.c
+++ b/arch/x86/kvm/vmx/vmcs12.c
@@ -2,7 +2,6 @@
#include "vmcs12.h"
-#define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n)))))
#define VMCS12_OFFSET(x) offsetof(struct vmcs12, x)
#define FIELD(number, name) [ROL16(number, 6)] = VMCS12_OFFSET(name)
#define FIELD64(number, name) \
diff --git a/arch/x86/kvm/vmx/vmcs12.h b/arch/x86/kvm/vmx/vmcs12.h
index 5e0e1b39f495..2a45f026ee11 100644
--- a/arch/x86/kvm/vmx/vmcs12.h
+++ b/arch/x86/kvm/vmx/vmcs12.h
@@ -364,8 +364,6 @@ static inline void vmx_check_vmcs12_offsets(void)
extern const unsigned short vmcs_field_to_offset_table[];
extern const unsigned int nr_vmcs12_fields;
-#define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n)))))
-
static inline short vmcs_field_to_offset(unsigned long field)
{
unsigned short offset;
@@ -385,8 +383,6 @@ static inline short vmcs_field_to_offset(unsigned long field)
return offset;
}
-#undef ROL16
-
static inline u64 vmcs12_read_any(struct vmcs12 *vmcs12, unsigned long field,
u16 offset)
{
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 927a552393b9..0c2c0d5ae873 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -136,8 +136,7 @@ module_param(allow_smaller_maxphyaddr, bool, S_IRUGO);
#define KVM_VM_CR0_ALWAYS_OFF (X86_CR0_NW | X86_CR0_CD)
#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR0_NE
#define KVM_VM_CR0_ALWAYS_ON \
- (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | \
- X86_CR0_WP | X86_CR0_PG | X86_CR0_PE)
+ (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
#define KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR4_VMXE
#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
@@ -1648,11 +1647,12 @@ static void vmx_setup_uret_msr(struct vcpu_vmx *vmx, unsigned int msr,
}
/*
- * Set up the vmcs to automatically save and restore system
- * msrs. Don't touch the 64-bit msrs if the guest is in legacy
- * mode, as fiddling with msrs is very expensive.
+ * Configuring user return MSRs to automatically save, load, and restore MSRs
+ * that need to be shoved into hardware when running the guest. Note, omitting
+ * an MSR here does _NOT_ mean it's not emulated, only that it will not be
+ * loaded into hardware when running the guest.
*/
-static void setup_msrs(struct vcpu_vmx *vmx)
+static void vmx_setup_uret_msrs(struct vcpu_vmx *vmx)
{
#ifdef CONFIG_X86_64
bool load_syscall_msrs;
@@ -1682,9 +1682,6 @@ static void setup_msrs(struct vcpu_vmx *vmx)
*/
vmx_setup_uret_msr(vmx, MSR_IA32_TSX_CTRL, boot_cpu_has(X86_FEATURE_RTM));
- if (cpu_has_vmx_msr_bitmap())
- vmx_update_msr_bitmap(&vmx->vcpu);
-
/*
* The set of MSRs to load may have changed, reload MSRs before the
* next VM-Enter.
@@ -2263,8 +2260,11 @@ static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & guest_owned_bits;
break;
case VCPU_EXREG_CR3:
- if (is_unrestricted_guest(vcpu) ||
- (enable_ept && is_paging(vcpu)))
+ /*
+ * When intercepting CR3 loads, e.g. for shadowing paging, KVM's
+ * CR3 is loaded into hardware, not the guest's CR3.
+ */
+ if (!(exec_controls_get(to_vmx(vcpu)) & CPU_BASED_CR3_LOAD_EXITING))
vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
break;
case VCPU_EXREG_CR4:
@@ -2274,7 +2274,7 @@ static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & guest_owned_bits;
break;
default:
- WARN_ON_ONCE(1);
+ KVM_BUG_ON(1, vcpu->kvm);
break;
}
}
@@ -2733,7 +2733,7 @@ static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg,
save->dpl = save->selector & SEGMENT_RPL_MASK;
save->s = 1;
}
- vmx_set_segment(vcpu, save, seg);
+ __vmx_set_segment(vcpu, save, seg);
}
static void enter_pmode(struct kvm_vcpu *vcpu)
@@ -2754,7 +2754,7 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
vmx->rmode.vm86_active = 0;
- vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
+ __vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
flags = vmcs_readl(GUEST_RFLAGS);
flags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
@@ -2852,8 +2852,6 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
-
- kvm_mmu_reset_context(vcpu);
}
int vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
@@ -2874,7 +2872,7 @@ int vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
msr->data = efer & ~EFER_LME;
}
- setup_msrs(vmx);
+ vmx_setup_uret_msrs(vmx);
return 0;
}
@@ -2997,42 +2995,24 @@ void ept_save_pdptrs(struct kvm_vcpu *vcpu)
kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR);
}
-static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
- unsigned long cr0,
- struct kvm_vcpu *vcpu)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
-
- if (!kvm_register_is_available(vcpu, VCPU_EXREG_CR3))
- vmx_cache_reg(vcpu, VCPU_EXREG_CR3);
- if (!(cr0 & X86_CR0_PG)) {
- /* From paging/starting to nonpaging */
- exec_controls_setbit(vmx, CPU_BASED_CR3_LOAD_EXITING |
- CPU_BASED_CR3_STORE_EXITING);
- vcpu->arch.cr0 = cr0;
- vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
- } else if (!is_paging(vcpu)) {
- /* From nonpaging to paging */
- exec_controls_clearbit(vmx, CPU_BASED_CR3_LOAD_EXITING |
- CPU_BASED_CR3_STORE_EXITING);
- vcpu->arch.cr0 = cr0;
- vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
- }
-
- if (!(cr0 & X86_CR0_WP))
- *hw_cr0 &= ~X86_CR0_WP;
-}
+#define CR3_EXITING_BITS (CPU_BASED_CR3_LOAD_EXITING | \
+ CPU_BASED_CR3_STORE_EXITING)
void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- unsigned long hw_cr0;
+ unsigned long hw_cr0, old_cr0_pg;
+ u32 tmp;
+
+ old_cr0_pg = kvm_read_cr0_bits(vcpu, X86_CR0_PG);
hw_cr0 = (cr0 & ~KVM_VM_CR0_ALWAYS_OFF);
if (is_unrestricted_guest(vcpu))
hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST;
else {
hw_cr0 |= KVM_VM_CR0_ALWAYS_ON;
+ if (!enable_ept)
+ hw_cr0 |= X86_CR0_WP;
if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE))
enter_pmode(vcpu);
@@ -3041,22 +3021,60 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
enter_rmode(vcpu);
}
+ vmcs_writel(CR0_READ_SHADOW, cr0);
+ vmcs_writel(GUEST_CR0, hw_cr0);
+ vcpu->arch.cr0 = cr0;
+ kvm_register_mark_available(vcpu, VCPU_EXREG_CR0);
+
#ifdef CONFIG_X86_64
if (vcpu->arch.efer & EFER_LME) {
- if (!is_paging(vcpu) && (cr0 & X86_CR0_PG))
+ if (!old_cr0_pg && (cr0 & X86_CR0_PG))
enter_lmode(vcpu);
- if (is_paging(vcpu) && !(cr0 & X86_CR0_PG))
+ else if (old_cr0_pg && !(cr0 & X86_CR0_PG))
exit_lmode(vcpu);
}
#endif
- if (enable_ept && !is_unrestricted_guest(vcpu))
- ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu);
+ if (enable_ept && !is_unrestricted_guest(vcpu)) {
+ /*
+ * Ensure KVM has an up-to-date snapshot of the guest's CR3. If
+ * the below code _enables_ CR3 exiting, vmx_cache_reg() will
+ * (correctly) stop reading vmcs.GUEST_CR3 because it thinks
+ * KVM's CR3 is installed.
+ */
+ if (!kvm_register_is_available(vcpu, VCPU_EXREG_CR3))
+ vmx_cache_reg(vcpu, VCPU_EXREG_CR3);
- vmcs_writel(CR0_READ_SHADOW, cr0);
- vmcs_writel(GUEST_CR0, hw_cr0);
- vcpu->arch.cr0 = cr0;
- kvm_register_mark_available(vcpu, VCPU_EXREG_CR0);
+ /*
+ * When running with EPT but not unrestricted guest, KVM must
+ * intercept CR3 accesses when paging is _disabled_. This is
+ * necessary because restricted guests can't actually run with
+ * paging disabled, and so KVM stuffs its own CR3 in order to
+ * run the guest when identity mapped page tables.
+ *
+ * Do _NOT_ check the old CR0.PG, e.g. to optimize away the
+ * update, it may be stale with respect to CR3 interception,
+ * e.g. after nested VM-Enter.
+ *
+ * Lastly, honor L1's desires, i.e. intercept CR3 loads and/or
+ * stores to forward them to L1, even if KVM does not need to
+ * intercept them to preserve its identity mapped page tables.
+ */
+ if (!(cr0 & X86_CR0_PG)) {
+ exec_controls_setbit(vmx, CR3_EXITING_BITS);
+ } else if (!is_guest_mode(vcpu)) {
+ exec_controls_clearbit(vmx, CR3_EXITING_BITS);
+ } else {
+ tmp = exec_controls_get(vmx);
+ tmp &= ~CR3_EXITING_BITS;
+ tmp |= get_vmcs12(vcpu)->cpu_based_vm_exec_control & CR3_EXITING_BITS;
+ exec_controls_set(vmx, tmp);
+ }
+
+ /* Note, vmx_set_cr4() consumes the new vcpu->arch.cr0. */
+ if ((old_cr0_pg ^ cr0) & X86_CR0_PG)
+ vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
+ }
/* depends on vcpu->arch.cr0 to be set to a new value */
vmx->emulation_required = emulation_required(vcpu);
@@ -3271,7 +3289,7 @@ static u32 vmx_segment_access_rights(struct kvm_segment *var)
return ar;
}
-void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
+void __vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
@@ -3284,7 +3302,7 @@ void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
vmcs_write16(sf->selector, var->selector);
else if (var->s)
fix_rmode_seg(seg, &vmx->rmode.segs[seg]);
- goto out;
+ return;
}
vmcs_writel(sf->base, var->base);
@@ -3306,9 +3324,13 @@ void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
var->type |= 0x1; /* Accessed */
vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var));
+}
-out:
- vmx->emulation_required = emulation_required(vcpu);
+static void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
+{
+ __vmx_set_segment(vcpu, var, seg);
+
+ to_vmx(vcpu)->emulation_required = emulation_required(vcpu);
}
static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
@@ -3790,21 +3812,6 @@ void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type)
vmx_set_msr_bitmap_write(msr_bitmap, msr);
}
-static u8 vmx_msr_bitmap_mode(struct kvm_vcpu *vcpu)
-{
- u8 mode = 0;
-
- if (cpu_has_secondary_exec_ctrls() &&
- (secondary_exec_controls_get(to_vmx(vcpu)) &
- SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
- mode |= MSR_BITMAP_MODE_X2APIC;
- if (enable_apicv && kvm_vcpu_apicv_active(vcpu))
- mode |= MSR_BITMAP_MODE_X2APIC_APICV;
- }
-
- return mode;
-}
-
static void vmx_reset_x2apic_msrs(struct kvm_vcpu *vcpu, u8 mode)
{
unsigned long *msr_bitmap = to_vmx(vcpu)->vmcs01.msr_bitmap;
@@ -3822,11 +3829,29 @@ static void vmx_reset_x2apic_msrs(struct kvm_vcpu *vcpu, u8 mode)
}
}
-static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu, u8 mode)
+static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu)
{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ u8 mode;
+
if (!cpu_has_vmx_msr_bitmap())
return;
+ if (cpu_has_secondary_exec_ctrls() &&
+ (secondary_exec_controls_get(vmx) &
+ SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
+ mode = MSR_BITMAP_MODE_X2APIC;
+ if (enable_apicv && kvm_vcpu_apicv_active(vcpu))
+ mode |= MSR_BITMAP_MODE_X2APIC_APICV;
+ } else {
+ mode = 0;
+ }
+
+ if (mode == vmx->x2apic_msr_bitmap_mode)
+ return;
+
+ vmx->x2apic_msr_bitmap_mode = mode;
+
vmx_reset_x2apic_msrs(vcpu, mode);
/*
@@ -3843,21 +3868,6 @@ static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu, u8 mode)
}
}
-void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- u8 mode = vmx_msr_bitmap_mode(vcpu);
- u8 changed = mode ^ vmx->msr_bitmap_mode;
-
- if (!changed)
- return;
-
- if (changed & (MSR_BITMAP_MODE_X2APIC | MSR_BITMAP_MODE_X2APIC_APICV))
- vmx_update_msr_bitmap_x2apic(vcpu, mode);
-
- vmx->msr_bitmap_mode = mode;
-}
-
void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -3914,7 +3924,6 @@ static void vmx_msr_filter_changed(struct kvm_vcpu *vcpu)
}
pt_update_intercept_for_msr(vcpu);
- vmx_update_msr_bitmap_x2apic(vcpu, vmx_msr_bitmap_mode(vcpu));
}
static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu,
@@ -4086,7 +4095,7 @@ void set_cr4_guest_host_mask(struct vcpu_vmx *vmx)
vmcs_writel(CR4_GUEST_HOST_MASK, ~vcpu->arch.cr4_guest_owned_bits);
}
-u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx)
+static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx)
{
u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl;
@@ -4102,6 +4111,30 @@ u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx)
return pin_based_exec_ctrl;
}
+static u32 vmx_vmentry_ctrl(void)
+{
+ u32 vmentry_ctrl = vmcs_config.vmentry_ctrl;
+
+ if (vmx_pt_mode_is_system())
+ vmentry_ctrl &= ~(VM_ENTRY_PT_CONCEAL_PIP |
+ VM_ENTRY_LOAD_IA32_RTIT_CTL);
+ /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */
+ return vmentry_ctrl &
+ ~(VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL | VM_ENTRY_LOAD_IA32_EFER);
+}
+
+static u32 vmx_vmexit_ctrl(void)
+{
+ u32 vmexit_ctrl = vmcs_config.vmexit_ctrl;
+
+ if (vmx_pt_mode_is_system())
+ vmexit_ctrl &= ~(VM_EXIT_PT_CONCEAL_PIP |
+ VM_EXIT_CLEAR_IA32_RTIT_CTL);
+ /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */
+ return vmexit_ctrl &
+ ~(VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | VM_EXIT_LOAD_IA32_EFER);
+}
+
static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -4118,11 +4151,10 @@ static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
}
- if (cpu_has_vmx_msr_bitmap())
- vmx_update_msr_bitmap(vcpu);
+ vmx_update_msr_bitmap_x2apic(vcpu);
}
-u32 vmx_exec_control(struct vcpu_vmx *vmx)
+static u32 vmx_exec_control(struct vcpu_vmx *vmx)
{
u32 exec_control = vmcs_config.cpu_based_exec_ctrl;
@@ -4204,7 +4236,7 @@ vmx_adjust_secondary_exec_control(struct vcpu_vmx *vmx, u32 *exec_control,
#define vmx_adjust_sec_exec_exiting(vmx, exec_control, lname, uname) \
vmx_adjust_sec_exec_control(vmx, exec_control, lname, uname, uname##_EXITING, true)
-static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
+static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
{
struct kvm_vcpu *vcpu = &vmx->vcpu;
@@ -4290,7 +4322,7 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
if (!vcpu->kvm->arch.bus_lock_detection_enabled)
exec_control &= ~SECONDARY_EXEC_BUS_LOCK_DETECTION;
- vmx->secondary_exec_control = exec_control;
+ return exec_control;
}
#define VMX_XSS_EXIT_BITMAP 0
@@ -4314,10 +4346,8 @@ static void init_vmcs(struct vcpu_vmx *vmx)
exec_controls_set(vmx, vmx_exec_control(vmx));
- if (cpu_has_secondary_exec_ctrls()) {
- vmx_compute_secondary_exec_control(vmx);
- secondary_exec_controls_set(vmx, vmx->secondary_exec_control);
- }
+ if (cpu_has_secondary_exec_ctrls())
+ secondary_exec_controls_set(vmx, vmx_secondary_exec_control(vmx));
if (kvm_vcpu_apicv_active(&vmx->vcpu)) {
vmcs_write64(EOI_EXIT_BITMAP0, 0);
@@ -4388,32 +4418,35 @@ static void init_vmcs(struct vcpu_vmx *vmx)
vmx->pt_desc.guest.output_mask = 0x7F;
vmcs_write64(GUEST_IA32_RTIT_CTL, 0);
}
+
+ vmcs_write32(GUEST_SYSENTER_CS, 0);
+ vmcs_writel(GUEST_SYSENTER_ESP, 0);
+ vmcs_writel(GUEST_SYSENTER_EIP, 0);
+ vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
+
+ if (cpu_has_vmx_tpr_shadow()) {
+ vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
+ if (cpu_need_tpr_shadow(&vmx->vcpu))
+ vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
+ __pa(vmx->vcpu.arch.apic->regs));
+ vmcs_write32(TPR_THRESHOLD, 0);
+ }
+
+ vmx_setup_uret_msrs(vmx);
}
static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- struct msr_data apic_base_msr;
- u64 cr0;
vmx->rmode.vm86_active = 0;
vmx->spec_ctrl = 0;
vmx->msr_ia32_umwait_control = 0;
- vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
vmx->hv_deadline_tsc = -1;
kvm_set_cr8(vcpu, 0);
- if (!init_event) {
- apic_base_msr.data = APIC_DEFAULT_PHYS_BASE |
- MSR_IA32_APICBASE_ENABLE;
- if (kvm_vcpu_is_reset_bsp(vcpu))
- apic_base_msr.data |= MSR_IA32_APICBASE_BSP;
- apic_base_msr.host_initiated = true;
- kvm_set_apic_base(vcpu, &apic_base_msr);
- }
-
vmx_segment_cache_clear(vmx);
seg_setup(VCPU_SREG_CS);
@@ -4436,16 +4469,6 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
- if (!init_event) {
- vmcs_write32(GUEST_SYSENTER_CS, 0);
- vmcs_writel(GUEST_SYSENTER_ESP, 0);
- vmcs_writel(GUEST_SYSENTER_EIP, 0);
- vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
- }
-
- kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
- kvm_rip_write(vcpu, 0xfff0);
-
vmcs_writel(GUEST_GDTR_BASE, 0);
vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
@@ -4458,31 +4481,11 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
if (kvm_mpx_supported())
vmcs_write64(GUEST_BNDCFGS, 0);
- setup_msrs(vmx);
-
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */
- if (cpu_has_vmx_tpr_shadow() && !init_event) {
- vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
- if (cpu_need_tpr_shadow(vcpu))
- vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
- __pa(vcpu->arch.apic->regs));
- vmcs_write32(TPR_THRESHOLD, 0);
- }
-
kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
- cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
- vmx->vcpu.arch.cr0 = cr0;
- vmx_set_cr0(vcpu, cr0); /* enter rmode */
- vmx_set_cr4(vcpu, 0);
- vmx_set_efer(vcpu, 0);
-
- vmx_update_exception_bitmap(vcpu);
-
vpid_sync_context(vmx->vpid);
- if (init_event)
- vmx_clear_hlt(vcpu);
}
static void vmx_enable_irq_window(struct kvm_vcpu *vcpu)
@@ -4996,6 +4999,7 @@ static int handle_cr(struct kvm_vcpu *vcpu)
return kvm_complete_insn_gp(vcpu, err);
case 3:
WARN_ON_ONCE(enable_unrestricted_guest);
+
err = kvm_set_cr3(vcpu, val);
return kvm_complete_insn_gp(vcpu, err);
case 4:
@@ -5021,14 +5025,13 @@ static int handle_cr(struct kvm_vcpu *vcpu)
}
break;
case 2: /* clts */
- WARN_ONCE(1, "Guest should always own CR0.TS");
- vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
- trace_kvm_cr_write(0, kvm_read_cr0(vcpu));
- return kvm_skip_emulated_instruction(vcpu);
+ KVM_BUG(1, vcpu->kvm, "Guest always owns CR0.TS");
+ return -EIO;
case 1: /*mov from cr*/
switch (cr) {
case 3:
WARN_ON_ONCE(enable_unrestricted_guest);
+
val = kvm_read_cr3(vcpu);
kvm_register_write(vcpu, reg, val);
trace_kvm_cr_read(cr, val);
@@ -5129,6 +5132,12 @@ static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
exec_controls_setbit(to_vmx(vcpu), CPU_BASED_MOV_DR_EXITING);
+
+ /*
+ * exc_debug expects dr6 to be cleared after it runs, avoid that it sees
+ * a stale dr6 from the guest.
+ */
+ set_debugreg(DR6_RESERVED, 6);
}
static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
@@ -5338,7 +5347,9 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
static int handle_nmi_window(struct kvm_vcpu *vcpu)
{
- WARN_ON_ONCE(!enable_vnmi);
+ if (KVM_BUG_ON(!enable_vnmi, vcpu->kvm))
+ return -EIO;
+
exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING);
++vcpu->stat.nmi_window_exits;
kvm_make_request(KVM_REQ_EVENT, vcpu);
@@ -5896,7 +5907,8 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
* below) should never happen as that means we incorrectly allowed a
* nested VM-Enter with an invalid vmcs12.
*/
- WARN_ON_ONCE(vmx->nested.nested_run_pending);
+ if (KVM_BUG_ON(vmx->nested.nested_run_pending, vcpu->kvm))
+ return -EIO;
/* If guest state is invalid, start emulating */
if (vmx->emulation_required)
@@ -6189,7 +6201,7 @@ void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
}
secondary_exec_controls_set(vmx, sec_exec_control);
- vmx_update_msr_bitmap(vcpu);
+ vmx_update_msr_bitmap_x2apic(vcpu);
}
static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu)
@@ -6274,7 +6286,9 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
int max_irr;
bool max_irr_updated;
- WARN_ON(!vcpu->arch.apicv_active);
+ if (KVM_BUG_ON(!vcpu->arch.apicv_active, vcpu->kvm))
+ return -EIO;
+
if (pi_test_on(&vmx->pi_desc)) {
pi_clear_on(&vmx->pi_desc);
/*
@@ -6357,7 +6371,7 @@ static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu)
unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK;
gate_desc *desc = (gate_desc *)host_idt_base + vector;
- if (WARN_ONCE(!is_external_intr(intr_info),
+ if (KVM_BUG(!is_external_intr(intr_info), vcpu->kvm,
"KVM: unexpected VM-Exit interrupt info: 0x%x", intr_info))
return;
@@ -6368,6 +6382,9 @@ static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ if (vmx->emulation_required)
+ return;
+
if (vmx->exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT)
handle_external_interrupt_irqoff(vcpu);
else if (vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI)
@@ -6639,6 +6656,10 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
vmx->loaded_vmcs->host_state.cr4 = cr4;
}
+ /* When KVM_DEBUGREG_WONT_EXIT, dr6 is accessible in guest. */
+ if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))
+ set_debugreg(vcpu->arch.dr6, 6);
+
/* When single-stepping over STI and MOV SS, we must clear the
* corresponding interruptibility bits in the guest state. Otherwise
* vmentry fails as it then expects bit 14 (BS) in pending debug
@@ -6838,7 +6859,6 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R);
vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R);
}
- vmx->msr_bitmap_mode = 0;
vmx->loaded_vmcs = &vmx->vmcs01;
cpu = get_cpu();
@@ -6997,7 +7017,7 @@ exit:
return (cache << VMX_EPT_MT_EPTE_SHIFT) | ipat;
}
-static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx)
+static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx, u32 new_ctl)
{
/*
* These bits in the secondary execution controls field
@@ -7011,7 +7031,6 @@ static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx)
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
SECONDARY_EXEC_DESC;
- u32 new_ctl = vmx->secondary_exec_control;
u32 cur_ctl = secondary_exec_controls_get(vmx);
secondary_exec_controls_set(vmx, (new_ctl & ~mask) | (cur_ctl & mask));
@@ -7154,10 +7173,11 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
/* xsaves_enabled is recomputed in vmx_compute_secondary_exec_control(). */
vcpu->arch.xsaves_enabled = false;
- if (cpu_has_secondary_exec_ctrls()) {
- vmx_compute_secondary_exec_control(vmx);
- vmcs_set_secondary_exec_control(vmx);
- }
+ vmx_setup_uret_msrs(vmx);
+
+ if (cpu_has_secondary_exec_ctrls())
+ vmcs_set_secondary_exec_control(vmx,
+ vmx_secondary_exec_control(vmx));
if (nested_vmx_allowed(vcpu))
to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |=
@@ -7803,7 +7823,8 @@ static __init int hardware_setup(void)
ept_lpage_level = PG_LEVEL_2M;
else
ept_lpage_level = PG_LEVEL_4K;
- kvm_configure_mmu(enable_ept, vmx_get_max_tdp_level(), ept_lpage_level);
+ kvm_configure_mmu(enable_ept, 0, vmx_get_max_tdp_level(),
+ ept_lpage_level);
/*
* Only enable PML when hardware supports PML feature, and both EPT
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 17a1cb4b059d..4858c5fd95f2 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -227,7 +227,7 @@ struct nested_vmx {
struct vcpu_vmx {
struct kvm_vcpu vcpu;
u8 fail;
- u8 msr_bitmap_mode;
+ u8 x2apic_msr_bitmap_mode;
/*
* If true, host state has been stored in vmx->loaded_vmcs for
@@ -263,8 +263,6 @@ struct vcpu_vmx {
u64 spec_ctrl;
u32 msr_ia32_umwait_control;
- u32 secondary_exec_control;
-
/*
* loaded_vmcs points to the VMCS currently used in this vcpu. For a
* non-nested (L1) guest, it always points to vmcs01. For a nested
@@ -371,12 +369,11 @@ void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
void set_cr4_guest_host_mask(struct vcpu_vmx *vmx);
void ept_save_pdptrs(struct kvm_vcpu *vcpu);
void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
-void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
+void __vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
u64 construct_eptp(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level);
bool vmx_guest_inject_ac(struct kvm_vcpu *vcpu);
void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu);
-void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu);
bool vmx_nmi_blocked(struct kvm_vcpu *vcpu);
bool vmx_interrupt_blocked(struct kvm_vcpu *vcpu);
bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu);
@@ -419,9 +416,13 @@ static inline void lname##_controls_set(struct vcpu_vmx *vmx, u32 val) \
vmx->loaded_vmcs->controls_shadow.lname = val; \
} \
} \
+static inline u32 __##lname##_controls_get(struct loaded_vmcs *vmcs) \
+{ \
+ return vmcs->controls_shadow.lname; \
+} \
static inline u32 lname##_controls_get(struct vcpu_vmx *vmx) \
{ \
- return vmx->loaded_vmcs->controls_shadow.lname; \
+ return __##lname##_controls_get(vmx->loaded_vmcs); \
} \
static inline void lname##_controls_setbit(struct vcpu_vmx *vmx, u32 val) \
{ \
@@ -451,31 +452,6 @@ static inline void vmx_register_cache_reset(struct kvm_vcpu *vcpu)
vcpu->arch.regs_dirty = 0;
}
-static inline u32 vmx_vmentry_ctrl(void)
-{
- u32 vmentry_ctrl = vmcs_config.vmentry_ctrl;
- if (vmx_pt_mode_is_system())
- vmentry_ctrl &= ~(VM_ENTRY_PT_CONCEAL_PIP |
- VM_ENTRY_LOAD_IA32_RTIT_CTL);
- /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */
- return vmentry_ctrl &
- ~(VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL | VM_ENTRY_LOAD_IA32_EFER);
-}
-
-static inline u32 vmx_vmexit_ctrl(void)
-{
- u32 vmexit_ctrl = vmcs_config.vmexit_ctrl;
- if (vmx_pt_mode_is_system())
- vmexit_ctrl &= ~(VM_EXIT_PT_CONCEAL_PIP |
- VM_EXIT_CLEAR_IA32_RTIT_CTL);
- /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */
- return vmexit_ctrl &
- ~(VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | VM_EXIT_LOAD_IA32_EFER);
-}
-
-u32 vmx_exec_control(struct vcpu_vmx *vmx);
-u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx);
-
static inline struct kvm_vmx *to_kvm_vmx(struct kvm *kvm)
{
return container_of(kvm, struct kvm_vmx, kvm);
diff --git a/arch/x86/kvm/vmx/vmx_ops.h b/arch/x86/kvm/vmx/vmx_ops.h
index 164b64f65a8f..9e9ef47e988c 100644
--- a/arch/x86/kvm/vmx/vmx_ops.h
+++ b/arch/x86/kvm/vmx/vmx_ops.h
@@ -4,13 +4,11 @@
#include <linux/nospec.h>
-#include <asm/kvm_host.h>
#include <asm/vmx.h>
#include "evmcs.h"
#include "vmcs.h"
-
-#define __ex(x) __kvm_handle_fault_on_reboot(x)
+#include "x86.h"
asmlinkage void vmread_error(unsigned long field, bool fault);
__attribute__((regparm(0))) void vmread_error_trampoline(unsigned long field,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e5d5c5ed7dd4..28ef14155726 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -233,12 +233,13 @@ const struct _kvm_stats_desc kvm_vm_stats_desc[] = {
STATS_DESC_COUNTER(VM, mmu_recycled),
STATS_DESC_COUNTER(VM, mmu_cache_miss),
STATS_DESC_ICOUNTER(VM, mmu_unsync),
- STATS_DESC_ICOUNTER(VM, lpages),
+ STATS_DESC_ICOUNTER(VM, pages_4k),
+ STATS_DESC_ICOUNTER(VM, pages_2m),
+ STATS_DESC_ICOUNTER(VM, pages_1g),
STATS_DESC_ICOUNTER(VM, nx_lpage_splits),
+ STATS_DESC_PCOUNTER(VM, max_mmu_rmap_size),
STATS_DESC_PCOUNTER(VM, max_mmu_page_hash_collisions)
};
-static_assert(ARRAY_SIZE(kvm_vm_stats_desc) ==
- sizeof(struct kvm_vm_stat) / sizeof(u64));
const struct kvm_stats_header kvm_vm_stats_header = {
.name_size = KVM_STATS_NAME_SIZE,
@@ -278,8 +279,6 @@ const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = {
STATS_DESC_COUNTER(VCPU, directed_yield_successful),
STATS_DESC_ICOUNTER(VCPU, guest_mode)
};
-static_assert(ARRAY_SIZE(kvm_vcpu_stats_desc) ==
- sizeof(struct kvm_vcpu_stat) / sizeof(u64));
const struct kvm_stats_header kvm_vcpu_stats_header = {
.name_size = KVM_STATS_NAME_SIZE,
@@ -485,7 +484,14 @@ int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
}
EXPORT_SYMBOL_GPL(kvm_set_apic_base);
-asmlinkage __visible noinstr void kvm_spurious_fault(void)
+/*
+ * Handle a fault on a hardware virtualization (VMX or SVM) instruction.
+ *
+ * Hardware virtualization extension instructions may fault if a reboot turns
+ * off virtualization while processes are running. Usually after catching the
+ * fault we just panic; during reboot instead the instruction is ignored.
+ */
+noinstr void kvm_spurious_fault(void)
{
/* Fault while not rebooting. We want the trace. */
BUG_ON(!kvm_rebooting);
@@ -1180,7 +1186,6 @@ static void kvm_update_dr0123(struct kvm_vcpu *vcpu)
if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
for (i = 0; i < KVM_NR_DB_REGS; i++)
vcpu->arch.eff_db[i] = vcpu->arch.db[i];
- vcpu->arch.switch_db_regs |= KVM_DEBUGREG_RELOAD;
}
}
@@ -3316,6 +3321,10 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if (!msr_info->host_initiated) {
s64 adj = data - vcpu->arch.ia32_tsc_adjust_msr;
adjust_tsc_offset_guest(vcpu, adj);
+ /* Before back to guest, tsc_timestamp must be adjusted
+ * as well, otherwise guest's percpu pvclock time could jump.
+ */
+ kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
}
vcpu->arch.ia32_tsc_adjust_msr = data;
}
@@ -4310,12 +4319,6 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
static_call(kvm_x86_vcpu_put)(vcpu);
vcpu->arch.last_host_tsc = rdtsc();
- /*
- * If userspace has set any breakpoints or watchpoints, dr6 is restored
- * on every vmexit, but if not, we might have a stale dr6 from the
- * guest. do_debug expects dr6 to be cleared after it runs, do the same.
- */
- set_debugreg(0, 6);
}
static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
@@ -6567,9 +6570,9 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
* there is no pkey in EPT page table for L1 guest or EPT
* shadow page table for L2 guest.
*/
- if (vcpu_match_mmio_gva(vcpu, gva)
- && !permission_fault(vcpu, vcpu->arch.walk_mmu,
- vcpu->arch.mmio_access, 0, access)) {
+ if (vcpu_match_mmio_gva(vcpu, gva) && (!is_paging(vcpu) ||
+ !permission_fault(vcpu, vcpu->arch.walk_mmu,
+ vcpu->arch.mmio_access, 0, access))) {
*gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT |
(gva & (PAGE_SIZE - 1));
trace_vcpu_match_mmio(gva, *gpa, write, false);
@@ -8578,6 +8581,8 @@ EXPORT_SYMBOL_GPL(kvm_apicv_activated);
static void kvm_apicv_init(struct kvm *kvm)
{
+ mutex_init(&kvm->arch.apicv_update_lock);
+
if (enable_apicv)
clear_bit(APICV_INHIBIT_REASON_DISABLE,
&kvm->arch.apicv_inhibit_reasons);
@@ -8891,6 +8896,10 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit)
can_inject = false;
}
+ /* Don't inject interrupts if the user asked to avoid doing so */
+ if (vcpu->guest_debug & KVM_GUESTDBG_BLOCKIRQ)
+ return 0;
+
/*
* Finally, inject interrupt events. If an event cannot be injected
* due to architectural conditions (e.g. IF=0) a window-open exit
@@ -9236,10 +9245,18 @@ void kvm_make_scan_ioapic_request(struct kvm *kvm)
void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu)
{
+ bool activate;
+
if (!lapic_in_kernel(vcpu))
return;
- vcpu->arch.apicv_active = kvm_apicv_activated(vcpu->kvm);
+ mutex_lock(&vcpu->kvm->arch.apicv_update_lock);
+
+ activate = kvm_apicv_activated(vcpu->kvm);
+ if (vcpu->arch.apicv_active == activate)
+ goto out;
+
+ vcpu->arch.apicv_active = activate;
kvm_apic_update_apicv(vcpu);
static_call(kvm_x86_refresh_apicv_exec_ctrl)(vcpu);
@@ -9251,54 +9268,45 @@ void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu)
*/
if (!vcpu->arch.apicv_active)
kvm_make_request(KVM_REQ_EVENT, vcpu);
+
+out:
+ mutex_unlock(&vcpu->kvm->arch.apicv_update_lock);
}
EXPORT_SYMBOL_GPL(kvm_vcpu_update_apicv);
-/*
- * NOTE: Do not hold any lock prior to calling this.
- *
- * In particular, kvm_request_apicv_update() expects kvm->srcu not to be
- * locked, because it calls __x86_set_memory_region() which does
- * synchronize_srcu(&kvm->srcu).
- */
-void kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit)
+void __kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit)
{
- struct kvm_vcpu *except;
- unsigned long old, new, expected;
+ unsigned long old, new;
if (!kvm_x86_ops.check_apicv_inhibit_reasons ||
!static_call(kvm_x86_check_apicv_inhibit_reasons)(bit))
return;
- old = READ_ONCE(kvm->arch.apicv_inhibit_reasons);
- do {
- expected = new = old;
- if (activate)
- __clear_bit(bit, &new);
- else
- __set_bit(bit, &new);
- if (new == old)
- break;
- old = cmpxchg(&kvm->arch.apicv_inhibit_reasons, expected, new);
- } while (old != expected);
-
- if (!!old == !!new)
- return;
+ old = new = kvm->arch.apicv_inhibit_reasons;
- trace_kvm_apicv_update_request(activate, bit);
- if (kvm_x86_ops.pre_update_apicv_exec_ctrl)
- static_call(kvm_x86_pre_update_apicv_exec_ctrl)(kvm, activate);
+ if (activate)
+ __clear_bit(bit, &new);
+ else
+ __set_bit(bit, &new);
+
+ if (!!old != !!new) {
+ trace_kvm_apicv_update_request(activate, bit);
+ kvm_make_all_cpus_request(kvm, KVM_REQ_APICV_UPDATE);
+ kvm->arch.apicv_inhibit_reasons = new;
+ if (new) {
+ unsigned long gfn = gpa_to_gfn(APIC_DEFAULT_PHYS_BASE);
+ kvm_zap_gfn_range(kvm, gfn, gfn+1);
+ }
+ } else
+ kvm->arch.apicv_inhibit_reasons = new;
+}
+EXPORT_SYMBOL_GPL(__kvm_request_apicv_update);
- /*
- * Sending request to update APICV for all other vcpus,
- * while update the calling vcpu immediately instead of
- * waiting for another #VMEXIT to handle the request.
- */
- except = kvm_get_running_vcpu();
- kvm_make_all_cpus_request_except(kvm, KVM_REQ_APICV_UPDATE,
- except);
- if (except)
- kvm_vcpu_update_apicv(except);
+void kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit)
+{
+ mutex_lock(&kvm->arch.apicv_update_lock);
+ __kvm_request_apicv_update(kvm, activate, bit);
+ mutex_unlock(&kvm->arch.apicv_update_lock);
}
EXPORT_SYMBOL_GPL(kvm_request_apicv_update);
@@ -9395,6 +9403,10 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
}
if (kvm_request_pending(vcpu)) {
+ if (kvm_check_request(KVM_REQ_VM_BUGGED, vcpu)) {
+ r = -EIO;
+ goto out;
+ }
if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) {
if (unlikely(!kvm_x86_ops.nested_ops->get_nested_state_pages(vcpu))) {
r = 0;
@@ -9608,8 +9620,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
set_debugreg(vcpu->arch.eff_db[1], 1);
set_debugreg(vcpu->arch.eff_db[2], 2);
set_debugreg(vcpu->arch.eff_db[3], 3);
- set_debugreg(vcpu->arch.dr6, 6);
- vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD;
} else if (unlikely(hw_breakpoint_active())) {
set_debugreg(0, 7);
}
@@ -9639,7 +9649,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
static_call(kvm_x86_sync_dirty_debug_regs)(vcpu);
kvm_update_dr0123(vcpu);
kvm_update_dr7(vcpu);
- vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD;
}
/*
@@ -9976,7 +9985,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
goto out;
}
- if (kvm_run->kvm_valid_regs & ~KVM_SYNC_X86_VALID_FIELDS) {
+ if ((kvm_run->kvm_valid_regs & ~KVM_SYNC_X86_VALID_FIELDS) ||
+ (kvm_run->kvm_dirty_regs & ~KVM_SYNC_X86_VALID_FIELDS)) {
r = -EINVAL;
goto out;
}
@@ -10581,9 +10591,6 @@ static void store_regs(struct kvm_vcpu *vcpu)
static int sync_regs(struct kvm_vcpu *vcpu)
{
- if (vcpu->run->kvm_dirty_regs & ~KVM_SYNC_X86_VALID_FIELDS)
- return -EINVAL;
-
if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_REGS) {
__set_regs(vcpu, &vcpu->run->s.regs.regs);
vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_REGS;
@@ -10799,6 +10806,8 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
{
unsigned long old_cr0 = kvm_read_cr0(vcpu);
+ unsigned long new_cr0;
+ u32 eax, dummy;
kvm_lapic_reset(vcpu, init_event);
@@ -10865,10 +10874,41 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
vcpu->arch.regs_avail = ~0;
vcpu->arch.regs_dirty = ~0;
+ /*
+ * Fall back to KVM's default Family/Model/Stepping of 0x600 (P6/Athlon)
+ * if no CPUID match is found. Note, it's impossible to get a match at
+ * RESET since KVM emulates RESET before exposing the vCPU to userspace,
+ * i.e. it'simpossible for kvm_cpuid() to find a valid entry on RESET.
+ * But, go through the motions in case that's ever remedied.
+ */
+ eax = 1;
+ if (!kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy, true))
+ eax = 0x600;
+ kvm_rdx_write(vcpu, eax);
+
vcpu->arch.ia32_xss = 0;
static_call(kvm_x86_vcpu_reset)(vcpu, init_event);
+ kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
+ kvm_rip_write(vcpu, 0xfff0);
+
+ /*
+ * CR0.CD/NW are set on RESET, preserved on INIT. Note, some versions
+ * of Intel's SDM list CD/NW as being set on INIT, but they contradict
+ * (or qualify) that with a footnote stating that CD/NW are preserved.
+ */
+ new_cr0 = X86_CR0_ET;
+ if (init_event)
+ new_cr0 |= (old_cr0 & (X86_CR0_NW | X86_CR0_CD));
+ else
+ new_cr0 |= X86_CR0_NW | X86_CR0_CD;
+
+ static_call(kvm_x86_set_cr0)(vcpu, new_cr0);
+ static_call(kvm_x86_set_cr4)(vcpu, 0);
+ static_call(kvm_x86_set_efer)(vcpu, 0);
+ static_call(kvm_x86_update_exception_bitmap)(vcpu);
+
/*
* Reset the MMU context if paging was enabled prior to INIT (which is
* implied if CR0.PG=1 as CR0 will be '0' prior to RESET). Unlike the
@@ -10879,7 +10919,20 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
*/
if (old_cr0 & X86_CR0_PG)
kvm_mmu_reset_context(vcpu);
+
+ /*
+ * Intel's SDM states that all TLB entries are flushed on INIT. AMD's
+ * APM states the TLBs are untouched by INIT, but it also states that
+ * the TLBs are flushed on "External initialization of the processor."
+ * Flush the guest TLB regardless of vendor, there is no meaningful
+ * benefit in relying on the guest to flush the TLB immediately after
+ * INIT. A spurious TLB flush is benign and likely negligible from a
+ * performance perspective.
+ */
+ if (init_event)
+ kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
}
+EXPORT_SYMBOL_GPL(kvm_vcpu_reset);
void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
{
@@ -11123,6 +11176,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
kvm_hv_init_vm(kvm);
kvm_page_track_init(kvm);
kvm_mmu_init_vm(kvm);
+ kvm_xen_init_vm(kvm);
return static_call(kvm_x86_vm_init)(kvm);
}
@@ -11312,8 +11366,7 @@ static int memslot_rmap_alloc(struct kvm_memory_slot *slot,
for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
int level = i + 1;
- int lpages = gfn_to_index(slot->base_gfn + npages - 1,
- slot->base_gfn, level) + 1;
+ int lpages = __kvm_mmu_slot_lpages(slot, npages, level);
WARN_ON(slot->arch.rmap[i]);
@@ -11396,8 +11449,7 @@ static int kvm_alloc_memslot_metadata(struct kvm *kvm,
int lpages;
int level = i + 1;
- lpages = gfn_to_index(slot->base_gfn + npages - 1,
- slot->base_gfn, level) + 1;
+ lpages = __kvm_mmu_slot_lpages(slot, npages, level);
linfo = kvcalloc(lpages, sizeof(*linfo), GFP_KERNEL_ACCOUNT);
if (!linfo)
@@ -11481,7 +11533,7 @@ static void kvm_mmu_update_cpu_dirty_logging(struct kvm *kvm, bool enable)
static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
struct kvm_memory_slot *old,
- struct kvm_memory_slot *new,
+ const struct kvm_memory_slot *new,
enum kvm_mr_change change)
{
bool log_dirty_pages = new->flags & KVM_MEM_LOG_DIRTY_PAGES;
@@ -11561,10 +11613,7 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
kvm_mmu_change_mmu_pages(kvm,
kvm_mmu_calculate_default_mmu_pages(kvm));
- /*
- * FIXME: const-ify all uses of struct kvm_memory_slot.
- */
- kvm_mmu_slot_apply_flags(kvm, old, (struct kvm_memory_slot *) new, change);
+ kvm_mmu_slot_apply_flags(kvm, old, new, change);
/* Free the arrays associated with the old memslot. */
if (change == KVM_MR_MOVE)
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 44ae10312740..7d66d63dc55a 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -8,6 +8,8 @@
#include "kvm_cache_regs.h"
#include "kvm_emulate.h"
+void kvm_spurious_fault(void);
+
static __always_inline void kvm_guest_enter_irqoff(void)
{
/*
diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index ae17250e1efe..9ea9c3dabe37 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -25,15 +25,14 @@ static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn)
{
gpa_t gpa = gfn_to_gpa(gfn);
int wc_ofs, sec_hi_ofs;
- int ret;
+ int ret = 0;
int idx = srcu_read_lock(&kvm->srcu);
- ret = kvm_gfn_to_hva_cache_init(kvm, &kvm->arch.xen.shinfo_cache,
- gpa, PAGE_SIZE);
- if (ret)
+ if (kvm_is_error_hva(gfn_to_hva(kvm, gfn))) {
+ ret = -EFAULT;
goto out;
-
- kvm->arch.xen.shinfo_set = true;
+ }
+ kvm->arch.xen.shinfo_gfn = gfn;
/* Paranoia checks on the 32-bit struct layout */
BUILD_BUG_ON(offsetof(struct compat_shared_info, wc) != 0x900);
@@ -245,7 +244,7 @@ int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
case KVM_XEN_ATTR_TYPE_SHARED_INFO:
if (data->u.shared_info.gfn == GPA_INVALID) {
- kvm->arch.xen.shinfo_set = false;
+ kvm->arch.xen.shinfo_gfn = GPA_INVALID;
r = 0;
break;
}
@@ -283,10 +282,7 @@ int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
break;
case KVM_XEN_ATTR_TYPE_SHARED_INFO:
- if (kvm->arch.xen.shinfo_set)
- data->u.shared_info.gfn = gpa_to_gfn(kvm->arch.xen.shinfo_cache.gpa);
- else
- data->u.shared_info.gfn = GPA_INVALID;
+ data->u.shared_info.gfn = gpa_to_gfn(kvm->arch.xen.shinfo_gfn);
r = 0;
break;
@@ -646,6 +642,11 @@ int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc)
return 0;
}
+void kvm_xen_init_vm(struct kvm *kvm)
+{
+ kvm->arch.xen.shinfo_gfn = GPA_INVALID;
+}
+
void kvm_xen_destroy_vm(struct kvm *kvm)
{
if (kvm->arch.xen_hvm_config.msr)
diff --git a/arch/x86/kvm/xen.h b/arch/x86/kvm/xen.h
index 463a7844a8ca..cc0cf5f37450 100644
--- a/arch/x86/kvm/xen.h
+++ b/arch/x86/kvm/xen.h
@@ -21,6 +21,7 @@ int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data);
int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data);
int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data);
int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc);
+void kvm_xen_init_vm(struct kvm *kvm);
void kvm_xen_destroy_vm(struct kvm *kvm);
static inline bool kvm_xen_msr_enabled(struct kvm *kvm)
@@ -50,6 +51,10 @@ static inline int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data)
return 1;
}
+static inline void kvm_xen_init_vm(struct kvm *kvm)
+{
+}
+
static inline void kvm_xen_destroy_vm(struct kvm *kvm)
{
}
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 75ef19aa8903..23a14d82e783 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -127,14 +127,12 @@ __ref void *alloc_low_pages(unsigned int num)
unsigned long ret = 0;
if (min_pfn_mapped < max_pfn_mapped) {
- ret = memblock_find_in_range(
+ ret = memblock_phys_alloc_range(
+ PAGE_SIZE * num, PAGE_SIZE,
min_pfn_mapped << PAGE_SHIFT,
- max_pfn_mapped << PAGE_SHIFT,
- PAGE_SIZE * num , PAGE_SIZE);
+ max_pfn_mapped << PAGE_SHIFT);
}
- if (ret)
- memblock_reserve(ret, PAGE_SIZE * num);
- else if (can_use_brk_pgt)
+ if (!ret && can_use_brk_pgt)
ret = __pa(extend_brk(PAGE_SIZE * num, PAGE_SIZE));
if (!ret)
@@ -610,8 +608,17 @@ static void __init memory_map_top_down(unsigned long map_start,
unsigned long addr;
unsigned long mapped_ram_size = 0;
- /* xen has big range in reserved near end of ram, skip it at first.*/
- addr = memblock_find_in_range(map_start, map_end, PMD_SIZE, PMD_SIZE);
+ /*
+ * Systems that have many reserved areas near top of the memory,
+ * e.g. QEMU with less than 1G RAM and EFI enabled, or Xen, will
+ * require lots of 4K mappings which may exhaust pgt_buf.
+ * Start with top-most PMD_SIZE range aligned at PMD_SIZE to ensure
+ * there is enough mapped memory that can be allocated from
+ * memblock.
+ */
+ addr = memblock_phys_alloc_range(PMD_SIZE, PMD_SIZE, map_start,
+ map_end);
+ memblock_free(addr, PMD_SIZE);
real_end = addr + PMD_SIZE;
/* step_size need to be small so pgt_buf from BRK could cover it */
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 74b78840182d..bd90b8fe81e4 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -801,8 +801,7 @@ int arch_add_memory(int nid, u64 start, u64 size,
return __add_pages(nid, start_pfn, nr_pages, params);
}
-void arch_remove_memory(int nid, u64 start, u64 size,
- struct vmem_altmap *altmap)
+void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
{
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index ddeaba947eb3..a6e11763763f 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1255,8 +1255,7 @@ kernel_physical_mapping_remove(unsigned long start, unsigned long end)
remove_pagetable(start, end, true, NULL);
}
-void __ref arch_remove_memory(int nid, u64 start, u64 size,
- struct vmem_altmap *altmap)
+void __ref arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
{
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index e94da744386f..a1b5c71099e6 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -376,15 +376,14 @@ static int __init numa_alloc_distance(void)
cnt++;
size = cnt * cnt * sizeof(numa_distance[0]);
- phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped),
- size, PAGE_SIZE);
+ phys = memblock_phys_alloc_range(size, PAGE_SIZE, 0,
+ PFN_PHYS(max_pfn_mapped));
if (!phys) {
pr_warn("Warning: can't allocate distance table!\n");
/* don't retry until explicitly reset */
numa_distance = (void *)1LU;
return -ENOMEM;
}
- memblock_reserve(phys, size);
numa_distance = __va(phys);
numa_distance_cnt = cnt;
diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c
index 87d77cc52f86..737491b13728 100644
--- a/arch/x86/mm/numa_emulation.c
+++ b/arch/x86/mm/numa_emulation.c
@@ -447,13 +447,12 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
if (numa_dist_cnt) {
u64 phys;
- phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped),
- phys_size, PAGE_SIZE);
+ phys = memblock_phys_alloc_range(phys_size, PAGE_SIZE, 0,
+ PFN_PHYS(max_pfn_mapped));
if (!phys) {
pr_warn("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n");
goto no_emu;
}
- memblock_reserve(phys, phys_size);
phys_dist = __va(phys);
for (i = 0; i < numa_dist_cnt; i++)
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 16d76f814e9b..0fe6aacef3db 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -1961,6 +1961,9 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
if (flags & BPF_TRAMP_F_CALL_ORIG)
stack_size += 8; /* room for return value of orig_call */
+ if (flags & BPF_TRAMP_F_IP_ARG)
+ stack_size += 8; /* room for IP address argument */
+
if (flags & BPF_TRAMP_F_SKIP_FRAME)
/* skip patched call instruction and point orig_call to actual
* body of the kernel function.
@@ -1974,6 +1977,22 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
EMIT4(0x48, 0x83, 0xEC, stack_size); /* sub rsp, stack_size */
EMIT1(0x53); /* push rbx */
+ if (flags & BPF_TRAMP_F_IP_ARG) {
+ /* Store IP address of the traced function:
+ * mov rax, QWORD PTR [rbp + 8]
+ * sub rax, X86_PATCH_SIZE
+ * mov QWORD PTR [rbp - stack_size], rax
+ */
+ emit_ldx(&prog, BPF_DW, BPF_REG_0, BPF_REG_FP, 8);
+ EMIT4(0x48, 0x83, 0xe8, X86_PATCH_SIZE);
+ emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -stack_size);
+
+ /* Continue with stack_size for regs storage, stack will
+ * be correctly restored with 'leave' instruction.
+ */
+ stack_size -= 8;
+ }
+
save_regs(m, &prog, nr_args, stack_size);
if (flags & BPF_TRAMP_F_CALL_ORIG) {
diff --git a/arch/x86/pci/numachip.c b/arch/x86/pci/numachip.c
index 01a085d9135a..4f0147d4e225 100644
--- a/arch/x86/pci/numachip.c
+++ b/arch/x86/pci/numachip.c
@@ -12,6 +12,7 @@
#include <linux/pci.h>
#include <asm/pci_x86.h>
+#include <asm/numachip/numachip.h>
static u8 limit __read_mostly;
diff --git a/arch/x86/pci/sta2x11-fixup.c b/arch/x86/pci/sta2x11-fixup.c
index 7d2525691854..101081ad64b6 100644
--- a/arch/x86/pci/sta2x11-fixup.c
+++ b/arch/x86/pci/sta2x11-fixup.c
@@ -146,8 +146,7 @@ static void sta2x11_map_ep(struct pci_dev *pdev)
dev_err(dev, "sta2x11: could not set DMA offset\n");
dev->bus_dma_limit = max_amba_addr;
- pci_set_consistent_dma_mask(pdev, max_amba_addr);
- pci_set_dma_mask(pdev, max_amba_addr);
+ dma_set_mask_and_coherent(&pdev->dev, max_amba_addr);
/* Configure AHB mapping */
pci_write_config_dword(pdev, AHB_PEXLBASE(0), 0);
diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c
index 6534c92d0f83..31b5856010cb 100644
--- a/arch/x86/realmode/init.c
+++ b/arch/x86/realmode/init.c
@@ -28,7 +28,7 @@ void __init reserve_real_mode(void)
WARN_ON(slab_is_available());
/* Has to be under 1M so we can execute real-mode AP code. */
- mem = memblock_find_in_range(0, 1<<20, size, PAGE_SIZE);
+ mem = memblock_phys_alloc_range(size, PAGE_SIZE, 0, 1<<20);
if (!mem)
pr_info("No sub-1M memory is available for the trampoline\n");
else
diff --git a/arch/x86/um/shared/sysdep/stub_32.h b/arch/x86/um/shared/sysdep/stub_32.h
index b95db9daf0e8..4c6c2be0c899 100644
--- a/arch/x86/um/shared/sysdep/stub_32.h
+++ b/arch/x86/um/shared/sysdep/stub_32.h
@@ -101,4 +101,16 @@ static inline void remap_stack_and_trap(void)
"memory");
}
+static __always_inline void *get_stub_page(void)
+{
+ unsigned long ret;
+
+ asm volatile (
+ "movl %%esp,%0 ;"
+ "andl %1,%0"
+ : "=a" (ret)
+ : "g" (~(UM_KERN_PAGE_SIZE - 1)));
+
+ return (void *)ret;
+}
#endif
diff --git a/arch/x86/um/shared/sysdep/stub_64.h b/arch/x86/um/shared/sysdep/stub_64.h
index 6e2626b77a2e..e9c4b2b38803 100644
--- a/arch/x86/um/shared/sysdep/stub_64.h
+++ b/arch/x86/um/shared/sysdep/stub_64.h
@@ -108,4 +108,16 @@ static inline void remap_stack_and_trap(void)
__syscall_clobber, "r10", "r8", "r9");
}
+static __always_inline void *get_stub_page(void)
+{
+ unsigned long ret;
+
+ asm volatile (
+ "movq %%rsp,%0 ;"
+ "andq %1,%0"
+ : "=a" (ret)
+ : "g" (~(UM_KERN_PAGE_SIZE - 1)));
+
+ return (void *)ret;
+}
#endif
diff --git a/arch/x86/um/stub_segv.c b/arch/x86/um/stub_segv.c
index 21836eaf1725..f7eefba034f9 100644
--- a/arch/x86/um/stub_segv.c
+++ b/arch/x86/um/stub_segv.c
@@ -11,9 +11,8 @@
void __attribute__ ((__section__ (".__syscall_stub")))
stub_segv_handler(int sig, siginfo_t *info, void *p)
{
- int stack;
+ struct faultinfo *f = get_stub_page();
ucontext_t *uc = p;
- struct faultinfo *f = (void *)(((unsigned long)&stack) & ~(UM_KERN_PAGE_SIZE - 1));
GET_FAULTINFO_FROM_MC(*f, &uc->uc_mcontext);
trap_myself();
diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index 03149422dce2..753f63734c13 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -116,9 +116,8 @@ static void __init xen_banner(void)
HYPERVISOR_xen_version(XENVER_extraversion, &extra);
pr_info("Booting paravirtualized kernel on %s\n", pv_info.name);
- printk(KERN_INFO "Xen version: %d.%d%s%s\n",
- version >> 16, version & 0xffff, extra.extraversion,
- xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ? " (preserve-AD)" : "");
+ pr_info("Xen version: %d.%d%s (preserve-AD)\n",
+ version >> 16, version & 0xffff, extra.extraversion);
}
static void __init xen_pv_init_platform(void)
@@ -1302,13 +1301,6 @@ asmlinkage __visible void __init xen_start_kernel(void)
xen_init_apic();
#endif
- if (xen_feature(XENFEAT_mmu_pt_update_preserve_ad)) {
- pv_ops.mmu.ptep_modify_prot_start =
- xen_ptep_modify_prot_start;
- pv_ops.mmu.ptep_modify_prot_commit =
- xen_ptep_modify_prot_commit;
- }
-
machine_ops = xen_machine_ops;
/*
diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
index ade789e73ee4..1df5f01529e5 100644
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -2099,8 +2099,8 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = {
.set_pte = xen_set_pte_init,
.set_pmd = xen_set_pmd_hyper,
- .ptep_modify_prot_start = __ptep_modify_prot_start,
- .ptep_modify_prot_commit = __ptep_modify_prot_commit,
+ .ptep_modify_prot_start = xen_ptep_modify_prot_start,
+ .ptep_modify_prot_commit = xen_ptep_modify_prot_commit,
.pte_val = PV_CALLEE_SAVE(xen_pte_val),
.pgd_val = PV_CALLEE_SAVE(xen_pgd_val),
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index ac06ca32e9ef..5e6e236977c7 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -618,8 +618,8 @@ int xen_alloc_p2m_entry(unsigned long pfn)
}
/* Expanded the p2m? */
- if (pfn > xen_p2m_last_pfn) {
- xen_p2m_last_pfn = pfn;
+ if (pfn >= xen_p2m_last_pfn) {
+ xen_p2m_last_pfn = ALIGN(pfn + 1, P2M_PER_PAGE);
HYPERVISOR_shared_info->arch.max_pfn = xen_p2m_last_pfn;
}
diff --git a/arch/x86/xen/platform-pci-unplug.c b/arch/x86/xen/platform-pci-unplug.c
index 96d7f7d39cb9..62ac4898df1a 100644
--- a/arch/x86/xen/platform-pci-unplug.c
+++ b/arch/x86/xen/platform-pci-unplug.c
@@ -7,6 +7,8 @@
* Copyright (c) 2010, Citrix
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/init.h>
#include <linux/io.h>
#include <linux/export.h>
@@ -30,13 +32,13 @@ static int check_platform_magic(void)
magic = inw(XEN_IOPORT_MAGIC);
if (magic != XEN_IOPORT_MAGIC_VAL) {
- printk(KERN_ERR "Xen Platform PCI: unrecognised magic value\n");
+ pr_err("Xen Platform PCI: unrecognised magic value\n");
return XEN_PLATFORM_ERR_MAGIC;
}
protocol = inb(XEN_IOPORT_PROTOVER);
- printk(KERN_DEBUG "Xen Platform PCI: I/O protocol version %d\n",
+ pr_debug("Xen Platform PCI: I/O protocol version %d\n",
protocol);
switch (protocol) {
@@ -44,12 +46,12 @@ static int check_platform_magic(void)
outw(XEN_IOPORT_LINUX_PRODNUM, XEN_IOPORT_PRODNUM);
outl(XEN_IOPORT_LINUX_DRVVER, XEN_IOPORT_DRVVER);
if (inw(XEN_IOPORT_MAGIC) != XEN_IOPORT_MAGIC_VAL) {
- printk(KERN_ERR "Xen Platform: blacklisted by host\n");
+ pr_err("Xen Platform: blacklisted by host\n");
return XEN_PLATFORM_ERR_BLACKLIST;
}
break;
default:
- printk(KERN_WARNING "Xen Platform PCI: unknown I/O protocol version\n");
+ pr_warn("Xen Platform PCI: unknown I/O protocol version\n");
return XEN_PLATFORM_ERR_PROTOCOL;
}
@@ -155,12 +157,12 @@ void xen_unplug_emulated_devices(void)
* been compiled for this kernel (modules or built-in are both OK). */
if (!xen_emul_unplug) {
if (xen_must_unplug_nics()) {
- printk(KERN_INFO "Netfront and the Xen platform PCI driver have "
+ pr_info("Netfront and the Xen platform PCI driver have "
"been compiled for this kernel: unplug emulated NICs.\n");
xen_emul_unplug |= XEN_UNPLUG_ALL_NICS;
}
if (xen_must_unplug_disks()) {
- printk(KERN_INFO "Blkfront and the Xen platform PCI driver have "
+ pr_info("Blkfront and the Xen platform PCI driver have "
"been compiled for this kernel: unplug emulated disks.\n"
"You might have to change the root device\n"
"from /dev/hd[a-d] to /dev/xvd[a-d]\n"
@@ -200,7 +202,7 @@ static int __init parse_xen_emul_unplug(char *arg)
else if (!strncmp(p, "never", l))
xen_emul_unplug |= XEN_UNPLUG_NEVER;
else
- printk(KERN_WARNING "unrecognised option '%s' "
+ pr_warn("unrecognised option '%s' "
"in parameter 'xen_emul_unplug'\n", p);
}
return 0;
diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c
index c2ac319f11a4..96afadf9878e 100644
--- a/arch/x86/xen/smp_pv.c
+++ b/arch/x86/xen/smp_pv.c
@@ -64,7 +64,6 @@ static void cpu_bringup(void)
cr4_init();
cpu_init();
touch_softlockup_watchdog();
- preempt_disable();
/* PVH runs in ring 0 and allows us to do native syscalls. Yay! */
if (!xen_feature(XENFEAT_supervisor_mode_kernel)) {