aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kvm/vmx
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kvm/vmx')
-rw-r--r--arch/x86/kvm/vmx/capabilities.h14
-rw-r--r--arch/x86/kvm/vmx/evmcs.c4
-rw-r--r--arch/x86/kvm/vmx/evmcs.h48
-rw-r--r--arch/x86/kvm/vmx/nested.c200
-rw-r--r--arch/x86/kvm/vmx/pmu_intel.c63
-rw-r--r--arch/x86/kvm/vmx/posted_intr.c278
-rw-r--r--arch/x86/kvm/vmx/posted_intr.h14
-rw-r--r--arch/x86/kvm/vmx/vmcs.h5
-rw-r--r--arch/x86/kvm/vmx/vmcs12.c4
-rw-r--r--arch/x86/kvm/vmx/vmcs12.h6
-rw-r--r--arch/x86/kvm/vmx/vmenter.S14
-rw-r--r--arch/x86/kvm/vmx/vmx.c482
-rw-r--r--arch/x86/kvm/vmx/vmx.h47
-rw-r--r--arch/x86/kvm/vmx/vmx_ops.h43
14 files changed, 775 insertions, 447 deletions
diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h
index 4705ad55abb5..3f430e218375 100644
--- a/arch/x86/kvm/vmx/capabilities.h
+++ b/arch/x86/kvm/vmx/capabilities.h
@@ -5,6 +5,7 @@
#include <asm/vmx.h>
#include "lapic.h"
+#include "x86.h"
extern bool __read_mostly enable_vpid;
extern bool __read_mostly flexpriority_enabled;
@@ -53,7 +54,6 @@ struct nested_vmx_msrs {
struct vmcs_config {
int size;
- int order;
u32 basic_cap;
u32 revision_id;
u32 pin_based_exec_ctrl;
@@ -312,6 +312,15 @@ static inline bool cpu_has_vmx_ept_1g_page(void)
return vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT;
}
+static inline int ept_caps_to_lpage_level(u32 ept_caps)
+{
+ if (ept_caps & VMX_EPT_1GB_PAGE_BIT)
+ return PG_LEVEL_1G;
+ if (ept_caps & VMX_EPT_2MB_PAGE_BIT)
+ return PG_LEVEL_2M;
+ return PG_LEVEL_4K;
+}
+
static inline bool cpu_has_vmx_ept_ad_bits(void)
{
return vmx_capability.ept & VMX_EPT_AD_BIT;
@@ -380,6 +389,9 @@ static inline u64 vmx_get_perf_capabilities(void)
{
u64 perf_cap = 0;
+ if (!enable_pmu)
+ return perf_cap;
+
if (boot_cpu_has(X86_FEATURE_PDCM))
rdmsrl(MSR_IA32_PERF_CAPABILITIES, perf_cap);
diff --git a/arch/x86/kvm/vmx/evmcs.c b/arch/x86/kvm/vmx/evmcs.c
index ba6f99f584ac..87e3dc10edf4 100644
--- a/arch/x86/kvm/vmx/evmcs.c
+++ b/arch/x86/kvm/vmx/evmcs.c
@@ -12,8 +12,6 @@
DEFINE_STATIC_KEY_FALSE(enable_evmcs);
-#if IS_ENABLED(CONFIG_HYPERV)
-
#define EVMCS1_OFFSET(x) offsetof(struct hv_enlightened_vmcs, x)
#define EVMCS1_FIELD(number, name, clean_field)[ROL16(number, 6)] = \
{EVMCS1_OFFSET(name), clean_field}
@@ -296,6 +294,7 @@ const struct evmcs_field vmcs_field_to_evmcs_1[] = {
};
const unsigned int nr_evmcs_1_fields = ARRAY_SIZE(vmcs_field_to_evmcs_1);
+#if IS_ENABLED(CONFIG_HYPERV)
__init void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf)
{
vmcs_conf->pin_based_exec_ctrl &= ~EVMCS1_UNSUPPORTED_PINCTRL;
@@ -362,6 +361,7 @@ void nested_evmcs_filter_control_msr(u32 msr_index, u64 *pdata)
case MSR_IA32_VMX_PROCBASED_CTLS2:
ctl_high &= ~EVMCS1_UNSUPPORTED_2NDEXEC;
break;
+ case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
case MSR_IA32_VMX_PINBASED_CTLS:
ctl_high &= ~EVMCS1_UNSUPPORTED_PINCTRL;
break;
diff --git a/arch/x86/kvm/vmx/evmcs.h b/arch/x86/kvm/vmx/evmcs.h
index 16731d2cf231..8d70f9aea94b 100644
--- a/arch/x86/kvm/vmx/evmcs.h
+++ b/arch/x86/kvm/vmx/evmcs.h
@@ -59,12 +59,12 @@ DECLARE_STATIC_KEY_FALSE(enable_evmcs);
SECONDARY_EXEC_SHADOW_VMCS | \
SECONDARY_EXEC_TSC_SCALING | \
SECONDARY_EXEC_PAUSE_LOOP_EXITING)
-#define EVMCS1_UNSUPPORTED_VMEXIT_CTRL (VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
+#define EVMCS1_UNSUPPORTED_VMEXIT_CTRL \
+ (VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | \
+ VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)
#define EVMCS1_UNSUPPORTED_VMENTRY_CTRL (VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL)
#define EVMCS1_UNSUPPORTED_VMFUNC (VMX_VMFUNC_EPTP_SWITCHING)
-#if IS_ENABLED(CONFIG_HYPERV)
-
struct evmcs_field {
u16 offset;
u16 clean_field;
@@ -73,26 +73,56 @@ struct evmcs_field {
extern const struct evmcs_field vmcs_field_to_evmcs_1[];
extern const unsigned int nr_evmcs_1_fields;
-static __always_inline int get_evmcs_offset(unsigned long field,
- u16 *clean_field)
+static __always_inline int evmcs_field_offset(unsigned long field,
+ u16 *clean_field)
{
unsigned int index = ROL16(field, 6);
const struct evmcs_field *evmcs_field;
- if (unlikely(index >= nr_evmcs_1_fields)) {
- WARN_ONCE(1, "KVM: accessing unsupported EVMCS field %lx\n",
- field);
+ if (unlikely(index >= nr_evmcs_1_fields))
return -ENOENT;
- }
evmcs_field = &vmcs_field_to_evmcs_1[index];
+ /*
+ * Use offset=0 to detect holes in eVMCS. This offset belongs to
+ * 'revision_id' but this field has no encoding and is supposed to
+ * be accessed directly.
+ */
+ if (unlikely(!evmcs_field->offset))
+ return -ENOENT;
+
if (clean_field)
*clean_field = evmcs_field->clean_field;
return evmcs_field->offset;
}
+static inline u64 evmcs_read_any(struct hv_enlightened_vmcs *evmcs,
+ unsigned long field, u16 offset)
+{
+ /*
+ * vmcs12_read_any() doesn't care whether the supplied structure
+ * is 'struct vmcs12' or 'struct hv_enlightened_vmcs' as it takes
+ * the exact offset of the required field, use it for convenience
+ * here.
+ */
+ return vmcs12_read_any((void *)evmcs, field, offset);
+}
+
+#if IS_ENABLED(CONFIG_HYPERV)
+
+static __always_inline int get_evmcs_offset(unsigned long field,
+ u16 *clean_field)
+{
+ int offset = evmcs_field_offset(field, clean_field);
+
+ WARN_ONCE(offset < 0, "KVM: accessing unsupported EVMCS field %lx\n",
+ field);
+
+ return offset;
+}
+
static __always_inline void evmcs_write64(unsigned long field, u64 value)
{
u16 clean_field;
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 1e2f66951566..ba34e94049c7 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -7,6 +7,7 @@
#include <asm/mmu_context.h>
#include "cpuid.h"
+#include "evmcs.h"
#include "hyperv.h"
#include "mmu.h"
#include "nested.h"
@@ -245,7 +246,8 @@ static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx,
src = &prev->host_state;
dest = &vmx->loaded_vmcs->host_state;
- vmx_set_host_fs_gs(dest, src->fs_sel, src->gs_sel, src->fs_base, src->gs_base);
+ vmx_set_vmcs_host_state(dest, src->cr3, src->fs_sel, src->gs_sel,
+ src->fs_base, src->gs_base);
dest->ldt_sel = src->ldt_sel;
#ifdef CONFIG_X86_64
dest->ds_sel = src->ds_sel;
@@ -269,7 +271,13 @@ static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
vmx_sync_vmcs_host_state(vmx, prev);
put_cpu();
- vmx_register_cache_reset(vcpu);
+ vcpu->arch.regs_avail = ~VMX_REGS_LAZY_LOAD_SET;
+
+ /*
+ * All lazily updated registers will be reloaded from VMCS12 on both
+ * vmentry and vmexit.
+ */
+ vcpu->arch.regs_dirty = 0;
}
/*
@@ -391,9 +399,11 @@ static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
static void nested_ept_new_eptp(struct kvm_vcpu *vcpu)
{
- kvm_init_shadow_ept_mmu(vcpu,
- to_vmx(vcpu)->nested.msrs.ept_caps &
- VMX_EPT_EXECUTE_ONLY_BIT,
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ bool execonly = vmx->nested.msrs.ept_caps & VMX_EPT_EXECUTE_ONLY_BIT;
+ int ept_lpage_level = ept_caps_to_lpage_level(vmx->nested.msrs.ept_caps);
+
+ kvm_init_shadow_ept_mmu(vcpu, execonly, ept_lpage_level,
nested_ept_ad_enabled(vcpu),
nested_ept_get_eptp(vcpu));
}
@@ -591,6 +601,7 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
int msr;
unsigned long *msr_bitmap_l1;
unsigned long *msr_bitmap_l0 = vmx->nested.vmcs02.msr_bitmap;
+ struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs;
struct kvm_host_map *map = &vmx->nested.msr_bitmap_map;
/* Nothing to do if the MSR bitmap is not in use. */
@@ -598,6 +609,19 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
return false;
+ /*
+ * MSR bitmap update can be skipped when:
+ * - MSR bitmap for L1 hasn't changed.
+ * - Nested hypervisor (L1) is attempting to launch the same L2 as
+ * before.
+ * - Nested hypervisor (L1) has enabled 'Enlightened MSR Bitmap' feature
+ * and tells KVM (L0) there were no changes in MSR bitmap for L2.
+ */
+ if (!vmx->nested.force_msr_bitmap_recalc && evmcs &&
+ evmcs->hv_enlightenments_control.msr_bitmap &&
+ evmcs->hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP)
+ return true;
+
if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->msr_bitmap), map))
return false;
@@ -664,6 +688,8 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
kvm_vcpu_unmap(vcpu, &vmx->nested.msr_bitmap_map, false);
+ vmx->nested.force_msr_bitmap_recalc = false;
+
return true;
}
@@ -1095,7 +1121,7 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3,
* must not be dereferenced.
*/
if (reload_pdptrs && !nested_ept && is_pae_paging(vcpu) &&
- CC(!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))) {
+ CC(!load_pdptrs(vcpu, cr3))) {
*entry_failure_code = ENTRY_FAIL_PDPTE;
return -EINVAL;
}
@@ -1104,7 +1130,7 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3,
kvm_mmu_new_pgd(vcpu, cr3);
vcpu->arch.cr3 = cr3;
- kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);
+ kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3);
/* Re-initialize the MMU, e.g. to pick up CR4 MMU role changes. */
kvm_init_mmu(vcpu);
@@ -1162,29 +1188,26 @@ static void nested_vmx_transition_tlb_flush(struct kvm_vcpu *vcpu,
WARN_ON(!enable_vpid);
/*
- * If VPID is enabled and used by vmc12, but L2 does not have a unique
- * TLB tag (ASID), i.e. EPT is disabled and KVM was unable to allocate
- * a VPID for L2, flush the current context as the effective ASID is
- * common to both L1 and L2.
- *
- * Defer the flush so that it runs after vmcs02.EPTP has been set by
- * KVM_REQ_LOAD_MMU_PGD (if nested EPT is enabled) and to avoid
- * redundant flushes further down the nested pipeline.
- *
- * If a TLB flush isn't required due to any of the above, and vpid12 is
- * changing then the new "virtual" VPID (vpid12) will reuse the same
- * "real" VPID (vpid02), and so needs to be flushed. There's no direct
- * mapping between vpid02 and vpid12, vpid02 is per-vCPU and reused for
- * all nested vCPUs. Remember, a flush on VM-Enter does not invalidate
- * guest-physical mappings, so there is no need to sync the nEPT MMU.
+ * VPID is enabled and in use by vmcs12. If vpid12 is changing, then
+ * emulate a guest TLB flush as KVM does not track vpid12 history nor
+ * is the VPID incorporated into the MMU context. I.e. KVM must assume
+ * that the new vpid12 has never been used and thus represents a new
+ * guest ASID that cannot have entries in the TLB.
*/
- if (!nested_has_guest_tlb_tag(vcpu)) {
- kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
- } else if (is_vmenter &&
- vmcs12->virtual_processor_id != vmx->nested.last_vpid) {
+ if (is_vmenter && vmcs12->virtual_processor_id != vmx->nested.last_vpid) {
vmx->nested.last_vpid = vmcs12->virtual_processor_id;
- vpid_sync_context(nested_get_vpid02(vcpu));
+ kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
+ return;
}
+
+ /*
+ * If VPID is enabled, used by vmc12, and vpid12 is not changing but
+ * does not have a unique TLB tag (ASID), i.e. EPT is disabled and
+ * KVM was unable to allocate a VPID for L2, flush the current context
+ * as the effective ASID is common to both L1 and L2.
+ */
+ if (!nested_has_guest_tlb_tag(vcpu))
+ kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
}
static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask)
@@ -2024,10 +2047,13 @@ static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld(
* Clean fields data can't be used on VMLAUNCH and when we switch
* between different L2 guests as KVM keeps a single VMCS12 per L1.
*/
- if (from_launch || evmcs_gpa_changed)
+ if (from_launch || evmcs_gpa_changed) {
vmx->nested.hv_evmcs->hv_clean_fields &=
~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
+ vmx->nested.force_msr_bitmap_recalc = true;
+ }
+
return EVMPTRLD_SUCCEEDED;
}
@@ -2594,8 +2620,10 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) &&
WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL,
- vmcs12->guest_ia32_perf_global_ctrl)))
+ vmcs12->guest_ia32_perf_global_ctrl))) {
+ *entry_failure_code = ENTRY_FAIL_DEFAULT;
return -EINVAL;
+ }
kvm_rsp_write(vcpu, vmcs12->guest_rsp);
kvm_rip_write(vcpu, vmcs12->guest_rip);
@@ -3028,7 +3056,7 @@ static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu,
static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- unsigned long cr3, cr4;
+ unsigned long cr4;
bool vm_fail;
if (!nested_early_check)
@@ -3051,12 +3079,6 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu)
*/
vmcs_writel(GUEST_RFLAGS, 0);
- cr3 = __get_current_cr3_fast();
- if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) {
- vmcs_writel(HOST_CR3, cr3);
- vmx->loaded_vmcs->host_state.cr3 = cr3;
- }
-
cr4 = cr4_read_shadow();
if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) {
vmcs_writel(HOST_CR4, cr4);
@@ -3146,7 +3168,7 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
* the guest CR3 might be restored prior to setting the nested
* state which can lead to a load of wrong PDPTRs.
*/
- if (CC(!load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3)))
+ if (CC(!load_pdptrs(vcpu, vcpu->arch.cr3)))
return false;
}
@@ -3344,8 +3366,7 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
};
u32 failed_index;
- if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu))
- kvm_vcpu_flush_tlb_current(vcpu);
+ kvm_service_local_tlb_flush_requests(vcpu);
evaluate_pending_interrupts = exec_controls_get(vmx) &
(CPU_BASED_INTR_WINDOW_EXITING | CPU_BASED_NMI_WINDOW_EXITING);
@@ -3506,10 +3527,13 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
if (evmptrld_status == EVMPTRLD_ERROR) {
kvm_queue_exception(vcpu, UD_VECTOR);
return 1;
- } else if (CC(evmptrld_status == EVMPTRLD_VMFAIL)) {
- return nested_vmx_failInvalid(vcpu);
}
+ kvm_pmu_trigger_event(vcpu, PERF_COUNT_HW_BRANCH_INSTRUCTIONS);
+
+ if (CC(evmptrld_status == EVMPTRLD_VMFAIL))
+ return nested_vmx_failInvalid(vcpu);
+
if (CC(!evmptr_is_valid(vmx->nested.hv_evmcs_vmptr) &&
vmx->nested.current_vmptr == INVALID_GPA))
return nested_vmx_failInvalid(vcpu);
@@ -3605,7 +3629,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
!(nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING) &&
(vmcs12->guest_rflags & X86_EFLAGS_IF))) {
vmx->nested.nested_run_pending = 0;
- return kvm_vcpu_halt(vcpu);
+ return kvm_emulate_halt_noskip(vcpu);
}
break;
case GUEST_ACTIVITY_WAIT_SIPI:
@@ -4502,9 +4526,8 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
(void)nested_get_evmcs_page(vcpu);
}
- /* Service the TLB flush request for L2 before switching to L1. */
- if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu))
- kvm_vcpu_flush_tlb_current(vcpu);
+ /* Service pending TLB flush requests for L2 before switching to L1. */
+ kvm_service_local_tlb_flush_requests(vcpu);
/*
* VCPU_EXREG_PDPTR will be clobbered in arch/x86/kvm/vmx/vmx.h between
@@ -4829,18 +4852,20 @@ static struct vmcs *alloc_shadow_vmcs(struct kvm_vcpu *vcpu)
struct loaded_vmcs *loaded_vmcs = vmx->loaded_vmcs;
/*
- * We should allocate a shadow vmcs for vmcs01 only when L1
- * executes VMXON and free it when L1 executes VMXOFF.
- * As it is invalid to execute VMXON twice, we shouldn't reach
- * here when vmcs01 already have an allocated shadow vmcs.
+ * KVM allocates a shadow VMCS only when L1 executes VMXON and frees it
+ * when L1 executes VMXOFF or the vCPU is forced out of nested
+ * operation. VMXON faults if the CPU is already post-VMXON, so it
+ * should be impossible to already have an allocated shadow VMCS. KVM
+ * doesn't support virtualization of VMCS shadowing, so vmcs01 should
+ * always be the loaded VMCS.
*/
- WARN_ON(loaded_vmcs == &vmx->vmcs01 && loaded_vmcs->shadow_vmcs);
+ if (WARN_ON(loaded_vmcs != &vmx->vmcs01 || loaded_vmcs->shadow_vmcs))
+ return loaded_vmcs->shadow_vmcs;
+
+ loaded_vmcs->shadow_vmcs = alloc_vmcs(true);
+ if (loaded_vmcs->shadow_vmcs)
+ vmcs_clear(loaded_vmcs->shadow_vmcs);
- if (!loaded_vmcs->shadow_vmcs) {
- loaded_vmcs->shadow_vmcs = alloc_vmcs(true);
- if (loaded_vmcs->shadow_vmcs)
- vmcs_clear(loaded_vmcs->shadow_vmcs);
- }
return loaded_vmcs->shadow_vmcs;
}
@@ -4857,6 +4882,7 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu)
if (!vmx->nested.cached_vmcs12)
goto out_cached_vmcs12;
+ vmx->nested.shadow_vmcs12_cache.gpa = INVALID_GPA;
vmx->nested.cached_shadow_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT);
if (!vmx->nested.cached_shadow_vmcs12)
goto out_cached_shadow_vmcs12;
@@ -5076,27 +5102,49 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
if (!nested_vmx_check_permission(vcpu))
return 1;
- /*
- * In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA,
- * any VMREAD sets the ALU flags for VMfailInvalid.
- */
- if (vmx->nested.current_vmptr == INVALID_GPA ||
- (is_guest_mode(vcpu) &&
- get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA))
- return nested_vmx_failInvalid(vcpu);
-
/* Decode instruction info and find the field to read */
field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf));
- offset = vmcs_field_to_offset(field);
- if (offset < 0)
- return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
+ if (!evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) {
+ /*
+ * In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA,
+ * any VMREAD sets the ALU flags for VMfailInvalid.
+ */
+ if (vmx->nested.current_vmptr == INVALID_GPA ||
+ (is_guest_mode(vcpu) &&
+ get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA))
+ return nested_vmx_failInvalid(vcpu);
- if (!is_guest_mode(vcpu) && is_vmcs12_ext_field(field))
- copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
+ offset = get_vmcs12_field_offset(field);
+ if (offset < 0)
+ return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
+
+ if (!is_guest_mode(vcpu) && is_vmcs12_ext_field(field))
+ copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
- /* Read the field, zero-extended to a u64 value */
- value = vmcs12_read_any(vmcs12, field, offset);
+ /* Read the field, zero-extended to a u64 value */
+ value = vmcs12_read_any(vmcs12, field, offset);
+ } else {
+ /*
+ * Hyper-V TLFS (as of 6.0b) explicitly states, that while an
+ * enlightened VMCS is active VMREAD/VMWRITE instructions are
+ * unsupported. Unfortunately, certain versions of Windows 11
+ * don't comply with this requirement which is not enforced in
+ * genuine Hyper-V. Allow VMREAD from an enlightened VMCS as a
+ * workaround, as misbehaving guests will panic on VM-Fail.
+ * Note, enlightened VMCS is incompatible with shadow VMCS so
+ * all VMREADs from L2 should go to L1.
+ */
+ if (WARN_ON_ONCE(is_guest_mode(vcpu)))
+ return nested_vmx_failInvalid(vcpu);
+
+ offset = evmcs_field_offset(field, NULL);
+ if (offset < 0)
+ return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
+
+ /* Read the field, zero-extended to a u64 value */
+ value = evmcs_read_any(vmx->nested.hv_evmcs, field, offset);
+ }
/*
* Now copy part of this value to register or memory, as requested.
@@ -5191,7 +5239,7 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf));
- offset = vmcs_field_to_offset(field);
+ offset = get_vmcs12_field_offset(field);
if (offset < 0)
return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
@@ -5260,6 +5308,7 @@ static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr)
vmx->nested.need_vmcs12_to_shadow_sync = true;
}
vmx->nested.dirty_vmcs12 = true;
+ vmx->nested.force_msr_bitmap_recalc = true;
}
/* Emulate the VMPTRLD instruction */
@@ -5289,8 +5338,7 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
struct gfn_to_hva_cache *ghc = &vmx->nested.vmcs12_cache;
struct vmcs_hdr hdr;
- if (ghc->gpa != vmptr &&
- kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, vmptr, VMCS12_SIZE)) {
+ if (kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, vmptr, VMCS12_SIZE)) {
/*
* Reads from an unbacked page return all 1s,
* which means that the 32 bits located at the
@@ -6396,6 +6444,7 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
goto error_guest_mode;
vmx->nested.dirty_vmcs12 = true;
+ vmx->nested.force_msr_bitmap_recalc = true;
ret = nested_vmx_enter_non_root_mode(vcpu, false);
if (ret)
goto error_guest_mode;
@@ -6438,7 +6487,7 @@ static u64 nested_vmx_calc_vmcs_enum_msr(void)
max_idx = 0;
for (i = 0; i < nr_vmcs12_fields; i++) {
/* The vmcs12 table is very, very sparsely populated. */
- if (!vmcs_field_to_offset_table[i])
+ if (!vmcs12_field_offsets[i])
continue;
idx = vmcs_field_index(VMCS12_IDX_TO_ENC(i));
@@ -6747,6 +6796,7 @@ __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *))
}
struct kvm_x86_nested_ops vmx_nested_ops = {
+ .leave_nested = vmx_leave_nested,
.check_events = vmx_check_nested_events,
.hv_timer_pending = nested_vmx_preemption_timer_pending,
.triple_fault = nested_vmx_triple_fault,
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
index 1b7456b2177b..466d18fc0c5d 100644
--- a/arch/x86/kvm/vmx/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@ -21,7 +21,6 @@
#define MSR_PMC_FULL_WIDTH_BIT (MSR_IA32_PMC0 - MSR_IA32_PERFCTR0)
static struct kvm_event_hw_type_mapping intel_arch_events[] = {
- /* Index must match CPUID 0x0A.EBX bit vector */
[0] = { 0x3c, 0x00, PERF_COUNT_HW_CPU_CYCLES },
[1] = { 0xc0, 0x00, PERF_COUNT_HW_INSTRUCTIONS },
[2] = { 0x3c, 0x01, PERF_COUNT_HW_BUS_CYCLES },
@@ -29,6 +28,7 @@ static struct kvm_event_hw_type_mapping intel_arch_events[] = {
[4] = { 0x2e, 0x41, PERF_COUNT_HW_CACHE_MISSES },
[5] = { 0xc4, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS },
[6] = { 0xc5, 0x00, PERF_COUNT_HW_BRANCH_MISSES },
+ /* The above index must match CPUID 0x0A.EBX bit vector */
[7] = { 0x00, 0x03, PERF_COUNT_HW_REF_CPU_CYCLES },
};
@@ -68,34 +68,29 @@ static void global_ctrl_changed(struct kvm_pmu *pmu, u64 data)
reprogram_counter(pmu, bit);
}
-static unsigned intel_find_arch_event(struct kvm_pmu *pmu,
- u8 event_select,
- u8 unit_mask)
+static unsigned int intel_pmc_perf_hw_id(struct kvm_pmc *pmc)
{
+ struct kvm_pmu *pmu = pmc_to_pmu(pmc);
+ u8 event_select = pmc->eventsel & ARCH_PERFMON_EVENTSEL_EVENT;
+ u8 unit_mask = (pmc->eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8;
int i;
- for (i = 0; i < ARRAY_SIZE(intel_arch_events); i++)
- if (intel_arch_events[i].eventsel == event_select
- && intel_arch_events[i].unit_mask == unit_mask
- && (pmu->available_event_types & (1 << i)))
- break;
-
- if (i == ARRAY_SIZE(intel_arch_events))
- return PERF_COUNT_HW_MAX;
+ for (i = 0; i < ARRAY_SIZE(intel_arch_events); i++) {
+ if (intel_arch_events[i].eventsel != event_select ||
+ intel_arch_events[i].unit_mask != unit_mask)
+ continue;
- return intel_arch_events[i].event_type;
-}
+ /* disable event that reported as not present by cpuid */
+ if ((i < 7) && !(pmu->available_event_types & (1 << i)))
+ return PERF_COUNT_HW_MAX + 1;
-static unsigned intel_find_fixed_event(int idx)
-{
- u32 event;
- size_t size = ARRAY_SIZE(fixed_pmc_events);
+ break;
+ }
- if (idx >= size)
+ if (i == ARRAY_SIZE(intel_arch_events))
return PERF_COUNT_HW_MAX;
- event = fixed_pmc_events[array_index_nospec(idx, size)];
- return intel_arch_events[event].event_type;
+ return intel_arch_events[i].event_type;
}
/* check if a PMC is enabled by comparing it with globl_ctrl bits. */
@@ -459,6 +454,21 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
return 1;
}
+static void setup_fixed_pmc_eventsel(struct kvm_pmu *pmu)
+{
+ size_t size = ARRAY_SIZE(fixed_pmc_events);
+ struct kvm_pmc *pmc;
+ u32 event;
+ int i;
+
+ for (i = 0; i < pmu->nr_arch_fixed_counters; i++) {
+ pmc = &pmu->fixed_counters[i];
+ event = fixed_pmc_events[array_index_nospec(i, size)];
+ pmc->eventsel = (intel_arch_events[event].unit_mask << 8) |
+ intel_arch_events[event].eventsel;
+ }
+}
+
static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
{
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
@@ -477,7 +487,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
pmu->reserved_bits = 0xffffffff00200000ull;
entry = kvm_find_cpuid_entry(vcpu, 0xa, 0);
- if (!entry)
+ if (!entry || !enable_pmu)
return;
eax.full = entry->eax;
edx.full = entry->edx;
@@ -500,12 +510,14 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
pmu->nr_arch_fixed_counters = 0;
} else {
pmu->nr_arch_fixed_counters =
- min_t(int, edx.split.num_counters_fixed,
- x86_pmu.num_counters_fixed);
+ min3(ARRAY_SIZE(fixed_pmc_events),
+ (size_t) edx.split.num_counters_fixed,
+ (size_t) x86_pmu.num_counters_fixed);
edx.split.bit_width_fixed = min_t(int,
edx.split.bit_width_fixed, x86_pmu.bit_width_fixed);
pmu->counter_bitmask[KVM_PMC_FIXED] =
((u64)1 << edx.split.bit_width_fixed) - 1;
+ setup_fixed_pmc_eventsel(pmu);
}
pmu->global_ctrl = ((1ull << pmu->nr_arch_gp_counters) - 1) |
@@ -703,8 +715,7 @@ static void intel_pmu_cleanup(struct kvm_vcpu *vcpu)
}
struct kvm_pmu_ops intel_pmu_ops = {
- .find_arch_event = intel_find_arch_event,
- .find_fixed_event = intel_find_fixed_event,
+ .pmc_perf_hw_id = intel_pmc_perf_hw_id,
.pmc_is_enabled = intel_pmc_is_enabled,
.pmc_idx_to_pmc = intel_pmc_idx_to_pmc,
.rdpmc_ecx_to_pmc = intel_rdpmc_ecx_to_pmc,
diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c
index 5f81ef092bd4..aa1fe9085d77 100644
--- a/arch/x86/kvm/vmx/posted_intr.c
+++ b/arch/x86/kvm/vmx/posted_intr.c
@@ -5,63 +5,115 @@
#include <asm/cpu.h>
#include "lapic.h"
+#include "irq.h"
#include "posted_intr.h"
#include "trace.h"
#include "vmx.h"
/*
- * We maintain a per-CPU linked-list of vCPU, so in wakeup_handler() we
- * can find which vCPU should be waken up.
+ * Maintain a per-CPU list of vCPUs that need to be awakened by wakeup_handler()
+ * when a WAKEUP_VECTOR interrupted is posted. vCPUs are added to the list when
+ * the vCPU is scheduled out and is blocking (e.g. in HLT) with IRQs enabled.
+ * The vCPUs posted interrupt descriptor is updated at the same time to set its
+ * notification vector to WAKEUP_VECTOR, so that posted interrupt from devices
+ * wake the target vCPUs. vCPUs are removed from the list and the notification
+ * vector is reset when the vCPU is scheduled in.
*/
-static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu);
-static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock);
+static DEFINE_PER_CPU(struct list_head, wakeup_vcpus_on_cpu);
+/*
+ * Protect the per-CPU list with a per-CPU spinlock to handle task migration.
+ * When a blocking vCPU is awakened _and_ migrated to a different pCPU, the
+ * ->sched_in() path will need to take the vCPU off the list of the _previous_
+ * CPU. IRQs must be disabled when taking this lock, otherwise deadlock will
+ * occur if a wakeup IRQ arrives and attempts to acquire the lock.
+ */
+static DEFINE_PER_CPU(raw_spinlock_t, wakeup_vcpus_on_cpu_lock);
static inline struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu)
{
return &(to_vmx(vcpu)->pi_desc);
}
+static int pi_try_set_control(struct pi_desc *pi_desc, u64 old, u64 new)
+{
+ /*
+ * PID.ON can be set at any time by a different vCPU or by hardware,
+ * e.g. a device. PID.control must be written atomically, and the
+ * update must be retried with a fresh snapshot an ON change causes
+ * the cmpxchg to fail.
+ */
+ if (cmpxchg64(&pi_desc->control, old, new) != old)
+ return -EBUSY;
+
+ return 0;
+}
+
void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
{
struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
struct pi_desc old, new;
+ unsigned long flags;
unsigned int dest;
/*
- * In case of hot-plug or hot-unplug, we may have to undo
- * vmx_vcpu_pi_put even if there is no assigned device. And we
- * always keep PI.NDST up to date for simplicity: it makes the
- * code easier, and CPU migration is not a fast path.
+ * To simplify hot-plug and dynamic toggling of APICv, keep PI.NDST and
+ * PI.SN up-to-date even if there is no assigned device or if APICv is
+ * deactivated due to a dynamic inhibit bit, e.g. for Hyper-V's SyncIC.
*/
- if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu)
+ if (!enable_apicv || !lapic_in_kernel(vcpu))
return;
/*
- * If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change
- * PI.NDST: pi_post_block is the one expected to change PID.NDST and the
- * wakeup handler expects the vCPU to be on the blocked_vcpu_list that
- * matches PI.NDST. Otherwise, a vcpu may not be able to be woken up
- * correctly.
+ * If the vCPU wasn't on the wakeup list and wasn't migrated, then the
+ * full update can be skipped as neither the vector nor the destination
+ * needs to be changed.
*/
- if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR || vcpu->cpu == cpu) {
- pi_clear_sn(pi_desc);
- goto after_clear_sn;
+ if (pi_desc->nv != POSTED_INTR_WAKEUP_VECTOR && vcpu->cpu == cpu) {
+ /*
+ * Clear SN if it was set due to being preempted. Again, do
+ * this even if there is no assigned device for simplicity.
+ */
+ if (pi_test_and_clear_sn(pi_desc))
+ goto after_clear_sn;
+ return;
}
- /* The full case. */
- do {
- old.control = new.control = pi_desc->control;
+ local_irq_save(flags);
- dest = cpu_physical_id(cpu);
+ /*
+ * If the vCPU was waiting for wakeup, remove the vCPU from the wakeup
+ * list of the _previous_ pCPU, which will not be the same as the
+ * current pCPU if the task was migrated.
+ */
+ if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR) {
+ raw_spin_lock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu));
+ list_del(&vmx->pi_wakeup_list);
+ raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu));
+ }
- if (x2apic_mode)
- new.ndst = dest;
- else
- new.ndst = (dest << 8) & 0xFF00;
+ dest = cpu_physical_id(cpu);
+ if (!x2apic_mode)
+ dest = (dest << 8) & 0xFF00;
+
+ do {
+ old.control = new.control = READ_ONCE(pi_desc->control);
+ /*
+ * Clear SN (as above) and refresh the destination APIC ID to
+ * handle task migration (@cpu != vcpu->cpu).
+ */
+ new.ndst = dest;
new.sn = 0;
- } while (cmpxchg64(&pi_desc->control, old.control,
- new.control) != old.control);
+
+ /*
+ * Restore the notification vector; in the blocking case, the
+ * descriptor was modified on "put" to use the wakeup vector.
+ */
+ new.nv = POSTED_INTR_VECTOR;
+ } while (pi_try_set_control(pi_desc, old.control, new.control));
+
+ local_irq_restore(flags);
after_clear_sn:
@@ -77,130 +129,71 @@ after_clear_sn:
pi_set_on(pi_desc);
}
-void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
-{
- struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
-
- if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
- !irq_remapping_cap(IRQ_POSTING_CAP) ||
- !kvm_vcpu_apicv_active(vcpu))
- return;
-
- /* Set SN when the vCPU is preempted */
- if (vcpu->preempted)
- pi_set_sn(pi_desc);
-}
-
-static void __pi_post_block(struct kvm_vcpu *vcpu)
+static bool vmx_can_use_vtd_pi(struct kvm *kvm)
{
- struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
- struct pi_desc old, new;
- unsigned int dest;
-
- do {
- old.control = new.control = pi_desc->control;
- WARN(old.nv != POSTED_INTR_WAKEUP_VECTOR,
- "Wakeup handler not enabled while the VCPU is blocked\n");
-
- dest = cpu_physical_id(vcpu->cpu);
-
- if (x2apic_mode)
- new.ndst = dest;
- else
- new.ndst = (dest << 8) & 0xFF00;
-
- /* set 'NV' to 'notification vector' */
- new.nv = POSTED_INTR_VECTOR;
- } while (cmpxchg64(&pi_desc->control, old.control,
- new.control) != old.control);
-
- if (!WARN_ON_ONCE(vcpu->pre_pcpu == -1)) {
- spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
- list_del(&vcpu->blocked_vcpu_list);
- spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
- vcpu->pre_pcpu = -1;
- }
+ return irqchip_in_kernel(kvm) && enable_apicv &&
+ kvm_arch_has_assigned_device(kvm) &&
+ irq_remapping_cap(IRQ_POSTING_CAP);
}
/*
- * This routine does the following things for vCPU which is going
- * to be blocked if VT-d PI is enabled.
- * - Store the vCPU to the wakeup list, so when interrupts happen
- * we can find the right vCPU to wake up.
- * - Change the Posted-interrupt descriptor as below:
- * 'NDST' <-- vcpu->pre_pcpu
- * 'NV' <-- POSTED_INTR_WAKEUP_VECTOR
- * - If 'ON' is set during this process, which means at least one
- * interrupt is posted for this vCPU, we cannot block it, in
- * this case, return 1, otherwise, return 0.
- *
+ * Put the vCPU on this pCPU's list of vCPUs that needs to be awakened and set
+ * WAKEUP as the notification vector in the PI descriptor.
*/
-int pi_pre_block(struct kvm_vcpu *vcpu)
+static void pi_enable_wakeup_handler(struct kvm_vcpu *vcpu)
{
- unsigned int dest;
- struct pi_desc old, new;
struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct pi_desc old, new;
+ unsigned long flags;
- if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
- !irq_remapping_cap(IRQ_POSTING_CAP) ||
- !kvm_vcpu_apicv_active(vcpu))
- return 0;
-
- WARN_ON(irqs_disabled());
- local_irq_disable();
- if (!WARN_ON_ONCE(vcpu->pre_pcpu != -1)) {
- vcpu->pre_pcpu = vcpu->cpu;
- spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
- list_add_tail(&vcpu->blocked_vcpu_list,
- &per_cpu(blocked_vcpu_on_cpu,
- vcpu->pre_pcpu));
- spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
- }
-
- do {
- old.control = new.control = pi_desc->control;
+ local_irq_save(flags);
- WARN((pi_desc->sn == 1),
- "Warning: SN field of posted-interrupts "
- "is set before blocking\n");
+ raw_spin_lock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu));
+ list_add_tail(&vmx->pi_wakeup_list,
+ &per_cpu(wakeup_vcpus_on_cpu, vcpu->cpu));
+ raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu));
- /*
- * Since vCPU can be preempted during this process,
- * vcpu->cpu could be different with pre_pcpu, we
- * need to set pre_pcpu as the destination of wakeup
- * notification event, then we can find the right vCPU
- * to wakeup in wakeup handler if interrupts happen
- * when the vCPU is in blocked state.
- */
- dest = cpu_physical_id(vcpu->pre_pcpu);
+ WARN(pi_desc->sn, "PI descriptor SN field set before blocking");
- if (x2apic_mode)
- new.ndst = dest;
- else
- new.ndst = (dest << 8) & 0xFF00;
+ do {
+ old.control = new.control = READ_ONCE(pi_desc->control);
/* set 'NV' to 'wakeup vector' */
new.nv = POSTED_INTR_WAKEUP_VECTOR;
- } while (cmpxchg64(&pi_desc->control, old.control,
- new.control) != old.control);
+ } while (pi_try_set_control(pi_desc, old.control, new.control));
- /* We should not block the vCPU if an interrupt is posted for it. */
- if (pi_test_on(pi_desc) == 1)
- __pi_post_block(vcpu);
+ /*
+ * Send a wakeup IPI to this CPU if an interrupt may have been posted
+ * before the notification vector was updated, in which case the IRQ
+ * will arrive on the non-wakeup vector. An IPI is needed as calling
+ * try_to_wake_up() from ->sched_out() isn't allowed (IRQs are not
+ * enabled until it is safe to call try_to_wake_up() on the task being
+ * scheduled out).
+ */
+ if (pi_test_on(&new))
+ apic->send_IPI_self(POSTED_INTR_WAKEUP_VECTOR);
- local_irq_enable();
- return (vcpu->pre_pcpu == -1);
+ local_irq_restore(flags);
}
-void pi_post_block(struct kvm_vcpu *vcpu)
+void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
{
- if (vcpu->pre_pcpu == -1)
+ struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+
+ if (!vmx_can_use_vtd_pi(vcpu->kvm))
return;
- WARN_ON(irqs_disabled());
- local_irq_disable();
- __pi_post_block(vcpu);
- local_irq_enable();
+ if (kvm_vcpu_is_blocking(vcpu) && !vmx_interrupt_blocked(vcpu))
+ pi_enable_wakeup_handler(vcpu);
+
+ /*
+ * Set SN when the vCPU is preempted. Note, the vCPU can both be seen
+ * as blocking and preempted, e.g. if it's preempted between setting
+ * its wait state and manually scheduling out.
+ */
+ if (vcpu->preempted)
+ pi_set_sn(pi_desc);
}
/*
@@ -208,24 +201,23 @@ void pi_post_block(struct kvm_vcpu *vcpu)
*/
void pi_wakeup_handler(void)
{
- struct kvm_vcpu *vcpu;
int cpu = smp_processor_id();
+ struct vcpu_vmx *vmx;
- spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
- list_for_each_entry(vcpu, &per_cpu(blocked_vcpu_on_cpu, cpu),
- blocked_vcpu_list) {
- struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+ raw_spin_lock(&per_cpu(wakeup_vcpus_on_cpu_lock, cpu));
+ list_for_each_entry(vmx, &per_cpu(wakeup_vcpus_on_cpu, cpu),
+ pi_wakeup_list) {
- if (pi_test_on(pi_desc) == 1)
- kvm_vcpu_kick(vcpu);
+ if (pi_test_on(&vmx->pi_desc))
+ kvm_vcpu_wake_up(&vmx->vcpu);
}
- spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
+ raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, cpu));
}
void __init pi_init_cpu(int cpu)
{
- INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu));
- spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
+ INIT_LIST_HEAD(&per_cpu(wakeup_vcpus_on_cpu, cpu));
+ raw_spin_lock_init(&per_cpu(wakeup_vcpus_on_cpu_lock, cpu));
}
bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu)
@@ -241,7 +233,7 @@ bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu)
* Bail out of the block loop if the VM has an assigned
* device, but the blocking vCPU didn't reconfigure the
* PI.NV to the wakeup vector, i.e. the assigned device
- * came along after the initial check in pi_pre_block().
+ * came along after the initial check in vmx_vcpu_pi_put().
*/
void vmx_pi_start_assignment(struct kvm *kvm)
{
@@ -270,9 +262,7 @@ int pi_update_irte(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq,
struct vcpu_data vcpu_info;
int idx, ret = 0;
- if (!kvm_arch_has_assigned_device(kvm) ||
- !irq_remapping_cap(IRQ_POSTING_CAP) ||
- !kvm_vcpu_apicv_active(kvm->vcpus[0]))
+ if (!vmx_can_use_vtd_pi(kvm))
return 0;
idx = srcu_read_lock(&kvm->irq_srcu);
diff --git a/arch/x86/kvm/vmx/posted_intr.h b/arch/x86/kvm/vmx/posted_intr.h
index 7f7b2326caf5..eb14e76b84ef 100644
--- a/arch/x86/kvm/vmx/posted_intr.h
+++ b/arch/x86/kvm/vmx/posted_intr.h
@@ -40,7 +40,13 @@ static inline bool pi_test_and_clear_on(struct pi_desc *pi_desc)
(unsigned long *)&pi_desc->control);
}
-static inline int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc)
+static inline bool pi_test_and_clear_sn(struct pi_desc *pi_desc)
+{
+ return test_and_clear_bit(POSTED_INTR_SN,
+ (unsigned long *)&pi_desc->control);
+}
+
+static inline bool pi_test_and_set_pir(int vector, struct pi_desc *pi_desc)
{
return test_and_set_bit(vector, (unsigned long *)pi_desc->pir);
}
@@ -74,13 +80,13 @@ static inline void pi_clear_sn(struct pi_desc *pi_desc)
(unsigned long *)&pi_desc->control);
}
-static inline int pi_test_on(struct pi_desc *pi_desc)
+static inline bool pi_test_on(struct pi_desc *pi_desc)
{
return test_bit(POSTED_INTR_ON,
(unsigned long *)&pi_desc->control);
}
-static inline int pi_test_sn(struct pi_desc *pi_desc)
+static inline bool pi_test_sn(struct pi_desc *pi_desc)
{
return test_bit(POSTED_INTR_SN,
(unsigned long *)&pi_desc->control);
@@ -88,8 +94,6 @@ static inline int pi_test_sn(struct pi_desc *pi_desc)
void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu);
void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu);
-int pi_pre_block(struct kvm_vcpu *vcpu);
-void pi_post_block(struct kvm_vcpu *vcpu);
void pi_wakeup_handler(void);
void __init pi_init_cpu(int cpu);
bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h
index 6e5de2e2b0da..e325c290a816 100644
--- a/arch/x86/kvm/vmx/vmcs.h
+++ b/arch/x86/kvm/vmx/vmcs.h
@@ -129,6 +129,11 @@ static inline bool is_machine_check(u32 intr_info)
return is_exception_n(intr_info, MC_VECTOR);
}
+static inline bool is_nm_fault(u32 intr_info)
+{
+ return is_exception_n(intr_info, NM_VECTOR);
+}
+
/* Undocumented: icebp/int1 */
static inline bool is_icebp(u32 intr_info)
{
diff --git a/arch/x86/kvm/vmx/vmcs12.c b/arch/x86/kvm/vmx/vmcs12.c
index cab6ba7a5005..2251b60920f8 100644
--- a/arch/x86/kvm/vmx/vmcs12.c
+++ b/arch/x86/kvm/vmx/vmcs12.c
@@ -8,7 +8,7 @@
FIELD(number, name), \
[ROL16(number##_HIGH, 6)] = VMCS12_OFFSET(name) + sizeof(u32)
-const unsigned short vmcs_field_to_offset_table[] = {
+const unsigned short vmcs12_field_offsets[] = {
FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id),
FIELD(POSTED_INTR_NV, posted_intr_nv),
FIELD(GUEST_ES_SELECTOR, guest_es_selector),
@@ -151,4 +151,4 @@ const unsigned short vmcs_field_to_offset_table[] = {
FIELD(HOST_RSP, host_rsp),
FIELD(HOST_RIP, host_rip),
};
-const unsigned int nr_vmcs12_fields = ARRAY_SIZE(vmcs_field_to_offset_table);
+const unsigned int nr_vmcs12_fields = ARRAY_SIZE(vmcs12_field_offsets);
diff --git a/arch/x86/kvm/vmx/vmcs12.h b/arch/x86/kvm/vmx/vmcs12.h
index 2a45f026ee11..746129ddd5ae 100644
--- a/arch/x86/kvm/vmx/vmcs12.h
+++ b/arch/x86/kvm/vmx/vmcs12.h
@@ -361,10 +361,10 @@ static inline void vmx_check_vmcs12_offsets(void)
CHECK_OFFSET(guest_pml_index, 996);
}
-extern const unsigned short vmcs_field_to_offset_table[];
+extern const unsigned short vmcs12_field_offsets[];
extern const unsigned int nr_vmcs12_fields;
-static inline short vmcs_field_to_offset(unsigned long field)
+static inline short get_vmcs12_field_offset(unsigned long field)
{
unsigned short offset;
unsigned int index;
@@ -377,7 +377,7 @@ static inline short vmcs_field_to_offset(unsigned long field)
return -ENOENT;
index = array_index_nospec(index, nr_vmcs12_fields);
- offset = vmcs_field_to_offset_table[index];
+ offset = vmcs12_field_offsets[index];
if (offset == 0)
return -ENOENT;
return offset;
diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S
index 3a6461694fc2..435c187927c4 100644
--- a/arch/x86/kvm/vmx/vmenter.S
+++ b/arch/x86/kvm/vmx/vmenter.S
@@ -49,14 +49,14 @@ SYM_FUNC_START_LOCAL(vmx_vmenter)
je 2f
1: vmresume
- ret
+ RET
2: vmlaunch
- ret
+ RET
3: cmpb $0, kvm_rebooting
je 4f
- ret
+ RET
4: ud2
_ASM_EXTABLE(1b, 3b)
@@ -89,7 +89,7 @@ SYM_FUNC_START(vmx_vmexit)
pop %_ASM_AX
.Lvmexit_skip_rsb:
#endif
- ret
+ RET
SYM_FUNC_END(vmx_vmexit)
/**
@@ -228,7 +228,7 @@ SYM_FUNC_START(__vmx_vcpu_run)
pop %edi
#endif
pop %_ASM_BP
- ret
+ RET
/* VM-Fail. Out-of-line to avoid a taken Jcc after VM-Exit. */
2: mov $1, %eax
@@ -293,7 +293,7 @@ SYM_FUNC_START(vmread_error_trampoline)
pop %_ASM_AX
pop %_ASM_BP
- ret
+ RET
SYM_FUNC_END(vmread_error_trampoline)
SYM_FUNC_START(vmx_do_interrupt_nmi_irqoff)
@@ -326,5 +326,5 @@ SYM_FUNC_START(vmx_do_interrupt_nmi_irqoff)
*/
mov %_ASM_BP, %_ASM_SP
pop %_ASM_BP
- ret
+ RET
SYM_FUNC_END(vmx_do_interrupt_nmi_irqoff)
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index ba66c171d951..aca3ae2a02f3 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -36,6 +36,7 @@
#include <asm/debugreg.h>
#include <asm/desc.h>
#include <asm/fpu/api.h>
+#include <asm/fpu/xstate.h>
#include <asm/idtentry.h>
#include <asm/io.h>
#include <asm/irq_remapping.h>
@@ -161,6 +162,8 @@ static u32 vmx_possible_passthrough_msrs[MAX_POSSIBLE_PASSTHROUGH_MSRS] = {
MSR_FS_BASE,
MSR_GS_BASE,
MSR_KERNEL_GS_BASE,
+ MSR_IA32_XFD,
+ MSR_IA32_XFD_ERR,
#endif
MSR_IA32_SYSENTER_CS,
MSR_IA32_SYSENTER_ESP,
@@ -602,15 +605,13 @@ static int vmx_set_guest_uret_msr(struct vcpu_vmx *vmx,
unsigned int slot = msr - vmx->guest_uret_msrs;
int ret = 0;
- u64 old_msr_data = msr->data;
- msr->data = data;
if (msr->load_into_hardware) {
preempt_disable();
- ret = kvm_set_user_return_msr(slot, msr->data, msr->mask);
+ ret = kvm_set_user_return_msr(slot, data, msr->mask);
preempt_enable();
- if (ret)
- msr->data = old_msr_data;
}
+ if (!ret)
+ msr->data = data;
return ret;
}
@@ -763,6 +764,14 @@ void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu)
vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, match);
}
+ /*
+ * Disabling xfd interception indicates that dynamic xfeatures
+ * might be used in the guest. Always trap #NM in this case
+ * to save guest xfd_err timely.
+ */
+ if (vcpu->arch.xfd_no_write_intercept)
+ eb |= (1u << NM_VECTOR);
+
vmcs_write32(EXCEPTION_BITMAP, eb);
}
@@ -1071,9 +1080,14 @@ static void pt_guest_exit(struct vcpu_vmx *vmx)
wrmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
}
-void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel,
- unsigned long fs_base, unsigned long gs_base)
+void vmx_set_vmcs_host_state(struct vmcs_host_state *host, unsigned long cr3,
+ u16 fs_sel, u16 gs_sel,
+ unsigned long fs_base, unsigned long gs_base)
{
+ if (unlikely(cr3 != host->cr3)) {
+ vmcs_writel(HOST_CR3, cr3);
+ host->cr3 = cr3;
+ }
if (unlikely(fs_sel != host->fs_sel)) {
if (!(fs_sel & 7))
vmcs_write16(HOST_FS_SELECTOR, fs_sel);
@@ -1168,7 +1182,9 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
gs_base = segment_base(gs_sel);
#endif
- vmx_set_host_fs_gs(host_state, fs_sel, gs_sel, fs_base, gs_base);
+ vmx_set_vmcs_host_state(host_state, __get_current_cr3_fast(),
+ fs_sel, gs_sel, fs_base, gs_base);
+
vmx->guest_state_loaded = true;
}
@@ -1271,7 +1287,6 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu,
if (!already_loaded) {
void *gdt = get_current_gdt_ro();
- unsigned long sysenter_esp;
/*
* Flush all EPTP/VPID contexts, the new pCPU may have stale
@@ -1287,8 +1302,11 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu,
(unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss);
vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt); /* 22.2.4 */
- rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
- vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
+ if (IS_ENABLED(CONFIG_IA32_EMULATION) || IS_ENABLED(CONFIG_X86_32)) {
+ /* 22.2.3 */
+ vmcs_writel(HOST_IA32_SYSENTER_ESP,
+ (unsigned long)(cpu_entry_stack(cpu) + 1));
+ }
vmx->loaded_vmcs->cpu = cpu;
}
@@ -1363,6 +1381,11 @@ void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
vmx->emulation_required = vmx_emulation_required(vcpu);
}
+static bool vmx_get_if_flag(struct kvm_vcpu *vcpu)
+{
+ return vmx_get_rflags(vcpu) & X86_EFLAGS_IF;
+}
+
u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu)
{
u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
@@ -1464,11 +1487,12 @@ static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data)
return 0;
}
-static bool vmx_can_emulate_instruction(struct kvm_vcpu *vcpu, void *insn, int insn_len)
+static bool vmx_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
+ void *insn, int insn_len)
{
/*
* Emulation of instructions in SGX enclaves is impossible as RIP does
- * not point tthe failing instruction, and even if it did, the code
+ * not point at the failing instruction, and even if it did, the code
* stream is inaccessible. Inject #UD instead of exiting to userspace
* so that guest userspace can't DoS the guest simply by triggering
* emulation (enclaves are CPL3 only).
@@ -1748,7 +1772,7 @@ static int vmx_get_msr_feature(struct kvm_msr_entry *msr)
}
/*
- * Reads an msr value (of 'msr_index') into 'pdata'.
+ * Reads an msr value (of 'msr_info->index') into 'msr_info->data'.
* Returns 0 on success, non-0 otherwise.
* Assumes vcpu_load() was already called.
*/
@@ -1955,6 +1979,24 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_KERNEL_GS_BASE:
vmx_write_guest_kernel_gs_base(vmx, data);
break;
+ case MSR_IA32_XFD:
+ ret = kvm_set_msr_common(vcpu, msr_info);
+ /*
+ * Always intercepting WRMSR could incur non-negligible
+ * overhead given xfd might be changed frequently in
+ * guest context switch. Disable write interception
+ * upon the first write with a non-zero value (indicating
+ * potential usage on dynamic xfeatures). Also update
+ * exception bitmap to trap #NM for proper virtualization
+ * of guest xfd_err.
+ */
+ if (!ret && data) {
+ vmx_disable_intercept_for_msr(vcpu, MSR_IA32_XFD,
+ MSR_TYPE_RW);
+ vcpu->arch.xfd_no_write_intercept = true;
+ vmx_update_exception_bitmap(vcpu);
+ }
+ break;
#endif
case MSR_IA32_SYSENTER_CS:
if (is_guest_mode(vcpu))
@@ -2095,9 +2137,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
}
ret = kvm_set_msr_common(vcpu, msr_info);
break;
- case MSR_IA32_TSC_ADJUST:
- ret = kvm_set_msr_common(vcpu, msr_info);
- break;
case MSR_IA32_MCG_EXT_CTL:
if ((!msr_info->host_initiated &&
!(to_vmx(vcpu)->msr_ia32_feature_control &
@@ -2565,7 +2604,6 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
return -EIO;
vmcs_conf->size = vmx_msr_high & 0x1fff;
- vmcs_conf->order = get_order(vmcs_conf->size);
vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff;
vmcs_conf->revision_id = vmx_msr_low;
@@ -2590,7 +2628,7 @@ struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags)
struct page *pages;
struct vmcs *vmcs;
- pages = __alloc_pages_node(node, flags, vmcs_config.order);
+ pages = __alloc_pages_node(node, flags, 0);
if (!pages)
return NULL;
vmcs = page_address(pages);
@@ -2609,7 +2647,7 @@ struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags)
void free_vmcs(struct vmcs *vmcs)
{
- free_pages((unsigned long)vmcs, vmcs_config.order);
+ free_page((unsigned long)vmcs);
}
/*
@@ -2646,15 +2684,6 @@ int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
if (!loaded_vmcs->msr_bitmap)
goto out_vmcs;
memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE);
-
- if (IS_ENABLED(CONFIG_HYPERV) &&
- static_branch_unlikely(&enable_evmcs) &&
- (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)) {
- struct hv_enlightened_vmcs *evmcs =
- (struct hv_enlightened_vmcs *)loaded_vmcs->vmcs;
-
- evmcs->hv_enlightenments_control.msr_bitmap = 1;
- }
}
memset(&loaded_vmcs->host_state, 0, sizeof(struct vmcs_host_state));
@@ -2918,6 +2947,13 @@ static void vmx_flush_tlb_all(struct kvm_vcpu *vcpu)
}
}
+static inline int vmx_get_current_vpid(struct kvm_vcpu *vcpu)
+{
+ if (is_guest_mode(vcpu))
+ return nested_get_vpid02(vcpu);
+ return to_vmx(vcpu)->vpid;
+}
+
static void vmx_flush_tlb_current(struct kvm_vcpu *vcpu)
{
struct kvm_mmu *mmu = vcpu->arch.mmu;
@@ -2930,31 +2966,29 @@ static void vmx_flush_tlb_current(struct kvm_vcpu *vcpu)
if (enable_ept)
ept_sync_context(construct_eptp(vcpu, root_hpa,
mmu->shadow_root_level));
- else if (!is_guest_mode(vcpu))
- vpid_sync_context(to_vmx(vcpu)->vpid);
else
- vpid_sync_context(nested_get_vpid02(vcpu));
+ vpid_sync_context(vmx_get_current_vpid(vcpu));
}
static void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr)
{
/*
- * vpid_sync_vcpu_addr() is a nop if vmx->vpid==0, see the comment in
+ * vpid_sync_vcpu_addr() is a nop if vpid==0, see the comment in
* vmx_flush_tlb_guest() for an explanation of why this is ok.
*/
- vpid_sync_vcpu_addr(to_vmx(vcpu)->vpid, addr);
+ vpid_sync_vcpu_addr(vmx_get_current_vpid(vcpu), addr);
}
static void vmx_flush_tlb_guest(struct kvm_vcpu *vcpu)
{
/*
- * vpid_sync_context() is a nop if vmx->vpid==0, e.g. if enable_vpid==0
- * or a vpid couldn't be allocated for this vCPU. VM-Enter and VM-Exit
- * are required to flush GVA->{G,H}PA mappings from the TLB if vpid is
+ * vpid_sync_context() is a nop if vpid==0, e.g. if enable_vpid==0 or a
+ * vpid couldn't be allocated for this vCPU. VM-Enter and VM-Exit are
+ * required to flush GVA->{G,H}PA mappings from the TLB if vpid is
* disabled (VM-Enter with vpid enabled and vpid==0 is disallowed),
* i.e. no explicit INVVPID is necessary.
*/
- vpid_sync_context(to_vmx(vcpu)->vpid);
+ vpid_sync_context(vmx_get_current_vpid(vcpu));
}
void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu)
@@ -2984,7 +3018,7 @@ void ept_save_pdptrs(struct kvm_vcpu *vcpu)
mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2);
mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3);
- kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR);
+ kvm_register_mark_available(vcpu, VCPU_EXREG_PDPTR);
}
#define CR3_EXITING_BITS (CPU_BASED_CR3_LOAD_EXITING | \
@@ -3066,6 +3100,13 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
/* Note, vmx_set_cr4() consumes the new vcpu->arch.cr0. */
if ((old_cr0_pg ^ cr0) & X86_CR0_PG)
vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
+
+ /*
+ * When !CR0_PG -> CR0_PG, vcpu->arch.cr3 becomes active, but
+ * GUEST_CR3 is still vmx->ept_identity_map_addr if EPT + !URG.
+ */
+ if (!(old_cr0_pg & X86_CR0_PG) && (cr0 & X86_CR0_PG))
+ kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3);
}
/* depends on vcpu->arch.cr0 to be set to a new value */
@@ -3109,9 +3150,9 @@ static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa,
if (!enable_unrestricted_guest && !is_paging(vcpu))
guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr;
- else if (test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
+ else if (kvm_register_is_dirty(vcpu, VCPU_EXREG_CR3))
guest_cr3 = vcpu->arch.cr3;
- else /* vmcs01.GUEST_CR3 is already up-to-date. */
+ else /* vmcs.GUEST_CR3 is already up-to-date. */
update_guest_cr3 = false;
vmx_ept_load_pdptrs(vcpu);
} else {
@@ -3122,6 +3163,7 @@ static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa,
vmcs_writel(GUEST_CR3, guest_cr3);
}
+
static bool vmx_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
{
/*
@@ -3686,6 +3728,19 @@ void free_vpid(int vpid)
spin_unlock(&vmx_vpid_lock);
}
+static void vmx_msr_bitmap_l01_changed(struct vcpu_vmx *vmx)
+{
+ /*
+ * When KVM is a nested hypervisor on top of Hyper-V and uses
+ * 'Enlightened MSR Bitmap' feature L0 needs to know that MSR
+ * bitmap has changed.
+ */
+ if (static_branch_unlikely(&enable_evmcs))
+ evmcs_touch_msr_bitmap();
+
+ vmx->nested.force_msr_bitmap_recalc = true;
+}
+
void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -3694,8 +3749,7 @@ void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type)
if (!cpu_has_vmx_msr_bitmap())
return;
- if (static_branch_unlikely(&enable_evmcs))
- evmcs_touch_msr_bitmap();
+ vmx_msr_bitmap_l01_changed(vmx);
/*
* Mark the desired intercept state in shadow bitmap, this is needed
@@ -3739,8 +3793,7 @@ void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type)
if (!cpu_has_vmx_msr_bitmap())
return;
- if (static_branch_unlikely(&enable_evmcs))
- evmcs_touch_msr_bitmap();
+ vmx_msr_bitmap_l01_changed(vmx);
/*
* Mark the desired intercept state in shadow bitmap, this is needed
@@ -3878,12 +3931,10 @@ static void vmx_msr_filter_changed(struct kvm_vcpu *vcpu)
pt_update_intercept_for_msr(vcpu);
}
-static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu,
- bool nested)
+static inline void kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu,
+ int pi_vec)
{
#ifdef CONFIG_SMP
- int pi_vec = nested ? POSTED_INTR_NESTED_VECTOR : POSTED_INTR_VECTOR;
-
if (vcpu->mode == IN_GUEST_MODE) {
/*
* The vector of interrupt to be delivered to vcpu had
@@ -3911,10 +3962,15 @@ static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu,
*/
apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec);
- return true;
+ return;
}
#endif
- return false;
+ /*
+ * The vCPU isn't in the guest; wake the vCPU in case it is blocking,
+ * otherwise do nothing as KVM will grab the highest priority pending
+ * IRQ via ->sync_pir_to_irr() in vcpu_enter_guest().
+ */
+ kvm_vcpu_wake_up(vcpu);
}
static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
@@ -3930,9 +3986,21 @@ static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
*/
vmx->nested.pi_pending = true;
kvm_make_request(KVM_REQ_EVENT, vcpu);
+
+ /*
+ * This pairs with the smp_mb_*() after setting vcpu->mode in
+ * vcpu_enter_guest() to guarantee the vCPU sees the event
+ * request if triggering a posted interrupt "fails" because
+ * vcpu->mode != IN_GUEST_MODE. The extra barrier is needed as
+ * the smb_wmb() in kvm_make_request() only ensures everything
+ * done before making the request is visible when the request
+ * is visible, it doesn't ensure ordering between the store to
+ * vcpu->requests and the load from vcpu->mode.
+ */
+ smp_mb__after_atomic();
+
/* the PIR and ON have been set by L1. */
- if (!kvm_vcpu_trigger_posted_interrupt(vcpu, true))
- kvm_vcpu_kick(vcpu);
+ kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_NESTED_VECTOR);
return 0;
}
return -1;
@@ -3963,10 +4031,13 @@ static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
if (pi_test_and_set_on(&vmx->pi_desc))
return 0;
- if (vcpu != kvm_get_running_vcpu() &&
- !kvm_vcpu_trigger_posted_interrupt(vcpu, false))
- kvm_vcpu_kick(vcpu);
-
+ /*
+ * The implied barrier in pi_test_and_set_on() pairs with the smp_mb_*()
+ * after setting vcpu->mode in vcpu_enter_guest(), thus the vCPU is
+ * guaranteed to see PID.ON=1 and sync the PIR to IRR if triggering a
+ * posted interrupt "fails" because vcpu->mode != IN_GUEST_MODE.
+ */
+ kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_VECTOR);
return 0;
}
@@ -4021,6 +4092,16 @@ void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
rdmsr(MSR_IA32_SYSENTER_CS, low32, high32);
vmcs_write32(HOST_IA32_SYSENTER_CS, low32);
+
+ /*
+ * SYSENTER is used for 32-bit system calls on either 32-bit or
+ * 64-bit kernels. It is always zero If neither is allowed, otherwise
+ * vmx_vcpu_load_vmcs loads it with the per-CPU entry stack (and may
+ * have already done so!).
+ */
+ if (!IS_ENABLED(CONFIG_IA32_EMULATION) && !IS_ENABLED(CONFIG_X86_32))
+ vmcs_writel(HOST_IA32_SYSENTER_ESP, 0);
+
rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl);
vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl); /* 22.2.3 */
@@ -4039,8 +4120,10 @@ void set_cr4_guest_host_mask(struct vcpu_vmx *vmx)
vcpu->arch.cr4_guest_owned_bits = KVM_POSSIBLE_CR4_GUEST_BITS &
~vcpu->arch.cr4_guest_rsvd_bits;
- if (!enable_ept)
- vcpu->arch.cr4_guest_owned_bits &= ~X86_CR4_PGE;
+ if (!enable_ept) {
+ vcpu->arch.cr4_guest_owned_bits &= ~X86_CR4_TLBFLUSH_BITS;
+ vcpu->arch.cr4_guest_owned_bits &= ~X86_CR4_PDPTR_BITS;
+ }
if (is_guest_mode(&vmx->vcpu))
vcpu->arch.cr4_guest_owned_bits &=
~get_vmcs12(vcpu)->cr4_guest_host_mask;
@@ -4692,7 +4775,7 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
if (kvm_emulate_instruction(vcpu, 0)) {
if (vcpu->arch.halt_request) {
vcpu->arch.halt_request = 0;
- return kvm_vcpu_halt(vcpu);
+ return kvm_emulate_halt_noskip(vcpu);
}
return 1;
}
@@ -4748,6 +4831,17 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu)
if (is_machine_check(intr_info) || is_nmi(intr_info))
return 1; /* handled by handle_exception_nmi_irqoff() */
+ /*
+ * Queue the exception here instead of in handle_nm_fault_irqoff().
+ * This ensures the nested_vmx check is not skipped so vmexit can
+ * be reflected to L1 (when it intercepts #NM) before reaching this
+ * point.
+ */
+ if (is_nm_fault(intr_info)) {
+ kvm_queue_exception(vcpu, NM_VECTOR);
+ return 1;
+ }
+
if (is_invalid_opcode(intr_info))
return handle_ud(vcpu);
@@ -4811,8 +4905,33 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu)
dr6 = vmx_get_exit_qual(vcpu);
if (!(vcpu->guest_debug &
(KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) {
+ /*
+ * If the #DB was due to ICEBP, a.k.a. INT1, skip the
+ * instruction. ICEBP generates a trap-like #DB, but
+ * despite its interception control being tied to #DB,
+ * is an instruction intercept, i.e. the VM-Exit occurs
+ * on the ICEBP itself. Note, skipping ICEBP also
+ * clears STI and MOVSS blocking.
+ *
+ * For all other #DBs, set vmcs.PENDING_DBG_EXCEPTIONS.BS
+ * if single-step is enabled in RFLAGS and STI or MOVSS
+ * blocking is active, as the CPU doesn't set the bit
+ * on VM-Exit due to #DB interception. VM-Entry has a
+ * consistency check that a single-step #DB is pending
+ * in this scenario as the previous instruction cannot
+ * have toggled RFLAGS.TF 0=>1 (because STI and POP/MOV
+ * don't modify RFLAGS), therefore the one instruction
+ * delay when activating single-step breakpoints must
+ * have already expired. Note, the CPU sets/clears BS
+ * as appropriate for all other VM-Exits types.
+ */
if (is_icebp(intr_info))
WARN_ON(!skip_emulated_instruction(vcpu));
+ else if ((vmx_get_rflags(vcpu) & X86_EFLAGS_TF) &&
+ (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
+ (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)))
+ vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
+ vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS) | DR6_BS);
kvm_queue_exception_p(vcpu, DB_VECTOR, dr6);
return 1;
@@ -5307,7 +5426,7 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
{
gpa_t gpa;
- if (!vmx_can_emulate_instruction(vcpu, NULL, 0))
+ if (!vmx_can_emulate_instruction(vcpu, EMULTYPE_PF, NULL, 0))
return 1;
/*
@@ -5336,6 +5455,14 @@ static int handle_nmi_window(struct kvm_vcpu *vcpu)
return 1;
}
+static bool vmx_emulation_required_with_pending_exception(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ return vmx->emulation_required && !vmx->rmode.vm86_active &&
+ vcpu->arch.exception.pending;
+}
+
static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -5355,15 +5482,14 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
if (!kvm_emulate_instruction(vcpu, 0))
return 0;
- if (vmx->emulation_required && !vmx->rmode.vm86_active &&
- vcpu->arch.exception.pending) {
+ if (vmx_emulation_required_with_pending_exception(vcpu)) {
kvm_prepare_emulation_failure_exit(vcpu);
return 0;
}
if (vcpu->arch.halt_request) {
vcpu->arch.halt_request = 0;
- return kvm_vcpu_halt(vcpu);
+ return kvm_emulate_halt_noskip(vcpu);
}
/*
@@ -5378,6 +5504,16 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
return 1;
}
+static int vmx_vcpu_pre_run(struct kvm_vcpu *vcpu)
+{
+ if (vmx_emulation_required_with_pending_exception(vcpu)) {
+ kvm_prepare_emulation_failure_exit(vcpu);
+ return 0;
+ }
+
+ return 1;
+}
+
static void grow_ple_window(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -5881,18 +6017,14 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
vmx_flush_pml_buffer(vcpu);
/*
- * We should never reach this point with a pending nested VM-Enter, and
- * more specifically emulation of L2 due to invalid guest state (see
- * below) should never happen as that means we incorrectly allowed a
- * nested VM-Enter with an invalid vmcs12.
+ * KVM should never reach this point with a pending nested VM-Enter.
+ * More specifically, short-circuiting VM-Entry to emulate L2 due to
+ * invalid guest state should never happen as that means KVM knowingly
+ * allowed a nested VM-Enter with an invalid vmcs12. More below.
*/
if (KVM_BUG_ON(vmx->nested.nested_run_pending, vcpu->kvm))
return -EIO;
- /* If guest state is invalid, start emulating */
- if (vmx->emulation_required)
- return handle_invalid_guest_state(vcpu);
-
if (is_guest_mode(vcpu)) {
/*
* PML is never enabled when running L2, bail immediately if a
@@ -5914,10 +6046,30 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
*/
nested_mark_vmcs12_pages_dirty(vcpu);
+ /*
+ * Synthesize a triple fault if L2 state is invalid. In normal
+ * operation, nested VM-Enter rejects any attempt to enter L2
+ * with invalid state. However, those checks are skipped if
+ * state is being stuffed via RSM or KVM_SET_NESTED_STATE. If
+ * L2 state is invalid, it means either L1 modified SMRAM state
+ * or userspace provided bad state. Synthesize TRIPLE_FAULT as
+ * doing so is architecturally allowed in the RSM case, and is
+ * the least awful solution for the userspace case without
+ * risking false positives.
+ */
+ if (vmx->emulation_required) {
+ nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0);
+ return 1;
+ }
+
if (nested_vmx_reflect_vmexit(vcpu))
return 1;
}
+ /* If guest state is invalid, start emulating. L2 is handled above. */
+ if (vmx->emulation_required)
+ return handle_invalid_guest_state(vcpu);
+
if (exit_reason.failed_vmentry) {
dump_vmcs(vcpu);
vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
@@ -6262,9 +6414,9 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
int max_irr;
- bool max_irr_updated;
+ bool got_posted_interrupt;
- if (KVM_BUG_ON(!vcpu->arch.apicv_active, vcpu->kvm))
+ if (KVM_BUG_ON(!enable_apicv, vcpu->kvm))
return -EIO;
if (pi_test_on(&vmx->pi_desc)) {
@@ -6274,22 +6426,33 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
* But on x86 this is just a compiler barrier anyway.
*/
smp_mb__after_atomic();
- max_irr_updated =
+ got_posted_interrupt =
kvm_apic_update_irr(vcpu, vmx->pi_desc.pir, &max_irr);
-
- /*
- * If we are running L2 and L1 has a new pending interrupt
- * which can be injected, this may cause a vmexit or it may
- * be injected into L2. Either way, this interrupt will be
- * processed via KVM_REQ_EVENT, not RVI, because we do not use
- * virtual interrupt delivery to inject L1 interrupts into L2.
- */
- if (is_guest_mode(vcpu) && max_irr_updated)
- kvm_make_request(KVM_REQ_EVENT, vcpu);
} else {
max_irr = kvm_lapic_find_highest_irr(vcpu);
+ got_posted_interrupt = false;
}
- vmx_hwapic_irr_update(vcpu, max_irr);
+
+ /*
+ * Newly recognized interrupts are injected via either virtual interrupt
+ * delivery (RVI) or KVM_REQ_EVENT. Virtual interrupt delivery is
+ * disabled in two cases:
+ *
+ * 1) If L2 is running and the vCPU has a new pending interrupt. If L1
+ * wants to exit on interrupts, KVM_REQ_EVENT is needed to synthesize a
+ * VM-Exit to L1. If L1 doesn't want to exit, the interrupt is injected
+ * into L2, but KVM doesn't use virtual interrupt delivery to inject
+ * interrupts into L2, and so KVM_REQ_EVENT is again needed.
+ *
+ * 2) If APICv is disabled for this vCPU, assigned devices may still
+ * attempt to post interrupts. The posted interrupt vector will cause
+ * a VM-Exit and the subsequent entry will call sync_pir_to_irr.
+ */
+ if (!is_guest_mode(vcpu) && kvm_vcpu_apicv_active(vcpu))
+ vmx_set_rvi(max_irr);
+ else if (got_posted_interrupt)
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+
return max_irr;
}
@@ -6317,11 +6480,33 @@ void vmx_do_interrupt_nmi_irqoff(unsigned long entry);
static void handle_interrupt_nmi_irqoff(struct kvm_vcpu *vcpu,
unsigned long entry)
{
- kvm_before_interrupt(vcpu);
+ bool is_nmi = entry == (unsigned long)asm_exc_nmi_noist;
+
+ kvm_before_interrupt(vcpu, is_nmi ? KVM_HANDLING_NMI : KVM_HANDLING_IRQ);
vmx_do_interrupt_nmi_irqoff(entry);
kvm_after_interrupt(vcpu);
}
+static void handle_nm_fault_irqoff(struct kvm_vcpu *vcpu)
+{
+ /*
+ * Save xfd_err to guest_fpu before interrupt is enabled, so the
+ * MSR value is not clobbered by the host activity before the guest
+ * has chance to consume it.
+ *
+ * Do not blindly read xfd_err here, since this exception might
+ * be caused by L1 interception on a platform which doesn't
+ * support xfd at all.
+ *
+ * Do it conditionally upon guest_fpu::xfd. xfd_err matters
+ * only when xfd contains a non-zero value.
+ *
+ * Queuing exception is done in vmx_handle_exit. See comment there.
+ */
+ if (vcpu->arch.guest_fpu.fpstate->xfd)
+ rdmsrl(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err);
+}
+
static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx)
{
const unsigned long nmi_entry = (unsigned long)asm_exc_nmi_noist;
@@ -6330,6 +6515,9 @@ static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx)
/* if exit due to PF check for async PF */
if (is_page_fault(intr_info))
vmx->vcpu.arch.apf.host_apf_flags = kvm_read_and_reset_apf_flags();
+ /* if exit due to NM, handle before interrupts are enabled */
+ else if (is_nm_fault(intr_info))
+ handle_nm_fault_irqoff(&vmx->vcpu);
/* Handle machine checks before interrupts are enabled */
else if (is_machine_check(intr_info))
kvm_machine_check();
@@ -6588,7 +6776,7 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- unsigned long cr3, cr4;
+ unsigned long cr4;
/* Record the guest's net vcpu time for enforced NMI injections. */
if (unlikely(!enable_vnmi &&
@@ -6601,9 +6789,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
* consistency check VM-Exit due to invalid guest state and bail.
*/
if (unlikely(vmx->emulation_required)) {
-
- /* We don't emulate invalid state of a nested guest */
- vmx->fail = is_guest_mode(vcpu);
+ vmx->fail = 0;
vmx->exit_reason.full = EXIT_REASON_INVALID_STATE;
vmx->exit_reason.failed_vmentry = 1;
@@ -6631,12 +6817,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
if (kvm_register_is_dirty(vcpu, VCPU_REGS_RIP))
vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
-
- cr3 = __get_current_cr3_fast();
- if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) {
- vmcs_writel(HOST_CR3, cr3);
- vmx->loaded_vmcs->host_state.cr3 = cr3;
- }
+ vcpu->arch.regs_dirty = 0;
cr4 = cr4_read_shadow();
if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) {
@@ -6725,7 +6906,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
loadsegment(es, __USER_DS);
#endif
- vmx_register_cache_reset(vcpu);
+ vcpu->arch.regs_avail &= ~VMX_REGS_LAZY_LOAD_SET;
pt_guest_exit(vmx);
@@ -6793,6 +6974,8 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
BUILD_BUG_ON(offsetof(struct vcpu_vmx, vcpu) != 0);
vmx = to_vmx(vcpu);
+ INIT_LIST_HEAD(&vmx->pi_wakeup_list);
+
err = -ENOMEM;
vmx->vpid = allocate_vpid();
@@ -6826,6 +7009,19 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
if (err < 0)
goto free_pml;
+ /*
+ * Use Hyper-V 'Enlightened MSR Bitmap' feature when KVM runs as a
+ * nested (L1) hypervisor and Hyper-V in L0 supports it. Enable the
+ * feature only for vmcs01, KVM currently isn't equipped to realize any
+ * performance benefits from enabling it for vmcs02.
+ */
+ if (IS_ENABLED(CONFIG_HYPERV) && static_branch_unlikely(&enable_evmcs) &&
+ (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)) {
+ struct hv_enlightened_vmcs *evmcs = (void *)vmx->vmcs01.vmcs;
+
+ evmcs->hv_enlightenments_control.msr_bitmap = 1;
+ }
+
/* The MSR bitmap starts with all ones */
bitmap_fill(vmx->shadow_msr_intercept.read, MAX_POSSIBLE_PASSTHROUGH_MSRS);
bitmap_fill(vmx->shadow_msr_intercept.write, MAX_POSSIBLE_PASSTHROUGH_MSRS);
@@ -6931,7 +7127,6 @@ static int __init vmx_check_processor_compat(void)
static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
{
u8 cache;
- u64 ipat = 0;
/* We wanted to honor guest CD/MTRR/PAT, but doing so could result in
* memory aliases with conflicting memory types and sometimes MCEs.
@@ -6951,30 +7146,22 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
* EPT memory type is used to emulate guest CD/MTRR.
*/
- if (is_mmio) {
- cache = MTRR_TYPE_UNCACHABLE;
- goto exit;
- }
+ if (is_mmio)
+ return MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT;
- if (!kvm_arch_has_noncoherent_dma(vcpu->kvm)) {
- ipat = VMX_EPT_IPAT_BIT;
- cache = MTRR_TYPE_WRBACK;
- goto exit;
- }
+ if (!kvm_arch_has_noncoherent_dma(vcpu->kvm))
+ return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IPAT_BIT;
if (kvm_read_cr0(vcpu) & X86_CR0_CD) {
- ipat = VMX_EPT_IPAT_BIT;
if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
cache = MTRR_TYPE_WRBACK;
else
cache = MTRR_TYPE_UNCACHABLE;
- goto exit;
- }
- cache = kvm_mtrr_get_guest_memory_type(vcpu, gfn);
+ return (cache << VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IPAT_BIT;
+ }
-exit:
- return (cache << VMX_EPT_MT_EPTE_SHIFT) | ipat;
+ return kvm_mtrr_get_guest_memory_type(vcpu, gfn) << VMX_EPT_MT_EPTE_SHIFT;
}
static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx, u32 new_ctl)
@@ -7166,6 +7353,11 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
}
}
+ if (kvm_cpu_cap_has(X86_FEATURE_XFD))
+ vmx_set_intercept_for_msr(vcpu, MSR_IA32_XFD_ERR, MSR_TYPE_R,
+ !guest_cpuid_has(vcpu, X86_FEATURE_XFD));
+
+
set_cr4_guest_host_mask(vmx);
vmx_write_encls_bitmap(vcpu, NULL);
@@ -7405,25 +7597,6 @@ void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu)
secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_ENABLE_PML);
}
-static int vmx_pre_block(struct kvm_vcpu *vcpu)
-{
- if (pi_pre_block(vcpu))
- return 1;
-
- if (kvm_lapic_hv_timer_in_use(vcpu))
- kvm_lapic_switch_to_sw_timer(vcpu);
-
- return 0;
-}
-
-static void vmx_post_block(struct kvm_vcpu *vcpu)
-{
- if (kvm_x86_ops.set_hv_timer)
- kvm_lapic_switch_to_hv_timer(vcpu);
-
- pi_post_block(vcpu);
-}
-
static void vmx_setup_mce(struct kvm_vcpu *vcpu)
{
if (vcpu->arch.mcg_cap & MCG_LMCE_P)
@@ -7509,6 +7682,7 @@ static void hardware_unsetup(void)
static bool vmx_check_apicv_inhibit_reasons(ulong bit)
{
ulong supported = BIT(APICV_INHIBIT_REASON_DISABLE) |
+ BIT(APICV_INHIBIT_REASON_ABSENT) |
BIT(APICV_INHIBIT_REASON_HYPERV) |
BIT(APICV_INHIBIT_REASON_BLOCKIRQ);
@@ -7558,12 +7732,14 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
.cache_reg = vmx_cache_reg,
.get_rflags = vmx_get_rflags,
.set_rflags = vmx_set_rflags,
+ .get_if_flag = vmx_get_if_flag,
.tlb_flush_all = vmx_flush_tlb_all,
.tlb_flush_current = vmx_flush_tlb_current,
.tlb_flush_gva = vmx_flush_tlb_gva,
.tlb_flush_guest = vmx_flush_tlb_guest,
+ .vcpu_pre_run = vmx_vcpu_pre_run,
.run = vmx_vcpu_run,
.handle_exit = vmx_handle_exit,
.skip_emulated_instruction = vmx_skip_emulated_instruction,
@@ -7622,9 +7798,6 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
.cpu_dirty_log_size = PML_ENTITY_NUM,
.update_cpu_dirty_logging = vmx_update_cpu_dirty_logging,
- .pre_block = vmx_pre_block,
- .post_block = vmx_post_block,
-
.pmu_ops = &intel_pmu_ops,
.nested_ops = &vmx_nested_ops,
@@ -7653,6 +7826,20 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
.vcpu_deliver_sipi_vector = kvm_vcpu_deliver_sipi_vector,
};
+static unsigned int vmx_handle_intel_pt_intr(void)
+{
+ struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
+
+ /* '0' on failure so that the !PT case can use a RET0 static call. */
+ if (!kvm_arch_pmi_in_guest(vcpu))
+ return 0;
+
+ kvm_make_request(KVM_REQ_PMI, vcpu);
+ __set_bit(MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI_BIT,
+ (unsigned long *)&vcpu->arch.pmu.global_status);
+ return 1;
+}
+
static __init void vmx_setup_user_return_msrs(void)
{
@@ -7679,11 +7866,13 @@ static __init void vmx_setup_user_return_msrs(void)
kvm_add_user_return_msr(vmx_uret_msrs_list[i]);
}
+static struct kvm_x86_init_ops vmx_init_ops __initdata;
+
static __init int hardware_setup(void)
{
unsigned long host_bndcfgs;
struct desc_ptr dt;
- int r, ept_lpage_level;
+ int r;
store_idt(&dt);
host_idt_base = dt.address;
@@ -7761,10 +7950,10 @@ static __init int hardware_setup(void)
ple_window_shrink = 0;
}
- if (!cpu_has_vmx_apicv()) {
+ if (!cpu_has_vmx_apicv())
enable_apicv = 0;
+ if (!enable_apicv)
vmx_x86_ops.sync_pir_to_irr = NULL;
- }
if (cpu_has_vmx_tsc_scaling()) {
kvm_has_tsc_control = true;
@@ -7780,16 +7969,8 @@ static __init int hardware_setup(void)
kvm_mmu_set_ept_masks(enable_ept_ad_bits,
cpu_has_vmx_ept_execute_only());
- if (!enable_ept)
- ept_lpage_level = 0;
- else if (cpu_has_vmx_ept_1g_page())
- ept_lpage_level = PG_LEVEL_1G;
- else if (cpu_has_vmx_ept_2m_page())
- ept_lpage_level = PG_LEVEL_2M;
- else
- ept_lpage_level = PG_LEVEL_4K;
kvm_configure_mmu(enable_ept, 0, vmx_get_max_tdp_level(),
- ept_lpage_level);
+ ept_caps_to_lpage_level(vmx_capability.ept));
/*
* Only enable PML when hardware supports PML feature, and both EPT
@@ -7837,6 +8018,10 @@ static __init int hardware_setup(void)
return -EINVAL;
if (!enable_ept || !cpu_has_vmx_intel_pt())
pt_mode = PT_MODE_SYSTEM;
+ if (pt_mode == PT_MODE_HOST_GUEST)
+ vmx_init_ops.handle_intel_pt_intr = vmx_handle_intel_pt_intr;
+ else
+ vmx_init_ops.handle_intel_pt_intr = NULL;
setup_default_sgx_lepubkeyhash();
@@ -7865,6 +8050,7 @@ static struct kvm_x86_init_ops vmx_init_ops __initdata = {
.disabled_by_bios = vmx_disabled_by_bios,
.check_processor_compatibility = vmx_check_processor_compat,
.hardware_setup = hardware_setup,
+ .handle_intel_pt_intr = NULL,
.runtime_ops = &vmx_x86_ops,
};
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 4df2ac24ffc1..7f2c82e7f38f 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -159,6 +159,15 @@ struct nested_vmx {
bool dirty_vmcs12;
/*
+ * Indicates whether MSR bitmap for L2 needs to be rebuilt due to
+ * changes in MSR bitmap for L1 or switching to a different L2. Note,
+ * this flag can only be used reliably in conjunction with a paravirt L1
+ * which informs L0 whether any changes to MSR bitmap for L2 were done
+ * on its side.
+ */
+ bool force_msr_bitmap_recalc;
+
+ /*
* Indicates lazily loaded guest state has not yet been decached from
* vmcs02.
*/
@@ -308,6 +317,9 @@ struct vcpu_vmx {
/* Posted interrupt descriptor */
struct pi_desc pi_desc;
+ /* Used if this vCPU is waiting for PI notification wakeup. */
+ struct list_head pi_wakeup_list;
+
/* Support for a guest hypervisor (nested VMX) */
struct nested_vmx nested;
@@ -340,7 +352,7 @@ struct vcpu_vmx {
struct lbr_desc lbr_desc;
/* Save desired MSR intercept (read: pass-through) state */
-#define MAX_POSSIBLE_PASSTHROUGH_MSRS 13
+#define MAX_POSSIBLE_PASSTHROUGH_MSRS 15
struct {
DECLARE_BITMAP(read, MAX_POSSIBLE_PASSTHROUGH_MSRS);
DECLARE_BITMAP(write, MAX_POSSIBLE_PASSTHROUGH_MSRS);
@@ -362,8 +374,9 @@ int allocate_vpid(void);
void free_vpid(int vpid);
void vmx_set_constant_host_state(struct vcpu_vmx *vmx);
void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu);
-void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel,
- unsigned long fs_base, unsigned long gs_base);
+void vmx_set_vmcs_host_state(struct vmcs_host_state *host, unsigned long cr3,
+ u16 fs_sel, u16 gs_sel,
+ unsigned long fs_base, unsigned long gs_base);
int vmx_get_cpl(struct kvm_vcpu *vcpu);
bool vmx_emulation_required(struct kvm_vcpu *vcpu);
unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu);
@@ -473,19 +486,21 @@ BUILD_CONTROLS_SHADOW(pin, PIN_BASED_VM_EXEC_CONTROL)
BUILD_CONTROLS_SHADOW(exec, CPU_BASED_VM_EXEC_CONTROL)
BUILD_CONTROLS_SHADOW(secondary_exec, SECONDARY_VM_EXEC_CONTROL)
-static inline void vmx_register_cache_reset(struct kvm_vcpu *vcpu)
-{
- vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)
- | (1 << VCPU_EXREG_RFLAGS)
- | (1 << VCPU_EXREG_PDPTR)
- | (1 << VCPU_EXREG_SEGMENTS)
- | (1 << VCPU_EXREG_CR0)
- | (1 << VCPU_EXREG_CR3)
- | (1 << VCPU_EXREG_CR4)
- | (1 << VCPU_EXREG_EXIT_INFO_1)
- | (1 << VCPU_EXREG_EXIT_INFO_2));
- vcpu->arch.regs_dirty = 0;
-}
+/*
+ * VMX_REGS_LAZY_LOAD_SET - The set of registers that will be updated in the
+ * cache on demand. Other registers not listed here are synced to
+ * the cache immediately after VM-Exit.
+ */
+#define VMX_REGS_LAZY_LOAD_SET ((1 << VCPU_REGS_RIP) | \
+ (1 << VCPU_REGS_RSP) | \
+ (1 << VCPU_EXREG_RFLAGS) | \
+ (1 << VCPU_EXREG_PDPTR) | \
+ (1 << VCPU_EXREG_SEGMENTS) | \
+ (1 << VCPU_EXREG_CR0) | \
+ (1 << VCPU_EXREG_CR3) | \
+ (1 << VCPU_EXREG_CR4) | \
+ (1 << VCPU_EXREG_EXIT_INFO_1) | \
+ (1 << VCPU_EXREG_EXIT_INFO_2))
static inline struct kvm_vmx *to_kvm_vmx(struct kvm *kvm)
{
diff --git a/arch/x86/kvm/vmx/vmx_ops.h b/arch/x86/kvm/vmx/vmx_ops.h
index 9e9ef47e988c..5e7f41225780 100644
--- a/arch/x86/kvm/vmx/vmx_ops.h
+++ b/arch/x86/kvm/vmx/vmx_ops.h
@@ -71,6 +71,31 @@ static __always_inline unsigned long __vmcs_readl(unsigned long field)
{
unsigned long value;
+#ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT
+
+ asm_volatile_goto("1: vmread %[field], %[output]\n\t"
+ "jna %l[do_fail]\n\t"
+
+ _ASM_EXTABLE(1b, %l[do_exception])
+
+ : [output] "=r" (value)
+ : [field] "r" (field)
+ : "cc"
+ : do_fail, do_exception);
+
+ return value;
+
+do_fail:
+ WARN_ONCE(1, "kvm: vmread failed: field=%lx\n", field);
+ pr_warn_ratelimited("kvm: vmread failed: field=%lx\n", field);
+ return 0;
+
+do_exception:
+ kvm_spurious_fault();
+ return 0;
+
+#else /* !CONFIG_CC_HAS_ASM_GOTO_OUTPUT */
+
asm volatile("1: vmread %2, %1\n\t"
".byte 0x3e\n\t" /* branch taken hint */
"ja 3f\n\t"
@@ -80,9 +105,11 @@ static __always_inline unsigned long __vmcs_readl(unsigned long field)
* @field, and bounce through the trampoline to preserve
* volatile registers.
*/
- "push $0\n\t"
+ "xorl %k1, %k1\n\t"
+ "2:\n\t"
+ "push %1\n\t"
"push %2\n\t"
- "2:call vmread_error_trampoline\n\t"
+ "call vmread_error_trampoline\n\t"
/*
* Unwind the stack. Note, the trampoline zeros out the
@@ -93,14 +120,12 @@ static __always_inline unsigned long __vmcs_readl(unsigned long field)
"3:\n\t"
/* VMREAD faulted. As above, except push '1' for @fault. */
- ".pushsection .fixup, \"ax\"\n\t"
- "4: push $1\n\t"
- "push %2\n\t"
- "jmp 2b\n\t"
- ".popsection\n\t"
- _ASM_EXTABLE(1b, 4b)
- : ASM_CALL_CONSTRAINT, "=r"(value) : "r"(field) : "cc");
+ _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_ONE_REG, %1)
+
+ : ASM_CALL_CONSTRAINT, "=&r"(value) : "r"(field) : "cc");
return value;
+
+#endif /* CONFIG_CC_HAS_ASM_GOTO_OUTPUT */
}
static __always_inline u16 vmcs_read16(unsigned long field)