aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kvm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kvm')
-rw-r--r--arch/x86/kvm/lapic.c27
-rw-r--r--arch/x86/kvm/mmu/mmu.c4
-rw-r--r--arch/x86/kvm/mmu/tdp_iter.c9
-rw-r--r--arch/x86/kvm/mmu/tdp_iter.h1
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.c38
-rw-r--r--arch/x86/kvm/svm/avic.c171
-rw-r--r--arch/x86/kvm/svm/nested.c43
-rw-r--r--arch/x86/kvm/svm/svm.c42
-rw-r--r--arch/x86/kvm/svm/svm.h6
-rw-r--r--arch/x86/kvm/vmx/vmx.c77
-rw-r--r--arch/x86/kvm/vmx/vmx.h2
-rw-r--r--arch/x86/kvm/x86.c53
-rw-r--r--arch/x86/kvm/xen.h6
13 files changed, 318 insertions, 161 deletions
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index f1bdac3f5aa8..0e68b4c937fc 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -2039,6 +2039,19 @@ static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val)
}
}
+static void kvm_lapic_xapic_id_updated(struct kvm_lapic *apic)
+{
+ struct kvm *kvm = apic->vcpu->kvm;
+
+ if (KVM_BUG_ON(apic_x2apic_mode(apic), kvm))
+ return;
+
+ if (kvm_xapic_id(apic) == apic->vcpu->vcpu_id)
+ return;
+
+ kvm_set_apicv_inhibit(apic->vcpu->kvm, APICV_INHIBIT_REASON_APIC_ID_MODIFIED);
+}
+
static int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
{
int ret = 0;
@@ -2047,10 +2060,12 @@ static int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
switch (reg) {
case APIC_ID: /* Local APIC ID */
- if (!apic_x2apic_mode(apic))
+ if (!apic_x2apic_mode(apic)) {
kvm_apic_set_xapic_id(apic, val >> 24);
- else
+ kvm_lapic_xapic_id_updated(apic);
+ } else {
ret = 1;
+ }
break;
case APIC_TASKPRI:
@@ -2336,8 +2351,10 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
MSR_IA32_APICBASE_BASE;
if ((value & MSR_IA32_APICBASE_ENABLE) &&
- apic->base_address != APIC_DEFAULT_PHYS_BASE)
- pr_warn_once("APIC base relocation is unsupported by KVM");
+ apic->base_address != APIC_DEFAULT_PHYS_BASE) {
+ kvm_set_apicv_inhibit(apic->vcpu->kvm,
+ APICV_INHIBIT_REASON_APIC_BASE_MODIFIED);
+ }
}
void kvm_apic_update_apicv(struct kvm_vcpu *vcpu)
@@ -2648,6 +2665,8 @@ static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu,
icr = __kvm_lapic_get_reg64(s->regs, APIC_ICR);
__kvm_lapic_set_reg(s->regs, APIC_ICR2, icr >> 32);
}
+ } else {
+ kvm_lapic_xapic_id_updated(vcpu->arch.apic);
}
return 0;
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index f4653688fa6d..17252f39bd7c 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -3411,7 +3411,7 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
root = mmu_alloc_root(vcpu, i << (30 - PAGE_SHIFT),
i << 30, PT32_ROOT_LEVEL, true);
mmu->pae_root[i] = root | PT_PRESENT_MASK |
- shadow_me_mask;
+ shadow_me_value;
}
mmu->root.hpa = __pa(mmu->pae_root);
} else {
@@ -5179,7 +5179,7 @@ static void __kvm_mmu_free_obsolete_roots(struct kvm *kvm, struct kvm_mmu *mmu)
roots_to_free |= KVM_MMU_ROOT_CURRENT;
for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
- if (is_obsolete_root(kvm, mmu->root.hpa))
+ if (is_obsolete_root(kvm, mmu->prev_roots[i].hpa))
roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
}
diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c
index 6d3b3e5a5533..ee4802d7b36c 100644
--- a/arch/x86/kvm/mmu/tdp_iter.c
+++ b/arch/x86/kvm/mmu/tdp_iter.c
@@ -146,6 +146,15 @@ static bool try_step_up(struct tdp_iter *iter)
}
/*
+ * Step the iterator back up a level in the paging structure. Should only be
+ * used when the iterator is below the root level.
+ */
+void tdp_iter_step_up(struct tdp_iter *iter)
+{
+ WARN_ON(!try_step_up(iter));
+}
+
+/*
* Step to the next SPTE in a pre-order traversal of the paging structure.
* To get to the next SPTE, the iterator either steps down towards the goal
* GFN, if at a present, non-last-level SPTE, or over to a SPTE mapping a
diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h
index f0af385c56e0..adfca0cf94d3 100644
--- a/arch/x86/kvm/mmu/tdp_iter.h
+++ b/arch/x86/kvm/mmu/tdp_iter.h
@@ -114,5 +114,6 @@ void tdp_iter_start(struct tdp_iter *iter, struct kvm_mmu_page *root,
int min_level, gfn_t next_last_level_gfn);
void tdp_iter_next(struct tdp_iter *iter);
void tdp_iter_restart(struct tdp_iter *iter);
+void tdp_iter_step_up(struct tdp_iter *iter);
#endif /* __KVM_X86_MMU_TDP_ITER_H */
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 841feaa48be5..7b9265d67131 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -1742,12 +1742,12 @@ static void zap_collapsible_spte_range(struct kvm *kvm,
gfn_t start = slot->base_gfn;
gfn_t end = start + slot->npages;
struct tdp_iter iter;
+ int max_mapping_level;
kvm_pfn_t pfn;
rcu_read_lock();
tdp_root_for_each_pte(iter, root, start, end) {
-retry:
if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
continue;
@@ -1755,15 +1755,41 @@ retry:
!is_last_spte(iter.old_spte, iter.level))
continue;
+ /*
+ * This is a leaf SPTE. Check if the PFN it maps can
+ * be mapped at a higher level.
+ */
pfn = spte_to_pfn(iter.old_spte);
- if (kvm_is_reserved_pfn(pfn) ||
- iter.level >= kvm_mmu_max_mapping_level(kvm, slot, iter.gfn,
- pfn, PG_LEVEL_NUM))
+
+ if (kvm_is_reserved_pfn(pfn))
continue;
+ max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot,
+ iter.gfn, pfn, PG_LEVEL_NUM);
+
+ WARN_ON(max_mapping_level < iter.level);
+
+ /*
+ * If this page is already mapped at the highest
+ * viable level, there's nothing more to do.
+ */
+ if (max_mapping_level == iter.level)
+ continue;
+
+ /*
+ * The page can be remapped at a higher level, so step
+ * up to zap the parent SPTE.
+ */
+ while (max_mapping_level > iter.level)
+ tdp_iter_step_up(&iter);
+
/* Note, a successful atomic zap also does a remote TLB flush. */
- if (tdp_mmu_zap_spte_atomic(kvm, &iter))
- goto retry;
+ tdp_mmu_zap_spte_atomic(kvm, &iter);
+
+ /*
+ * If the atomic zap fails, the iter will recurse back into
+ * the same subtree to retry.
+ */
}
rcu_read_unlock();
diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
index 54fe03714f8a..d1bc5820ea46 100644
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@@ -291,58 +291,91 @@ void avic_ring_doorbell(struct kvm_vcpu *vcpu)
static int avic_kick_target_vcpus_fast(struct kvm *kvm, struct kvm_lapic *source,
u32 icrl, u32 icrh, u32 index)
{
- u32 dest, apic_id;
- struct kvm_vcpu *vcpu;
+ u32 l1_physical_id, dest;
+ struct kvm_vcpu *target_vcpu;
int dest_mode = icrl & APIC_DEST_MASK;
int shorthand = icrl & APIC_SHORT_MASK;
struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
- u32 *avic_logical_id_table = page_address(kvm_svm->avic_logical_id_table_page);
if (shorthand != APIC_DEST_NOSHORT)
return -EINVAL;
- /*
- * The AVIC incomplete IPI #vmexit info provides index into
- * the physical APIC ID table, which can be used to derive
- * guest physical APIC ID.
- */
+ if (apic_x2apic_mode(source))
+ dest = icrh;
+ else
+ dest = GET_APIC_DEST_FIELD(icrh);
+
if (dest_mode == APIC_DEST_PHYSICAL) {
- apic_id = index;
+ /* broadcast destination, use slow path */
+ if (apic_x2apic_mode(source) && dest == X2APIC_BROADCAST)
+ return -EINVAL;
+ if (!apic_x2apic_mode(source) && dest == APIC_BROADCAST)
+ return -EINVAL;
+
+ l1_physical_id = dest;
+
+ if (WARN_ON_ONCE(l1_physical_id != index))
+ return -EINVAL;
+
} else {
- if (!apic_x2apic_mode(source)) {
- /* For xAPIC logical mode, the index is for logical APIC table. */
- apic_id = avic_logical_id_table[index] & 0x1ff;
+ u32 bitmap, cluster;
+ int logid_index;
+
+ if (apic_x2apic_mode(source)) {
+ /* 16 bit dest mask, 16 bit cluster id */
+ bitmap = dest & 0xFFFF0000;
+ cluster = (dest >> 16) << 4;
+ } else if (kvm_lapic_get_reg(source, APIC_DFR) == APIC_DFR_FLAT) {
+ /* 8 bit dest mask*/
+ bitmap = dest;
+ cluster = 0;
} else {
- return -EINVAL;
+ /* 4 bit desk mask, 4 bit cluster id */
+ bitmap = dest & 0xF;
+ cluster = (dest >> 4) << 2;
}
- }
- /*
- * Assuming vcpu ID is the same as physical apic ID,
- * and use it to retrieve the target vCPU.
- */
- vcpu = kvm_get_vcpu_by_id(kvm, apic_id);
- if (!vcpu)
- return -EINVAL;
+ if (unlikely(!bitmap))
+ /* guest bug: nobody to send the logical interrupt to */
+ return 0;
- if (apic_x2apic_mode(vcpu->arch.apic))
- dest = icrh;
- else
- dest = GET_APIC_DEST_FIELD(icrh);
+ if (!is_power_of_2(bitmap))
+ /* multiple logical destinations, use slow path */
+ return -EINVAL;
- /*
- * Try matching the destination APIC ID with the vCPU.
- */
- if (kvm_apic_match_dest(vcpu, source, shorthand, dest, dest_mode)) {
- vcpu->arch.apic->irr_pending = true;
- svm_complete_interrupt_delivery(vcpu,
- icrl & APIC_MODE_MASK,
- icrl & APIC_INT_LEVELTRIG,
- icrl & APIC_VECTOR_MASK);
- return 0;
+ logid_index = cluster + __ffs(bitmap);
+
+ if (apic_x2apic_mode(source)) {
+ l1_physical_id = logid_index;
+ } else {
+ u32 *avic_logical_id_table =
+ page_address(kvm_svm->avic_logical_id_table_page);
+
+ u32 logid_entry = avic_logical_id_table[logid_index];
+
+ if (WARN_ON_ONCE(index != logid_index))
+ return -EINVAL;
+
+ /* guest bug: non existing/reserved logical destination */
+ if (unlikely(!(logid_entry & AVIC_LOGICAL_ID_ENTRY_VALID_MASK)))
+ return 0;
+
+ l1_physical_id = logid_entry &
+ AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK;
+ }
}
- return -EINVAL;
+ target_vcpu = kvm_get_vcpu_by_id(kvm, l1_physical_id);
+ if (unlikely(!target_vcpu))
+ /* guest bug: non existing vCPU is a target of this IPI*/
+ return 0;
+
+ target_vcpu->arch.apic->irr_pending = true;
+ svm_complete_interrupt_delivery(target_vcpu,
+ icrl & APIC_MODE_MASK,
+ icrl & APIC_INT_LEVELTRIG,
+ icrl & APIC_VECTOR_MASK);
+ return 0;
}
static void avic_kick_target_vcpus(struct kvm *kvm, struct kvm_lapic *source,
@@ -508,35 +541,6 @@ static int avic_handle_ldr_update(struct kvm_vcpu *vcpu)
return ret;
}
-static int avic_handle_apic_id_update(struct kvm_vcpu *vcpu)
-{
- u64 *old, *new;
- struct vcpu_svm *svm = to_svm(vcpu);
- u32 id = kvm_xapic_id(vcpu->arch.apic);
-
- if (vcpu->vcpu_id == id)
- return 0;
-
- old = avic_get_physical_id_entry(vcpu, vcpu->vcpu_id);
- new = avic_get_physical_id_entry(vcpu, id);
- if (!new || !old)
- return 1;
-
- /* We need to move physical_id_entry to new offset */
- *new = *old;
- *old = 0ULL;
- to_svm(vcpu)->avic_physical_id_cache = new;
-
- /*
- * Also update the guest physical APIC ID in the logical
- * APIC ID table entry if already setup the LDR.
- */
- if (svm->ldr_reg)
- avic_handle_ldr_update(vcpu);
-
- return 0;
-}
-
static void avic_handle_dfr_update(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -555,10 +559,6 @@ static int avic_unaccel_trap_write(struct kvm_vcpu *vcpu)
AVIC_UNACCEL_ACCESS_OFFSET_MASK;
switch (offset) {
- case APIC_ID:
- if (avic_handle_apic_id_update(vcpu))
- return 0;
- break;
case APIC_LDR:
if (avic_handle_ldr_update(vcpu))
return 0;
@@ -650,8 +650,6 @@ int avic_init_vcpu(struct vcpu_svm *svm)
void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu)
{
- if (avic_handle_apic_id_update(vcpu) != 0)
- return;
avic_handle_dfr_update(vcpu);
avic_handle_ldr_update(vcpu);
}
@@ -910,7 +908,9 @@ bool avic_check_apicv_inhibit_reasons(enum kvm_apicv_inhibit reason)
BIT(APICV_INHIBIT_REASON_PIT_REINJ) |
BIT(APICV_INHIBIT_REASON_X2APIC) |
BIT(APICV_INHIBIT_REASON_BLOCKIRQ) |
- BIT(APICV_INHIBIT_REASON_SEV);
+ BIT(APICV_INHIBIT_REASON_SEV) |
+ BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) |
+ BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED);
return supported & BIT(reason);
}
@@ -946,7 +946,7 @@ out:
return ret;
}
-void __avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
u64 entry;
int h_physical_id = kvm_cpu_get_apicid(cpu);
@@ -978,7 +978,7 @@ void __avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, true);
}
-void __avic_vcpu_put(struct kvm_vcpu *vcpu)
+void avic_vcpu_put(struct kvm_vcpu *vcpu)
{
u64 entry;
struct vcpu_svm *svm = to_svm(vcpu);
@@ -997,25 +997,6 @@ void __avic_vcpu_put(struct kvm_vcpu *vcpu)
WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
}
-static void avic_vcpu_load(struct kvm_vcpu *vcpu)
-{
- int cpu = get_cpu();
-
- WARN_ON(cpu != vcpu->cpu);
-
- __avic_vcpu_load(vcpu, cpu);
-
- put_cpu();
-}
-
-static void avic_vcpu_put(struct kvm_vcpu *vcpu)
-{
- preempt_disable();
-
- __avic_vcpu_put(vcpu);
-
- preempt_enable();
-}
void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
{
@@ -1042,7 +1023,7 @@ void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
vmcb_mark_dirty(vmcb, VMCB_AVIC);
if (activated)
- avic_vcpu_load(vcpu);
+ avic_vcpu_load(vcpu, vcpu->cpu);
else
avic_vcpu_put(vcpu);
@@ -1075,5 +1056,5 @@ void avic_vcpu_unblocking(struct kvm_vcpu *vcpu)
if (!kvm_vcpu_apicv_active(vcpu))
return;
- avic_vcpu_load(vcpu);
+ avic_vcpu_load(vcpu, vcpu->cpu);
}
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index bed5e1692cef..ba7cd26f438f 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -616,6 +616,8 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm)
struct kvm_vcpu *vcpu = &svm->vcpu;
struct vmcb *vmcb01 = svm->vmcb01.ptr;
struct vmcb *vmcb02 = svm->nested.vmcb02.ptr;
+ u32 pause_count12;
+ u32 pause_thresh12;
/*
* Filled at exit: exit_code, exit_code_hi, exit_info_1, exit_info_2,
@@ -671,27 +673,25 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm)
if (!nested_vmcb_needs_vls_intercept(svm))
vmcb02->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
+ pause_count12 = svm->pause_filter_enabled ? svm->nested.ctl.pause_filter_count : 0;
+ pause_thresh12 = svm->pause_threshold_enabled ? svm->nested.ctl.pause_filter_thresh : 0;
if (kvm_pause_in_guest(svm->vcpu.kvm)) {
- /* use guest values since host doesn't use them */
- vmcb02->control.pause_filter_count =
- svm->pause_filter_enabled ?
- svm->nested.ctl.pause_filter_count : 0;
+ /* use guest values since host doesn't intercept PAUSE */
+ vmcb02->control.pause_filter_count = pause_count12;
+ vmcb02->control.pause_filter_thresh = pause_thresh12;
- vmcb02->control.pause_filter_thresh =
- svm->pause_threshold_enabled ?
- svm->nested.ctl.pause_filter_thresh : 0;
-
- } else if (!vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_PAUSE)) {
- /* use host values when guest doesn't use them */
+ } else {
+ /* start from host values otherwise */
vmcb02->control.pause_filter_count = vmcb01->control.pause_filter_count;
vmcb02->control.pause_filter_thresh = vmcb01->control.pause_filter_thresh;
- } else {
- /*
- * Intercept every PAUSE otherwise and
- * ignore both host and guest values
- */
- vmcb02->control.pause_filter_count = 0;
- vmcb02->control.pause_filter_thresh = 0;
+
+ /* ... but ensure filtering is disabled if so requested. */
+ if (vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_PAUSE)) {
+ if (!pause_count12)
+ vmcb02->control.pause_filter_count = 0;
+ if (!pause_thresh12)
+ vmcb02->control.pause_filter_thresh = 0;
+ }
}
nested_svm_transition_tlb_flush(vcpu);
@@ -951,8 +951,11 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
vmcb12->control.event_inj = svm->nested.ctl.event_inj;
vmcb12->control.event_inj_err = svm->nested.ctl.event_inj_err;
- if (!kvm_pause_in_guest(vcpu->kvm) && vmcb02->control.pause_filter_count)
+ if (!kvm_pause_in_guest(vcpu->kvm)) {
vmcb01->control.pause_filter_count = vmcb02->control.pause_filter_count;
+ vmcb_mark_dirty(vmcb01, VMCB_INTERCEPTS);
+
+ }
nested_svm_copy_common_state(svm->nested.vmcb02.ptr, svm->vmcb01.ptr);
@@ -982,7 +985,7 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
if (svm->tsc_ratio_msr != kvm_default_tsc_scaling_ratio) {
WARN_ON(!svm->tsc_scaling_enabled);
vcpu->arch.tsc_scaling_ratio = vcpu->arch.l1_tsc_scaling_ratio;
- svm_write_tsc_multiplier(vcpu, vcpu->arch.tsc_scaling_ratio);
+ __svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio);
}
svm->nested.ctl.nested_cr3 = 0;
@@ -1387,7 +1390,7 @@ void nested_svm_update_tsc_ratio_msr(struct kvm_vcpu *vcpu)
vcpu->arch.tsc_scaling_ratio =
kvm_calc_nested_tsc_multiplier(vcpu->arch.l1_tsc_scaling_ratio,
svm->tsc_ratio_msr);
- svm_write_tsc_multiplier(vcpu, vcpu->arch.tsc_scaling_ratio);
+ __svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio);
}
/* Inverse operation of nested_copy_vmcb_control_to_cache(). asid is copied too. */
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 200045f71df0..87da90360bc7 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -465,11 +465,24 @@ static int has_svm(void)
return 1;
}
+void __svm_write_tsc_multiplier(u64 multiplier)
+{
+ preempt_disable();
+
+ if (multiplier == __this_cpu_read(current_tsc_ratio))
+ goto out;
+
+ wrmsrl(MSR_AMD64_TSC_RATIO, multiplier);
+ __this_cpu_write(current_tsc_ratio, multiplier);
+out:
+ preempt_enable();
+}
+
static void svm_hardware_disable(void)
{
/* Make sure we clean up behind us */
if (tsc_scaling)
- wrmsrl(MSR_AMD64_TSC_RATIO, SVM_TSC_RATIO_DEFAULT);
+ __svm_write_tsc_multiplier(SVM_TSC_RATIO_DEFAULT);
cpu_svm_disable();
@@ -515,8 +528,7 @@ static int svm_hardware_enable(void)
* Set the default value, even if we don't use TSC scaling
* to avoid having stale value in the msr
*/
- wrmsrl(MSR_AMD64_TSC_RATIO, SVM_TSC_RATIO_DEFAULT);
- __this_cpu_write(current_tsc_ratio, SVM_TSC_RATIO_DEFAULT);
+ __svm_write_tsc_multiplier(SVM_TSC_RATIO_DEFAULT);
}
@@ -909,7 +921,7 @@ static void grow_ple_window(struct kvm_vcpu *vcpu)
struct vmcb_control_area *control = &svm->vmcb->control;
int old = control->pause_filter_count;
- if (kvm_pause_in_guest(vcpu->kvm) || !old)
+ if (kvm_pause_in_guest(vcpu->kvm))
return;
control->pause_filter_count = __grow_ple_window(old,
@@ -930,7 +942,7 @@ static void shrink_ple_window(struct kvm_vcpu *vcpu)
struct vmcb_control_area *control = &svm->vmcb->control;
int old = control->pause_filter_count;
- if (kvm_pause_in_guest(vcpu->kvm) || !old)
+ if (kvm_pause_in_guest(vcpu->kvm))
return;
control->pause_filter_count =
@@ -999,11 +1011,12 @@ static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
}
-void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier)
+static void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier)
{
- wrmsrl(MSR_AMD64_TSC_RATIO, multiplier);
+ __svm_write_tsc_multiplier(multiplier);
}
+
/* Evaluate instruction intercepts that depend on guest CPUID features. */
static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu,
struct vcpu_svm *svm)
@@ -1363,13 +1376,8 @@ static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
sev_es_prepare_switch_to_guest(hostsa);
}
- if (tsc_scaling) {
- u64 tsc_ratio = vcpu->arch.tsc_scaling_ratio;
- if (tsc_ratio != __this_cpu_read(current_tsc_ratio)) {
- __this_cpu_write(current_tsc_ratio, tsc_ratio);
- wrmsrl(MSR_AMD64_TSC_RATIO, tsc_ratio);
- }
- }
+ if (tsc_scaling)
+ __svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio);
if (likely(tsc_aux_uret_slot >= 0))
kvm_set_user_return_msr(tsc_aux_uret_slot, svm->tsc_aux, -1ull);
@@ -1392,13 +1400,13 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
indirect_branch_prediction_barrier();
}
if (kvm_vcpu_apicv_active(vcpu))
- __avic_vcpu_load(vcpu, cpu);
+ avic_vcpu_load(vcpu, cpu);
}
static void svm_vcpu_put(struct kvm_vcpu *vcpu)
{
if (kvm_vcpu_apicv_active(vcpu))
- __avic_vcpu_put(vcpu);
+ avic_vcpu_put(vcpu);
svm_prepare_host_switch(vcpu);
@@ -4255,6 +4263,8 @@ out:
static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu)
{
+ if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_INTR)
+ vcpu->arch.at_instruction_boundary = true;
}
static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu)
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 21c5460e947a..1bddd336a27e 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -590,7 +590,7 @@ int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
bool has_error_code, u32 error_code);
int nested_svm_exit_special(struct vcpu_svm *svm);
void nested_svm_update_tsc_ratio_msr(struct kvm_vcpu *vcpu);
-void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 multiplier);
+void __svm_write_tsc_multiplier(u64 multiplier);
void nested_copy_vmcb_control_to_cache(struct vcpu_svm *svm,
struct vmcb_control_area *control);
void nested_copy_vmcb_save_to_cache(struct vcpu_svm *svm,
@@ -610,8 +610,8 @@ void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb);
int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu);
int avic_unaccelerated_access_interception(struct kvm_vcpu *vcpu);
int avic_init_vcpu(struct vcpu_svm *svm);
-void __avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
-void __avic_vcpu_put(struct kvm_vcpu *vcpu);
+void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
+void avic_vcpu_put(struct kvm_vcpu *vcpu);
void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu);
void avic_set_virtual_apic_mode(struct kvm_vcpu *vcpu);
void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index a07e8cd753ec..3a919e49129b 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -229,6 +229,9 @@ static const struct {
#define L1D_CACHE_ORDER 4
static void *vmx_l1d_flush_pages;
+/* Control for disabling CPU Fill buffer clear */
+static bool __read_mostly vmx_fb_clear_ctrl_available;
+
static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
{
struct page *page;
@@ -360,6 +363,60 @@ static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp)
return sprintf(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option);
}
+static void vmx_setup_fb_clear_ctrl(void)
+{
+ u64 msr;
+
+ if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES) &&
+ !boot_cpu_has_bug(X86_BUG_MDS) &&
+ !boot_cpu_has_bug(X86_BUG_TAA)) {
+ rdmsrl(MSR_IA32_ARCH_CAPABILITIES, msr);
+ if (msr & ARCH_CAP_FB_CLEAR_CTRL)
+ vmx_fb_clear_ctrl_available = true;
+ }
+}
+
+static __always_inline void vmx_disable_fb_clear(struct vcpu_vmx *vmx)
+{
+ u64 msr;
+
+ if (!vmx->disable_fb_clear)
+ return;
+
+ rdmsrl(MSR_IA32_MCU_OPT_CTRL, msr);
+ msr |= FB_CLEAR_DIS;
+ wrmsrl(MSR_IA32_MCU_OPT_CTRL, msr);
+ /* Cache the MSR value to avoid reading it later */
+ vmx->msr_ia32_mcu_opt_ctrl = msr;
+}
+
+static __always_inline void vmx_enable_fb_clear(struct vcpu_vmx *vmx)
+{
+ if (!vmx->disable_fb_clear)
+ return;
+
+ vmx->msr_ia32_mcu_opt_ctrl &= ~FB_CLEAR_DIS;
+ wrmsrl(MSR_IA32_MCU_OPT_CTRL, vmx->msr_ia32_mcu_opt_ctrl);
+}
+
+static void vmx_update_fb_clear_dis(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx)
+{
+ vmx->disable_fb_clear = vmx_fb_clear_ctrl_available;
+
+ /*
+ * If guest will not execute VERW, there is no need to set FB_CLEAR_DIS
+ * at VMEntry. Skip the MSR read/write when a guest has no use case to
+ * execute VERW.
+ */
+ if ((vcpu->arch.arch_capabilities & ARCH_CAP_FB_CLEAR) ||
+ ((vcpu->arch.arch_capabilities & ARCH_CAP_MDS_NO) &&
+ (vcpu->arch.arch_capabilities & ARCH_CAP_TAA_NO) &&
+ (vcpu->arch.arch_capabilities & ARCH_CAP_PSDP_NO) &&
+ (vcpu->arch.arch_capabilities & ARCH_CAP_FBSDP_NO) &&
+ (vcpu->arch.arch_capabilities & ARCH_CAP_SBDR_SSDP_NO)))
+ vmx->disable_fb_clear = false;
+}
+
static const struct kernel_param_ops vmentry_l1d_flush_ops = {
.set = vmentry_l1d_flush_set,
.get = vmentry_l1d_flush_get,
@@ -2252,6 +2309,10 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
ret = kvm_set_msr_common(vcpu, msr_info);
}
+ /* FB_CLEAR may have changed, also update the FB_CLEAR_DIS behavior */
+ if (msr_index == MSR_IA32_ARCH_CAPABILITIES)
+ vmx_update_fb_clear_dis(vcpu, vmx);
+
return ret;
}
@@ -4553,6 +4614,8 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
vpid_sync_context(vmx->vpid);
+
+ vmx_update_fb_clear_dis(vcpu, vmx);
}
static void vmx_enable_irq_window(struct kvm_vcpu *vcpu)
@@ -6547,6 +6610,7 @@ static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu)
return;
handle_interrupt_nmi_irqoff(vcpu, gate_offset(desc));
+ vcpu->arch.at_instruction_boundary = true;
}
static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
@@ -6771,6 +6835,11 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
vmx_l1d_flush(vcpu);
else if (static_branch_unlikely(&mds_user_clear))
mds_clear_cpu_buffers();
+ else if (static_branch_unlikely(&mmio_stale_data_clear) &&
+ kvm_arch_has_assigned_device(vcpu->kvm))
+ mds_clear_cpu_buffers();
+
+ vmx_disable_fb_clear(vmx);
if (vcpu->arch.cr2 != native_read_cr2())
native_write_cr2(vcpu->arch.cr2);
@@ -6780,6 +6849,8 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
vcpu->arch.cr2 = native_read_cr2();
+ vmx_enable_fb_clear(vmx);
+
guest_state_exit_irqoff();
}
@@ -7708,7 +7779,9 @@ static bool vmx_check_apicv_inhibit_reasons(enum kvm_apicv_inhibit reason)
ulong supported = BIT(APICV_INHIBIT_REASON_DISABLE) |
BIT(APICV_INHIBIT_REASON_ABSENT) |
BIT(APICV_INHIBIT_REASON_HYPERV) |
- BIT(APICV_INHIBIT_REASON_BLOCKIRQ);
+ BIT(APICV_INHIBIT_REASON_BLOCKIRQ) |
+ BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) |
+ BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED);
return supported & BIT(reason);
}
@@ -8211,6 +8284,8 @@ static int __init vmx_init(void)
return r;
}
+ vmx_setup_fb_clear_ctrl();
+
for_each_possible_cpu(cpu) {
INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index b98c7e96697a..8d2342ede0c5 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -348,6 +348,8 @@ struct vcpu_vmx {
u64 msr_ia32_feature_control_valid_bits;
/* SGX Launch Control public key hash */
u64 msr_ia32_sgxlepubkeyhash[4];
+ u64 msr_ia32_mcu_opt_ctrl;
+ bool disable_fb_clear;
struct pt_desc pt_desc;
struct lbr_desc lbr_desc;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e9473c7c7390..1910e1e78b15 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -296,6 +296,8 @@ const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = {
STATS_DESC_COUNTER(VCPU, nested_run),
STATS_DESC_COUNTER(VCPU, directed_yield_attempted),
STATS_DESC_COUNTER(VCPU, directed_yield_successful),
+ STATS_DESC_COUNTER(VCPU, preemption_reported),
+ STATS_DESC_COUNTER(VCPU, preemption_other),
STATS_DESC_ICOUNTER(VCPU, guest_mode)
};
@@ -1615,6 +1617,9 @@ static u64 kvm_get_arch_capabilities(void)
*/
}
+ /* Guests don't need to know "Fill buffer clear control" exists */
+ data &= ~ARCH_CAP_FB_CLEAR_CTRL;
+
return data;
}
@@ -4625,6 +4630,19 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
struct kvm_memslots *slots;
static const u8 preempted = KVM_VCPU_PREEMPTED;
+ /*
+ * The vCPU can be marked preempted if and only if the VM-Exit was on
+ * an instruction boundary and will not trigger guest emulation of any
+ * kind (see vcpu_run). Vendor specific code controls (conservatively)
+ * when this is true, for example allowing the vCPU to be marked
+ * preempted if and only if the VM-Exit was due to a host interrupt.
+ */
+ if (!vcpu->arch.at_instruction_boundary) {
+ vcpu->stat.preemption_other++;
+ return;
+ }
+
+ vcpu->stat.preemption_reported++;
if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
return;
@@ -4654,19 +4672,21 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
{
int idx;
- if (vcpu->preempted && !vcpu->arch.guest_state_protected)
- vcpu->arch.preempted_in_kernel = !static_call(kvm_x86_get_cpl)(vcpu);
+ if (vcpu->preempted) {
+ if (!vcpu->arch.guest_state_protected)
+ vcpu->arch.preempted_in_kernel = !static_call(kvm_x86_get_cpl)(vcpu);
- /*
- * Take the srcu lock as memslots will be accessed to check the gfn
- * cache generation against the memslots generation.
- */
- idx = srcu_read_lock(&vcpu->kvm->srcu);
- if (kvm_xen_msr_enabled(vcpu->kvm))
- kvm_xen_runstate_set_preempted(vcpu);
- else
- kvm_steal_time_set_preempted(vcpu);
- srcu_read_unlock(&vcpu->kvm->srcu, idx);
+ /*
+ * Take the srcu lock as memslots will be accessed to check the gfn
+ * cache generation against the memslots generation.
+ */
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
+ if (kvm_xen_msr_enabled(vcpu->kvm))
+ kvm_xen_runstate_set_preempted(vcpu);
+ else
+ kvm_steal_time_set_preempted(vcpu);
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
+ }
static_call(kvm_x86_vcpu_put)(vcpu);
vcpu->arch.last_host_tsc = rdtsc();
@@ -9833,6 +9853,7 @@ void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu)
return;
down_read(&vcpu->kvm->arch.apicv_update_lock);
+ preempt_disable();
activate = kvm_vcpu_apicv_activated(vcpu);
@@ -9853,6 +9874,7 @@ void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu)
kvm_make_request(KVM_REQ_EVENT, vcpu);
out:
+ preempt_enable();
up_read(&vcpu->kvm->arch.apicv_update_lock);
}
EXPORT_SYMBOL_GPL(kvm_vcpu_update_apicv);
@@ -10422,6 +10444,13 @@ static int vcpu_run(struct kvm_vcpu *vcpu)
vcpu->arch.l1tf_flush_l1d = true;
for (;;) {
+ /*
+ * If another guest vCPU requests a PV TLB flush in the middle
+ * of instruction emulation, the rest of the emulation could
+ * use a stale page translation. Assume that any code after
+ * this point can start executing an instruction.
+ */
+ vcpu->arch.at_instruction_boundary = false;
if (kvm_vcpu_running(vcpu)) {
r = vcpu_enter_guest(vcpu);
} else {
diff --git a/arch/x86/kvm/xen.h b/arch/x86/kvm/xen.h
index ee5c4ae0755c..532a535a9e99 100644
--- a/arch/x86/kvm/xen.h
+++ b/arch/x86/kvm/xen.h
@@ -159,8 +159,10 @@ static inline void kvm_xen_runstate_set_preempted(struct kvm_vcpu *vcpu)
* behalf of the vCPU. Only if the VMM does actually block
* does it need to enter RUNSTATE_blocked.
*/
- if (vcpu->preempted)
- kvm_xen_update_runstate_guest(vcpu, RUNSTATE_runnable);
+ if (WARN_ON_ONCE(!vcpu->preempted))
+ return;
+
+ kvm_xen_update_runstate_guest(vcpu, RUNSTATE_runnable);
}
/* 32-bit compatibility definitions, also used natively in 32-bit build */