aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kvm/x86.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kvm/x86.c')
-rw-r--r--arch/x86/kvm/x86.c1112
1 files changed, 795 insertions, 317 deletions
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 6eda2834fc05..17468d983fbd 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -58,6 +58,7 @@
#include <linux/sched/isolation.h>
#include <linux/mem_encrypt.h>
#include <linux/entry-kvm.h>
+#include <linux/suspend.h>
#include <trace/events/kvm.h>
@@ -102,6 +103,8 @@ static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE);
static u64 __read_mostly cr4_reserved_bits = CR4_RESERVED_BITS;
+#define KVM_EXIT_HYPERCALL_VALID_MASK (1 << KVM_HC_MAP_GPA_RANGE)
+
#define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS | \
KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK)
@@ -113,6 +116,9 @@ static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
static void store_regs(struct kvm_vcpu *vcpu);
static int sync_regs(struct kvm_vcpu *vcpu);
+static int __set_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2);
+static void __get_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2);
+
struct kvm_x86_ops kvm_x86_ops __read_mostly;
EXPORT_SYMBOL_GPL(kvm_x86_ops);
@@ -184,11 +190,6 @@ module_param(pi_inject_timer, bint, S_IRUGO | S_IWUSR);
*/
#define KVM_MAX_NR_USER_RETURN_MSRS 16
-struct kvm_user_return_msrs_global {
- int nr;
- u32 msrs[KVM_MAX_NR_USER_RETURN_MSRS];
-};
-
struct kvm_user_return_msrs {
struct user_return_notifier urn;
bool registered;
@@ -198,7 +199,9 @@ struct kvm_user_return_msrs {
} values[KVM_MAX_NR_USER_RETURN_MSRS];
};
-static struct kvm_user_return_msrs_global __read_mostly user_return_msrs_global;
+u32 __read_mostly kvm_nr_uret_msrs;
+EXPORT_SYMBOL_GPL(kvm_nr_uret_msrs);
+static u32 __read_mostly kvm_uret_msrs_list[KVM_MAX_NR_USER_RETURN_MSRS];
static struct kvm_user_return_msrs __percpu *user_return_msrs;
#define KVM_SUPPORTED_XCR0 (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \
@@ -212,55 +215,78 @@ EXPORT_SYMBOL_GPL(host_efer);
bool __read_mostly allow_smaller_maxphyaddr = 0;
EXPORT_SYMBOL_GPL(allow_smaller_maxphyaddr);
+bool __read_mostly enable_apicv = true;
+EXPORT_SYMBOL_GPL(enable_apicv);
+
u64 __read_mostly host_xss;
EXPORT_SYMBOL_GPL(host_xss);
u64 __read_mostly supported_xss;
EXPORT_SYMBOL_GPL(supported_xss);
-struct kvm_stats_debugfs_item debugfs_entries[] = {
- VCPU_STAT("pf_fixed", pf_fixed),
- VCPU_STAT("pf_guest", pf_guest),
- VCPU_STAT("tlb_flush", tlb_flush),
- VCPU_STAT("invlpg", invlpg),
- VCPU_STAT("exits", exits),
- VCPU_STAT("io_exits", io_exits),
- VCPU_STAT("mmio_exits", mmio_exits),
- VCPU_STAT("signal_exits", signal_exits),
- VCPU_STAT("irq_window", irq_window_exits),
- VCPU_STAT("nmi_window", nmi_window_exits),
- VCPU_STAT("halt_exits", halt_exits),
- VCPU_STAT("halt_successful_poll", halt_successful_poll),
- VCPU_STAT("halt_attempted_poll", halt_attempted_poll),
- VCPU_STAT("halt_poll_invalid", halt_poll_invalid),
- VCPU_STAT("halt_wakeup", halt_wakeup),
- VCPU_STAT("hypercalls", hypercalls),
- VCPU_STAT("request_irq", request_irq_exits),
- VCPU_STAT("irq_exits", irq_exits),
- VCPU_STAT("host_state_reload", host_state_reload),
- VCPU_STAT("fpu_reload", fpu_reload),
- VCPU_STAT("insn_emulation", insn_emulation),
- VCPU_STAT("insn_emulation_fail", insn_emulation_fail),
- VCPU_STAT("irq_injections", irq_injections),
- VCPU_STAT("nmi_injections", nmi_injections),
- VCPU_STAT("req_event", req_event),
- VCPU_STAT("l1d_flush", l1d_flush),
- VCPU_STAT("halt_poll_success_ns", halt_poll_success_ns),
- VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns),
- VCPU_STAT("nested_run", nested_run),
- VCPU_STAT("directed_yield_attempted", directed_yield_attempted),
- VCPU_STAT("directed_yield_successful", directed_yield_successful),
- VM_STAT("mmu_shadow_zapped", mmu_shadow_zapped),
- VM_STAT("mmu_pte_write", mmu_pte_write),
- VM_STAT("mmu_pde_zapped", mmu_pde_zapped),
- VM_STAT("mmu_flooded", mmu_flooded),
- VM_STAT("mmu_recycled", mmu_recycled),
- VM_STAT("mmu_cache_miss", mmu_cache_miss),
- VM_STAT("mmu_unsync", mmu_unsync),
- VM_STAT("remote_tlb_flush", remote_tlb_flush),
- VM_STAT("largepages", lpages, .mode = 0444),
- VM_STAT("nx_largepages_splitted", nx_lpage_splits, .mode = 0444),
- VM_STAT("max_mmu_page_hash_collisions", max_mmu_page_hash_collisions),
- { NULL }
+const struct _kvm_stats_desc kvm_vm_stats_desc[] = {
+ KVM_GENERIC_VM_STATS(),
+ STATS_DESC_COUNTER(VM, mmu_shadow_zapped),
+ STATS_DESC_COUNTER(VM, mmu_pte_write),
+ STATS_DESC_COUNTER(VM, mmu_pde_zapped),
+ STATS_DESC_COUNTER(VM, mmu_flooded),
+ STATS_DESC_COUNTER(VM, mmu_recycled),
+ STATS_DESC_COUNTER(VM, mmu_cache_miss),
+ STATS_DESC_ICOUNTER(VM, mmu_unsync),
+ STATS_DESC_ICOUNTER(VM, lpages),
+ STATS_DESC_ICOUNTER(VM, nx_lpage_splits),
+ STATS_DESC_PCOUNTER(VM, max_mmu_page_hash_collisions)
+};
+static_assert(ARRAY_SIZE(kvm_vm_stats_desc) ==
+ sizeof(struct kvm_vm_stat) / sizeof(u64));
+
+const struct kvm_stats_header kvm_vm_stats_header = {
+ .name_size = KVM_STATS_NAME_SIZE,
+ .num_desc = ARRAY_SIZE(kvm_vm_stats_desc),
+ .id_offset = sizeof(struct kvm_stats_header),
+ .desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE,
+ .data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE +
+ sizeof(kvm_vm_stats_desc),
+};
+
+const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = {
+ KVM_GENERIC_VCPU_STATS(),
+ STATS_DESC_COUNTER(VCPU, pf_fixed),
+ STATS_DESC_COUNTER(VCPU, pf_guest),
+ STATS_DESC_COUNTER(VCPU, tlb_flush),
+ STATS_DESC_COUNTER(VCPU, invlpg),
+ STATS_DESC_COUNTER(VCPU, exits),
+ STATS_DESC_COUNTER(VCPU, io_exits),
+ STATS_DESC_COUNTER(VCPU, mmio_exits),
+ STATS_DESC_COUNTER(VCPU, signal_exits),
+ STATS_DESC_COUNTER(VCPU, irq_window_exits),
+ STATS_DESC_COUNTER(VCPU, nmi_window_exits),
+ STATS_DESC_COUNTER(VCPU, l1d_flush),
+ STATS_DESC_COUNTER(VCPU, halt_exits),
+ STATS_DESC_COUNTER(VCPU, request_irq_exits),
+ STATS_DESC_COUNTER(VCPU, irq_exits),
+ STATS_DESC_COUNTER(VCPU, host_state_reload),
+ STATS_DESC_COUNTER(VCPU, fpu_reload),
+ STATS_DESC_COUNTER(VCPU, insn_emulation),
+ STATS_DESC_COUNTER(VCPU, insn_emulation_fail),
+ STATS_DESC_COUNTER(VCPU, hypercalls),
+ STATS_DESC_COUNTER(VCPU, irq_injections),
+ STATS_DESC_COUNTER(VCPU, nmi_injections),
+ STATS_DESC_COUNTER(VCPU, req_event),
+ STATS_DESC_COUNTER(VCPU, nested_run),
+ STATS_DESC_COUNTER(VCPU, directed_yield_attempted),
+ STATS_DESC_COUNTER(VCPU, directed_yield_successful),
+ STATS_DESC_ICOUNTER(VCPU, guest_mode)
+};
+static_assert(ARRAY_SIZE(kvm_vcpu_stats_desc) ==
+ sizeof(struct kvm_vcpu_stat) / sizeof(u64));
+
+const struct kvm_stats_header kvm_vcpu_stats_header = {
+ .name_size = KVM_STATS_NAME_SIZE,
+ .num_desc = ARRAY_SIZE(kvm_vcpu_stats_desc),
+ .id_offset = sizeof(struct kvm_stats_header),
+ .desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE,
+ .data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE +
+ sizeof(kvm_vcpu_stats_desc),
};
u64 __read_mostly host_xcr0;
@@ -330,23 +356,53 @@ static void kvm_on_user_return(struct user_return_notifier *urn)
user_return_notifier_unregister(urn);
}
local_irq_restore(flags);
- for (slot = 0; slot < user_return_msrs_global.nr; ++slot) {
+ for (slot = 0; slot < kvm_nr_uret_msrs; ++slot) {
values = &msrs->values[slot];
if (values->host != values->curr) {
- wrmsrl(user_return_msrs_global.msrs[slot], values->host);
+ wrmsrl(kvm_uret_msrs_list[slot], values->host);
values->curr = values->host;
}
}
}
-void kvm_define_user_return_msr(unsigned slot, u32 msr)
+static int kvm_probe_user_return_msr(u32 msr)
+{
+ u64 val;
+ int ret;
+
+ preempt_disable();
+ ret = rdmsrl_safe(msr, &val);
+ if (ret)
+ goto out;
+ ret = wrmsrl_safe(msr, val);
+out:
+ preempt_enable();
+ return ret;
+}
+
+int kvm_add_user_return_msr(u32 msr)
{
- BUG_ON(slot >= KVM_MAX_NR_USER_RETURN_MSRS);
- user_return_msrs_global.msrs[slot] = msr;
- if (slot >= user_return_msrs_global.nr)
- user_return_msrs_global.nr = slot + 1;
+ BUG_ON(kvm_nr_uret_msrs >= KVM_MAX_NR_USER_RETURN_MSRS);
+
+ if (kvm_probe_user_return_msr(msr))
+ return -1;
+
+ kvm_uret_msrs_list[kvm_nr_uret_msrs] = msr;
+ return kvm_nr_uret_msrs++;
}
-EXPORT_SYMBOL_GPL(kvm_define_user_return_msr);
+EXPORT_SYMBOL_GPL(kvm_add_user_return_msr);
+
+int kvm_find_user_return_msr(u32 msr)
+{
+ int i;
+
+ for (i = 0; i < kvm_nr_uret_msrs; ++i) {
+ if (kvm_uret_msrs_list[i] == msr)
+ return i;
+ }
+ return -1;
+}
+EXPORT_SYMBOL_GPL(kvm_find_user_return_msr);
static void kvm_user_return_msr_cpu_online(void)
{
@@ -355,8 +411,8 @@ static void kvm_user_return_msr_cpu_online(void)
u64 value;
int i;
- for (i = 0; i < user_return_msrs_global.nr; ++i) {
- rdmsrl_safe(user_return_msrs_global.msrs[i], &value);
+ for (i = 0; i < kvm_nr_uret_msrs; ++i) {
+ rdmsrl_safe(kvm_uret_msrs_list[i], &value);
msrs->values[i].host = value;
msrs->values[i].curr = value;
}
@@ -371,7 +427,7 @@ int kvm_set_user_return_msr(unsigned slot, u64 value, u64 mask)
value = (value & mask) | (msrs->values[slot].host & ~mask);
if (value == msrs->values[slot].curr)
return 0;
- err = wrmsrl_safe(user_return_msrs_global.msrs[slot], value);
+ err = wrmsrl_safe(kvm_uret_msrs_list[slot], value);
if (err)
return 1;
@@ -751,13 +807,6 @@ int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
}
EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu);
-static int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
- void *data, int offset, int len, u32 access)
-{
- return kvm_read_guest_page_mmu(vcpu, vcpu->arch.walk_mmu, gfn,
- data, offset, len, access);
-}
-
static inline u64 pdptr_rsvd_bits(struct kvm_vcpu *vcpu)
{
return vcpu->arch.reserved_gpa_bits | rsvd_bits(5, 8) | rsvd_bits(1, 2);
@@ -792,6 +841,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3)
memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs));
kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR);
+ vcpu->arch.pdptrs_from_userspace = false;
out:
@@ -799,40 +849,14 @@ out:
}
EXPORT_SYMBOL_GPL(load_pdptrs);
-bool pdptrs_changed(struct kvm_vcpu *vcpu)
-{
- u64 pdpte[ARRAY_SIZE(vcpu->arch.walk_mmu->pdptrs)];
- int offset;
- gfn_t gfn;
- int r;
-
- if (!is_pae_paging(vcpu))
- return false;
-
- if (!kvm_register_is_available(vcpu, VCPU_EXREG_PDPTR))
- return true;
-
- gfn = (kvm_read_cr3(vcpu) & 0xffffffe0ul) >> PAGE_SHIFT;
- offset = (kvm_read_cr3(vcpu) & 0xffffffe0ul) & (PAGE_SIZE - 1);
- r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte),
- PFERR_USER_MASK | PFERR_WRITE_MASK);
- if (r < 0)
- return true;
-
- return memcmp(pdpte, vcpu->arch.walk_mmu->pdptrs, sizeof(pdpte)) != 0;
-}
-EXPORT_SYMBOL_GPL(pdptrs_changed);
-
void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned long cr0)
{
- unsigned long update_bits = X86_CR0_PG | X86_CR0_WP;
-
if ((cr0 ^ old_cr0) & X86_CR0_PG) {
kvm_clear_async_pf_completion_queue(vcpu);
kvm_async_pf_hash_reset(vcpu);
}
- if ((cr0 ^ old_cr0) & update_bits)
+ if ((cr0 ^ old_cr0) & KVM_MMU_CR0_ROLE_BITS)
kvm_mmu_reset_context(vcpu);
if (((cr0 ^ old_cr0) & X86_CR0_CD) &&
@@ -1011,10 +1035,7 @@ EXPORT_SYMBOL_GPL(kvm_is_valid_cr4);
void kvm_post_set_cr4(struct kvm_vcpu *vcpu, unsigned long old_cr4, unsigned long cr4)
{
- unsigned long mmu_role_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE |
- X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE;
-
- if (((cr4 ^ old_cr4) & mmu_role_bits) ||
+ if (((cr4 ^ old_cr4) & KVM_MMU_CR4_ROLE_BITS) ||
(!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE)))
kvm_mmu_reset_context(vcpu);
}
@@ -1057,25 +1078,46 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
}
EXPORT_SYMBOL_GPL(kvm_set_cr4);
+static void kvm_invalidate_pcid(struct kvm_vcpu *vcpu, unsigned long pcid)
+{
+ struct kvm_mmu *mmu = vcpu->arch.mmu;
+ unsigned long roots_to_free = 0;
+ int i;
+
+ /*
+ * If neither the current CR3 nor any of the prev_roots use the given
+ * PCID, then nothing needs to be done here because a resync will
+ * happen anyway before switching to any other CR3.
+ */
+ if (kvm_get_active_pcid(vcpu) == pcid) {
+ kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
+ kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
+ }
+
+ for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
+ if (kvm_get_pcid(vcpu, mmu->prev_roots[i].pgd) == pcid)
+ roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
+
+ kvm_mmu_free_roots(vcpu, mmu, roots_to_free);
+}
+
int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
{
bool skip_tlb_flush = false;
+ unsigned long pcid = 0;
#ifdef CONFIG_X86_64
bool pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE);
if (pcid_enabled) {
skip_tlb_flush = cr3 & X86_CR3_PCID_NOFLUSH;
cr3 &= ~X86_CR3_PCID_NOFLUSH;
+ pcid = cr3 & X86_CR3_PCID_MASK;
}
#endif
- if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) {
- if (!skip_tlb_flush) {
- kvm_mmu_sync_roots(vcpu);
- kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
- }
- return 0;
- }
+ /* PDPTRs are always reloaded for PAE paging. */
+ if (cr3 == kvm_read_cr3(vcpu) && !is_pae_paging(vcpu))
+ goto handle_tlb_flush;
/*
* Do not condition the GPA check on long mode, this helper is used to
@@ -1088,10 +1130,23 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
if (is_pae_paging(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
return 1;
- kvm_mmu_new_pgd(vcpu, cr3, skip_tlb_flush, skip_tlb_flush);
+ if (cr3 != kvm_read_cr3(vcpu))
+ kvm_mmu_new_pgd(vcpu, cr3);
+
vcpu->arch.cr3 = cr3;
kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);
+handle_tlb_flush:
+ /*
+ * A load of CR3 that flushes the TLB flushes only the current PCID,
+ * even if PCID is disabled, in which case PCID=0 is flushed. It's a
+ * moot point in the end because _disabling_ PCID will flush all PCIDs,
+ * and it's impossible to use a non-zero PCID when PCID is disabled,
+ * i.e. only PCID=0 can be relevant.
+ */
+ if (!skip_tlb_flush)
+ kvm_invalidate_pcid(vcpu, pcid);
+
return 0;
}
EXPORT_SYMBOL_GPL(kvm_set_cr3);
@@ -1149,6 +1204,9 @@ static u64 kvm_dr6_fixed(struct kvm_vcpu *vcpu)
if (!guest_cpuid_has(vcpu, X86_FEATURE_RTM))
fixed |= DR6_RTM;
+
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT))
+ fixed |= DR6_BUS_LOCK;
return fixed;
}
@@ -1615,6 +1673,30 @@ static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data,
* invokes 64-bit SYSENTER.
*/
data = get_canonical(data, vcpu_virt_addr_bits(vcpu));
+ break;
+ case MSR_TSC_AUX:
+ if (!kvm_is_supported_user_return_msr(MSR_TSC_AUX))
+ return 1;
+
+ if (!host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_RDPID))
+ return 1;
+
+ /*
+ * Per Intel's SDM, bits 63:32 are reserved, but AMD's APM has
+ * incomplete and conflicting architectural behavior. Current
+ * AMD CPUs completely ignore bits 63:32, i.e. they aren't
+ * reserved and always read as zeros. Enforce Intel's reserved
+ * bits check if and only if the guest CPU is Intel, and clear
+ * the bits in all other cases. This ensures cross-vendor
+ * migration will provide consistent behavior for the guest.
+ */
+ if (guest_cpuid_is_intel(vcpu) && (data >> 32) != 0)
+ return 1;
+
+ data = (u32)data;
+ break;
}
msr.data = data;
@@ -1651,6 +1733,18 @@ int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data,
if (!host_initiated && !kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_READ))
return KVM_MSR_RET_FILTERED;
+ switch (index) {
+ case MSR_TSC_AUX:
+ if (!kvm_is_supported_user_return_msr(MSR_TSC_AUX))
+ return 1;
+
+ if (!host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_RDPID))
+ return 1;
+ break;
+ }
+
msr.index = index;
msr.host_initiated = host_initiated;
@@ -2113,13 +2207,15 @@ static u32 adjust_tsc_khz(u32 khz, s32 ppm)
return v;
}
+static void kvm_vcpu_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 l1_multiplier);
+
static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
{
u64 ratio;
/* Guest TSC same frequency as host TSC? */
if (!scale) {
- vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio;
+ kvm_vcpu_write_tsc_multiplier(vcpu, kvm_default_tsc_scaling_ratio);
return 0;
}
@@ -2145,7 +2241,7 @@ static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
return -1;
}
- vcpu->arch.tsc_scaling_ratio = ratio;
+ kvm_vcpu_write_tsc_multiplier(vcpu, ratio);
return 0;
}
@@ -2157,7 +2253,7 @@ static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
/* tsc_khz can be zero if TSC calibration fails */
if (user_tsc_khz == 0) {
/* set tsc_scaling_ratio to a safe value */
- vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio;
+ kvm_vcpu_write_tsc_multiplier(vcpu, kvm_default_tsc_scaling_ratio);
return -1;
}
@@ -2239,10 +2335,9 @@ static inline u64 __scale_tsc(u64 ratio, u64 tsc)
return mul_u64_u64_shr(tsc, ratio, kvm_tsc_scaling_ratio_frac_bits);
}
-u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc)
+u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc, u64 ratio)
{
u64 _tsc = tsc;
- u64 ratio = vcpu->arch.tsc_scaling_ratio;
if (ratio != kvm_default_tsc_scaling_ratio)
_tsc = __scale_tsc(ratio, tsc);
@@ -2251,25 +2346,86 @@ u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc)
}
EXPORT_SYMBOL_GPL(kvm_scale_tsc);
-static u64 kvm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
+static u64 kvm_compute_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
{
u64 tsc;
- tsc = kvm_scale_tsc(vcpu, rdtsc());
+ tsc = kvm_scale_tsc(vcpu, rdtsc(), vcpu->arch.l1_tsc_scaling_ratio);
return target_tsc - tsc;
}
u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
{
- return vcpu->arch.l1_tsc_offset + kvm_scale_tsc(vcpu, host_tsc);
+ return vcpu->arch.l1_tsc_offset +
+ kvm_scale_tsc(vcpu, host_tsc, vcpu->arch.l1_tsc_scaling_ratio);
}
EXPORT_SYMBOL_GPL(kvm_read_l1_tsc);
-static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
+u64 kvm_calc_nested_tsc_offset(u64 l1_offset, u64 l2_offset, u64 l2_multiplier)
+{
+ u64 nested_offset;
+
+ if (l2_multiplier == kvm_default_tsc_scaling_ratio)
+ nested_offset = l1_offset;
+ else
+ nested_offset = mul_s64_u64_shr((s64) l1_offset, l2_multiplier,
+ kvm_tsc_scaling_ratio_frac_bits);
+
+ nested_offset += l2_offset;
+ return nested_offset;
+}
+EXPORT_SYMBOL_GPL(kvm_calc_nested_tsc_offset);
+
+u64 kvm_calc_nested_tsc_multiplier(u64 l1_multiplier, u64 l2_multiplier)
{
- vcpu->arch.l1_tsc_offset = offset;
- vcpu->arch.tsc_offset = static_call(kvm_x86_write_l1_tsc_offset)(vcpu, offset);
+ if (l2_multiplier != kvm_default_tsc_scaling_ratio)
+ return mul_u64_u64_shr(l1_multiplier, l2_multiplier,
+ kvm_tsc_scaling_ratio_frac_bits);
+
+ return l1_multiplier;
+}
+EXPORT_SYMBOL_GPL(kvm_calc_nested_tsc_multiplier);
+
+static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 l1_offset)
+{
+ trace_kvm_write_tsc_offset(vcpu->vcpu_id,
+ vcpu->arch.l1_tsc_offset,
+ l1_offset);
+
+ vcpu->arch.l1_tsc_offset = l1_offset;
+
+ /*
+ * If we are here because L1 chose not to trap WRMSR to TSC then
+ * according to the spec this should set L1's TSC (as opposed to
+ * setting L1's offset for L2).
+ */
+ if (is_guest_mode(vcpu))
+ vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset(
+ l1_offset,
+ static_call(kvm_x86_get_l2_tsc_offset)(vcpu),
+ static_call(kvm_x86_get_l2_tsc_multiplier)(vcpu));
+ else
+ vcpu->arch.tsc_offset = l1_offset;
+
+ static_call(kvm_x86_write_tsc_offset)(vcpu, vcpu->arch.tsc_offset);
+}
+
+static void kvm_vcpu_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 l1_multiplier)
+{
+ vcpu->arch.l1_tsc_scaling_ratio = l1_multiplier;
+
+ /* Userspace is changing the multiplier while L2 is active */
+ if (is_guest_mode(vcpu))
+ vcpu->arch.tsc_scaling_ratio = kvm_calc_nested_tsc_multiplier(
+ l1_multiplier,
+ static_call(kvm_x86_get_l2_tsc_multiplier)(vcpu));
+ else
+ vcpu->arch.tsc_scaling_ratio = l1_multiplier;
+
+ if (kvm_has_tsc_control)
+ static_call(kvm_x86_write_tsc_multiplier)(
+ vcpu, vcpu->arch.tsc_scaling_ratio);
}
static inline bool kvm_check_tsc_unstable(void)
@@ -2295,7 +2451,7 @@ static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data)
bool synchronizing = false;
raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
- offset = kvm_compute_tsc_offset(vcpu, data);
+ offset = kvm_compute_l1_tsc_offset(vcpu, data);
ns = get_kvmclock_base_ns();
elapsed = ns - kvm->arch.last_tsc_nsec;
@@ -2334,7 +2490,7 @@ static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data)
} else {
u64 delta = nsec_to_cycles(vcpu, elapsed);
data += delta;
- offset = kvm_compute_tsc_offset(vcpu, data);
+ offset = kvm_compute_l1_tsc_offset(vcpu, data);
}
matched = true;
already_matched = (vcpu->arch.this_tsc_generation == kvm->arch.cur_tsc_generation);
@@ -2393,9 +2549,10 @@ static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu,
static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment)
{
- if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio)
+ if (vcpu->arch.l1_tsc_scaling_ratio != kvm_default_tsc_scaling_ratio)
WARN_ON(adjustment < 0);
- adjustment = kvm_scale_tsc(vcpu, (u64) adjustment);
+ adjustment = kvm_scale_tsc(vcpu, (u64) adjustment,
+ vcpu->arch.l1_tsc_scaling_ratio);
adjust_tsc_offset_guest(vcpu, adjustment);
}
@@ -2778,7 +2935,8 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
/* With all the info we got, fill in the values */
if (kvm_has_tsc_control)
- tgt_tsc_khz = kvm_scale_tsc(v, tgt_tsc_khz);
+ tgt_tsc_khz = kvm_scale_tsc(v, tgt_tsc_khz,
+ v->arch.l1_tsc_scaling_ratio);
if (unlikely(vcpu->hw_tsc_khz != tgt_tsc_khz)) {
kvm_get_time_scale(NSEC_PER_SEC, tgt_tsc_khz * 1000LL,
@@ -3006,6 +3164,19 @@ static void kvm_vcpu_flush_tlb_all(struct kvm_vcpu *vcpu)
static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu)
{
++vcpu->stat.tlb_flush;
+
+ if (!tdp_enabled) {
+ /*
+ * A TLB flush on behalf of the guest is equivalent to
+ * INVPCID(all), toggling CR4.PGE, etc., which requires
+ * a forced sync of the shadow page tables. Unload the
+ * entire MMU here and the subsequent load will sync the
+ * shadow page tables, and also flush the TLB.
+ */
+ kvm_mmu_unload(vcpu);
+ return;
+ }
+
static_call(kvm_x86_tlb_flush_guest)(vcpu);
}
@@ -3035,10 +3206,14 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
* expensive IPIs.
*/
if (guest_pv_has(vcpu, KVM_FEATURE_PV_TLB_FLUSH)) {
+ u8 st_preempted = xchg(&st->preempted, 0);
+
trace_kvm_pv_tlb_flush(vcpu->vcpu_id,
- st->preempted & KVM_VCPU_FLUSH_TLB);
- if (xchg(&st->preempted, 0) & KVM_VCPU_FLUSH_TLB)
+ st_preempted & KVM_VCPU_FLUSH_TLB);
+ if (st_preempted & KVM_VCPU_FLUSH_TLB)
kvm_vcpu_flush_tlb_guest(vcpu);
+ } else {
+ st->preempted = 0;
}
vcpu->arch.st.preempted = 0;
@@ -3167,7 +3342,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if (msr_info->host_initiated) {
kvm_synchronize_tsc(vcpu, data);
} else {
- u64 adj = kvm_compute_tsc_offset(vcpu, data) - vcpu->arch.l1_tsc_offset;
+ u64 adj = kvm_compute_l1_tsc_offset(vcpu, data) - vcpu->arch.l1_tsc_offset;
adjust_tsc_offset_guest(vcpu, adj);
vcpu->arch.ia32_tsc_adjust_msr += adj;
}
@@ -3402,7 +3577,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_IA32_LASTBRANCHTOIP:
case MSR_IA32_LASTINTFROMIP:
case MSR_IA32_LASTINTTOIP:
- case MSR_K8_SYSCFG:
+ case MSR_AMD64_SYSCFG:
case MSR_K8_TSEG_ADDR:
case MSR_K8_TSEG_MASK:
case MSR_VM_HSAVE_PA:
@@ -3469,10 +3644,17 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
* return L1's TSC value to ensure backwards-compatible
* behavior for migration.
*/
- u64 tsc_offset = msr_info->host_initiated ? vcpu->arch.l1_tsc_offset :
- vcpu->arch.tsc_offset;
+ u64 offset, ratio;
- msr_info->data = kvm_scale_tsc(vcpu, rdtsc()) + tsc_offset;
+ if (msr_info->host_initiated) {
+ offset = vcpu->arch.l1_tsc_offset;
+ ratio = vcpu->arch.l1_tsc_scaling_ratio;
+ } else {
+ offset = vcpu->arch.tsc_offset;
+ ratio = vcpu->arch.tsc_scaling_ratio;
+ }
+
+ msr_info->data = kvm_scale_tsc(vcpu, rdtsc(), ratio) + offset;
break;
}
case MSR_MTRRcap:
@@ -3796,6 +3978,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_HYPERV_TLBFLUSH:
case KVM_CAP_HYPERV_SEND_IPI:
case KVM_CAP_HYPERV_CPUID:
+ case KVM_CAP_HYPERV_ENFORCE_CPUID:
case KVM_CAP_SYS_HYPERV_CPUID:
case KVM_CAP_PCI_SEGMENT:
case KVM_CAP_DEBUGREGS:
@@ -3826,8 +4009,13 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_SGX_ATTRIBUTE:
#endif
case KVM_CAP_VM_COPY_ENC_CONTEXT_FROM:
+ case KVM_CAP_SREGS2:
+ case KVM_CAP_EXIT_ON_EMULATION_FAILURE:
r = 1;
break;
+ case KVM_CAP_EXIT_HYPERCALL:
+ r = KVM_EXIT_HYPERCALL_VALID_MASK;
+ break;
case KVM_CAP_SET_GUEST_DEBUG2:
return KVM_GUESTDBG_VALID_MASK;
#ifdef CONFIG_KVM_XEN
@@ -4055,7 +4243,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
mark_tsc_unstable("KVM discovered backwards TSC");
if (kvm_check_tsc_unstable()) {
- u64 offset = kvm_compute_tsc_offset(vcpu,
+ u64 offset = kvm_compute_l1_tsc_offset(vcpu,
vcpu->arch.last_guest_tsc);
kvm_vcpu_write_tsc_offset(vcpu, offset);
vcpu->arch.tsc_catchup = 1;
@@ -4374,7 +4562,7 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
memset(&events->reserved, 0, sizeof(events->reserved));
}
-static void kvm_smm_changed(struct kvm_vcpu *vcpu);
+static void kvm_smm_changed(struct kvm_vcpu *vcpu, bool entering_smm);
static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
struct kvm_vcpu_events *events)
@@ -4434,13 +4622,8 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
vcpu->arch.apic->sipi_vector = events->sipi_vector;
if (events->flags & KVM_VCPUEVENT_VALID_SMM) {
- if (!!(vcpu->arch.hflags & HF_SMM_MASK) != events->smi.smm) {
- if (events->smi.smm)
- vcpu->arch.hflags |= HF_SMM_MASK;
- else
- vcpu->arch.hflags &= ~HF_SMM_MASK;
- kvm_smm_changed(vcpu);
- }
+ if (!!(vcpu->arch.hflags & HF_SMM_MASK) != events->smi.smm)
+ kvm_smm_changed(vcpu, events->smi.smm);
vcpu->arch.smi_pending = events->smi.pending;
@@ -4724,6 +4907,9 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
return static_call(kvm_x86_enable_direct_tlbflush)(vcpu);
+ case KVM_CAP_HYPERV_ENFORCE_CPUID:
+ return kvm_hv_set_enforce_cpuid(vcpu, cap->args[0]);
+
case KVM_CAP_ENFORCE_PV_FEATURE_CPUID:
vcpu->arch.pv_cpuid.enforce = cap->args[0];
if (vcpu->arch.pv_cpuid.enforce)
@@ -4742,6 +4928,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
void __user *argp = (void __user *)arg;
int r;
union {
+ struct kvm_sregs2 *sregs2;
struct kvm_lapic_state *lapic;
struct kvm_xsave *xsave;
struct kvm_xcrs *xcrs;
@@ -5114,6 +5301,28 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
break;
}
#endif
+ case KVM_GET_SREGS2: {
+ u.sregs2 = kzalloc(sizeof(struct kvm_sregs2), GFP_KERNEL);
+ r = -ENOMEM;
+ if (!u.sregs2)
+ goto out;
+ __get_sregs2(vcpu, u.sregs2);
+ r = -EFAULT;
+ if (copy_to_user(argp, u.sregs2, sizeof(struct kvm_sregs2)))
+ goto out;
+ r = 0;
+ break;
+ }
+ case KVM_SET_SREGS2: {
+ u.sregs2 = memdup_user(argp, sizeof(struct kvm_sregs2));
+ if (IS_ERR(u.sregs2)) {
+ r = PTR_ERR(u.sregs2);
+ u.sregs2 = NULL;
+ goto out;
+ }
+ r = __set_sregs2(vcpu, u.sregs2);
+ break;
+ }
default:
r = -EINVAL;
}
@@ -5433,6 +5642,21 @@ split_irqchip_unlock:
if (kvm_x86_ops.vm_copy_enc_context_from)
r = kvm_x86_ops.vm_copy_enc_context_from(kvm, cap->args[0]);
return r;
+ case KVM_CAP_EXIT_HYPERCALL:
+ if (cap->args[0] & ~KVM_EXIT_HYPERCALL_VALID_MASK) {
+ r = -EINVAL;
+ break;
+ }
+ kvm->arch.hypercall_exit_enabled = cap->args[0];
+ r = 0;
+ break;
+ case KVM_CAP_EXIT_ON_EMULATION_FAILURE:
+ r = -EINVAL;
+ if (cap->args[0] & ~1)
+ break;
+ kvm->arch.exit_on_emulation_error = cap->args[0];
+ r = 0;
+ break;
default:
r = -EINVAL;
break;
@@ -5468,14 +5692,18 @@ static void kvm_free_msr_filter(struct kvm_x86_msr_filter *msr_filter)
static int kvm_add_msr_filter(struct kvm_x86_msr_filter *msr_filter,
struct kvm_msr_filter_range *user_range)
{
- struct msr_bitmap_range range;
unsigned long *bitmap = NULL;
size_t bitmap_size;
- int r;
if (!user_range->nmsrs)
return 0;
+ if (user_range->flags & ~(KVM_MSR_FILTER_READ | KVM_MSR_FILTER_WRITE))
+ return -EINVAL;
+
+ if (!user_range->flags)
+ return -EINVAL;
+
bitmap_size = BITS_TO_LONGS(user_range->nmsrs) * sizeof(long);
if (!bitmap_size || bitmap_size > KVM_MSR_FILTER_MAX_BITMAP_SIZE)
return -EINVAL;
@@ -5484,31 +5712,15 @@ static int kvm_add_msr_filter(struct kvm_x86_msr_filter *msr_filter,
if (IS_ERR(bitmap))
return PTR_ERR(bitmap);
- range = (struct msr_bitmap_range) {
+ msr_filter->ranges[msr_filter->count] = (struct msr_bitmap_range) {
.flags = user_range->flags,
.base = user_range->base,
.nmsrs = user_range->nmsrs,
.bitmap = bitmap,
};
- if (range.flags & ~(KVM_MSR_FILTER_READ | KVM_MSR_FILTER_WRITE)) {
- r = -EINVAL;
- goto err;
- }
-
- if (!range.flags) {
- r = -EINVAL;
- goto err;
- }
-
- /* Everything ok, add this range identifier. */
- msr_filter->ranges[msr_filter->count] = range;
msr_filter->count++;
-
return 0;
-err:
- kfree(bitmap);
- return r;
}
static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, void __user *argp)
@@ -5559,6 +5771,41 @@ static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, void __user *argp)
return 0;
}
+#ifdef CONFIG_HAVE_KVM_PM_NOTIFIER
+static int kvm_arch_suspend_notifier(struct kvm *kvm)
+{
+ struct kvm_vcpu *vcpu;
+ int i, ret = 0;
+
+ mutex_lock(&kvm->lock);
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ if (!vcpu->arch.pv_time_enabled)
+ continue;
+
+ ret = kvm_set_guest_paused(vcpu);
+ if (ret) {
+ kvm_err("Failed to pause guest VCPU%d: %d\n",
+ vcpu->vcpu_id, ret);
+ break;
+ }
+ }
+ mutex_unlock(&kvm->lock);
+
+ return ret ? NOTIFY_BAD : NOTIFY_DONE;
+}
+
+int kvm_arch_pm_notifier(struct kvm *kvm, unsigned long state)
+{
+ switch (state) {
+ case PM_HIBERNATION_PREPARE:
+ case PM_SUSPEND_PREPARE:
+ return kvm_arch_suspend_notifier(kvm);
+ }
+
+ return NOTIFY_DONE;
+}
+#endif /* CONFIG_HAVE_KVM_PM_NOTIFIER */
+
long kvm_arch_vm_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg)
{
@@ -5937,7 +6184,8 @@ static void kvm_init_msr_list(void)
continue;
break;
case MSR_TSC_AUX:
- if (!kvm_cpu_cap_has(X86_FEATURE_RDTSCP))
+ if (!kvm_cpu_cap_has(X86_FEATURE_RDTSCP) &&
+ !kvm_cpu_cap_has(X86_FEATURE_RDPID))
continue;
break;
case MSR_IA32_UMWAIT_CONTROL:
@@ -7032,20 +7280,22 @@ static unsigned emulator_get_hflags(struct x86_emulate_ctxt *ctxt)
return emul_to_vcpu(ctxt)->arch.hflags;
}
-static void emulator_set_hflags(struct x86_emulate_ctxt *ctxt, unsigned emul_flags)
+static void emulator_exiting_smm(struct x86_emulate_ctxt *ctxt)
{
- emul_to_vcpu(ctxt)->arch.hflags = emul_flags;
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+
+ kvm_smm_changed(vcpu, false);
}
-static int emulator_pre_leave_smm(struct x86_emulate_ctxt *ctxt,
+static int emulator_leave_smm(struct x86_emulate_ctxt *ctxt,
const char *smstate)
{
- return static_call(kvm_x86_pre_leave_smm)(emul_to_vcpu(ctxt), smstate);
+ return static_call(kvm_x86_leave_smm)(emul_to_vcpu(ctxt), smstate);
}
-static void emulator_post_leave_smm(struct x86_emulate_ctxt *ctxt)
+static void emulator_triple_fault(struct x86_emulate_ctxt *ctxt)
{
- kvm_smm_changed(emul_to_vcpu(ctxt));
+ kvm_make_request(KVM_REQ_TRIPLE_FAULT, emul_to_vcpu(ctxt));
}
static int emulator_set_xcr(struct x86_emulate_ctxt *ctxt, u32 index, u64 xcr)
@@ -7094,9 +7344,9 @@ static const struct x86_emulate_ops emulate_ops = {
.guest_has_fxsr = emulator_guest_has_fxsr,
.set_nmi_mask = emulator_set_nmi_mask,
.get_hflags = emulator_get_hflags,
- .set_hflags = emulator_set_hflags,
- .pre_leave_smm = emulator_pre_leave_smm,
- .post_leave_smm = emulator_post_leave_smm,
+ .exiting_smm = emulator_exiting_smm,
+ .leave_smm = emulator_leave_smm,
+ .triple_fault = emulator_triple_fault,
.set_xcr = emulator_set_xcr,
};
@@ -7171,6 +7421,11 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
BUILD_BUG_ON(HF_SMM_MASK != X86EMUL_SMM_MASK);
BUILD_BUG_ON(HF_SMM_INSIDE_NMI_MASK != X86EMUL_SMM_INSIDE_NMI_MASK);
+ ctxt->interruptibility = 0;
+ ctxt->have_exception = false;
+ ctxt->exception.vector = -1;
+ ctxt->perm_ok = false;
+
init_decode_cache(ctxt);
vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
}
@@ -7197,8 +7452,33 @@ void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip)
}
EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt);
+static void prepare_emulation_failure_exit(struct kvm_vcpu *vcpu)
+{
+ struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
+ u32 insn_size = ctxt->fetch.end - ctxt->fetch.data;
+ struct kvm_run *run = vcpu->run;
+
+ run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ run->emulation_failure.suberror = KVM_INTERNAL_ERROR_EMULATION;
+ run->emulation_failure.ndata = 0;
+ run->emulation_failure.flags = 0;
+
+ if (insn_size) {
+ run->emulation_failure.ndata = 3;
+ run->emulation_failure.flags |=
+ KVM_INTERNAL_ERROR_EMULATION_FLAG_INSTRUCTION_BYTES;
+ run->emulation_failure.insn_size = insn_size;
+ memset(run->emulation_failure.insn_bytes, 0x90,
+ sizeof(run->emulation_failure.insn_bytes));
+ memcpy(run->emulation_failure.insn_bytes,
+ ctxt->fetch.data, insn_size);
+ }
+}
+
static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type)
{
+ struct kvm *kvm = vcpu->kvm;
+
++vcpu->stat.insn_emulation_fail;
trace_kvm_emulate_insn_failed(vcpu);
@@ -7207,10 +7487,9 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type)
return 1;
}
- if (emulation_type & EMULTYPE_SKIP) {
- vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
- vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
- vcpu->run->internal.ndata = 0;
+ if (kvm->arch.exit_on_emulation_error ||
+ (emulation_type & EMULTYPE_SKIP)) {
+ prepare_emulation_failure_exit(vcpu);
return 0;
}
@@ -7352,11 +7631,14 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
static int complete_emulated_mmio(struct kvm_vcpu *vcpu);
static int complete_emulated_pio(struct kvm_vcpu *vcpu);
-static void kvm_smm_changed(struct kvm_vcpu *vcpu)
+static void kvm_smm_changed(struct kvm_vcpu *vcpu, bool entering_smm)
{
- if (!(vcpu->arch.hflags & HF_SMM_MASK)) {
- /* This is a good place to trace that we are exiting SMM. */
- trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, false);
+ trace_kvm_smm_transition(vcpu->vcpu_id, vcpu->arch.smbase, entering_smm);
+
+ if (entering_smm) {
+ vcpu->arch.hflags |= HF_SMM_MASK;
+ } else {
+ vcpu->arch.hflags &= ~(HF_SMM_MASK | HF_SMM_INSIDE_NMI_MASK);
/* Process a latched INIT or SMI, if any. */
kvm_make_request(KVM_REQ_EVENT, vcpu);
@@ -7506,14 +7788,7 @@ int x86_decode_emulated_instruction(struct kvm_vcpu *vcpu, int emulation_type,
kvm_vcpu_check_breakpoint(vcpu, &r))
return r;
- ctxt->interruptibility = 0;
- ctxt->have_exception = false;
- ctxt->exception.vector = -1;
- ctxt->perm_ok = false;
-
- ctxt->ud = emulation_type & EMULTYPE_TRAP_UD;
-
- r = x86_decode_insn(ctxt, insn, insn_len);
+ r = x86_decode_insn(ctxt, insn, insn_len, emulation_type);
trace_kvm_emulate_insn_start(vcpu);
++vcpu->stat.insn_emulation;
@@ -8040,6 +8315,18 @@ static void pvclock_gtod_update_fn(struct work_struct *work)
static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn);
/*
+ * Indirection to move queue_work() out of the tk_core.seq write held
+ * region to prevent possible deadlocks against time accessors which
+ * are invoked with work related locks held.
+ */
+static void pvclock_irq_work_fn(struct irq_work *w)
+{
+ queue_work(system_long_wq, &pvclock_gtod_work);
+}
+
+static DEFINE_IRQ_WORK(pvclock_irq_work, pvclock_irq_work_fn);
+
+/*
* Notification about pvclock gtod data update.
*/
static int pvclock_gtod_notify(struct notifier_block *nb, unsigned long unused,
@@ -8050,13 +8337,14 @@ static int pvclock_gtod_notify(struct notifier_block *nb, unsigned long unused,
update_pvclock_gtod(tk);
- /* disable master clock if host does not trust, or does not
- * use, TSC based clocksource.
+ /*
+ * Disable master clock if host does not trust, or does not use,
+ * TSC based clocksource. Delegate queue_work() to irq_work as
+ * this is invoked with tk_core.seq write held.
*/
if (!gtod_is_based_on_tsc(gtod->clock.vclock_mode) &&
atomic_read(&kvm_guest_has_master_clock) != 0)
- queue_work(system_long_wq, &pvclock_gtod_work);
-
+ irq_work_queue(&pvclock_irq_work);
return 0;
}
@@ -8118,6 +8406,7 @@ int kvm_arch_init(void *opaque)
printk(KERN_ERR "kvm: failed to allocate percpu kvm_user_return_msrs\n");
goto out_free_x86_emulator_cache;
}
+ kvm_nr_uret_msrs = 0;
r = kvm_mmu_module_init();
if (r)
@@ -8168,10 +8457,13 @@ void kvm_arch_exit(void)
cpuhp_remove_state_nocalls(CPUHP_AP_X86_KVM_CLK_ONLINE);
#ifdef CONFIG_X86_64
pvclock_gtod_unregister_notifier(&pvclock_gtod_notifier);
+ irq_work_sync(&pvclock_irq_work);
+ cancel_work_sync(&pvclock_gtod_work);
#endif
kvm_x86_ops.hardware_enable = NULL;
kvm_mmu_module_exit();
free_percpu(user_return_msrs);
+ kmem_cache_destroy(x86_emulator_cache);
kmem_cache_destroy(x86_fpu_cache);
#ifdef CONFIG_KVM_XEN
static_key_deferred_flush(&kvm_xen_enabled);
@@ -8271,16 +8563,15 @@ bool kvm_apicv_activated(struct kvm *kvm)
}
EXPORT_SYMBOL_GPL(kvm_apicv_activated);
-void kvm_apicv_init(struct kvm *kvm, bool enable)
+static void kvm_apicv_init(struct kvm *kvm)
{
- if (enable)
+ if (enable_apicv)
clear_bit(APICV_INHIBIT_REASON_DISABLE,
&kvm->arch.apicv_inhibit_reasons);
else
set_bit(APICV_INHIBIT_REASON_DISABLE,
&kvm->arch.apicv_inhibit_reasons);
}
-EXPORT_SYMBOL_GPL(kvm_apicv_init);
static void kvm_sched_yield(struct kvm_vcpu *vcpu, unsigned long dest_id)
{
@@ -8289,6 +8580,9 @@ static void kvm_sched_yield(struct kvm_vcpu *vcpu, unsigned long dest_id)
vcpu->stat.directed_yield_attempted++;
+ if (single_task_running())
+ goto no_yield;
+
rcu_read_lock();
map = rcu_dereference(vcpu->kvm->arch.apic_map);
@@ -8313,6 +8607,17 @@ no_yield:
return;
}
+static int complete_hypercall_exit(struct kvm_vcpu *vcpu)
+{
+ u64 ret = vcpu->run->hypercall.ret;
+
+ if (!is_64_bit_mode(vcpu))
+ ret = (u32)ret;
+ kvm_rax_write(vcpu, ret);
+ ++vcpu->stat.hypercalls;
+ return kvm_skip_emulated_instruction(vcpu);
+}
+
int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
{
unsigned long nr, a0, a1, a2, a3, ret;
@@ -8378,6 +8683,28 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
kvm_sched_yield(vcpu, a0);
ret = 0;
break;
+ case KVM_HC_MAP_GPA_RANGE: {
+ u64 gpa = a0, npages = a1, attrs = a2;
+
+ ret = -KVM_ENOSYS;
+ if (!(vcpu->kvm->arch.hypercall_exit_enabled & (1 << KVM_HC_MAP_GPA_RANGE)))
+ break;
+
+ if (!PAGE_ALIGNED(gpa) || !npages ||
+ gpa_to_gfn(gpa) + npages <= gpa_to_gfn(gpa)) {
+ ret = -KVM_EINVAL;
+ break;
+ }
+
+ vcpu->run->exit_reason = KVM_EXIT_HYPERCALL;
+ vcpu->run->hypercall.nr = KVM_HC_MAP_GPA_RANGE;
+ vcpu->run->hypercall.args[0] = gpa;
+ vcpu->run->hypercall.args[1] = npages;
+ vcpu->run->hypercall.args[2] = attrs;
+ vcpu->run->hypercall.longmode = op_64_bit;
+ vcpu->arch.complete_userspace_io = complete_hypercall_exit;
+ return 0;
+ }
default:
ret = -KVM_ENOSYS;
break;
@@ -8461,9 +8788,6 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu)
int kvm_check_nested_events(struct kvm_vcpu *vcpu)
{
- if (WARN_ON_ONCE(!is_guest_mode(vcpu)))
- return -EIO;
-
if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) {
kvm_x86_ops.nested_ops->triple_fault(vcpu);
return 1;
@@ -8479,7 +8803,7 @@ static void kvm_inject_exception(struct kvm_vcpu *vcpu)
static_call(kvm_x86_queue_exception)(vcpu);
}
-static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit)
+static int inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit)
{
int r;
bool can_inject = true;
@@ -8526,7 +8850,7 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit
if (is_guest_mode(vcpu)) {
r = kvm_check_nested_events(vcpu);
if (r < 0)
- goto busy;
+ goto out;
}
/* try to inject new event if pending */
@@ -8568,7 +8892,7 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit
if (vcpu->arch.smi_pending) {
r = can_inject ? static_call(kvm_x86_smi_allowed)(vcpu, true) : -EBUSY;
if (r < 0)
- goto busy;
+ goto out;
if (r) {
vcpu->arch.smi_pending = false;
++vcpu->arch.smi_count;
@@ -8581,7 +8905,7 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit
if (vcpu->arch.nmi_pending) {
r = can_inject ? static_call(kvm_x86_nmi_allowed)(vcpu, true) : -EBUSY;
if (r < 0)
- goto busy;
+ goto out;
if (r) {
--vcpu->arch.nmi_pending;
vcpu->arch.nmi_injected = true;
@@ -8596,7 +8920,7 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit
if (kvm_cpu_has_injectable_intr(vcpu)) {
r = can_inject ? static_call(kvm_x86_interrupt_allowed)(vcpu, true) : -EBUSY;
if (r < 0)
- goto busy;
+ goto out;
if (r) {
kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu), false);
static_call(kvm_x86_set_irq)(vcpu);
@@ -8612,11 +8936,14 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit
*req_immediate_exit = true;
WARN_ON(vcpu->arch.exception.pending);
- return;
+ return 0;
-busy:
- *req_immediate_exit = true;
- return;
+out:
+ if (r == -EBUSY) {
+ *req_immediate_exit = true;
+ r = 0;
+ }
+ return r;
}
static void process_nmi(struct kvm_vcpu *vcpu)
@@ -8795,10 +9122,9 @@ static void enter_smm(struct kvm_vcpu *vcpu)
{
struct kvm_segment cs, ds;
struct desc_ptr dt;
+ unsigned long cr0;
char buf[512];
- u32 cr0;
- trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, true);
memset(buf, 0, 512);
#ifdef CONFIG_X86_64
if (guest_cpuid_has(vcpu, X86_FEATURE_LM))
@@ -8808,13 +9134,13 @@ static void enter_smm(struct kvm_vcpu *vcpu)
enter_smm_save_state_32(vcpu, buf);
/*
- * Give pre_enter_smm() a chance to make ISA-specific changes to the
- * vCPU state (e.g. leave guest mode) after we've saved the state into
- * the SMM state-save area.
+ * Give enter_smm() a chance to make ISA-specific changes to the vCPU
+ * state (e.g. leave guest mode) after we've saved the state into the
+ * SMM state-save area.
*/
- static_call(kvm_x86_pre_enter_smm)(vcpu, buf);
+ static_call(kvm_x86_enter_smm)(vcpu, buf);
- vcpu->arch.hflags |= HF_SMM_MASK;
+ kvm_smm_changed(vcpu, true);
kvm_vcpu_write_guest(vcpu, vcpu->arch.smbase + 0xfe00, buf, sizeof(buf));
if (static_call(kvm_x86_get_nmi_mask)(vcpu))
@@ -8903,6 +9229,15 @@ void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu)
vcpu->arch.apicv_active = kvm_apicv_activated(vcpu->kvm);
kvm_apic_update_apicv(vcpu);
static_call(kvm_x86_refresh_apicv_exec_ctrl)(vcpu);
+
+ /*
+ * When APICv gets disabled, we may still have injected interrupts
+ * pending. At the same time, KVM_REQ_EVENT may not be set as APICv was
+ * still active when the interrupt got accepted. Make sure
+ * inject_pending_event() is called to check for that.
+ */
+ if (!vcpu->arch.apicv_active)
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
}
EXPORT_SYMBOL_GPL(kvm_vcpu_update_apicv);
@@ -9078,7 +9413,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
}
if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu))
kvm_vcpu_flush_tlb_current(vcpu);
- if (kvm_check_request(KVM_REQ_HV_TLB_FLUSH, vcpu))
+ if (kvm_check_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu))
kvm_vcpu_flush_tlb_guest(vcpu);
if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) {
@@ -9171,13 +9506,21 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win ||
kvm_xen_has_interrupt(vcpu)) {
++vcpu->stat.req_event;
- kvm_apic_accept_events(vcpu);
+ r = kvm_apic_accept_events(vcpu);
+ if (r < 0) {
+ r = 0;
+ goto out;
+ }
if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
r = 1;
goto out;
}
- inject_pending_event(vcpu, &req_immediate_exit);
+ r = inject_pending_event(vcpu, &req_immediate_exit);
+ if (r < 0) {
+ r = 0;
+ goto out;
+ }
if (req_int_win)
static_call(kvm_x86_enable_irq_window)(vcpu);
@@ -9379,7 +9722,8 @@ static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu)
return 1;
}
- kvm_apic_accept_events(vcpu);
+ if (kvm_apic_accept_events(vcpu) < 0)
+ return 0;
switch(vcpu->arch.mp_state) {
case KVM_MP_STATE_HALTED:
case KVM_MP_STATE_AP_RESET_HOLD:
@@ -9425,7 +9769,7 @@ static int vcpu_run(struct kvm_vcpu *vcpu)
if (r <= 0)
break;
- kvm_clear_request(KVM_REQ_PENDING_TIMER, vcpu);
+ kvm_clear_request(KVM_REQ_UNBLOCK, vcpu);
if (kvm_cpu_has_pending_timer(vcpu))
kvm_inject_pending_timer_irqs(vcpu);
@@ -9603,7 +9947,10 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
goto out;
}
kvm_vcpu_block(vcpu);
- kvm_apic_accept_events(vcpu);
+ if (kvm_apic_accept_events(vcpu) < 0) {
+ r = 0;
+ goto out;
+ }
kvm_clear_request(KVM_REQ_UNHALT, vcpu);
r = -EAGAIN;
if (signal_pending(current)) {
@@ -9752,7 +10099,7 @@ void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
}
EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits);
-static void __get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
+static void __get_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
{
struct desc_ptr dt;
@@ -9785,14 +10132,36 @@ skip_protected_regs:
sregs->cr8 = kvm_get_cr8(vcpu);
sregs->efer = vcpu->arch.efer;
sregs->apic_base = kvm_get_apic_base(vcpu);
+}
- memset(sregs->interrupt_bitmap, 0, sizeof(sregs->interrupt_bitmap));
+static void __get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
+{
+ __get_sregs_common(vcpu, sregs);
+
+ if (vcpu->arch.guest_state_protected)
+ return;
if (vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft)
set_bit(vcpu->arch.interrupt.nr,
(unsigned long *)sregs->interrupt_bitmap);
}
+static void __get_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2)
+{
+ int i;
+
+ __get_sregs_common(vcpu, (struct kvm_sregs *)sregs2);
+
+ if (vcpu->arch.guest_state_protected)
+ return;
+
+ if (is_pae_paging(vcpu)) {
+ for (i = 0 ; i < 4 ; i++)
+ sregs2->pdptrs[i] = kvm_pdptr_read(vcpu, i);
+ sregs2->flags |= KVM_SREGS2_FLAGS_PDPTRS_VALID;
+ }
+}
+
int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
struct kvm_sregs *sregs)
{
@@ -9805,11 +10174,17 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
struct kvm_mp_state *mp_state)
{
+ int r;
+
vcpu_load(vcpu);
if (kvm_mpx_supported())
kvm_load_guest_fpu(vcpu);
- kvm_apic_accept_events(vcpu);
+ r = kvm_apic_accept_events(vcpu);
+ if (r < 0)
+ goto out;
+ r = 0;
+
if ((vcpu->arch.mp_state == KVM_MP_STATE_HALTED ||
vcpu->arch.mp_state == KVM_MP_STATE_AP_RESET_HOLD) &&
vcpu->arch.pv.pv_unhalted)
@@ -9817,10 +10192,11 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
else
mp_state->mp_state = vcpu->arch.mp_state;
+out:
if (kvm_mpx_supported())
kvm_put_guest_fpu(vcpu);
vcpu_put(vcpu);
- return 0;
+ return r;
}
int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
@@ -9904,24 +10280,23 @@ static bool kvm_is_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
return kvm_is_valid_cr4(vcpu, sregs->cr4);
}
-static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
+static int __set_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs,
+ int *mmu_reset_needed, bool update_pdptrs)
{
struct msr_data apic_base_msr;
- int mmu_reset_needed = 0;
- int pending_vec, max_bits, idx;
+ int idx;
struct desc_ptr dt;
- int ret = -EINVAL;
if (!kvm_is_valid_sregs(vcpu, sregs))
- goto out;
+ return -EINVAL;
apic_base_msr.data = sregs->apic_base;
apic_base_msr.host_initiated = true;
if (kvm_set_apic_base(vcpu, &apic_base_msr))
- goto out;
+ return -EINVAL;
if (vcpu->arch.guest_state_protected)
- goto skip_protected_regs;
+ return 0;
dt.size = sregs->idt.limit;
dt.address = sregs->idt.base;
@@ -9931,31 +10306,30 @@ static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
static_call(kvm_x86_set_gdt)(vcpu, &dt);
vcpu->arch.cr2 = sregs->cr2;
- mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3;
+ *mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3;
vcpu->arch.cr3 = sregs->cr3;
kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);
kvm_set_cr8(vcpu, sregs->cr8);
- mmu_reset_needed |= vcpu->arch.efer != sregs->efer;
+ *mmu_reset_needed |= vcpu->arch.efer != sregs->efer;
static_call(kvm_x86_set_efer)(vcpu, sregs->efer);
- mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0;
+ *mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0;
static_call(kvm_x86_set_cr0)(vcpu, sregs->cr0);
vcpu->arch.cr0 = sregs->cr0;
- mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
+ *mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
static_call(kvm_x86_set_cr4)(vcpu, sregs->cr4);
- idx = srcu_read_lock(&vcpu->kvm->srcu);
- if (is_pae_paging(vcpu)) {
- load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
- mmu_reset_needed = 1;
+ if (update_pdptrs) {
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
+ if (is_pae_paging(vcpu)) {
+ load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
+ *mmu_reset_needed = 1;
+ }
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
}
- srcu_read_unlock(&vcpu->kvm->srcu, idx);
-
- if (mmu_reset_needed)
- kvm_mmu_reset_context(vcpu);
kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
kvm_set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
@@ -9975,20 +10349,63 @@ static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
!is_protmode(vcpu))
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
-skip_protected_regs:
+ return 0;
+}
+
+static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
+{
+ int pending_vec, max_bits;
+ int mmu_reset_needed = 0;
+ int ret = __set_sregs_common(vcpu, sregs, &mmu_reset_needed, true);
+
+ if (ret)
+ return ret;
+
+ if (mmu_reset_needed)
+ kvm_mmu_reset_context(vcpu);
+
max_bits = KVM_NR_INTERRUPTS;
pending_vec = find_first_bit(
(const unsigned long *)sregs->interrupt_bitmap, max_bits);
+
if (pending_vec < max_bits) {
kvm_queue_interrupt(vcpu, pending_vec, false);
pr_debug("Set back pending irq %d\n", pending_vec);
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
}
+ return 0;
+}
- kvm_make_request(KVM_REQ_EVENT, vcpu);
+static int __set_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2)
+{
+ int mmu_reset_needed = 0;
+ bool valid_pdptrs = sregs2->flags & KVM_SREGS2_FLAGS_PDPTRS_VALID;
+ bool pae = (sregs2->cr0 & X86_CR0_PG) && (sregs2->cr4 & X86_CR4_PAE) &&
+ !(sregs2->efer & EFER_LMA);
+ int i, ret;
- ret = 0;
-out:
- return ret;
+ if (sregs2->flags & ~KVM_SREGS2_FLAGS_PDPTRS_VALID)
+ return -EINVAL;
+
+ if (valid_pdptrs && (!pae || vcpu->arch.guest_state_protected))
+ return -EINVAL;
+
+ ret = __set_sregs_common(vcpu, (struct kvm_sregs *)sregs2,
+ &mmu_reset_needed, !valid_pdptrs);
+ if (ret)
+ return ret;
+
+ if (valid_pdptrs) {
+ for (i = 0; i < 4 ; i++)
+ kvm_pdptr_write(vcpu, i, sregs2->pdptrs[i]);
+
+ kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR);
+ mmu_reset_needed = 1;
+ vcpu->arch.pdptrs_from_userspace = true;
+ }
+ if (mmu_reset_needed)
+ kvm_mmu_reset_context(vcpu);
+ return 0;
}
int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
@@ -10044,8 +10461,7 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
kvm_update_dr7(vcpu);
if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
- vcpu->arch.singlestep_rip = kvm_rip_read(vcpu) +
- get_segment_base(vcpu, VCPU_SREG_CS);
+ vcpu->arch.singlestep_rip = kvm_get_linear_rip(vcpu);
/*
* Trigger an rflags update that will inject or remove the trace
@@ -10213,13 +10629,13 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
struct page *page;
int r;
+ vcpu->arch.last_vmentry_cpu = -1;
+
if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu))
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
else
vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;
- kvm_set_tsc_khz(vcpu, max_tsc_khz);
-
r = kvm_mmu_create(vcpu);
if (r < 0)
return r;
@@ -10279,6 +10695,10 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
vcpu->arch.pending_external_vector = -1;
vcpu->arch.preempted_in_kernel = false;
+#if IS_ENABLED(CONFIG_HYPERV)
+ vcpu->arch.hv_root_tdp = INVALID_PAGE;
+#endif
+
r = static_call(kvm_x86_vcpu_create)(vcpu);
if (r)
goto free_guest_fpu;
@@ -10287,8 +10707,9 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT;
kvm_vcpu_mtrr_init(vcpu);
vcpu_load(vcpu);
+ kvm_set_tsc_khz(vcpu, max_tsc_khz);
kvm_vcpu_reset(vcpu, false);
- kvm_init_mmu(vcpu, false);
+ kvm_init_mmu(vcpu);
vcpu_put(vcpu);
return 0;
@@ -10362,6 +10783,8 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
{
+ unsigned long old_cr0 = kvm_read_cr0(vcpu);
+
kvm_lapic_reset(vcpu, init_event);
vcpu->arch.hflags = 0;
@@ -10430,6 +10853,17 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
vcpu->arch.ia32_xss = 0;
static_call(kvm_x86_vcpu_reset)(vcpu, init_event);
+
+ /*
+ * Reset the MMU context if paging was enabled prior to INIT (which is
+ * implied if CR0.PG=1 as CR0 will be '0' prior to RESET). Unlike the
+ * standard CR0/CR4/EFER modification paths, only CR0.PG needs to be
+ * checked because it is unconditionally cleared on INIT and all other
+ * paging related bits are ignored if paging is disabled, i.e. CR0.WP,
+ * CR4, and EFER changes are all irrelevant if CR0.PG was '0'.
+ */
+ if (old_cr0 & X86_CR0_PG)
+ kvm_mmu_reset_context(vcpu);
}
void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
@@ -10547,6 +10981,9 @@ int kvm_arch_hardware_setup(void *opaque)
int r;
rdmsrl_safe(MSR_EFER, &host_efer);
+ if (WARN_ON_ONCE(boot_cpu_has(X86_FEATURE_NX) &&
+ !(host_efer & EFER_NX)))
+ return -EIO;
if (boot_cpu_has(X86_FEATURE_XSAVES))
rdmsrl(MSR_IA32_XSS, host_xss);
@@ -10662,9 +11099,15 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
kvm->arch.guest_can_read_msr_platform_info = true;
+#if IS_ENABLED(CONFIG_HYPERV)
+ spin_lock_init(&kvm->arch.hv_root_tdp_lock);
+ kvm->arch.hv_root_tdp = INVALID_PAGE;
+#endif
+
INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn);
INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work, kvmclock_sync_fn);
+ kvm_apicv_init(kvm);
kvm_hv_init_vm(kvm);
kvm_page_track_init(kvm);
kvm_mmu_init_vm(kvm);
@@ -10825,17 +11268,23 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
kvm_hv_destroy_vm(kvm);
}
-void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
+static void memslot_rmap_free(struct kvm_memory_slot *slot)
{
int i;
for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
kvfree(slot->arch.rmap[i]);
slot->arch.rmap[i] = NULL;
+ }
+}
- if (i == 0)
- continue;
+void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
+{
+ int i;
+
+ memslot_rmap_free(slot);
+ for (i = 1; i < KVM_NR_PAGE_SIZES; ++i) {
kvfree(slot->arch.lpage_info[i - 1]);
slot->arch.lpage_info[i - 1] = NULL;
}
@@ -10843,11 +11292,79 @@ void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
kvm_page_track_free_memslot(slot);
}
-static int kvm_alloc_memslot_metadata(struct kvm_memory_slot *slot,
- unsigned long npages)
+static int memslot_rmap_alloc(struct kvm_memory_slot *slot,
+ unsigned long npages)
{
+ const int sz = sizeof(*slot->arch.rmap[0]);
int i;
+ for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
+ int level = i + 1;
+ int lpages = gfn_to_index(slot->base_gfn + npages - 1,
+ slot->base_gfn, level) + 1;
+
+ WARN_ON(slot->arch.rmap[i]);
+
+ slot->arch.rmap[i] = kvcalloc(lpages, sz, GFP_KERNEL_ACCOUNT);
+ if (!slot->arch.rmap[i]) {
+ memslot_rmap_free(slot);
+ return -ENOMEM;
+ }
+ }
+
+ return 0;
+}
+
+int alloc_all_memslots_rmaps(struct kvm *kvm)
+{
+ struct kvm_memslots *slots;
+ struct kvm_memory_slot *slot;
+ int r, i;
+
+ /*
+ * Check if memslots alreday have rmaps early before acquiring
+ * the slots_arch_lock below.
+ */
+ if (kvm_memslots_have_rmaps(kvm))
+ return 0;
+
+ mutex_lock(&kvm->slots_arch_lock);
+
+ /*
+ * Read memslots_have_rmaps again, under the slots arch lock,
+ * before allocating the rmaps
+ */
+ if (kvm_memslots_have_rmaps(kvm)) {
+ mutex_unlock(&kvm->slots_arch_lock);
+ return 0;
+ }
+
+ for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
+ slots = __kvm_memslots(kvm, i);
+ kvm_for_each_memslot(slot, slots) {
+ r = memslot_rmap_alloc(slot, slot->npages);
+ if (r) {
+ mutex_unlock(&kvm->slots_arch_lock);
+ return r;
+ }
+ }
+ }
+
+ /*
+ * Ensure that memslots_have_rmaps becomes true strictly after
+ * all the rmap pointers are set.
+ */
+ smp_store_release(&kvm->arch.memslots_have_rmaps, true);
+ mutex_unlock(&kvm->slots_arch_lock);
+ return 0;
+}
+
+static int kvm_alloc_memslot_metadata(struct kvm *kvm,
+ struct kvm_memory_slot *slot,
+ unsigned long npages)
+{
+ int i, r;
+
/*
* Clear out the previous array pointers for the KVM_MR_MOVE case. The
* old arrays will be freed by __kvm_set_memory_region() if installing
@@ -10855,7 +11372,13 @@ static int kvm_alloc_memslot_metadata(struct kvm_memory_slot *slot,
*/
memset(&slot->arch, 0, sizeof(slot->arch));
- for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
+ if (kvm_memslots_have_rmaps(kvm)) {
+ r = memslot_rmap_alloc(slot, npages);
+ if (r)
+ return r;
+ }
+
+ for (i = 1; i < KVM_NR_PAGE_SIZES; ++i) {
struct kvm_lpage_info *linfo;
unsigned long ugfn;
int lpages;
@@ -10864,14 +11387,6 @@ static int kvm_alloc_memslot_metadata(struct kvm_memory_slot *slot,
lpages = gfn_to_index(slot->base_gfn + npages - 1,
slot->base_gfn, level) + 1;
- slot->arch.rmap[i] =
- kvcalloc(lpages, sizeof(*slot->arch.rmap[i]),
- GFP_KERNEL_ACCOUNT);
- if (!slot->arch.rmap[i])
- goto out_free;
- if (i == 0)
- continue;
-
linfo = kvcalloc(lpages, sizeof(*linfo), GFP_KERNEL_ACCOUNT);
if (!linfo)
goto out_free;
@@ -10901,12 +11416,9 @@ static int kvm_alloc_memslot_metadata(struct kvm_memory_slot *slot,
return 0;
out_free:
- for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
- kvfree(slot->arch.rmap[i]);
- slot->arch.rmap[i] = NULL;
- if (i == 0)
- continue;
+ memslot_rmap_free(slot);
+ for (i = 1; i < KVM_NR_PAGE_SIZES; ++i) {
kvfree(slot->arch.lpage_info[i - 1]);
slot->arch.lpage_info[i - 1] = NULL;
}
@@ -10935,7 +11447,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
enum kvm_mr_change change)
{
if (change == KVM_MR_CREATE || change == KVM_MR_MOVE)
- return kvm_alloc_memslot_metadata(memslot,
+ return kvm_alloc_memslot_metadata(kvm, memslot,
mem->memory_size >> PAGE_SHIFT);
return 0;
}
@@ -11011,36 +11523,19 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
*/
kvm_mmu_zap_collapsible_sptes(kvm, new);
} else {
- /* By default, write-protect everything to log writes. */
- int level = PG_LEVEL_4K;
+ /*
+ * Initially-all-set does not require write protecting any page,
+ * because they're all assumed to be dirty.
+ */
+ if (kvm_dirty_log_manual_protect_and_init_set(kvm))
+ return;
if (kvm_x86_ops.cpu_dirty_log_size) {
- /*
- * Clear all dirty bits, unless pages are treated as
- * dirty from the get-go.
- */
- if (!kvm_dirty_log_manual_protect_and_init_set(kvm))
- kvm_mmu_slot_leaf_clear_dirty(kvm, new);
-
- /*
- * Write-protect large pages on write so that dirty
- * logging happens at 4k granularity. No need to
- * write-protect small SPTEs since write accesses are
- * logged by the CPU via dirty bits.
- */
- level = PG_LEVEL_2M;
- } else if (kvm_dirty_log_manual_protect_and_init_set(kvm)) {
- /*
- * If we're with initial-all-set, we don't need
- * to write protect any small page because
- * they're reported as dirty already. However
- * we still need to write-protect huge pages
- * so that the page split can happen lazily on
- * the first write to the huge page.
- */
- level = PG_LEVEL_2M;
+ kvm_mmu_slot_leaf_clear_dirty(kvm, new);
+ kvm_mmu_slot_remove_write_access(kvm, new, PG_LEVEL_2M);
+ } else {
+ kvm_mmu_slot_remove_write_access(kvm, new, PG_LEVEL_4K);
}
- kvm_mmu_slot_remove_write_access(kvm, new, level);
}
}
@@ -11428,7 +11923,8 @@ bool kvm_arch_can_dequeue_async_page_present(struct kvm_vcpu *vcpu)
void kvm_arch_start_assignment(struct kvm *kvm)
{
- atomic_inc(&kvm->arch.assigned_device_count);
+ if (atomic_inc_return(&kvm->arch.assigned_device_count) == 1)
+ static_call_cond(kvm_x86_start_assignment)(kvm);
}
EXPORT_SYMBOL_GPL(kvm_arch_start_assignment);
@@ -11608,8 +12104,6 @@ int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva)
{
bool pcid_enabled;
struct x86_exception e;
- unsigned i;
- unsigned long roots_to_free = 0;
struct {
u64 pcid;
u64 gla;
@@ -11643,23 +12137,7 @@ int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva)
return 1;
}
- if (kvm_get_active_pcid(vcpu) == operand.pcid) {
- kvm_mmu_sync_roots(vcpu);
- kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
- }
-
- for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
- if (kvm_get_pcid(vcpu, vcpu->arch.mmu->prev_roots[i].pgd)
- == operand.pcid)
- roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
-
- kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, roots_to_free);
- /*
- * If neither the current cr3 nor any of the prev_roots use the
- * given PCID, then nothing needs to be done here because a
- * resync will happen anyway before switching to any other CR3.
- */
-
+ kvm_invalidate_pcid(vcpu, operand.pcid);
return kvm_skip_emulated_instruction(vcpu);
case INVPCID_TYPE_ALL_NON_GLOBAL:
@@ -11672,7 +12150,7 @@ int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva)
fallthrough;
case INVPCID_TYPE_ALL_INCL_GLOBAL:
- kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu);
+ kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
return kvm_skip_emulated_instruction(vcpu);
default: