diff options
Diffstat (limited to 'arch/x86/kvm/x86.c')
-rw-r--r-- | arch/x86/kvm/x86.c | 7864 |
1 files changed, 5536 insertions, 2328 deletions
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3156e25b0774..ecea83f0da49 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -18,15 +18,18 @@ #include <linux/kvm_host.h> #include "irq.h" +#include "ioapic.h" #include "mmu.h" #include "i8254.h" #include "tss.h" #include "kvm_cache_regs.h" +#include "kvm_emulate.h" #include "x86.h" #include "cpuid.h" #include "pmu.h" #include "hyperv.h" #include "lapic.h" +#include "xen.h" #include <linux/clocksource.h> #include <linux/interrupt.h> @@ -38,7 +41,6 @@ #include <linux/mman.h> #include <linux/highmem.h> #include <linux/iommu.h> -#include <linux/intel-iommu.h> #include <linux/cpufreq.h> #include <linux/user-return-notifier.h> #include <linux/srcu.h> @@ -54,6 +56,8 @@ #include <linux/sched/stat.h> #include <linux/sched/isolation.h> #include <linux/mem_encrypt.h> +#include <linux/entry-kvm.h> +#include <linux/suspend.h> #include <trace/events/kvm.h> @@ -61,15 +65,20 @@ #include <asm/msr.h> #include <asm/desc.h> #include <asm/mce.h> +#include <asm/pkru.h> #include <linux/kernel_stat.h> -#include <asm/fpu/internal.h> /* Ugh! */ +#include <asm/fpu/api.h> +#include <asm/fpu/xcr.h> +#include <asm/fpu/xstate.h> #include <asm/pvclock.h> #include <asm/div64.h> #include <asm/irq_remapping.h> #include <asm/mshyperv.h> #include <asm/hypervisor.h> +#include <asm/tlbflush.h> #include <asm/intel_pt.h> #include <asm/emulate_prefix.h> +#include <asm/sgx.h> #include <clocksource/hyperv_timer.h> #define CREATE_TRACE_POINTS @@ -77,11 +86,16 @@ #define MAX_IO_MSRS 256 #define KVM_MAX_MCE_BANKS 32 -u64 __read_mostly kvm_mce_cap_supported = MCG_CTL_P | MCG_SER_P; -EXPORT_SYMBOL_GPL(kvm_mce_cap_supported); + +struct kvm_caps kvm_caps __read_mostly = { + .supported_mce_cap = MCG_CTL_P | MCG_SER_P, +}; +EXPORT_SYMBOL_GPL(kvm_caps); + +#define ERR_PTR_USR(e) ((void __user *)ERR_PTR(e)) #define emul_to_vcpu(ctxt) \ - container_of(ctxt, struct kvm_vcpu, arch.emulate_ctxt) + ((struct kvm_vcpu *)(ctxt)->vcpu) /* EFER defaults: * - enable syscall per default because its emulated by KVM @@ -96,27 +110,42 @@ static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE); static u64 __read_mostly cr4_reserved_bits = CR4_RESERVED_BITS; -#define VM_STAT(x, ...) offsetof(struct kvm, stat.x), KVM_STAT_VM, ## __VA_ARGS__ -#define VCPU_STAT(x, ...) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU, ## __VA_ARGS__ +#define KVM_EXIT_HYPERCALL_VALID_MASK (1 << KVM_HC_MAP_GPA_RANGE) + +#define KVM_CAP_PMU_VALID_MASK KVM_PMU_CAP_DISABLE #define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS | \ KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK) static void update_cr8_intercept(struct kvm_vcpu *vcpu); static void process_nmi(struct kvm_vcpu *vcpu); +static void process_smi(struct kvm_vcpu *vcpu); static void enter_smm(struct kvm_vcpu *vcpu); static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags); static void store_regs(struct kvm_vcpu *vcpu); static int sync_regs(struct kvm_vcpu *vcpu); +static int kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu); + +static int __set_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2); +static void __get_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2); + +struct kvm_x86_ops kvm_x86_ops __read_mostly; -struct kvm_x86_ops *kvm_x86_ops __read_mostly; -EXPORT_SYMBOL_GPL(kvm_x86_ops); +#define KVM_X86_OP(func) \ + DEFINE_STATIC_CALL_NULL(kvm_x86_##func, \ + *(((struct kvm_x86_ops *)0)->func)); +#define KVM_X86_OP_OPTIONAL KVM_X86_OP +#define KVM_X86_OP_OPTIONAL_RET0 KVM_X86_OP +#include <asm/kvm-x86-ops.h> +EXPORT_STATIC_CALL_GPL(kvm_x86_get_cs_db_l_bits); +EXPORT_STATIC_CALL_GPL(kvm_x86_cache_reg); static bool __read_mostly ignore_msrs = 0; module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR); -static bool __read_mostly report_ignored_msrs = true; +bool __read_mostly report_ignored_msrs = true; module_param(report_ignored_msrs, bool, S_IRUGO | S_IWUSR); +EXPORT_SYMBOL_GPL(report_ignored_msrs); unsigned int min_timer_period_us = 200; module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR); @@ -124,26 +153,15 @@ module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR); static bool __read_mostly kvmclock_periodic_sync = true; module_param(kvmclock_periodic_sync, bool, S_IRUGO); -bool __read_mostly kvm_has_tsc_control; -EXPORT_SYMBOL_GPL(kvm_has_tsc_control); -u32 __read_mostly kvm_max_guest_tsc_khz; -EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz); -u8 __read_mostly kvm_tsc_scaling_ratio_frac_bits; -EXPORT_SYMBOL_GPL(kvm_tsc_scaling_ratio_frac_bits); -u64 __read_mostly kvm_max_tsc_scaling_ratio; -EXPORT_SYMBOL_GPL(kvm_max_tsc_scaling_ratio); -u64 __read_mostly kvm_default_tsc_scaling_ratio; -EXPORT_SYMBOL_GPL(kvm_default_tsc_scaling_ratio); - /* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */ static u32 __read_mostly tsc_tolerance_ppm = 250; module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR); /* * lapic timer advance (tscdeadline mode only) in nanoseconds. '-1' enables - * adaptive tuning starting from default advancment of 1000ns. '0' disables + * adaptive tuning starting from default advancement of 1000ns. '0' disables * advancement entirely. Any other value is used as-is and disables adaptive - * tuning, i.e. allows priveleged userspace to set an exact advancement time. + * tuning, i.e. allows privileged userspace to set an exact advancement time. */ static int __read_mostly lapic_timer_advance_ns = -1; module_param(lapic_timer_advance_ns, int, S_IRUGO | S_IWUSR); @@ -155,96 +173,187 @@ bool __read_mostly enable_vmware_backdoor = false; module_param(enable_vmware_backdoor, bool, S_IRUGO); EXPORT_SYMBOL_GPL(enable_vmware_backdoor); -static bool __read_mostly force_emulation_prefix = false; -module_param(force_emulation_prefix, bool, S_IRUGO); +/* + * Flags to manipulate forced emulation behavior (any non-zero value will + * enable forced emulation). + */ +#define KVM_FEP_CLEAR_RFLAGS_RF BIT(1) +static int __read_mostly force_emulation_prefix; +module_param(force_emulation_prefix, int, 0644); int __read_mostly pi_inject_timer = -1; module_param(pi_inject_timer, bint, S_IRUGO | S_IWUSR); -#define KVM_NR_SHARED_MSRS 16 +/* Enable/disable PMU virtualization */ +bool __read_mostly enable_pmu = true; +EXPORT_SYMBOL_GPL(enable_pmu); +module_param(enable_pmu, bool, 0444); -struct kvm_shared_msrs_global { - int nr; - u32 msrs[KVM_NR_SHARED_MSRS]; -}; +bool __read_mostly eager_page_split = true; +module_param(eager_page_split, bool, 0644); + +/* + * Restoring the host value for MSRs that are only consumed when running in + * usermode, e.g. SYSCALL MSRs and TSC_AUX, can be deferred until the CPU + * returns to userspace, i.e. the kernel can run with the guest's value. + */ +#define KVM_MAX_NR_USER_RETURN_MSRS 16 -struct kvm_shared_msrs { +struct kvm_user_return_msrs { struct user_return_notifier urn; bool registered; - struct kvm_shared_msr_values { + struct kvm_user_return_msr_values { u64 host; u64 curr; - } values[KVM_NR_SHARED_MSRS]; + } values[KVM_MAX_NR_USER_RETURN_MSRS]; +}; + +u32 __read_mostly kvm_nr_uret_msrs; +EXPORT_SYMBOL_GPL(kvm_nr_uret_msrs); +static u32 __read_mostly kvm_uret_msrs_list[KVM_MAX_NR_USER_RETURN_MSRS]; +static struct kvm_user_return_msrs __percpu *user_return_msrs; + +#define KVM_SUPPORTED_XCR0 (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \ + | XFEATURE_MASK_YMM | XFEATURE_MASK_BNDREGS \ + | XFEATURE_MASK_BNDCSR | XFEATURE_MASK_AVX512 \ + | XFEATURE_MASK_PKRU | XFEATURE_MASK_XTILE) + +u64 __read_mostly host_efer; +EXPORT_SYMBOL_GPL(host_efer); + +bool __read_mostly allow_smaller_maxphyaddr = 0; +EXPORT_SYMBOL_GPL(allow_smaller_maxphyaddr); + +bool __read_mostly enable_apicv = true; +EXPORT_SYMBOL_GPL(enable_apicv); + +u64 __read_mostly host_xss; +EXPORT_SYMBOL_GPL(host_xss); + +const struct _kvm_stats_desc kvm_vm_stats_desc[] = { + KVM_GENERIC_VM_STATS(), + STATS_DESC_COUNTER(VM, mmu_shadow_zapped), + STATS_DESC_COUNTER(VM, mmu_pte_write), + STATS_DESC_COUNTER(VM, mmu_pde_zapped), + STATS_DESC_COUNTER(VM, mmu_flooded), + STATS_DESC_COUNTER(VM, mmu_recycled), + STATS_DESC_COUNTER(VM, mmu_cache_miss), + STATS_DESC_ICOUNTER(VM, mmu_unsync), + STATS_DESC_ICOUNTER(VM, pages_4k), + STATS_DESC_ICOUNTER(VM, pages_2m), + STATS_DESC_ICOUNTER(VM, pages_1g), + STATS_DESC_ICOUNTER(VM, nx_lpage_splits), + STATS_DESC_PCOUNTER(VM, max_mmu_rmap_size), + STATS_DESC_PCOUNTER(VM, max_mmu_page_hash_collisions) +}; + +const struct kvm_stats_header kvm_vm_stats_header = { + .name_size = KVM_STATS_NAME_SIZE, + .num_desc = ARRAY_SIZE(kvm_vm_stats_desc), + .id_offset = sizeof(struct kvm_stats_header), + .desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE, + .data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE + + sizeof(kvm_vm_stats_desc), }; -static struct kvm_shared_msrs_global __read_mostly shared_msrs_global; -static struct kvm_shared_msrs __percpu *shared_msrs; - -static u64 __read_mostly host_xss; - -struct kvm_stats_debugfs_item debugfs_entries[] = { - { "pf_fixed", VCPU_STAT(pf_fixed) }, - { "pf_guest", VCPU_STAT(pf_guest) }, - { "tlb_flush", VCPU_STAT(tlb_flush) }, - { "invlpg", VCPU_STAT(invlpg) }, - { "exits", VCPU_STAT(exits) }, - { "io_exits", VCPU_STAT(io_exits) }, - { "mmio_exits", VCPU_STAT(mmio_exits) }, - { "signal_exits", VCPU_STAT(signal_exits) }, - { "irq_window", VCPU_STAT(irq_window_exits) }, - { "nmi_window", VCPU_STAT(nmi_window_exits) }, - { "halt_exits", VCPU_STAT(halt_exits) }, - { "halt_successful_poll", VCPU_STAT(halt_successful_poll) }, - { "halt_attempted_poll", VCPU_STAT(halt_attempted_poll) }, - { "halt_poll_invalid", VCPU_STAT(halt_poll_invalid) }, - { "halt_wakeup", VCPU_STAT(halt_wakeup) }, - { "hypercalls", VCPU_STAT(hypercalls) }, - { "request_irq", VCPU_STAT(request_irq_exits) }, - { "irq_exits", VCPU_STAT(irq_exits) }, - { "host_state_reload", VCPU_STAT(host_state_reload) }, - { "fpu_reload", VCPU_STAT(fpu_reload) }, - { "insn_emulation", VCPU_STAT(insn_emulation) }, - { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) }, - { "irq_injections", VCPU_STAT(irq_injections) }, - { "nmi_injections", VCPU_STAT(nmi_injections) }, - { "req_event", VCPU_STAT(req_event) }, - { "l1d_flush", VCPU_STAT(l1d_flush) }, - { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) }, - { "mmu_pte_write", VM_STAT(mmu_pte_write) }, - { "mmu_pte_updated", VM_STAT(mmu_pte_updated) }, - { "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) }, - { "mmu_flooded", VM_STAT(mmu_flooded) }, - { "mmu_recycled", VM_STAT(mmu_recycled) }, - { "mmu_cache_miss", VM_STAT(mmu_cache_miss) }, - { "mmu_unsync", VM_STAT(mmu_unsync) }, - { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, - { "largepages", VM_STAT(lpages, .mode = 0444) }, - { "nx_largepages_splitted", VM_STAT(nx_lpage_splits, .mode = 0444) }, - { "max_mmu_page_hash_collisions", - VM_STAT(max_mmu_page_hash_collisions) }, - { NULL } +const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = { + KVM_GENERIC_VCPU_STATS(), + STATS_DESC_COUNTER(VCPU, pf_taken), + STATS_DESC_COUNTER(VCPU, pf_fixed), + STATS_DESC_COUNTER(VCPU, pf_emulate), + STATS_DESC_COUNTER(VCPU, pf_spurious), + STATS_DESC_COUNTER(VCPU, pf_fast), + STATS_DESC_COUNTER(VCPU, pf_mmio_spte_created), + STATS_DESC_COUNTER(VCPU, pf_guest), + STATS_DESC_COUNTER(VCPU, tlb_flush), + STATS_DESC_COUNTER(VCPU, invlpg), + STATS_DESC_COUNTER(VCPU, exits), + STATS_DESC_COUNTER(VCPU, io_exits), + STATS_DESC_COUNTER(VCPU, mmio_exits), + STATS_DESC_COUNTER(VCPU, signal_exits), + STATS_DESC_COUNTER(VCPU, irq_window_exits), + STATS_DESC_COUNTER(VCPU, nmi_window_exits), + STATS_DESC_COUNTER(VCPU, l1d_flush), + STATS_DESC_COUNTER(VCPU, halt_exits), + STATS_DESC_COUNTER(VCPU, request_irq_exits), + STATS_DESC_COUNTER(VCPU, irq_exits), + STATS_DESC_COUNTER(VCPU, host_state_reload), + STATS_DESC_COUNTER(VCPU, fpu_reload), + STATS_DESC_COUNTER(VCPU, insn_emulation), + STATS_DESC_COUNTER(VCPU, insn_emulation_fail), + STATS_DESC_COUNTER(VCPU, hypercalls), + STATS_DESC_COUNTER(VCPU, irq_injections), + STATS_DESC_COUNTER(VCPU, nmi_injections), + STATS_DESC_COUNTER(VCPU, req_event), + STATS_DESC_COUNTER(VCPU, nested_run), + STATS_DESC_COUNTER(VCPU, directed_yield_attempted), + STATS_DESC_COUNTER(VCPU, directed_yield_successful), + STATS_DESC_COUNTER(VCPU, preemption_reported), + STATS_DESC_COUNTER(VCPU, preemption_other), + STATS_DESC_IBOOLEAN(VCPU, guest_mode), + STATS_DESC_COUNTER(VCPU, notify_window_exits), +}; + +const struct kvm_stats_header kvm_vcpu_stats_header = { + .name_size = KVM_STATS_NAME_SIZE, + .num_desc = ARRAY_SIZE(kvm_vcpu_stats_desc), + .id_offset = sizeof(struct kvm_stats_header), + .desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE, + .data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE + + sizeof(kvm_vcpu_stats_desc), }; u64 __read_mostly host_xcr0; -struct kmem_cache *x86_fpu_cache; -EXPORT_SYMBOL_GPL(x86_fpu_cache); +static struct kmem_cache *x86_emulator_cache; + +/* + * When called, it means the previous get/set msr reached an invalid msr. + * Return true if we want to ignore/silent this failed msr access. + */ +static bool kvm_msr_ignored_check(u32 msr, u64 data, bool write) +{ + const char *op = write ? "wrmsr" : "rdmsr"; + + if (ignore_msrs) { + if (report_ignored_msrs) + kvm_pr_unimpl("ignored %s: 0x%x data 0x%llx\n", + op, msr, data); + /* Mask the error */ + return true; + } else { + kvm_debug_ratelimited("unhandled %s: 0x%x data 0x%llx\n", + op, msr, data); + return false; + } +} + +static struct kmem_cache *kvm_alloc_emulator_cache(void) +{ + unsigned int useroffset = offsetof(struct x86_emulate_ctxt, src); + unsigned int size = sizeof(struct x86_emulate_ctxt); + + return kmem_cache_create_usercopy("x86_emulator", size, + __alignof__(struct x86_emulate_ctxt), + SLAB_ACCOUNT, useroffset, + size - useroffset, NULL); +} static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt); static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu) { int i; - for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU); i++) + for (i = 0; i < ASYNC_PF_PER_VCPU; i++) vcpu->arch.apf.gfns[i] = ~0; } static void kvm_on_user_return(struct user_return_notifier *urn) { unsigned slot; - struct kvm_shared_msrs *locals - = container_of(urn, struct kvm_shared_msrs, urn); - struct kvm_shared_msr_values *values; + struct kvm_user_return_msrs *msrs + = container_of(urn, struct kvm_user_return_msrs, urn); + struct kvm_user_return_msr_values *values; unsigned long flags; /* @@ -252,73 +361,103 @@ static void kvm_on_user_return(struct user_return_notifier *urn) * interrupted and executed through kvm_arch_hardware_disable() */ local_irq_save(flags); - if (locals->registered) { - locals->registered = false; + if (msrs->registered) { + msrs->registered = false; user_return_notifier_unregister(urn); } local_irq_restore(flags); - for (slot = 0; slot < shared_msrs_global.nr; ++slot) { - values = &locals->values[slot]; + for (slot = 0; slot < kvm_nr_uret_msrs; ++slot) { + values = &msrs->values[slot]; if (values->host != values->curr) { - wrmsrl(shared_msrs_global.msrs[slot], values->host); + wrmsrl(kvm_uret_msrs_list[slot], values->host); values->curr = values->host; } } } -void kvm_define_shared_msr(unsigned slot, u32 msr) +static int kvm_probe_user_return_msr(u32 msr) { - BUG_ON(slot >= KVM_NR_SHARED_MSRS); - shared_msrs_global.msrs[slot] = msr; - if (slot >= shared_msrs_global.nr) - shared_msrs_global.nr = slot + 1; + u64 val; + int ret; + + preempt_disable(); + ret = rdmsrl_safe(msr, &val); + if (ret) + goto out; + ret = wrmsrl_safe(msr, val); +out: + preempt_enable(); + return ret; } -EXPORT_SYMBOL_GPL(kvm_define_shared_msr); -static void kvm_shared_msr_cpu_online(void) +int kvm_add_user_return_msr(u32 msr) +{ + BUG_ON(kvm_nr_uret_msrs >= KVM_MAX_NR_USER_RETURN_MSRS); + + if (kvm_probe_user_return_msr(msr)) + return -1; + + kvm_uret_msrs_list[kvm_nr_uret_msrs] = msr; + return kvm_nr_uret_msrs++; +} +EXPORT_SYMBOL_GPL(kvm_add_user_return_msr); + +int kvm_find_user_return_msr(u32 msr) +{ + int i; + + for (i = 0; i < kvm_nr_uret_msrs; ++i) { + if (kvm_uret_msrs_list[i] == msr) + return i; + } + return -1; +} +EXPORT_SYMBOL_GPL(kvm_find_user_return_msr); + +static void kvm_user_return_msr_cpu_online(void) { unsigned int cpu = smp_processor_id(); - struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu); + struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu); u64 value; int i; - for (i = 0; i < shared_msrs_global.nr; ++i) { - rdmsrl_safe(shared_msrs_global.msrs[i], &value); - smsr->values[i].host = value; - smsr->values[i].curr = value; + for (i = 0; i < kvm_nr_uret_msrs; ++i) { + rdmsrl_safe(kvm_uret_msrs_list[i], &value); + msrs->values[i].host = value; + msrs->values[i].curr = value; } } -int kvm_set_shared_msr(unsigned slot, u64 value, u64 mask) +int kvm_set_user_return_msr(unsigned slot, u64 value, u64 mask) { unsigned int cpu = smp_processor_id(); - struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu); + struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu); int err; - value = (value & mask) | (smsr->values[slot].host & ~mask); - if (value == smsr->values[slot].curr) + value = (value & mask) | (msrs->values[slot].host & ~mask); + if (value == msrs->values[slot].curr) return 0; - err = wrmsrl_safe(shared_msrs_global.msrs[slot], value); + err = wrmsrl_safe(kvm_uret_msrs_list[slot], value); if (err) return 1; - smsr->values[slot].curr = value; - if (!smsr->registered) { - smsr->urn.on_user_return = kvm_on_user_return; - user_return_notifier_register(&smsr->urn); - smsr->registered = true; + msrs->values[slot].curr = value; + if (!msrs->registered) { + msrs->urn.on_user_return = kvm_on_user_return; + user_return_notifier_register(&msrs->urn); + msrs->registered = true; } return 0; } -EXPORT_SYMBOL_GPL(kvm_set_shared_msr); +EXPORT_SYMBOL_GPL(kvm_set_user_return_msr); static void drop_user_return_notifiers(void) { unsigned int cpu = smp_processor_id(); - struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu); + struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu); - if (smsr->registered) - kvm_on_user_return(&smsr->urn); + if (msrs->registered) + kvm_on_user_return(&msrs->urn); } u64 kvm_get_apic_base(struct kvm_vcpu *vcpu) @@ -337,7 +476,7 @@ int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { enum lapic_mode old_mode = kvm_get_apic_mode(vcpu); enum lapic_mode new_mode = kvm_apic_mode(msr_info->data); - u64 reserved_bits = ((~0ULL) << cpuid_maxphyaddr(vcpu)) | 0x2ff | + u64 reserved_bits = kvm_vcpu_reserved_gpa_bits_raw(vcpu) | 0x2ff | (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC) ? 0 : X2APIC_ENABLE); if ((msr_info->data & reserved_bits) != 0 || new_mode == LAPIC_MODE_INVALID) @@ -350,11 +489,19 @@ int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info) } kvm_lapic_set_base(vcpu, msr_info->data); + kvm_recalculate_apic_map(vcpu->kvm); return 0; } EXPORT_SYMBOL_GPL(kvm_set_apic_base); -asmlinkage __visible void kvm_spurious_fault(void) +/* + * Handle a fault on a hardware virtualization (VMX or SVM) instruction. + * + * Hardware virtualization extension instructions may fault if a reboot turns + * off virtualization while processes are running. Usually after catching the + * fault we just panic; during reboot instead the instruction is ignored. + */ +noinstr void kvm_spurious_fault(void) { /* Fault while not rebooting. We want the trace. */ BUG_ON(!kvm_rebooting); @@ -386,6 +533,7 @@ static int exception_class(int vector) #define EXCPT_TRAP 1 #define EXCPT_ABORT 2 #define EXCPT_INTERRUPT 3 +#define EXCPT_DB 4 static int exception_type(int vector) { @@ -396,8 +544,14 @@ static int exception_type(int vector) mask = 1 << vector; - /* #DB is trap, as instruction watchpoints are handled elsewhere */ - if (mask & ((1 << DB_VECTOR) | (1 << BP_VECTOR) | (1 << OF_VECTOR))) + /* + * #DBs can be trap-like or fault-like, the caller must check other CPU + * state, e.g. DR6, to determine whether a #DB is a trap or fault. + */ + if (mask & (1 << DB_VECTOR)) + return EXCPT_DB; + + if (mask & ((1 << BP_VECTOR) | (1 << OF_VECTOR))) return EXCPT_TRAP; if (mask & ((1 << DF_VECTOR) | (1 << MC_VECTOR))) @@ -407,16 +561,13 @@ static int exception_type(int vector) return EXCPT_FAULT; } -void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu) +void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu, + struct kvm_queued_exception *ex) { - unsigned nr = vcpu->arch.exception.nr; - bool has_payload = vcpu->arch.exception.has_payload; - unsigned long payload = vcpu->arch.exception.payload; - - if (!has_payload) + if (!ex->has_payload) return; - switch (nr) { + switch (ex->vector) { case DB_VECTOR: /* * "Certain debug exceptions may clear bit 0-3. The @@ -425,19 +576,24 @@ void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu) */ vcpu->arch.dr6 &= ~DR_TRAP_BITS; /* - * DR6.RTM is set by all #DB exceptions that don't clear it. - */ - vcpu->arch.dr6 |= DR6_RTM; - vcpu->arch.dr6 |= payload; - /* - * Bit 16 should be set in the payload whenever the #DB - * exception should clear DR6.RTM. This makes the payload - * compatible with the pending debug exceptions under VMX. - * Though not currently documented in the SDM, this also - * makes the payload compatible with the exit qualification - * for #DB exceptions under VMX. + * In order to reflect the #DB exception payload in guest + * dr6, three components need to be considered: active low + * bit, FIXED_1 bits and active high bits (e.g. DR6_BD, + * DR6_BS and DR6_BT) + * DR6_ACTIVE_LOW contains the FIXED_1 and active low bits. + * In the target guest dr6: + * FIXED_1 bits should always be set. + * Active low bits should be cleared if 1-setting in payload. + * Active high bits should be set if 1-setting in payload. + * + * Note, the payload is compatible with the pending debug + * exceptions/exit qualification under VMX, that active_low bits + * are active high in payload. + * So they need to be flipped for DR6. */ - vcpu->arch.dr6 ^= payload & DR6_RTM; + vcpu->arch.dr6 |= DR6_ACTIVE_LOW; + vcpu->arch.dr6 |= ex->payload; + vcpu->arch.dr6 ^= ex->payload & DR6_ACTIVE_LOW; /* * The #DB payload is defined as compatible with the 'pending @@ -448,15 +604,30 @@ void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu) vcpu->arch.dr6 &= ~BIT(12); break; case PF_VECTOR: - vcpu->arch.cr2 = payload; + vcpu->arch.cr2 = ex->payload; break; } - vcpu->arch.exception.has_payload = false; - vcpu->arch.exception.payload = 0; + ex->has_payload = false; + ex->payload = 0; } EXPORT_SYMBOL_GPL(kvm_deliver_exception_payload); +static void kvm_queue_exception_vmexit(struct kvm_vcpu *vcpu, unsigned int vector, + bool has_error_code, u32 error_code, + bool has_payload, unsigned long payload) +{ + struct kvm_queued_exception *ex = &vcpu->arch.exception_vmexit; + + ex->vector = vector; + ex->injected = false; + ex->pending = true; + ex->has_error_code = has_error_code; + ex->error_code = error_code; + ex->has_payload = has_payload; + ex->payload = payload; +} + static void kvm_multiple_exception(struct kvm_vcpu *vcpu, unsigned nr, bool has_error, u32 error_code, bool has_payload, unsigned long payload, bool reinject) @@ -466,20 +637,31 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu, kvm_make_request(KVM_REQ_EVENT, vcpu); + /* + * If the exception is destined for L2 and isn't being reinjected, + * morph it to a VM-Exit if L1 wants to intercept the exception. A + * previously injected exception is not checked because it was checked + * when it was original queued, and re-checking is incorrect if _L1_ + * injected the exception, in which case it's exempt from interception. + */ + if (!reinject && is_guest_mode(vcpu) && + kvm_x86_ops.nested_ops->is_exception_vmexit(vcpu, nr, error_code)) { + kvm_queue_exception_vmexit(vcpu, nr, has_error, error_code, + has_payload, payload); + return; + } + if (!vcpu->arch.exception.pending && !vcpu->arch.exception.injected) { queue: - if (has_error && !is_protmode(vcpu)) - has_error = false; if (reinject) { /* - * On vmentry, vcpu->arch.exception.pending is only - * true if an event injection was blocked by - * nested_run_pending. In that case, however, - * vcpu_enter_guest requests an immediate exit, - * and the guest shouldn't proceed far enough to - * need reinjection. + * On VM-Entry, an exception can be pending if and only + * if event injection was blocked by nested_run_pending. + * In that case, however, vcpu_enter_guest() requests an + * immediate exit, and the guest shouldn't proceed far + * enough to need reinjection. */ - WARN_ON_ONCE(vcpu->arch.exception.pending); + WARN_ON_ONCE(kvm_is_exception_pending(vcpu)); vcpu->arch.exception.injected = true; if (WARN_ON_ONCE(has_payload)) { /* @@ -494,17 +676,18 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu, vcpu->arch.exception.injected = false; } vcpu->arch.exception.has_error_code = has_error; - vcpu->arch.exception.nr = nr; + vcpu->arch.exception.vector = nr; vcpu->arch.exception.error_code = error_code; vcpu->arch.exception.has_payload = has_payload; vcpu->arch.exception.payload = payload; if (!is_guest_mode(vcpu)) - kvm_deliver_exception_payload(vcpu); + kvm_deliver_exception_payload(vcpu, + &vcpu->arch.exception); return; } /* to check exception */ - prev_nr = vcpu->arch.exception.nr; + prev_nr = vcpu->arch.exception.vector; if (prev_nr == DF_VECTOR) { /* triple fault -> shutdown */ kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); @@ -512,25 +695,22 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu, } class1 = exception_class(prev_nr); class2 = exception_class(nr); - if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY) - || (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) { + if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY) || + (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) { /* - * Generate double fault per SDM Table 5-5. Set - * exception.pending = true so that the double fault - * can trigger a nested vmexit. + * Synthesize #DF. Clear the previously injected or pending + * exception so as not to incorrectly trigger shutdown. */ - vcpu->arch.exception.pending = true; vcpu->arch.exception.injected = false; - vcpu->arch.exception.has_error_code = true; - vcpu->arch.exception.nr = DF_VECTOR; - vcpu->arch.exception.error_code = 0; - vcpu->arch.exception.has_payload = false; - vcpu->arch.exception.payload = 0; - } else + vcpu->arch.exception.pending = false; + + kvm_queue_exception_e(vcpu, DF_VECTOR, 0); + } else { /* replace previous exception with a new one in a hope that instruction re-execution will regenerate lost exception */ goto queue; + } } void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr) @@ -545,11 +725,12 @@ void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr) } EXPORT_SYMBOL_GPL(kvm_requeue_exception); -static void kvm_queue_exception_p(struct kvm_vcpu *vcpu, unsigned nr, - unsigned long payload) +void kvm_queue_exception_p(struct kvm_vcpu *vcpu, unsigned nr, + unsigned long payload) { kvm_multiple_exception(vcpu, nr, false, 0, true, payload, false); } +EXPORT_SYMBOL_GPL(kvm_queue_exception_p); static void kvm_queue_exception_e_p(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code, unsigned long payload) @@ -569,30 +750,56 @@ int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err) } EXPORT_SYMBOL_GPL(kvm_complete_insn_gp); +static int complete_emulated_insn_gp(struct kvm_vcpu *vcpu, int err) +{ + if (err) { + kvm_inject_gp(vcpu, 0); + return 1; + } + + return kvm_emulate_instruction(vcpu, EMULTYPE_NO_DECODE | EMULTYPE_SKIP | + EMULTYPE_COMPLETE_USER_EXIT); +} + void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) { ++vcpu->stat.pf_guest; - vcpu->arch.exception.nested_apf = - is_guest_mode(vcpu) && fault->async_page_fault; - if (vcpu->arch.exception.nested_apf) { - vcpu->arch.apf.nested_apf_token = fault->address; - kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code); - } else { + + /* + * Async #PF in L2 is always forwarded to L1 as a VM-Exit regardless of + * whether or not L1 wants to intercept "regular" #PF. + */ + if (is_guest_mode(vcpu) && fault->async_page_fault) + kvm_queue_exception_vmexit(vcpu, PF_VECTOR, + true, fault->error_code, + true, fault->address); + else kvm_queue_exception_e_p(vcpu, PF_VECTOR, fault->error_code, fault->address); - } } EXPORT_SYMBOL_GPL(kvm_inject_page_fault); -static bool kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) +void kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu, + struct x86_exception *fault) { - if (mmu_is_nested(vcpu) && !fault->nested_page_fault) - vcpu->arch.nested_mmu.inject_page_fault(vcpu, fault); - else - vcpu->arch.mmu->inject_page_fault(vcpu, fault); + struct kvm_mmu *fault_mmu; + WARN_ON_ONCE(fault->vector != PF_VECTOR); + + fault_mmu = fault->nested_page_fault ? vcpu->arch.mmu : + vcpu->arch.walk_mmu; - return fault->nested_page_fault; + /* + * Invalidate the TLB entry for the faulting address, if it exists, + * else the access will fault indefinitely (and to emulate hardware). + */ + if ((fault->error_code & PFERR_PRESENT_MASK) && + !(fault->error_code & PFERR_RSVD_MASK)) + kvm_mmu_invalidate_gva(vcpu, fault_mmu, fault->address, + fault_mmu->root.hpa); + + fault_mmu->inject_page_fault(vcpu, fault); } +EXPORT_SYMBOL_GPL(kvm_inject_emulated_page_fault); void kvm_inject_nmi(struct kvm_vcpu *vcpu) { @@ -619,7 +826,7 @@ EXPORT_SYMBOL_GPL(kvm_requeue_exception_e); */ bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl) { - if (kvm_x86_ops->get_cpl(vcpu) <= required_cpl) + if (static_call(kvm_x86_get_cpl)(vcpu) <= required_cpl) return true; kvm_queue_exception_e(vcpu, GP_VECTOR, 0); return false; @@ -636,107 +843,88 @@ bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr) } EXPORT_SYMBOL_GPL(kvm_require_dr); -/* - * This function will be used to read from the physical memory of the currently - * running guest. The difference to kvm_vcpu_read_guest_page is that this function - * can read from guest physical or from the guest's guest physical memory. - */ -int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, - gfn_t ngfn, void *data, int offset, int len, - u32 access) -{ - struct x86_exception exception; - gfn_t real_gfn; - gpa_t ngpa; - - ngpa = gfn_to_gpa(ngfn); - real_gfn = mmu->translate_gpa(vcpu, ngpa, access, &exception); - if (real_gfn == UNMAPPED_GVA) - return -EFAULT; - - real_gfn = gpa_to_gfn(real_gfn); - - return kvm_vcpu_read_guest_page(vcpu, real_gfn, data, offset, len); -} -EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu); - -static int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, - void *data, int offset, int len, u32 access) -{ - return kvm_read_guest_page_mmu(vcpu, vcpu->arch.walk_mmu, gfn, - data, offset, len, access); -} - static inline u64 pdptr_rsvd_bits(struct kvm_vcpu *vcpu) { - return rsvd_bits(cpuid_maxphyaddr(vcpu), 63) | rsvd_bits(5, 8) | - rsvd_bits(1, 2); + return vcpu->arch.reserved_gpa_bits | rsvd_bits(5, 8) | rsvd_bits(1, 2); } /* * Load the pae pdptrs. Return 1 if they are all valid, 0 otherwise. */ -int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3) +int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) { + struct kvm_mmu *mmu = vcpu->arch.walk_mmu; gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT; - unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2; + gpa_t real_gpa; int i; int ret; u64 pdpte[ARRAY_SIZE(mmu->pdptrs)]; - ret = kvm_read_guest_page_mmu(vcpu, mmu, pdpt_gfn, pdpte, - offset * sizeof(u64), sizeof(pdpte), - PFERR_USER_MASK|PFERR_WRITE_MASK); - if (ret < 0) { - ret = 0; - goto out; - } + /* + * If the MMU is nested, CR3 holds an L2 GPA and needs to be translated + * to an L1 GPA. + */ + real_gpa = kvm_translate_gpa(vcpu, mmu, gfn_to_gpa(pdpt_gfn), + PFERR_USER_MASK | PFERR_WRITE_MASK, NULL); + if (real_gpa == INVALID_GPA) + return 0; + + /* Note the offset, PDPTRs are 32 byte aligned when using PAE paging. */ + ret = kvm_vcpu_read_guest_page(vcpu, gpa_to_gfn(real_gpa), pdpte, + cr3 & GENMASK(11, 5), sizeof(pdpte)); + if (ret < 0) + return 0; + for (i = 0; i < ARRAY_SIZE(pdpte); ++i) { if ((pdpte[i] & PT_PRESENT_MASK) && (pdpte[i] & pdptr_rsvd_bits(vcpu))) { - ret = 0; - goto out; + return 0; } } - ret = 1; + + /* + * Marking VCPU_EXREG_PDPTR dirty doesn't work for !tdp_enabled. + * Shadow page roots need to be reconstructed instead. + */ + if (!tdp_enabled && memcmp(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs))) + kvm_mmu_free_roots(vcpu->kvm, mmu, KVM_MMU_ROOT_CURRENT); memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs)); kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR); + kvm_make_request(KVM_REQ_LOAD_MMU_PGD, vcpu); + vcpu->arch.pdptrs_from_userspace = false; -out: - - return ret; + return 1; } EXPORT_SYMBOL_GPL(load_pdptrs); -bool pdptrs_changed(struct kvm_vcpu *vcpu) +void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned long cr0) { - u64 pdpte[ARRAY_SIZE(vcpu->arch.walk_mmu->pdptrs)]; - int offset; - gfn_t gfn; - int r; - - if (!is_pae_paging(vcpu)) - return false; + if ((cr0 ^ old_cr0) & X86_CR0_PG) { + kvm_clear_async_pf_completion_queue(vcpu); + kvm_async_pf_hash_reset(vcpu); - if (!kvm_register_is_available(vcpu, VCPU_EXREG_PDPTR)) - return true; + /* + * Clearing CR0.PG is defined to flush the TLB from the guest's + * perspective. + */ + if (!(cr0 & X86_CR0_PG)) + kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); + } - gfn = (kvm_read_cr3(vcpu) & 0xffffffe0ul) >> PAGE_SHIFT; - offset = (kvm_read_cr3(vcpu) & 0xffffffe0ul) & (PAGE_SIZE - 1); - r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte), - PFERR_USER_MASK | PFERR_WRITE_MASK); - if (r < 0) - return true; + if ((cr0 ^ old_cr0) & KVM_MMU_CR0_ROLE_BITS) + kvm_mmu_reset_context(vcpu); - return memcmp(pdpte, vcpu->arch.walk_mmu->pdptrs, sizeof(pdpte)) != 0; + if (((cr0 ^ old_cr0) & X86_CR0_CD) && + kvm_arch_has_noncoherent_dma(vcpu->kvm) && + !kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED)) + kvm_zap_gfn_range(vcpu->kvm, 0, ~0ULL); } -EXPORT_SYMBOL_GPL(pdptrs_changed); +EXPORT_SYMBOL_GPL(kvm_post_set_cr0); int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { unsigned long old_cr0 = kvm_read_cr0(vcpu); - unsigned long update_bits = X86_CR0_PG | X86_CR0_WP; cr0 |= X86_CR0_ET; @@ -753,40 +941,30 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) return 1; - if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { #ifdef CONFIG_X86_64 - if ((vcpu->arch.efer & EFER_LME)) { - int cs_db, cs_l; + if ((vcpu->arch.efer & EFER_LME) && !is_paging(vcpu) && + (cr0 & X86_CR0_PG)) { + int cs_db, cs_l; - if (!is_pae(vcpu)) - return 1; - kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); - if (cs_l) - return 1; - } else -#endif - if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, - kvm_read_cr3(vcpu))) + if (!is_pae(vcpu)) + return 1; + static_call(kvm_x86_get_cs_db_l_bits)(vcpu, &cs_db, &cs_l); + if (cs_l) return 1; } - - if (!(cr0 & X86_CR0_PG) && kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE)) +#endif + if (!(vcpu->arch.efer & EFER_LME) && (cr0 & X86_CR0_PG) && + is_pae(vcpu) && ((cr0 ^ old_cr0) & X86_CR0_PDPTR_BITS) && + !load_pdptrs(vcpu, kvm_read_cr3(vcpu))) return 1; - kvm_x86_ops->set_cr0(vcpu, cr0); - - if ((cr0 ^ old_cr0) & X86_CR0_PG) { - kvm_clear_async_pf_completion_queue(vcpu); - kvm_async_pf_hash_reset(vcpu); - } + if (!(cr0 & X86_CR0_PG) && + (is_64_bit_mode(vcpu) || kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE))) + return 1; - if ((cr0 ^ old_cr0) & update_bits) - kvm_mmu_reset_context(vcpu); + static_call(kvm_x86_set_cr0)(vcpu, cr0); - if (((cr0 ^ old_cr0) & X86_CR0_CD) && - kvm_arch_has_noncoherent_dma(vcpu->kvm) && - !kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED)) - kvm_zap_gfn_range(vcpu->kvm, 0, ~0ULL); + kvm_post_set_cr0(vcpu, old_cr0, cr0); return 0; } @@ -800,6 +978,9 @@ EXPORT_SYMBOL_GPL(kvm_lmsw); void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu) { + if (vcpu->arch.guest_state_protected) + return; + if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) { if (vcpu->arch.xcr0 != host_xcr0) @@ -809,11 +990,32 @@ void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu) vcpu->arch.ia32_xss != host_xss) wrmsrl(MSR_IA32_XSS, vcpu->arch.ia32_xss); } + +#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS + if (static_cpu_has(X86_FEATURE_PKU) && + vcpu->arch.pkru != vcpu->arch.host_pkru && + ((vcpu->arch.xcr0 & XFEATURE_MASK_PKRU) || + kvm_read_cr4_bits(vcpu, X86_CR4_PKE))) + write_pkru(vcpu->arch.pkru); +#endif /* CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS */ } EXPORT_SYMBOL_GPL(kvm_load_guest_xsave_state); void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu) { + if (vcpu->arch.guest_state_protected) + return; + +#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS + if (static_cpu_has(X86_FEATURE_PKU) && + ((vcpu->arch.xcr0 & XFEATURE_MASK_PKRU) || + kvm_read_cr4_bits(vcpu, X86_CR4_PKE))) { + vcpu->arch.pkru = rdpkru(); + if (vcpu->arch.pkru != vcpu->arch.host_pkru) + write_pkru(vcpu->arch.host_pkru); + } +#endif /* CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS */ + if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) { if (vcpu->arch.xcr0 != host_xcr0) @@ -827,6 +1029,13 @@ void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_load_host_xsave_state); +#ifdef CONFIG_X86_64 +static inline u64 kvm_guest_supported_xfd(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.guest_supported_xcr0 & XFEATURE_MASK_USER_DYNAMIC; +} +#endif + static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) { u64 xcr0 = xcr; @@ -844,7 +1053,7 @@ static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) /* * Do not allow the guest to set bits that we do not support * saving. However, xcr0 bit 0 is always set, even if the - * emulated CPU does not support XSAVE (see fx_init). + * emulated CPU does not support XSAVE (see kvm_vcpu_reset()). */ valid_bits = vcpu->arch.guest_supported_xcr0 | XFEATURE_MASK_FP; if (xcr0 & ~valid_bits) @@ -860,85 +1069,106 @@ static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) if ((xcr0 & XFEATURE_MASK_AVX512) != XFEATURE_MASK_AVX512) return 1; } + + if ((xcr0 & XFEATURE_MASK_XTILE) && + ((xcr0 & XFEATURE_MASK_XTILE) != XFEATURE_MASK_XTILE)) + return 1; + vcpu->arch.xcr0 = xcr0; if ((xcr0 ^ old_xcr0) & XFEATURE_MASK_EXTEND) - kvm_update_cpuid(vcpu); + kvm_update_cpuid_runtime(vcpu); return 0; } -int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) +int kvm_emulate_xsetbv(struct kvm_vcpu *vcpu) { - if (kvm_x86_ops->get_cpl(vcpu) != 0 || - __kvm_set_xcr(vcpu, index, xcr)) { + /* Note, #UD due to CR4.OSXSAVE=0 has priority over the intercept. */ + if (static_call(kvm_x86_get_cpl)(vcpu) != 0 || + __kvm_set_xcr(vcpu, kvm_rcx_read(vcpu), kvm_read_edx_eax(vcpu))) { kvm_inject_gp(vcpu, 0); return 1; } - return 0; -} -EXPORT_SYMBOL_GPL(kvm_set_xcr); -#define __cr4_reserved_bits(__cpu_has, __c) \ -({ \ - u64 __reserved_bits = CR4_RESERVED_BITS; \ - \ - if (!__cpu_has(__c, X86_FEATURE_XSAVE)) \ - __reserved_bits |= X86_CR4_OSXSAVE; \ - if (!__cpu_has(__c, X86_FEATURE_SMEP)) \ - __reserved_bits |= X86_CR4_SMEP; \ - if (!__cpu_has(__c, X86_FEATURE_SMAP)) \ - __reserved_bits |= X86_CR4_SMAP; \ - if (!__cpu_has(__c, X86_FEATURE_FSGSBASE)) \ - __reserved_bits |= X86_CR4_FSGSBASE; \ - if (!__cpu_has(__c, X86_FEATURE_PKU)) \ - __reserved_bits |= X86_CR4_PKE; \ - if (!__cpu_has(__c, X86_FEATURE_LA57)) \ - __reserved_bits |= X86_CR4_LA57; \ - if (!__cpu_has(__c, X86_FEATURE_UMIP)) \ - __reserved_bits |= X86_CR4_UMIP; \ - __reserved_bits; \ -}) + return kvm_skip_emulated_instruction(vcpu); +} +EXPORT_SYMBOL_GPL(kvm_emulate_xsetbv); -static u64 kvm_host_cr4_reserved_bits(struct cpuinfo_x86 *c) +bool __kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { - u64 reserved_bits = __cr4_reserved_bits(cpu_has, c); + if (cr4 & cr4_reserved_bits) + return false; - if (cpuid_ecx(0x7) & feature_bit(LA57)) - reserved_bits &= ~X86_CR4_LA57; + if (cr4 & vcpu->arch.cr4_guest_rsvd_bits) + return false; - if (kvm_x86_ops->umip_emulated()) - reserved_bits &= ~X86_CR4_UMIP; + return true; +} +EXPORT_SYMBOL_GPL(__kvm_is_valid_cr4); - return reserved_bits; +static bool kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +{ + return __kvm_is_valid_cr4(vcpu, cr4) && + static_call(kvm_x86_is_valid_cr4)(vcpu, cr4); } -static int kvm_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +void kvm_post_set_cr4(struct kvm_vcpu *vcpu, unsigned long old_cr4, unsigned long cr4) { - if (cr4 & cr4_reserved_bits) - return -EINVAL; + if ((cr4 ^ old_cr4) & KVM_MMU_CR4_ROLE_BITS) + kvm_mmu_reset_context(vcpu); - if (cr4 & __cr4_reserved_bits(guest_cpuid_has, vcpu)) - return -EINVAL; + /* + * If CR4.PCIDE is changed 0 -> 1, there is no need to flush the TLB + * according to the SDM; however, stale prev_roots could be reused + * incorrectly in the future after a MOV to CR3 with NOFLUSH=1, so we + * free them all. This is *not* a superset of KVM_REQ_TLB_FLUSH_GUEST + * or KVM_REQ_TLB_FLUSH_CURRENT, because the hardware TLB is not flushed, + * so fall through. + */ + if (!tdp_enabled && + (cr4 & X86_CR4_PCIDE) && !(old_cr4 & X86_CR4_PCIDE)) + kvm_mmu_unload(vcpu); + + /* + * The TLB has to be flushed for all PCIDs if any of the following + * (architecturally required) changes happen: + * - CR4.PCIDE is changed from 1 to 0 + * - CR4.PGE is toggled + * + * This is a superset of KVM_REQ_TLB_FLUSH_CURRENT. + */ + if (((cr4 ^ old_cr4) & X86_CR4_PGE) || + (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE))) + kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); + + /* + * The TLB has to be flushed for the current PCID if any of the + * following (architecturally required) changes happen: + * - CR4.SMEP is changed from 0 to 1 + * - CR4.PAE is toggled + */ + else if (((cr4 ^ old_cr4) & X86_CR4_PAE) || + ((cr4 & X86_CR4_SMEP) && !(old_cr4 & X86_CR4_SMEP))) + kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); - return 0; } +EXPORT_SYMBOL_GPL(kvm_post_set_cr4); int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { unsigned long old_cr4 = kvm_read_cr4(vcpu); - unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE | - X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE; - if (kvm_valid_cr4(vcpu, cr4)) + if (!kvm_is_valid_cr4(vcpu, cr4)) return 1; if (is_long_mode(vcpu)) { if (!(cr4 & X86_CR4_PAE)) return 1; + if ((cr4 ^ old_cr4) & X86_CR4_LA57) + return 1; } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE) - && ((cr4 ^ old_cr4) & pdptr_bits) - && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, - kvm_read_cr3(vcpu))) + && ((cr4 ^ old_cr4) & X86_CR4_PDPTR_BITS) + && !load_pdptrs(vcpu, kvm_read_cr3(vcpu))) return 1; if ((cr4 & X86_CR4_PCIDE) && !(old_cr4 & X86_CR4_PCIDE)) { @@ -950,50 +1180,103 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) return 1; } - if (kvm_x86_ops->set_cr4(vcpu, cr4)) - return 1; - - if (((cr4 ^ old_cr4) & pdptr_bits) || - (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE))) - kvm_mmu_reset_context(vcpu); + static_call(kvm_x86_set_cr4)(vcpu, cr4); - if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE)) - kvm_update_cpuid(vcpu); + kvm_post_set_cr4(vcpu, old_cr4, cr4); return 0; } EXPORT_SYMBOL_GPL(kvm_set_cr4); +static void kvm_invalidate_pcid(struct kvm_vcpu *vcpu, unsigned long pcid) +{ + struct kvm_mmu *mmu = vcpu->arch.mmu; + unsigned long roots_to_free = 0; + int i; + + /* + * MOV CR3 and INVPCID are usually not intercepted when using TDP, but + * this is reachable when running EPT=1 and unrestricted_guest=0, and + * also via the emulator. KVM's TDP page tables are not in the scope of + * the invalidation, but the guest's TLB entries need to be flushed as + * the CPU may have cached entries in its TLB for the target PCID. + */ + if (unlikely(tdp_enabled)) { + kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); + return; + } + + /* + * If neither the current CR3 nor any of the prev_roots use the given + * PCID, then nothing needs to be done here because a resync will + * happen anyway before switching to any other CR3. + */ + if (kvm_get_active_pcid(vcpu) == pcid) { + kvm_make_request(KVM_REQ_MMU_SYNC, vcpu); + kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); + } + + /* + * If PCID is disabled, there is no need to free prev_roots even if the + * PCIDs for them are also 0, because MOV to CR3 always flushes the TLB + * with PCIDE=0. + */ + if (!kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE)) + return; + + for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) + if (kvm_get_pcid(vcpu, mmu->prev_roots[i].pgd) == pcid) + roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); + + kvm_mmu_free_roots(vcpu->kvm, mmu, roots_to_free); +} + int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) { bool skip_tlb_flush = false; + unsigned long pcid = 0; #ifdef CONFIG_X86_64 bool pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE); if (pcid_enabled) { skip_tlb_flush = cr3 & X86_CR3_PCID_NOFLUSH; cr3 &= ~X86_CR3_PCID_NOFLUSH; + pcid = cr3 & X86_CR3_PCID_MASK; } #endif - if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) { - if (!skip_tlb_flush) { - kvm_mmu_sync_roots(vcpu); - kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); - } - return 0; - } + /* PDPTRs are always reloaded for PAE paging. */ + if (cr3 == kvm_read_cr3(vcpu) && !is_pae_paging(vcpu)) + goto handle_tlb_flush; - if (is_long_mode(vcpu) && - (cr3 & rsvd_bits(cpuid_maxphyaddr(vcpu), 63))) + /* + * Do not condition the GPA check on long mode, this helper is used to + * stuff CR3, e.g. for RSM emulation, and there is no guarantee that + * the current vCPU mode is accurate. + */ + if (kvm_vcpu_is_illegal_gpa(vcpu, cr3)) return 1; - else if (is_pae_paging(vcpu) && - !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) + + if (is_pae_paging(vcpu) && !load_pdptrs(vcpu, cr3)) return 1; - kvm_mmu_new_cr3(vcpu, cr3, skip_tlb_flush); + if (cr3 != kvm_read_cr3(vcpu)) + kvm_mmu_new_pgd(vcpu, cr3); + vcpu->arch.cr3 = cr3; - kvm_register_mark_available(vcpu, VCPU_EXREG_CR3); + kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3); + /* Do not call post_set_cr3, we do not get here for confidential guests. */ + +handle_tlb_flush: + /* + * A load of CR3 that flushes the TLB flushes only the current PCID, + * even if PCID is disabled, in which case PCID=0 is flushed. It's a + * moot point in the end because _disabling_ PCID will flush all PCIDs, + * and it's impossible to use a non-zero PCID when PCID is disabled, + * i.e. only PCID=0 can be relevant. + */ + if (!skip_tlb_flush) + kvm_invalidate_pcid(vcpu, pcid); return 0; } @@ -1027,17 +1310,10 @@ static void kvm_update_dr0123(struct kvm_vcpu *vcpu) if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) { for (i = 0; i < KVM_NR_DB_REGS; i++) vcpu->arch.eff_db[i] = vcpu->arch.db[i]; - vcpu->arch.switch_db_regs |= KVM_DEBUGREG_RELOAD; } } -static void kvm_update_dr6(struct kvm_vcpu *vcpu) -{ - if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) - kvm_x86_ops->set_dr6(vcpu, vcpu->arch.dr6); -} - -static void kvm_update_dr7(struct kvm_vcpu *vcpu) +void kvm_update_dr7(struct kvm_vcpu *vcpu) { unsigned long dr7; @@ -1045,11 +1321,12 @@ static void kvm_update_dr7(struct kvm_vcpu *vcpu) dr7 = vcpu->arch.guest_debug_dr7; else dr7 = vcpu->arch.dr7; - kvm_x86_ops->set_dr7(vcpu, dr7); + static_call(kvm_x86_set_dr7)(vcpu, dr7); vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_BP_ENABLED; if (dr7 & DR7_BP_EN_MASK) vcpu->arch.switch_db_regs |= KVM_DEBUGREG_BP_ENABLED; } +EXPORT_SYMBOL_GPL(kvm_update_dr7); static u64 kvm_dr6_fixed(struct kvm_vcpu *vcpu) { @@ -1057,10 +1334,13 @@ static u64 kvm_dr6_fixed(struct kvm_vcpu *vcpu) if (!guest_cpuid_has(vcpu, X86_FEATURE_RTM)) fixed |= DR6_RTM; + + if (!guest_cpuid_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT)) + fixed |= DR6_BUS_LOCK; return fixed; } -static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) +int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) { size_t size = ARRAY_SIZE(vcpu->arch.db); @@ -1071,18 +1351,15 @@ static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) vcpu->arch.eff_db[dr] = val; break; case 4: - /* fall through */ case 6: - if (val & 0xffffffff00000000ULL) - return -1; /* #GP */ + if (!kvm_dr6_valid(val)) + return 1; /* #GP */ vcpu->arch.dr6 = (val & DR6_VOLATILE) | kvm_dr6_fixed(vcpu); - kvm_update_dr6(vcpu); break; case 5: - /* fall through */ default: /* 7 */ if (!kvm_dr7_valid(val)) - return -1; /* #GP */ + return 1; /* #GP */ vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1; kvm_update_dr7(vcpu); break; @@ -1090,18 +1367,9 @@ static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) return 0; } - -int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) -{ - if (__kvm_set_dr(vcpu, dr, val)) { - kvm_inject_gp(vcpu, 0); - return 1; - } - return 0; -} EXPORT_SYMBOL_GPL(kvm_set_dr); -int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val) +void kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val) { size_t size = ARRAY_SIZE(vcpu->arch.db); @@ -1110,37 +1378,32 @@ int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val) *val = vcpu->arch.db[array_index_nospec(dr, size)]; break; case 4: - /* fall through */ case 6: - if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) - *val = vcpu->arch.dr6; - else - *val = kvm_x86_ops->get_dr6(vcpu); + *val = vcpu->arch.dr6; break; case 5: - /* fall through */ default: /* 7 */ *val = vcpu->arch.dr7; break; } - return 0; } EXPORT_SYMBOL_GPL(kvm_get_dr); -bool kvm_rdpmc(struct kvm_vcpu *vcpu) +int kvm_emulate_rdpmc(struct kvm_vcpu *vcpu) { u32 ecx = kvm_rcx_read(vcpu); u64 data; - int err; - err = kvm_pmu_rdpmc(vcpu, ecx, &data); - if (err) - return err; + if (kvm_pmu_rdpmc(vcpu, ecx, &data)) { + kvm_inject_gp(vcpu, 0); + return 1; + } + kvm_rax_write(vcpu, (u32)data); kvm_rdx_write(vcpu, data >> 32); - return err; + return kvm_skip_emulated_instruction(vcpu); } -EXPORT_SYMBOL_GPL(kvm_rdpmc); +EXPORT_SYMBOL_GPL(kvm_emulate_rdpmc); /* * List of msr numbers which we expose to userspace through KVM_GET_MSRS @@ -1172,27 +1435,31 @@ static const u32 msrs_to_save_all[] = { MSR_IA32_UMWAIT_CONTROL, MSR_ARCH_PERFMON_FIXED_CTR0, MSR_ARCH_PERFMON_FIXED_CTR1, - MSR_ARCH_PERFMON_FIXED_CTR0 + 2, MSR_ARCH_PERFMON_FIXED_CTR0 + 3, + MSR_ARCH_PERFMON_FIXED_CTR0 + 2, MSR_CORE_PERF_FIXED_CTR_CTRL, MSR_CORE_PERF_GLOBAL_STATUS, MSR_CORE_PERF_GLOBAL_CTRL, MSR_CORE_PERF_GLOBAL_OVF_CTRL, + MSR_IA32_PEBS_ENABLE, MSR_IA32_DS_AREA, MSR_PEBS_DATA_CFG, + + /* This part of MSRs should match KVM_INTEL_PMC_MAX_GENERIC. */ MSR_ARCH_PERFMON_PERFCTR0, MSR_ARCH_PERFMON_PERFCTR1, MSR_ARCH_PERFMON_PERFCTR0 + 2, MSR_ARCH_PERFMON_PERFCTR0 + 3, MSR_ARCH_PERFMON_PERFCTR0 + 4, MSR_ARCH_PERFMON_PERFCTR0 + 5, MSR_ARCH_PERFMON_PERFCTR0 + 6, MSR_ARCH_PERFMON_PERFCTR0 + 7, - MSR_ARCH_PERFMON_PERFCTR0 + 8, MSR_ARCH_PERFMON_PERFCTR0 + 9, - MSR_ARCH_PERFMON_PERFCTR0 + 10, MSR_ARCH_PERFMON_PERFCTR0 + 11, - MSR_ARCH_PERFMON_PERFCTR0 + 12, MSR_ARCH_PERFMON_PERFCTR0 + 13, - MSR_ARCH_PERFMON_PERFCTR0 + 14, MSR_ARCH_PERFMON_PERFCTR0 + 15, - MSR_ARCH_PERFMON_PERFCTR0 + 16, MSR_ARCH_PERFMON_PERFCTR0 + 17, MSR_ARCH_PERFMON_EVENTSEL0, MSR_ARCH_PERFMON_EVENTSEL1, MSR_ARCH_PERFMON_EVENTSEL0 + 2, MSR_ARCH_PERFMON_EVENTSEL0 + 3, MSR_ARCH_PERFMON_EVENTSEL0 + 4, MSR_ARCH_PERFMON_EVENTSEL0 + 5, MSR_ARCH_PERFMON_EVENTSEL0 + 6, MSR_ARCH_PERFMON_EVENTSEL0 + 7, - MSR_ARCH_PERFMON_EVENTSEL0 + 8, MSR_ARCH_PERFMON_EVENTSEL0 + 9, - MSR_ARCH_PERFMON_EVENTSEL0 + 10, MSR_ARCH_PERFMON_EVENTSEL0 + 11, - MSR_ARCH_PERFMON_EVENTSEL0 + 12, MSR_ARCH_PERFMON_EVENTSEL0 + 13, - MSR_ARCH_PERFMON_EVENTSEL0 + 14, MSR_ARCH_PERFMON_EVENTSEL0 + 15, - MSR_ARCH_PERFMON_EVENTSEL0 + 16, MSR_ARCH_PERFMON_EVENTSEL0 + 17, + + MSR_K7_EVNTSEL0, MSR_K7_EVNTSEL1, MSR_K7_EVNTSEL2, MSR_K7_EVNTSEL3, + MSR_K7_PERFCTR0, MSR_K7_PERFCTR1, MSR_K7_PERFCTR2, MSR_K7_PERFCTR3, + + /* This part of MSRs should match KVM_AMD_PMC_MAX_GENERIC. */ + MSR_F15H_PERF_CTL0, MSR_F15H_PERF_CTL1, MSR_F15H_PERF_CTL2, + MSR_F15H_PERF_CTL3, MSR_F15H_PERF_CTL4, MSR_F15H_PERF_CTL5, + MSR_F15H_PERF_CTR0, MSR_F15H_PERF_CTR1, MSR_F15H_PERF_CTR2, + MSR_F15H_PERF_CTR3, MSR_F15H_PERF_CTR4, MSR_F15H_PERF_CTR5, + + MSR_IA32_XFD, MSR_IA32_XFD_ERR, }; static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_all)]; @@ -1214,13 +1481,18 @@ static const u32 emulated_msrs_all[] = { HV_X64_MSR_VP_ASSIST_PAGE, HV_X64_MSR_REENLIGHTENMENT_CONTROL, HV_X64_MSR_TSC_EMULATION_CONTROL, HV_X64_MSR_TSC_EMULATION_STATUS, + HV_X64_MSR_SYNDBG_OPTIONS, + HV_X64_MSR_SYNDBG_CONTROL, HV_X64_MSR_SYNDBG_STATUS, + HV_X64_MSR_SYNDBG_SEND_BUFFER, HV_X64_MSR_SYNDBG_RECV_BUFFER, + HV_X64_MSR_SYNDBG_PENDING_BUFFER, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME, - MSR_KVM_PV_EOI_EN, + MSR_KVM_PV_EOI_EN, MSR_KVM_ASYNC_PF_INT, MSR_KVM_ASYNC_PF_ACK, MSR_IA32_TSC_ADJUST, - MSR_IA32_TSCDEADLINE, + MSR_IA32_TSC_DEADLINE, MSR_IA32_ARCH_CAPABILITIES, + MSR_IA32_PERF_CAPABILITIES, MSR_IA32_MISC_ENABLE, MSR_IA32_MCG_STATUS, MSR_IA32_MCG_CTL, @@ -1230,6 +1502,7 @@ static const u32 emulated_msrs_all[] = { MSR_PLATFORM_INFO, MSR_MISC_FEATURES_ENABLES, MSR_AMD64_VIRT_SPEC_CTRL, + MSR_AMD64_TSC_RATIO, MSR_IA32_POWER_CTL, MSR_IA32_UCODE_REV, @@ -1287,22 +1560,43 @@ static const u32 msr_based_features_all[] = { MSR_F10H_DECFG, MSR_IA32_UCODE_REV, MSR_IA32_ARCH_CAPABILITIES, + MSR_IA32_PERF_CAPABILITIES, }; static u32 msr_based_features[ARRAY_SIZE(msr_based_features_all)]; static unsigned int num_msr_based_features; +/* + * Some IA32_ARCH_CAPABILITIES bits have dependencies on MSRs that KVM + * does not yet virtualize. These include: + * 10 - MISC_PACKAGE_CTRLS + * 11 - ENERGY_FILTERING_CTL + * 12 - DOITM + * 18 - FB_CLEAR_CTRL + * 21 - XAPIC_DISABLE_STATUS + * 23 - OVERCLOCKING_STATUS + */ + +#define KVM_SUPPORTED_ARCH_CAP \ + (ARCH_CAP_RDCL_NO | ARCH_CAP_IBRS_ALL | ARCH_CAP_RSBA | \ + ARCH_CAP_SKIP_VMENTRY_L1DFLUSH | ARCH_CAP_SSB_NO | ARCH_CAP_MDS_NO | \ + ARCH_CAP_PSCHANGE_MC_NO | ARCH_CAP_TSX_CTRL_MSR | ARCH_CAP_TAA_NO | \ + ARCH_CAP_SBDR_SSDP_NO | ARCH_CAP_FBSDP_NO | ARCH_CAP_PSDP_NO | \ + ARCH_CAP_FB_CLEAR | ARCH_CAP_RRSBA | ARCH_CAP_PBRSB_NO) + static u64 kvm_get_arch_capabilities(void) { u64 data = 0; - if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) + if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) { rdmsrl(MSR_IA32_ARCH_CAPABILITIES, data); + data &= KVM_SUPPORTED_ARCH_CAP; + } /* * If nx_huge_pages is enabled, KVM's shadow paging will ensure that * the nested hypervisor runs with NX huge pages. If it is not, - * L1 is anyway vulnerable to ITLB_MULTIHIT explots from other + * L1 is anyway vulnerable to ITLB_MULTIHIT exploits from other * L1 guests, so it need not worry about its own (L2) guests. */ data |= ARCH_CAP_PSCHANGE_MC_NO; @@ -1326,16 +1620,24 @@ static u64 kvm_get_arch_capabilities(void) if (!boot_cpu_has_bug(X86_BUG_MDS)) data |= ARCH_CAP_MDS_NO; - /* - * On TAA affected systems: - * - nothing to do if TSX is disabled on the host. - * - we emulate TSX_CTRL if present on the host. - * This lets the guest use VERW to clear CPU buffers. - */ - if (!boot_cpu_has(X86_FEATURE_RTM)) - data &= ~(ARCH_CAP_TAA_NO | ARCH_CAP_TSX_CTRL_MSR); - else if (!boot_cpu_has_bug(X86_BUG_TAA)) + if (!boot_cpu_has(X86_FEATURE_RTM)) { + /* + * If RTM=0 because the kernel has disabled TSX, the host might + * have TAA_NO or TSX_CTRL. Clear TAA_NO (the guest sees RTM=0 + * and therefore knows that there cannot be TAA) but keep + * TSX_CTRL: some buggy userspaces leave it set on tsx=on hosts, + * and we want to allow migrating those guests to tsx=off hosts. + */ + data &= ~ARCH_CAP_TAA_NO; + } else if (!boot_cpu_has_bug(X86_BUG_TAA)) { data |= ARCH_CAP_TAA_NO; + } else { + /* + * Nothing to do here; we emulate TSX_CTRL if present on the + * host so the guest can choose between disabling TSX or + * using VERW to clear CPU buffers. + */ + } return data; } @@ -1350,8 +1652,7 @@ static int kvm_get_msr_feature(struct kvm_msr_entry *msr) rdmsrl_safe(msr->index, &msr->data); break; default: - if (kvm_x86_ops->get_msr_feature(msr)) - return 1; + return static_call(kvm_x86_get_msr_feature)(msr); } return 0; } @@ -1363,6 +1664,14 @@ static int do_get_msr_feature(struct kvm_vcpu *vcpu, unsigned index, u64 *data) msr.index = index; r = kvm_get_msr_feature(&msr); + + if (r == KVM_MSR_RET_INVALID) { + /* Unconditionally clear the output for simplicity */ + *data = 0; + if (kvm_msr_ignored_check(index, 0, false)) + r = 0; + } + if (r) return r; @@ -1402,6 +1711,7 @@ static int set_efer(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { u64 old_efer = vcpu->arch.efer; u64 efer = msr_info->data; + int r; if (efer & efer_reserved_bits) return 1; @@ -1418,10 +1728,13 @@ static int set_efer(struct kvm_vcpu *vcpu, struct msr_data *msr_info) efer &= ~EFER_LMA; efer |= vcpu->arch.efer & EFER_LMA; - kvm_x86_ops->set_efer(vcpu, efer); + r = static_call(kvm_x86_set_efer)(vcpu, efer); + if (r) { + WARN_ON(r > 0); + return r; + } - /* Update reserved bits */ - if ((efer ^ old_efer) & EFER_NX) + if ((efer ^ old_efer) & KVM_MMU_EFER_ROLE_BITS) kvm_mmu_reset_context(vcpu); return 0; @@ -1433,6 +1746,49 @@ void kvm_enable_efer_bits(u64 mask) } EXPORT_SYMBOL_GPL(kvm_enable_efer_bits); +bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type) +{ + struct kvm_x86_msr_filter *msr_filter; + struct msr_bitmap_range *ranges; + struct kvm *kvm = vcpu->kvm; + bool allowed; + int idx; + u32 i; + + /* x2APIC MSRs do not support filtering. */ + if (index >= 0x800 && index <= 0x8ff) + return true; + + idx = srcu_read_lock(&kvm->srcu); + + msr_filter = srcu_dereference(kvm->arch.msr_filter, &kvm->srcu); + if (!msr_filter) { + allowed = true; + goto out; + } + + allowed = msr_filter->default_allow; + ranges = msr_filter->ranges; + + for (i = 0; i < msr_filter->count; i++) { + u32 start = ranges[i].base; + u32 end = start + ranges[i].nmsrs; + u32 flags = ranges[i].flags; + unsigned long *bitmap = ranges[i].bitmap; + + if ((index >= start) && (index < end) && (flags & type)) { + allowed = !!test_bit(index - start, bitmap); + break; + } + } + +out: + srcu_read_unlock(&kvm->srcu, idx); + + return allowed; +} +EXPORT_SYMBOL_GPL(kvm_msr_allowed); + /* * Write @data into the MSR specified by @index. Select MSR specific fault * checks are bypassed if @host_initiated is %true. @@ -1467,14 +1823,50 @@ static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data, * value, and that something deterministic happens if the guest * invokes 64-bit SYSENTER. */ - data = get_canonical(data, vcpu_virt_addr_bits(vcpu)); + data = __canonical_address(data, vcpu_virt_addr_bits(vcpu)); + break; + case MSR_TSC_AUX: + if (!kvm_is_supported_user_return_msr(MSR_TSC_AUX)) + return 1; + + if (!host_initiated && + !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) && + !guest_cpuid_has(vcpu, X86_FEATURE_RDPID)) + return 1; + + /* + * Per Intel's SDM, bits 63:32 are reserved, but AMD's APM has + * incomplete and conflicting architectural behavior. Current + * AMD CPUs completely ignore bits 63:32, i.e. they aren't + * reserved and always read as zeros. Enforce Intel's reserved + * bits check if and only if the guest CPU is Intel, and clear + * the bits in all other cases. This ensures cross-vendor + * migration will provide consistent behavior for the guest. + */ + if (guest_cpuid_is_intel(vcpu) && (data >> 32) != 0) + return 1; + + data = (u32)data; + break; } msr.data = data; msr.index = index; msr.host_initiated = host_initiated; - return kvm_x86_ops->set_msr(vcpu, &msr); + return static_call(kvm_x86_set_msr)(vcpu, &msr); +} + +static int kvm_set_msr_ignored_check(struct kvm_vcpu *vcpu, + u32 index, u64 data, bool host_initiated) +{ + int ret = __kvm_set_msr(vcpu, index, data, host_initiated); + + if (ret == KVM_MSR_RET_INVALID) + if (kvm_msr_ignored_check(index, data, true)) + ret = 0; + + return ret; } /* @@ -1489,43 +1881,154 @@ int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, struct msr_data msr; int ret; + switch (index) { + case MSR_TSC_AUX: + if (!kvm_is_supported_user_return_msr(MSR_TSC_AUX)) + return 1; + + if (!host_initiated && + !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) && + !guest_cpuid_has(vcpu, X86_FEATURE_RDPID)) + return 1; + break; + } + msr.index = index; msr.host_initiated = host_initiated; - ret = kvm_x86_ops->get_msr(vcpu, &msr); + ret = static_call(kvm_x86_get_msr)(vcpu, &msr); if (!ret) *data = msr.data; return ret; } +static int kvm_get_msr_ignored_check(struct kvm_vcpu *vcpu, + u32 index, u64 *data, bool host_initiated) +{ + int ret = __kvm_get_msr(vcpu, index, data, host_initiated); + + if (ret == KVM_MSR_RET_INVALID) { + /* Unconditionally clear *data for simplicity */ + *data = 0; + if (kvm_msr_ignored_check(index, 0, false)) + ret = 0; + } + + return ret; +} + +static int kvm_get_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 *data) +{ + if (!kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_READ)) + return KVM_MSR_RET_FILTERED; + return kvm_get_msr_ignored_check(vcpu, index, data, false); +} + +static int kvm_set_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 data) +{ + if (!kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_WRITE)) + return KVM_MSR_RET_FILTERED; + return kvm_set_msr_ignored_check(vcpu, index, data, false); +} + int kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data) { - return __kvm_get_msr(vcpu, index, data, false); + return kvm_get_msr_ignored_check(vcpu, index, data, false); } EXPORT_SYMBOL_GPL(kvm_get_msr); int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data) { - return __kvm_set_msr(vcpu, index, data, false); + return kvm_set_msr_ignored_check(vcpu, index, data, false); } EXPORT_SYMBOL_GPL(kvm_set_msr); +static void complete_userspace_rdmsr(struct kvm_vcpu *vcpu) +{ + if (!vcpu->run->msr.error) { + kvm_rax_write(vcpu, (u32)vcpu->run->msr.data); + kvm_rdx_write(vcpu, vcpu->run->msr.data >> 32); + } +} + +static int complete_emulated_msr_access(struct kvm_vcpu *vcpu) +{ + return complete_emulated_insn_gp(vcpu, vcpu->run->msr.error); +} + +static int complete_emulated_rdmsr(struct kvm_vcpu *vcpu) +{ + complete_userspace_rdmsr(vcpu); + return complete_emulated_msr_access(vcpu); +} + +static int complete_fast_msr_access(struct kvm_vcpu *vcpu) +{ + return static_call(kvm_x86_complete_emulated_msr)(vcpu, vcpu->run->msr.error); +} + +static int complete_fast_rdmsr(struct kvm_vcpu *vcpu) +{ + complete_userspace_rdmsr(vcpu); + return complete_fast_msr_access(vcpu); +} + +static u64 kvm_msr_reason(int r) +{ + switch (r) { + case KVM_MSR_RET_INVALID: + return KVM_MSR_EXIT_REASON_UNKNOWN; + case KVM_MSR_RET_FILTERED: + return KVM_MSR_EXIT_REASON_FILTER; + default: + return KVM_MSR_EXIT_REASON_INVAL; + } +} + +static int kvm_msr_user_space(struct kvm_vcpu *vcpu, u32 index, + u32 exit_reason, u64 data, + int (*completion)(struct kvm_vcpu *vcpu), + int r) +{ + u64 msr_reason = kvm_msr_reason(r); + + /* Check if the user wanted to know about this MSR fault */ + if (!(vcpu->kvm->arch.user_space_msr_mask & msr_reason)) + return 0; + + vcpu->run->exit_reason = exit_reason; + vcpu->run->msr.error = 0; + memset(vcpu->run->msr.pad, 0, sizeof(vcpu->run->msr.pad)); + vcpu->run->msr.reason = msr_reason; + vcpu->run->msr.index = index; + vcpu->run->msr.data = data; + vcpu->arch.complete_userspace_io = completion; + + return 1; +} + int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu) { u32 ecx = kvm_rcx_read(vcpu); u64 data; + int r; + + r = kvm_get_msr_with_filter(vcpu, ecx, &data); - if (kvm_get_msr(vcpu, ecx, &data)) { + if (!r) { + trace_kvm_msr_read(ecx, data); + + kvm_rax_write(vcpu, data & -1u); + kvm_rdx_write(vcpu, (data >> 32) & -1u); + } else { + /* MSR read failed? See if we should ask user space */ + if (kvm_msr_user_space(vcpu, ecx, KVM_EXIT_X86_RDMSR, 0, + complete_fast_rdmsr, r)) + return 0; trace_kvm_msr_read_ex(ecx); - kvm_inject_gp(vcpu, 0); - return 1; } - trace_kvm_msr_read(ecx, data); - - kvm_rax_write(vcpu, data & -1u); - kvm_rdx_write(vcpu, (data >> 32) & -1u); - return kvm_skip_emulated_instruction(vcpu); + return static_call(kvm_x86_complete_emulated_msr)(vcpu, r); } EXPORT_SYMBOL_GPL(kvm_emulate_rdmsr); @@ -1533,18 +2036,76 @@ int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu) { u32 ecx = kvm_rcx_read(vcpu); u64 data = kvm_read_edx_eax(vcpu); + int r; - if (kvm_set_msr(vcpu, ecx, data)) { + r = kvm_set_msr_with_filter(vcpu, ecx, data); + + if (!r) { + trace_kvm_msr_write(ecx, data); + } else { + /* MSR write failed? See if we should ask user space */ + if (kvm_msr_user_space(vcpu, ecx, KVM_EXIT_X86_WRMSR, data, + complete_fast_msr_access, r)) + return 0; + /* Signal all other negative errors to userspace */ + if (r < 0) + return r; trace_kvm_msr_write_ex(ecx, data); - kvm_inject_gp(vcpu, 0); - return 1; } - trace_kvm_msr_write(ecx, data); - return kvm_skip_emulated_instruction(vcpu); + return static_call(kvm_x86_complete_emulated_msr)(vcpu, r); } EXPORT_SYMBOL_GPL(kvm_emulate_wrmsr); +int kvm_emulate_as_nop(struct kvm_vcpu *vcpu) +{ + return kvm_skip_emulated_instruction(vcpu); +} +EXPORT_SYMBOL_GPL(kvm_emulate_as_nop); + +int kvm_emulate_invd(struct kvm_vcpu *vcpu) +{ + /* Treat an INVD instruction as a NOP and just skip it. */ + return kvm_emulate_as_nop(vcpu); +} +EXPORT_SYMBOL_GPL(kvm_emulate_invd); + +int kvm_handle_invalid_op(struct kvm_vcpu *vcpu) +{ + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; +} +EXPORT_SYMBOL_GPL(kvm_handle_invalid_op); + + +static int kvm_emulate_monitor_mwait(struct kvm_vcpu *vcpu, const char *insn) +{ + if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS) && + !guest_cpuid_has(vcpu, X86_FEATURE_MWAIT)) + return kvm_handle_invalid_op(vcpu); + + pr_warn_once("kvm: %s instruction emulated as NOP!\n", insn); + return kvm_emulate_as_nop(vcpu); +} +int kvm_emulate_mwait(struct kvm_vcpu *vcpu) +{ + return kvm_emulate_monitor_mwait(vcpu, "MWAIT"); +} +EXPORT_SYMBOL_GPL(kvm_emulate_mwait); + +int kvm_emulate_monitor(struct kvm_vcpu *vcpu) +{ + return kvm_emulate_monitor_mwait(vcpu, "MONITOR"); +} +EXPORT_SYMBOL_GPL(kvm_emulate_monitor); + +static inline bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu) +{ + xfer_to_guest_mode_prepare(); + return vcpu->mode == EXITING_GUEST_MODE || kvm_request_pending(vcpu) || + xfer_to_guest_mode_work_pending(); +} + /* * The fast path for frequent and performance sensitive wrmsr emulation, * i.e. the sending of IPI, sending IPI early in the VM-Exit flow reduces @@ -1554,37 +2115,56 @@ EXPORT_SYMBOL_GPL(kvm_emulate_wrmsr); */ static int handle_fastpath_set_x2apic_icr_irqoff(struct kvm_vcpu *vcpu, u64 data) { - if (lapic_in_kernel(vcpu) && apic_x2apic_mode(vcpu->arch.apic) && - ((data & APIC_DEST_MASK) == APIC_DEST_PHYSICAL) && - ((data & APIC_MODE_MASK) == APIC_DM_FIXED)) { + if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(vcpu->arch.apic)) + return 1; - kvm_lapic_set_reg(vcpu->arch.apic, APIC_ICR2, (u32)(data >> 32)); - return kvm_lapic_reg_write(vcpu->arch.apic, APIC_ICR, (u32)data); - } + if (((data & APIC_SHORT_MASK) == APIC_DEST_NOSHORT) && + ((data & APIC_DEST_MASK) == APIC_DEST_PHYSICAL) && + ((data & APIC_MODE_MASK) == APIC_DM_FIXED) && + ((u32)(data >> 32) != X2APIC_BROADCAST)) + return kvm_x2apic_icr_write(vcpu->arch.apic, data); return 1; } -enum exit_fastpath_completion handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu) +static int handle_fastpath_set_tscdeadline(struct kvm_vcpu *vcpu, u64 data) +{ + if (!kvm_can_use_hv_timer(vcpu)) + return 1; + + kvm_set_lapic_tscdeadline_msr(vcpu, data); + return 0; +} + +fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu) { u32 msr = kvm_rcx_read(vcpu); - u64 data = kvm_read_edx_eax(vcpu); - int ret = 0; + u64 data; + fastpath_t ret = EXIT_FASTPATH_NONE; switch (msr) { case APIC_BASE_MSR + (APIC_ICR >> 4): - ret = handle_fastpath_set_x2apic_icr_irqoff(vcpu, data); + data = kvm_read_edx_eax(vcpu); + if (!handle_fastpath_set_x2apic_icr_irqoff(vcpu, data)) { + kvm_skip_emulated_instruction(vcpu); + ret = EXIT_FASTPATH_EXIT_HANDLED; + } + break; + case MSR_IA32_TSC_DEADLINE: + data = kvm_read_edx_eax(vcpu); + if (!handle_fastpath_set_tscdeadline(vcpu, data)) { + kvm_skip_emulated_instruction(vcpu); + ret = EXIT_FASTPATH_REENTER_GUEST; + } break; default: - return EXIT_FASTPATH_NONE; + break; } - if (!ret) { + if (ret != EXIT_FASTPATH_NONE) trace_kvm_msr_write(msr, data); - return EXIT_FASTPATH_SKIP_EMUL_INS; - } - return EXIT_FASTPATH_NONE; + return ret; } EXPORT_SYMBOL_GPL(handle_fastpath_set_msr_irqoff); @@ -1593,12 +2173,12 @@ EXPORT_SYMBOL_GPL(handle_fastpath_set_msr_irqoff); */ static int do_get_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) { - return __kvm_get_msr(vcpu, index, data, true); + return kvm_get_msr_ignored_check(vcpu, index, data, true); } static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) { - return __kvm_set_msr(vcpu, index, *data, true); + return kvm_set_msr_ignored_check(vcpu, index, *data, true); } #ifdef CONFIG_X86_64 @@ -1631,7 +2211,7 @@ static void update_pvclock_gtod(struct timekeeper *tk) write_seqcount_begin(&vdata->seq); /* copy pvclock gtod data */ - vdata->clock.vclock_mode = tk->tkr_mono.clock->archdata.vclock_mode; + vdata->clock.vclock_mode = tk->tkr_mono.clock->vdso_clock_mode; vdata->clock.cycle_last = tk->tkr_mono.cycle_last; vdata->clock.mask = tk->tkr_mono.mask; vdata->clock.mult = tk->tkr_mono.mult; @@ -1639,7 +2219,7 @@ static void update_pvclock_gtod(struct timekeeper *tk) vdata->clock.base_cycles = tk->tkr_mono.xtime_nsec; vdata->clock.offset = tk->tkr_mono.base; - vdata->raw_clock.vclock_mode = tk->tkr_raw.clock->archdata.vclock_mode; + vdata->raw_clock.vclock_mode = tk->tkr_raw.clock->vdso_clock_mode; vdata->raw_clock.cycle_last = tk->tkr_raw.cycle_last; vdata->raw_clock.mask = tk->tkr_raw.mask; vdata->raw_clock.mult = tk->tkr_raw.mult; @@ -1667,17 +2247,12 @@ static s64 get_kvmclock_base_ns(void) } #endif -void kvm_set_pending_timer(struct kvm_vcpu *vcpu) -{ - kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu); - kvm_vcpu_kick(vcpu); -} - -static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) +static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock, int sec_hi_ofs) { int version; int r; struct pvclock_wall_clock wc; + u32 wc_sec_hi; u64 wall_nsec; if (!wall_clock) @@ -1708,10 +2283,43 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc)); + if (sec_hi_ofs) { + wc_sec_hi = wall_nsec >> 32; + kvm_write_guest(kvm, wall_clock + sec_hi_ofs, + &wc_sec_hi, sizeof(wc_sec_hi)); + } + version++; kvm_write_guest(kvm, wall_clock, &version, sizeof(version)); } +static void kvm_write_system_time(struct kvm_vcpu *vcpu, gpa_t system_time, + bool old_msr, bool host_initiated) +{ + struct kvm_arch *ka = &vcpu->kvm->arch; + + if (vcpu->vcpu_id == 0 && !host_initiated) { + if (ka->boot_vcpu_runs_old_kvmclock != old_msr) + kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu); + + ka->boot_vcpu_runs_old_kvmclock = old_msr; + } + + vcpu->arch.time = system_time; + kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu); + + /* we verify if the enable bit is set... */ + if (system_time & 1) { + kvm_gpc_activate(vcpu->kvm, &vcpu->arch.pv_time, vcpu, + KVM_HOST_USES_PFN, system_time & ~1ULL, + sizeof(struct pvclock_vcpu_time_info)); + } else { + kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.pv_time); + } + + return; +} + static uint32_t div_frac(uint32_t dividend, uint32_t divisor) { do_shl32_div32(dividend, divisor); @@ -1760,18 +2368,20 @@ static u32 adjust_tsc_khz(u32 khz, s32 ppm) return v; } +static void kvm_vcpu_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 l1_multiplier); + static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale) { u64 ratio; /* Guest TSC same frequency as host TSC? */ if (!scale) { - vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio; + kvm_vcpu_write_tsc_multiplier(vcpu, kvm_caps.default_tsc_scaling_ratio); return 0; } /* TSC scaling supported? */ - if (!kvm_has_tsc_control) { + if (!kvm_caps.has_tsc_control) { if (user_tsc_khz > tsc_khz) { vcpu->arch.tsc_catchup = 1; vcpu->arch.tsc_always_catchup = 1; @@ -1783,16 +2393,16 @@ static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale) } /* TSC scaling required - calculate ratio */ - ratio = mul_u64_u32_div(1ULL << kvm_tsc_scaling_ratio_frac_bits, + ratio = mul_u64_u32_div(1ULL << kvm_caps.tsc_scaling_ratio_frac_bits, user_tsc_khz, tsc_khz); - if (ratio == 0 || ratio >= kvm_max_tsc_scaling_ratio) { + if (ratio == 0 || ratio >= kvm_caps.max_tsc_scaling_ratio) { pr_warn_ratelimited("Invalid TSC scaling ratio - virtual-tsc-khz=%u\n", user_tsc_khz); return -1; } - vcpu->arch.tsc_scaling_ratio = ratio; + kvm_vcpu_write_tsc_multiplier(vcpu, ratio); return 0; } @@ -1804,7 +2414,7 @@ static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz) /* tsc_khz can be zero if TSC calibration fails */ if (user_tsc_khz == 0) { /* set tsc_scaling_ratio to a safe value */ - vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio; + kvm_vcpu_write_tsc_multiplier(vcpu, kvm_caps.default_tsc_scaling_ratio); return -1; } @@ -1838,10 +2448,12 @@ static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns) return tsc; } +#ifdef CONFIG_X86_64 static inline int gtod_is_based_on_tsc(int mode) { - return mode == VCLOCK_TSC || mode == VCLOCK_HVCLOCK; + return mode == VDSO_CLOCKMODE_TSC || mode == VDSO_CLOCKMODE_HVCLOCK; } +#endif static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu) { @@ -1871,12 +2483,6 @@ static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu) #endif } -static void update_ia32_tsc_adjust_msr(struct kvm_vcpu *vcpu, s64 offset) -{ - u64 curr_offset = kvm_x86_ops->read_l1_tsc_offset(vcpu); - vcpu->arch.ia32_tsc_adjust_msr += offset - curr_offset; -} - /* * Multiply tsc by a fixed point number represented by ratio. * @@ -1885,45 +2491,104 @@ static void update_ia32_tsc_adjust_msr(struct kvm_vcpu *vcpu, s64 offset) * (frac) represent the fractional part, ie. ratio represents a fixed * point number (mult + frac * 2^(-N)). * - * N equals to kvm_tsc_scaling_ratio_frac_bits. + * N equals to kvm_caps.tsc_scaling_ratio_frac_bits. */ static inline u64 __scale_tsc(u64 ratio, u64 tsc) { - return mul_u64_u64_shr(tsc, ratio, kvm_tsc_scaling_ratio_frac_bits); + return mul_u64_u64_shr(tsc, ratio, kvm_caps.tsc_scaling_ratio_frac_bits); } -u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc) +u64 kvm_scale_tsc(u64 tsc, u64 ratio) { u64 _tsc = tsc; - u64 ratio = vcpu->arch.tsc_scaling_ratio; - if (ratio != kvm_default_tsc_scaling_ratio) + if (ratio != kvm_caps.default_tsc_scaling_ratio) _tsc = __scale_tsc(ratio, tsc); return _tsc; } EXPORT_SYMBOL_GPL(kvm_scale_tsc); -static u64 kvm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) +static u64 kvm_compute_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) { u64 tsc; - tsc = kvm_scale_tsc(vcpu, rdtsc()); + tsc = kvm_scale_tsc(rdtsc(), vcpu->arch.l1_tsc_scaling_ratio); return target_tsc - tsc; } u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc) { - u64 tsc_offset = kvm_x86_ops->read_l1_tsc_offset(vcpu); - - return tsc_offset + kvm_scale_tsc(vcpu, host_tsc); + return vcpu->arch.l1_tsc_offset + + kvm_scale_tsc(host_tsc, vcpu->arch.l1_tsc_scaling_ratio); } EXPORT_SYMBOL_GPL(kvm_read_l1_tsc); -static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) +u64 kvm_calc_nested_tsc_offset(u64 l1_offset, u64 l2_offset, u64 l2_multiplier) +{ + u64 nested_offset; + + if (l2_multiplier == kvm_caps.default_tsc_scaling_ratio) + nested_offset = l1_offset; + else + nested_offset = mul_s64_u64_shr((s64) l1_offset, l2_multiplier, + kvm_caps.tsc_scaling_ratio_frac_bits); + + nested_offset += l2_offset; + return nested_offset; +} +EXPORT_SYMBOL_GPL(kvm_calc_nested_tsc_offset); + +u64 kvm_calc_nested_tsc_multiplier(u64 l1_multiplier, u64 l2_multiplier) { - vcpu->arch.tsc_offset = kvm_x86_ops->write_l1_tsc_offset(vcpu, offset); + if (l2_multiplier != kvm_caps.default_tsc_scaling_ratio) + return mul_u64_u64_shr(l1_multiplier, l2_multiplier, + kvm_caps.tsc_scaling_ratio_frac_bits); + + return l1_multiplier; +} +EXPORT_SYMBOL_GPL(kvm_calc_nested_tsc_multiplier); + +static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 l1_offset) +{ + trace_kvm_write_tsc_offset(vcpu->vcpu_id, + vcpu->arch.l1_tsc_offset, + l1_offset); + + vcpu->arch.l1_tsc_offset = l1_offset; + + /* + * If we are here because L1 chose not to trap WRMSR to TSC then + * according to the spec this should set L1's TSC (as opposed to + * setting L1's offset for L2). + */ + if (is_guest_mode(vcpu)) + vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset( + l1_offset, + static_call(kvm_x86_get_l2_tsc_offset)(vcpu), + static_call(kvm_x86_get_l2_tsc_multiplier)(vcpu)); + else + vcpu->arch.tsc_offset = l1_offset; + + static_call(kvm_x86_write_tsc_offset)(vcpu, vcpu->arch.tsc_offset); +} + +static void kvm_vcpu_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 l1_multiplier) +{ + vcpu->arch.l1_tsc_scaling_ratio = l1_multiplier; + + /* Userspace is changing the multiplier while L2 is active */ + if (is_guest_mode(vcpu)) + vcpu->arch.tsc_scaling_ratio = kvm_calc_nested_tsc_multiplier( + l1_multiplier, + static_call(kvm_x86_get_l2_tsc_multiplier)(vcpu)); + else + vcpu->arch.tsc_scaling_ratio = l1_multiplier; + + if (kvm_caps.has_tsc_control) + static_call(kvm_x86_write_tsc_multiplier)( + vcpu, vcpu->arch.tsc_scaling_ratio); } static inline bool kvm_check_tsc_unstable(void) @@ -1933,29 +2598,79 @@ static inline bool kvm_check_tsc_unstable(void) * TSC is marked unstable when we're running on Hyper-V, * 'TSC page' clocksource is good. */ - if (pvclock_gtod_data.clock.vclock_mode == VCLOCK_HVCLOCK) + if (pvclock_gtod_data.clock.vclock_mode == VDSO_CLOCKMODE_HVCLOCK) return false; #endif return check_tsc_unstable(); } -void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) +/* + * Infers attempts to synchronize the guest's tsc from host writes. Sets the + * offset for the vcpu and tracks the TSC matching generation that the vcpu + * participates in. + */ +static void __kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 offset, u64 tsc, + u64 ns, bool matched) +{ + struct kvm *kvm = vcpu->kvm; + + lockdep_assert_held(&kvm->arch.tsc_write_lock); + + /* + * We also track th most recent recorded KHZ, write and time to + * allow the matching interval to be extended at each write. + */ + kvm->arch.last_tsc_nsec = ns; + kvm->arch.last_tsc_write = tsc; + kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz; + kvm->arch.last_tsc_offset = offset; + + vcpu->arch.last_guest_tsc = tsc; + + kvm_vcpu_write_tsc_offset(vcpu, offset); + + if (!matched) { + /* + * We split periods of matched TSC writes into generations. + * For each generation, we track the original measured + * nanosecond time, offset, and write, so if TSCs are in + * sync, we can match exact offset, and if not, we can match + * exact software computation in compute_guest_tsc() + * + * These values are tracked in kvm->arch.cur_xxx variables. + */ + kvm->arch.cur_tsc_generation++; + kvm->arch.cur_tsc_nsec = ns; + kvm->arch.cur_tsc_write = tsc; + kvm->arch.cur_tsc_offset = offset; + kvm->arch.nr_vcpus_matched_tsc = 0; + } else if (vcpu->arch.this_tsc_generation != kvm->arch.cur_tsc_generation) { + kvm->arch.nr_vcpus_matched_tsc++; + } + + /* Keep track of which generation this VCPU has synchronized to */ + vcpu->arch.this_tsc_generation = kvm->arch.cur_tsc_generation; + vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec; + vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write; + + kvm_track_tsc_matching(vcpu); +} + +static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data) { struct kvm *kvm = vcpu->kvm; u64 offset, ns, elapsed; unsigned long flags; - bool matched; - bool already_matched; - u64 data = msr->data; + bool matched = false; bool synchronizing = false; raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); - offset = kvm_compute_tsc_offset(vcpu, data); + offset = kvm_compute_l1_tsc_offset(vcpu, data); ns = get_kvmclock_base_ns(); elapsed = ns - kvm->arch.last_tsc_nsec; if (vcpu->arch.virtual_tsc_khz) { - if (data == 0 && msr->host_initiated) { + if (data == 0) { /* * detection of vcpu initialization -- need to sync * with other vCPUs. This particularly helps to keep @@ -1989,73 +2704,28 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) } else { u64 delta = nsec_to_cycles(vcpu, elapsed); data += delta; - offset = kvm_compute_tsc_offset(vcpu, data); + offset = kvm_compute_l1_tsc_offset(vcpu, data); } matched = true; - already_matched = (vcpu->arch.this_tsc_generation == kvm->arch.cur_tsc_generation); - } else { - /* - * We split periods of matched TSC writes into generations. - * For each generation, we track the original measured - * nanosecond time, offset, and write, so if TSCs are in - * sync, we can match exact offset, and if not, we can match - * exact software computation in compute_guest_tsc() - * - * These values are tracked in kvm->arch.cur_xxx variables. - */ - kvm->arch.cur_tsc_generation++; - kvm->arch.cur_tsc_nsec = ns; - kvm->arch.cur_tsc_write = data; - kvm->arch.cur_tsc_offset = offset; - matched = false; } - /* - * We also track th most recent recorded KHZ, write and time to - * allow the matching interval to be extended at each write. - */ - kvm->arch.last_tsc_nsec = ns; - kvm->arch.last_tsc_write = data; - kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz; - - vcpu->arch.last_guest_tsc = data; - - /* Keep track of which generation this VCPU has synchronized to */ - vcpu->arch.this_tsc_generation = kvm->arch.cur_tsc_generation; - vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec; - vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write; - - if (!msr->host_initiated && guest_cpuid_has(vcpu, X86_FEATURE_TSC_ADJUST)) - update_ia32_tsc_adjust_msr(vcpu, offset); - - kvm_vcpu_write_tsc_offset(vcpu, offset); + __kvm_synchronize_tsc(vcpu, offset, data, ns, matched); raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); - - spin_lock(&kvm->arch.pvclock_gtod_sync_lock); - if (!matched) { - kvm->arch.nr_vcpus_matched_tsc = 0; - } else if (!already_matched) { - kvm->arch.nr_vcpus_matched_tsc++; - } - - kvm_track_tsc_matching(vcpu); - spin_unlock(&kvm->arch.pvclock_gtod_sync_lock); } -EXPORT_SYMBOL_GPL(kvm_write_tsc); - static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, s64 adjustment) { - u64 tsc_offset = kvm_x86_ops->read_l1_tsc_offset(vcpu); + u64 tsc_offset = vcpu->arch.l1_tsc_offset; kvm_vcpu_write_tsc_offset(vcpu, tsc_offset + adjustment); } static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment) { - if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio) + if (vcpu->arch.l1_tsc_scaling_ratio != kvm_caps.default_tsc_scaling_ratio) WARN_ON(adjustment < 0); - adjustment = kvm_scale_tsc(vcpu, (u64) adjustment); + adjustment = kvm_scale_tsc((u64) adjustment, + vcpu->arch.l1_tsc_scaling_ratio); adjust_tsc_offset_guest(vcpu, adjustment); } @@ -2088,30 +2758,30 @@ static inline u64 vgettsc(struct pvclock_clock *clock, u64 *tsc_timestamp, u64 tsc_pg_val; switch (clock->vclock_mode) { - case VCLOCK_HVCLOCK: + case VDSO_CLOCKMODE_HVCLOCK: tsc_pg_val = hv_read_tsc_page_tsc(hv_get_tsc_page(), tsc_timestamp); if (tsc_pg_val != U64_MAX) { /* TSC page valid */ - *mode = VCLOCK_HVCLOCK; + *mode = VDSO_CLOCKMODE_HVCLOCK; v = (tsc_pg_val - clock->cycle_last) & clock->mask; } else { /* TSC page invalid */ - *mode = VCLOCK_NONE; + *mode = VDSO_CLOCKMODE_NONE; } break; - case VCLOCK_TSC: - *mode = VCLOCK_TSC; + case VDSO_CLOCKMODE_TSC: + *mode = VDSO_CLOCKMODE_TSC; *tsc_timestamp = read_tsc(); v = (*tsc_timestamp - clock->cycle_last) & clock->mask; break; default: - *mode = VCLOCK_NONE; + *mode = VDSO_CLOCKMODE_NONE; } - if (*mode == VCLOCK_NONE) + if (*mode == VDSO_CLOCKMODE_NONE) *tsc_timestamp = v = 0; return v * clock->mult; @@ -2228,6 +2898,7 @@ static void pvclock_update_vm_gtod_copy(struct kvm *kvm) int vclock_mode; bool host_tsc_clocksource, vcpus_matched; + lockdep_assert_held(&kvm->arch.tsc_write_lock); vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 == atomic_read(&kvm->online_vcpus)); @@ -2252,126 +2923,158 @@ static void pvclock_update_vm_gtod_copy(struct kvm *kvm) #endif } -void kvm_make_mclock_inprogress_request(struct kvm *kvm) +static void kvm_make_mclock_inprogress_request(struct kvm *kvm) { kvm_make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS); } -static void kvm_gen_update_masterclock(struct kvm *kvm) +static void __kvm_start_pvclock_update(struct kvm *kvm) { -#ifdef CONFIG_X86_64 - int i; - struct kvm_vcpu *vcpu; - struct kvm_arch *ka = &kvm->arch; + raw_spin_lock_irq(&kvm->arch.tsc_write_lock); + write_seqcount_begin(&kvm->arch.pvclock_sc); +} - spin_lock(&ka->pvclock_gtod_sync_lock); +static void kvm_start_pvclock_update(struct kvm *kvm) +{ kvm_make_mclock_inprogress_request(kvm); + /* no guest entries from this point */ - pvclock_update_vm_gtod_copy(kvm); + __kvm_start_pvclock_update(kvm); +} + +static void kvm_end_pvclock_update(struct kvm *kvm) +{ + struct kvm_arch *ka = &kvm->arch; + struct kvm_vcpu *vcpu; + unsigned long i; + write_seqcount_end(&ka->pvclock_sc); + raw_spin_unlock_irq(&ka->tsc_write_lock); kvm_for_each_vcpu(i, vcpu, kvm) kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); /* guest entries allowed */ kvm_for_each_vcpu(i, vcpu, kvm) kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu); +} - spin_unlock(&ka->pvclock_gtod_sync_lock); -#endif +static void kvm_update_masterclock(struct kvm *kvm) +{ + kvm_hv_request_tsc_page_update(kvm); + kvm_start_pvclock_update(kvm); + pvclock_update_vm_gtod_copy(kvm); + kvm_end_pvclock_update(kvm); } -u64 get_kvmclock_ns(struct kvm *kvm) +/* Called within read_seqcount_begin/retry for kvm->pvclock_sc. */ +static void __get_kvmclock(struct kvm *kvm, struct kvm_clock_data *data) { struct kvm_arch *ka = &kvm->arch; struct pvclock_vcpu_time_info hv_clock; - u64 ret; - - spin_lock(&ka->pvclock_gtod_sync_lock); - if (!ka->use_master_clock) { - spin_unlock(&ka->pvclock_gtod_sync_lock); - return get_kvmclock_base_ns() + ka->kvmclock_offset; - } - - hv_clock.tsc_timestamp = ka->master_cycle_now; - hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset; - spin_unlock(&ka->pvclock_gtod_sync_lock); /* both __this_cpu_read() and rdtsc() should be on the same cpu */ get_cpu(); - if (__this_cpu_read(cpu_tsc_khz)) { + data->flags = 0; + if (ka->use_master_clock && __this_cpu_read(cpu_tsc_khz)) { +#ifdef CONFIG_X86_64 + struct timespec64 ts; + + if (kvm_get_walltime_and_clockread(&ts, &data->host_tsc)) { + data->realtime = ts.tv_nsec + NSEC_PER_SEC * ts.tv_sec; + data->flags |= KVM_CLOCK_REALTIME | KVM_CLOCK_HOST_TSC; + } else +#endif + data->host_tsc = rdtsc(); + + data->flags |= KVM_CLOCK_TSC_STABLE; + hv_clock.tsc_timestamp = ka->master_cycle_now; + hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset; kvm_get_time_scale(NSEC_PER_SEC, __this_cpu_read(cpu_tsc_khz) * 1000LL, &hv_clock.tsc_shift, &hv_clock.tsc_to_system_mul); - ret = __pvclock_read_cycles(&hv_clock, rdtsc()); - } else - ret = get_kvmclock_base_ns() + ka->kvmclock_offset; + data->clock = __pvclock_read_cycles(&hv_clock, data->host_tsc); + } else { + data->clock = get_kvmclock_base_ns() + ka->kvmclock_offset; + } put_cpu(); +} - return ret; +static void get_kvmclock(struct kvm *kvm, struct kvm_clock_data *data) +{ + struct kvm_arch *ka = &kvm->arch; + unsigned seq; + + do { + seq = read_seqcount_begin(&ka->pvclock_sc); + __get_kvmclock(kvm, data); + } while (read_seqcount_retry(&ka->pvclock_sc, seq)); } -static void kvm_setup_pvclock_page(struct kvm_vcpu *v) +u64 get_kvmclock_ns(struct kvm *kvm) +{ + struct kvm_clock_data data; + + get_kvmclock(kvm, &data); + return data.clock; +} + +static void kvm_setup_guest_pvclock(struct kvm_vcpu *v, + struct gfn_to_pfn_cache *gpc, + unsigned int offset) { struct kvm_vcpu_arch *vcpu = &v->arch; - struct pvclock_vcpu_time_info guest_hv_clock; + struct pvclock_vcpu_time_info *guest_hv_clock; + unsigned long flags; - if (unlikely(kvm_read_guest_cached(v->kvm, &vcpu->pv_time, - &guest_hv_clock, sizeof(guest_hv_clock)))) - return; + read_lock_irqsave(&gpc->lock, flags); + while (!kvm_gfn_to_pfn_cache_check(v->kvm, gpc, gpc->gpa, + offset + sizeof(*guest_hv_clock))) { + read_unlock_irqrestore(&gpc->lock, flags); - /* This VCPU is paused, but it's legal for a guest to read another + if (kvm_gfn_to_pfn_cache_refresh(v->kvm, gpc, gpc->gpa, + offset + sizeof(*guest_hv_clock))) + return; + + read_lock_irqsave(&gpc->lock, flags); + } + + guest_hv_clock = (void *)(gpc->khva + offset); + + /* + * This VCPU is paused, but it's legal for a guest to read another * VCPU's kvmclock, so we really have to follow the specification where * it says that version is odd if data is being modified, and even after * it is consistent. - * - * Version field updates must be kept separate. This is because - * kvm_write_guest_cached might use a "rep movs" instruction, and - * writes within a string instruction are weakly ordered. So there - * are three writes overall. - * - * As a small optimization, only write the version field in the first - * and third write. The vcpu->pv_time cache is still valid, because the - * version field is the first in the struct. */ - BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0); - - if (guest_hv_clock.version & 1) - ++guest_hv_clock.version; /* first time write, random junk */ - - vcpu->hv_clock.version = guest_hv_clock.version + 1; - kvm_write_guest_cached(v->kvm, &vcpu->pv_time, - &vcpu->hv_clock, - sizeof(vcpu->hv_clock.version)); + guest_hv_clock->version = vcpu->hv_clock.version = (guest_hv_clock->version + 1) | 1; smp_wmb(); /* retain PVCLOCK_GUEST_STOPPED if set in guest copy */ - vcpu->hv_clock.flags |= (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED); + vcpu->hv_clock.flags |= (guest_hv_clock->flags & PVCLOCK_GUEST_STOPPED); if (vcpu->pvclock_set_guest_stopped_request) { vcpu->hv_clock.flags |= PVCLOCK_GUEST_STOPPED; vcpu->pvclock_set_guest_stopped_request = false; } - trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock); + memcpy(guest_hv_clock, &vcpu->hv_clock, sizeof(*guest_hv_clock)); + smp_wmb(); - kvm_write_guest_cached(v->kvm, &vcpu->pv_time, - &vcpu->hv_clock, - sizeof(vcpu->hv_clock)); + guest_hv_clock->version = ++vcpu->hv_clock.version; - smp_wmb(); + mark_page_dirty_in_slot(v->kvm, gpc->memslot, gpc->gpa >> PAGE_SHIFT); + read_unlock_irqrestore(&gpc->lock, flags); - vcpu->hv_clock.version++; - kvm_write_guest_cached(v->kvm, &vcpu->pv_time, - &vcpu->hv_clock, - sizeof(vcpu->hv_clock.version)); + trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock); } static int kvm_guest_time_update(struct kvm_vcpu *v) { unsigned long flags, tgt_tsc_khz; + unsigned seq; struct kvm_vcpu_arch *vcpu = &v->arch; struct kvm_arch *ka = &v->kvm->arch; s64 kernel_ns; @@ -2386,13 +3089,14 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) * If the host uses TSC clock, then passthrough TSC as stable * to the guest. */ - spin_lock(&ka->pvclock_gtod_sync_lock); - use_master_clock = ka->use_master_clock; - if (use_master_clock) { - host_tsc = ka->master_cycle_now; - kernel_ns = ka->master_kernel_ns; - } - spin_unlock(&ka->pvclock_gtod_sync_lock); + do { + seq = read_seqcount_begin(&ka->pvclock_sc); + use_master_clock = ka->use_master_clock; + if (use_master_clock) { + host_tsc = ka->master_cycle_now; + kernel_ns = ka->master_kernel_ns; + } + } while (read_seqcount_retry(&ka->pvclock_sc, seq)); /* Keep irq disabled to prevent changes to the clock */ local_irq_save(flags); @@ -2431,8 +3135,9 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) /* With all the info we got, fill in the values */ - if (kvm_has_tsc_control) - tgt_tsc_khz = kvm_scale_tsc(v, tgt_tsc_khz); + if (kvm_caps.has_tsc_control) + tgt_tsc_khz = kvm_scale_tsc(tgt_tsc_khz, + v->arch.l1_tsc_scaling_ratio); if (unlikely(vcpu->hw_tsc_khz != tgt_tsc_khz)) { kvm_get_time_scale(NSEC_PER_SEC, tgt_tsc_khz * 1000LL, @@ -2444,7 +3149,6 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) vcpu->hv_clock.tsc_timestamp = tsc_timestamp; vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset; vcpu->last_guest_tsc = tsc_timestamp; - WARN_ON((s64)vcpu->hv_clock.system_time < 0); /* If the host uses TSC clocksource, then it is stable */ pvclock_flags = 0; @@ -2453,10 +3157,14 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) vcpu->hv_clock.flags = pvclock_flags; - if (vcpu->pv_time_enabled) - kvm_setup_pvclock_page(v); - if (v == kvm_get_vcpu(v->kvm, 0)) - kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock); + if (vcpu->pv_time.active) + kvm_setup_guest_pvclock(v, &vcpu->pv_time, 0); + if (vcpu->xen.vcpu_info_cache.active) + kvm_setup_guest_pvclock(v, &vcpu->xen.vcpu_info_cache, + offsetof(struct compat_vcpu_info, time)); + if (vcpu->xen.vcpu_time_info_cache.active) + kvm_setup_guest_pvclock(v, &vcpu->xen.vcpu_time_info_cache, 0); + kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock); return 0; } @@ -2478,7 +3186,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) static void kvmclock_update_fn(struct work_struct *work) { - int i; + unsigned long i; struct delayed_work *dwork = to_delayed_work(work); struct kvm_arch *ka = container_of(dwork, struct kvm_arch, kvmclock_update_work); @@ -2517,13 +3225,23 @@ static void kvmclock_sync_fn(struct work_struct *work) KVMCLOCK_SYNC_PERIOD); } +/* These helpers are safe iff @msr is known to be an MCx bank MSR. */ +static bool is_mci_control_msr(u32 msr) +{ + return (msr & 3) == 0; +} +static bool is_mci_status_msr(u32 msr) +{ + return (msr & 3) == 1; +} + /* * On AMD, HWCR[McStatusWrEn] controls whether setting MCi_STATUS results in #GP. */ static bool can_set_mci_status(struct kvm_vcpu *vcpu) { /* McStatusWrEn enabled? */ - if (guest_cpuid_is_amd(vcpu)) + if (guest_cpuid_is_amd_or_hygon(vcpu)) return !!(vcpu->arch.msr_hwcr & BIT_ULL(18)); return false; @@ -2535,6 +3253,7 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info) unsigned bank_num = mcg_cap & 0xff; u32 msr = msr_info->index; u64 data = msr_info->data; + u32 offset, last_msr; switch (msr) { case MSR_IA32_MCG_STATUS: @@ -2548,149 +3267,269 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; vcpu->arch.mcg_ctl = data; break; - default: - if (msr >= MSR_IA32_MC0_CTL && - msr < MSR_IA32_MCx_CTL(bank_num)) { - u32 offset = array_index_nospec( - msr - MSR_IA32_MC0_CTL, - MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL); - - /* only 0 or all 1s can be written to IA32_MCi_CTL - * some Linux kernels though clear bit 10 in bank 4 to - * workaround a BIOS/GART TBL issue on AMD K8s, ignore - * this to avoid an uncatched #GP in the guest - */ - if ((offset & 0x3) == 0 && - data != 0 && (data | (1 << 10)) != ~(u64)0) - return -1; - - /* MCi_STATUS */ - if (!msr_info->host_initiated && - (offset & 0x3) == 1 && data != 0) { - if (!can_set_mci_status(vcpu)) - return -1; - } + case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1: + last_msr = MSR_IA32_MCx_CTL2(bank_num) - 1; + if (msr > last_msr) + return 1; - vcpu->arch.mce_banks[offset] = data; - break; - } + if (!(mcg_cap & MCG_CMCI_P) && (data || !msr_info->host_initiated)) + return 1; + /* An attempt to write a 1 to a reserved bit raises #GP */ + if (data & ~(MCI_CTL2_CMCI_EN | MCI_CTL2_CMCI_THRESHOLD_MASK)) + return 1; + offset = array_index_nospec(msr - MSR_IA32_MC0_CTL2, + last_msr + 1 - MSR_IA32_MC0_CTL2); + vcpu->arch.mci_ctl2_banks[offset] = data; + break; + case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: + last_msr = MSR_IA32_MCx_CTL(bank_num) - 1; + if (msr > last_msr) + return 1; + + /* + * Only 0 or all 1s can be written to IA32_MCi_CTL, all other + * values are architecturally undefined. But, some Linux + * kernels clear bit 10 in bank 4 to workaround a BIOS/GART TLB + * issue on AMD K8s, allow bit 10 to be clear when setting all + * other bits in order to avoid an uncaught #GP in the guest. + * + * UNIXWARE clears bit 0 of MC1_CTL to ignore correctable, + * single-bit ECC data errors. + */ + if (is_mci_control_msr(msr) && + data != 0 && (data | (1 << 10) | 1) != ~(u64)0) + return 1; + + /* + * All CPUs allow writing 0 to MCi_STATUS MSRs to clear the MSR. + * AMD-based CPUs allow non-zero values, but if and only if + * HWCR[McStatusWrEn] is set. + */ + if (!msr_info->host_initiated && is_mci_status_msr(msr) && + data != 0 && !can_set_mci_status(vcpu)) + return 1; + + offset = array_index_nospec(msr - MSR_IA32_MC0_CTL, + last_msr + 1 - MSR_IA32_MC0_CTL); + vcpu->arch.mce_banks[offset] = data; + break; + default: return 1; } return 0; } -static int xen_hvm_config(struct kvm_vcpu *vcpu, u64 data) +static inline bool kvm_pv_async_pf_enabled(struct kvm_vcpu *vcpu) { - struct kvm *kvm = vcpu->kvm; - int lm = is_long_mode(vcpu); - u8 *blob_addr = lm ? (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_64 - : (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_32; - u8 blob_size = lm ? kvm->arch.xen_hvm_config.blob_size_64 - : kvm->arch.xen_hvm_config.blob_size_32; - u32 page_num = data & ~PAGE_MASK; - u64 page_addr = data & PAGE_MASK; - u8 *page; - int r; + u64 mask = KVM_ASYNC_PF_ENABLED | KVM_ASYNC_PF_DELIVERY_AS_INT; - r = -E2BIG; - if (page_num >= blob_size) - goto out; - r = -ENOMEM; - page = memdup_user(blob_addr + (page_num * PAGE_SIZE), PAGE_SIZE); - if (IS_ERR(page)) { - r = PTR_ERR(page); - goto out; - } - if (kvm_vcpu_write_guest(vcpu, page_addr, page, PAGE_SIZE)) - goto out_free; - r = 0; -out_free: - kfree(page); -out: - return r; + return (vcpu->arch.apf.msr_en_val & mask) == mask; } static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data) { gpa_t gpa = data & ~0x3f; - /* Bits 3:5 are reserved, Should be zero */ - if (data & 0x38) + /* Bits 4:5 are reserved, Should be zero */ + if (data & 0x30) return 1; - vcpu->arch.apf.msr_val = data; + if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_VMEXIT) && + (data & KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT)) + return 1; + + if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT) && + (data & KVM_ASYNC_PF_DELIVERY_AS_INT)) + return 1; + + if (!lapic_in_kernel(vcpu)) + return data ? 1 : 0; - if (!(data & KVM_ASYNC_PF_ENABLED)) { + vcpu->arch.apf.msr_en_val = data; + + if (!kvm_pv_async_pf_enabled(vcpu)) { kvm_clear_async_pf_completion_queue(vcpu); kvm_async_pf_hash_reset(vcpu); return 0; } if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa, - sizeof(u32))) + sizeof(u64))) return 1; vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS); vcpu->arch.apf.delivery_as_pf_vmexit = data & KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT; + kvm_async_pf_wakeup_all(vcpu); + + return 0; +} + +static int kvm_pv_enable_async_pf_int(struct kvm_vcpu *vcpu, u64 data) +{ + /* Bits 8-63 are reserved */ + if (data >> 8) + return 1; + + if (!lapic_in_kernel(vcpu)) + return 1; + + vcpu->arch.apf.msr_int_val = data; + + vcpu->arch.apf.vec = data & KVM_ASYNC_PF_VEC_MASK; + return 0; } static void kvmclock_reset(struct kvm_vcpu *vcpu) { - vcpu->arch.pv_time_enabled = false; + kvm_gpc_deactivate(vcpu->kvm, &vcpu->arch.pv_time); vcpu->arch.time = 0; } -static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa) +static void kvm_vcpu_flush_tlb_all(struct kvm_vcpu *vcpu) +{ + ++vcpu->stat.tlb_flush; + static_call(kvm_x86_flush_tlb_all)(vcpu); +} + +static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu) +{ + ++vcpu->stat.tlb_flush; + + if (!tdp_enabled) { + /* + * A TLB flush on behalf of the guest is equivalent to + * INVPCID(all), toggling CR4.PGE, etc., which requires + * a forced sync of the shadow page tables. Ensure all the + * roots are synced and the guest TLB in hardware is clean. + */ + kvm_mmu_sync_roots(vcpu); + kvm_mmu_sync_prev_roots(vcpu); + } + + static_call(kvm_x86_flush_tlb_guest)(vcpu); +} + + +static inline void kvm_vcpu_flush_tlb_current(struct kvm_vcpu *vcpu) { ++vcpu->stat.tlb_flush; - kvm_x86_ops->tlb_flush(vcpu, invalidate_gpa); + static_call(kvm_x86_flush_tlb_current)(vcpu); } +/* + * Service "local" TLB flush requests, which are specific to the current MMU + * context. In addition to the generic event handling in vcpu_enter_guest(), + * TLB flushes that are targeted at an MMU context also need to be serviced + * prior before nested VM-Enter/VM-Exit. + */ +void kvm_service_local_tlb_flush_requests(struct kvm_vcpu *vcpu) +{ + if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu)) + kvm_vcpu_flush_tlb_current(vcpu); + + if (kvm_check_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu)) + kvm_vcpu_flush_tlb_guest(vcpu); +} +EXPORT_SYMBOL_GPL(kvm_service_local_tlb_flush_requests); + static void record_steal_time(struct kvm_vcpu *vcpu) { - struct kvm_host_map map; - struct kvm_steal_time *st; + struct gfn_to_hva_cache *ghc = &vcpu->arch.st.cache; + struct kvm_steal_time __user *st; + struct kvm_memslots *slots; + gpa_t gpa = vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS; + u64 steal; + u32 version; + + if (kvm_xen_msr_enabled(vcpu->kvm)) { + kvm_xen_runstate_set_running(vcpu); + return; + } if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) return; - /* -EAGAIN is returned in atomic context so we can just return. */ - if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT, - &map, &vcpu->arch.st.cache, false)) + if (WARN_ON_ONCE(current->mm != vcpu->kvm->mm)) return; - st = map.hva + - offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS); + slots = kvm_memslots(vcpu->kvm); + + if (unlikely(slots->generation != ghc->generation || + gpa != ghc->gpa || + kvm_is_error_hva(ghc->hva) || !ghc->memslot)) { + /* We rely on the fact that it fits in a single page. */ + BUILD_BUG_ON((sizeof(*st) - 1) & KVM_STEAL_VALID_BITS); + if (kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, gpa, sizeof(*st)) || + kvm_is_error_hva(ghc->hva) || !ghc->memslot) + return; + } + + st = (struct kvm_steal_time __user *)ghc->hva; /* * Doing a TLB flush here, on the guest's behalf, can avoid * expensive IPIs. */ - trace_kvm_pv_tlb_flush(vcpu->vcpu_id, - st->preempted & KVM_VCPU_FLUSH_TLB); - if (xchg(&st->preempted, 0) & KVM_VCPU_FLUSH_TLB) - kvm_vcpu_flush_tlb(vcpu, false); + if (guest_pv_has(vcpu, KVM_FEATURE_PV_TLB_FLUSH)) { + u8 st_preempted = 0; + int err = -EFAULT; - vcpu->arch.st.preempted = 0; + if (!user_access_begin(st, sizeof(*st))) + return; - if (st->version & 1) - st->version += 1; /* first time write, random junk */ + asm volatile("1: xchgb %0, %2\n" + "xor %1, %1\n" + "2:\n" + _ASM_EXTABLE_UA(1b, 2b) + : "+q" (st_preempted), + "+&r" (err), + "+m" (st->preempted)); + if (err) + goto out; + + user_access_end(); + + vcpu->arch.st.preempted = 0; + + trace_kvm_pv_tlb_flush(vcpu->vcpu_id, + st_preempted & KVM_VCPU_FLUSH_TLB); + if (st_preempted & KVM_VCPU_FLUSH_TLB) + kvm_vcpu_flush_tlb_guest(vcpu); + + if (!user_access_begin(st, sizeof(*st))) + goto dirty; + } else { + if (!user_access_begin(st, sizeof(*st))) + return; + + unsafe_put_user(0, &st->preempted, out); + vcpu->arch.st.preempted = 0; + } + + unsafe_get_user(version, &st->version, out); + if (version & 1) + version += 1; /* first time write, random junk */ - st->version += 1; + version += 1; + unsafe_put_user(version, &st->version, out); smp_wmb(); - st->steal += current->sched_info.run_delay - + unsafe_get_user(steal, &st->steal, out); + steal += current->sched_info.run_delay - vcpu->arch.st.last_steal; vcpu->arch.st.last_steal = current->sched_info.run_delay; + unsafe_put_user(steal, &st->steal, out); - smp_wmb(); - - st->version += 1; + version += 1; + unsafe_put_user(version, &st->version, out); - kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, false); + out: + user_access_end(); + dirty: + mark_page_dirty_in_slot(vcpu->kvm, ghc->memslot, gpa_to_gfn(ghc->gpa)); } int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) @@ -2699,6 +3538,9 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) u32 msr = msr_info->index; u64 data = msr_info->data; + if (msr && msr == vcpu->kvm->arch.xen_hvm_config.msr) + return kvm_xen_write_hypercall_page(vcpu, data); + switch (msr) { case MSR_AMD64_NB_CFG: case MSR_IA32_UCODE_WRITE: @@ -2718,6 +3560,20 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; vcpu->arch.arch_capabilities = data; break; + case MSR_IA32_PERF_CAPABILITIES: { + struct kvm_msr_entry msr_ent = {.index = msr, .data = 0}; + + if (!msr_info->host_initiated) + return 1; + if (kvm_get_msr_feature(&msr_ent)) + return 1; + if (data & ~msr_ent.data) + return 1; + + vcpu->arch.perf_capabilities = data; + kvm_pmu_refresh(vcpu); + return 0; + } case MSR_EFER: return set_efer(vcpu, msr_info); case MSR_K7_HWCR: @@ -2741,25 +3597,14 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; } break; - case MSR_IA32_DEBUGCTLMSR: - if (!data) { - /* We support the non-activated case already */ - break; - } else if (data & ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_BTF)) { - /* Values other than LBR and BTF are vendor-specific, - thus reserved and should throw a #GP */ - return 1; - } - vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n", - __func__, data); - break; - case 0x200 ... 0x2ff: + case 0x200 ... MSR_IA32_MC0_CTL2 - 1: + case MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) ... 0x2ff: return kvm_mtrr_set_msr(vcpu, msr, data); case MSR_IA32_APICBASE: return kvm_set_apic_base(vcpu, msr_info); - case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff: + case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff: return kvm_x2apic_msr_write(vcpu, msr, data); - case MSR_IA32_TSCDEADLINE: + case MSR_IA32_TSC_DEADLINE: kvm_set_lapic_tscdeadline_msr(vcpu, data); break; case MSR_IA32_TSC_ADJUST: @@ -2767,21 +3612,38 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!msr_info->host_initiated) { s64 adj = data - vcpu->arch.ia32_tsc_adjust_msr; adjust_tsc_offset_guest(vcpu, adj); + /* Before back to guest, tsc_timestamp must be adjusted + * as well, otherwise guest's percpu pvclock time could jump. + */ + kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); } vcpu->arch.ia32_tsc_adjust_msr = data; } break; - case MSR_IA32_MISC_ENABLE: + case MSR_IA32_MISC_ENABLE: { + u64 old_val = vcpu->arch.ia32_misc_enable_msr; + + if (!msr_info->host_initiated) { + /* RO bits */ + if ((old_val ^ data) & MSR_IA32_MISC_ENABLE_PMU_RO_MASK) + return 1; + + /* R bits, i.e. writes are ignored, but don't fault. */ + data = data & ~MSR_IA32_MISC_ENABLE_EMON; + data |= old_val & MSR_IA32_MISC_ENABLE_EMON; + } + if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT) && - ((vcpu->arch.ia32_misc_enable_msr ^ data) & MSR_IA32_MISC_ENABLE_MWAIT)) { + ((old_val ^ data) & MSR_IA32_MISC_ENABLE_MWAIT)) { if (!guest_cpuid_has(vcpu, X86_FEATURE_XMM3)) return 1; vcpu->arch.ia32_misc_enable_msr = data; - kvm_update_cpuid(vcpu); + kvm_update_cpuid_runtime(vcpu); } else { vcpu->arch.ia32_misc_enable_msr = data; } break; + } case MSR_IA32_SMBASE: if (!msr_info->host_initiated) return 1; @@ -2791,21 +3653,27 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vcpu->arch.msr_ia32_power_ctl = data; break; case MSR_IA32_TSC: - kvm_write_tsc(vcpu, msr_info); + if (msr_info->host_initiated) { + kvm_synchronize_tsc(vcpu, data); + } else { + u64 adj = kvm_compute_l1_tsc_offset(vcpu, data) - vcpu->arch.l1_tsc_offset; + adjust_tsc_offset_guest(vcpu, adj); + vcpu->arch.ia32_tsc_adjust_msr += adj; + } break; case MSR_IA32_XSS: if (!msr_info->host_initiated && !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES)) return 1; /* - * We do support PT if kvm_x86_ops->pt_supported(), but we do - * not support IA32_XSS[bit 8]. Guests will have to use - * RDMSR/WRMSR rather than XSAVES/XRSTORS to save/restore PT - * MSRs. + * KVM supports exposing PT to the guest, but does not support + * IA32_XSS[bit 8]. Guests have to use RDMSR/WRMSR rather than + * XSAVES/XRSTORS to save/restore PT MSRs. */ - if (data != 0) + if (data & ~kvm_caps.supported_xss) return 1; vcpu->arch.ia32_xss = data; + kvm_update_cpuid_runtime(vcpu); break; case MSR_SMI_COUNT: if (!msr_info->host_initiated) @@ -2813,43 +3681,56 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vcpu->arch.smi_count = data; break; case MSR_KVM_WALL_CLOCK_NEW: + if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2)) + return 1; + + vcpu->kvm->arch.wall_clock = data; + kvm_write_wall_clock(vcpu->kvm, data, 0); + break; case MSR_KVM_WALL_CLOCK: + if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE)) + return 1; + vcpu->kvm->arch.wall_clock = data; - kvm_write_wall_clock(vcpu->kvm, data); + kvm_write_wall_clock(vcpu->kvm, data, 0); break; case MSR_KVM_SYSTEM_TIME_NEW: - case MSR_KVM_SYSTEM_TIME: { - struct kvm_arch *ka = &vcpu->kvm->arch; - - if (vcpu->vcpu_id == 0 && !msr_info->host_initiated) { - bool tmp = (msr == MSR_KVM_SYSTEM_TIME); - - if (ka->boot_vcpu_runs_old_kvmclock != tmp) - kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu); - - ka->boot_vcpu_runs_old_kvmclock = tmp; - } - - vcpu->arch.time = data; - kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu); - - /* we verify if the enable bit is set... */ - vcpu->arch.pv_time_enabled = false; - if (!(data & 1)) - break; + if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2)) + return 1; - if (!kvm_gfn_to_hva_cache_init(vcpu->kvm, - &vcpu->arch.pv_time, data & ~1ULL, - sizeof(struct pvclock_vcpu_time_info))) - vcpu->arch.pv_time_enabled = true; + kvm_write_system_time(vcpu, data, false, msr_info->host_initiated); + break; + case MSR_KVM_SYSTEM_TIME: + if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE)) + return 1; + kvm_write_system_time(vcpu, data, true, msr_info->host_initiated); break; - } case MSR_KVM_ASYNC_PF_EN: + if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF)) + return 1; + if (kvm_pv_enable_async_pf(vcpu, data)) return 1; break; + case MSR_KVM_ASYNC_PF_INT: + if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT)) + return 1; + + if (kvm_pv_enable_async_pf_int(vcpu, data)) + return 1; + break; + case MSR_KVM_ASYNC_PF_ACK: + if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT)) + return 1; + if (data & 0x1) { + vcpu->arch.apf.pageready_pending = false; + kvm_check_async_pf_completion(vcpu); + } + break; case MSR_KVM_STEAL_TIME: + if (!guest_pv_has(vcpu, KVM_FEATURE_STEAL_TIME)) + return 1; if (unlikely(!sched_info_on())) return 1; @@ -2866,11 +3747,17 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_KVM_PV_EOI_EN: - if (kvm_lapic_enable_pv_eoi(vcpu, data, sizeof(u8))) + if (!guest_pv_has(vcpu, KVM_FEATURE_PV_EOI)) + return 1; + + if (kvm_lapic_set_pv_eoi(vcpu, data, sizeof(u8))) return 1; break; case MSR_KVM_POLL_CONTROL: + if (!guest_pv_has(vcpu, KVM_FEATURE_POLL_CONTROL)) + return 1; + /* only enable bit supported */ if (data & (-1ULL << 1)) return 1; @@ -2881,11 +3768,13 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_MCG_CTL: case MSR_IA32_MCG_STATUS: case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: + case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1: return set_msr_mce(vcpu, msr_info); case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3: case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1: - pr = true; /* fall through */ + pr = true; + fallthrough; case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3: case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1: if (kvm_pmu_is_valid_msr(vcpu, msr)) @@ -2906,6 +3795,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) */ break; case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: + case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER: + case HV_X64_MSR_SYNDBG_OPTIONS: case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: case HV_X64_MSR_CRASH_CTL: case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT: @@ -2946,22 +3837,43 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; vcpu->arch.msr_misc_features_enables = data; break; +#ifdef CONFIG_X86_64 + case MSR_IA32_XFD: + if (!msr_info->host_initiated && + !guest_cpuid_has(vcpu, X86_FEATURE_XFD)) + return 1; + + if (data & ~kvm_guest_supported_xfd(vcpu)) + return 1; + + fpu_update_guest_xfd(&vcpu->arch.guest_fpu, data); + break; + case MSR_IA32_XFD_ERR: + if (!msr_info->host_initiated && + !guest_cpuid_has(vcpu, X86_FEATURE_XFD)) + return 1; + + if (data & ~kvm_guest_supported_xfd(vcpu)) + return 1; + + vcpu->arch.guest_fpu.xfd_err = data; + break; +#endif + case MSR_IA32_PEBS_ENABLE: + case MSR_IA32_DS_AREA: + case MSR_PEBS_DATA_CFG: + case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5: + if (kvm_pmu_is_valid_msr(vcpu, msr)) + return kvm_pmu_set_msr(vcpu, msr_info); + /* + * Userspace is allowed to write '0' to MSRs that KVM reports + * as to-be-saved, even if an MSRs isn't fully supported. + */ + return !msr_info->host_initiated || data; default: - if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr)) - return xen_hvm_config(vcpu, data); if (kvm_pmu_is_valid_msr(vcpu, msr)) return kvm_pmu_set_msr(vcpu, msr_info); - if (!ignore_msrs) { - vcpu_debug_ratelimited(vcpu, "unhandled wrmsr: 0x%x data 0x%llx\n", - msr, data); - return 1; - } else { - if (report_ignored_msrs) - vcpu_unimpl(vcpu, - "ignored wrmsr: 0x%x data 0x%llx\n", - msr, data); - break; - } + return KVM_MSR_RET_INVALID; } return 0; } @@ -2972,6 +3884,7 @@ static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host) u64 data; u64 mcg_cap = vcpu->arch.mcg_cap; unsigned bank_num = mcg_cap & 0xff; + u32 offset, last_msr; switch (msr) { case MSR_IA32_P5_MC_ADDR: @@ -2989,16 +3902,27 @@ static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host) case MSR_IA32_MCG_STATUS: data = vcpu->arch.mcg_status; break; - default: - if (msr >= MSR_IA32_MC0_CTL && - msr < MSR_IA32_MCx_CTL(bank_num)) { - u32 offset = array_index_nospec( - msr - MSR_IA32_MC0_CTL, - MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL); + case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1: + last_msr = MSR_IA32_MCx_CTL2(bank_num) - 1; + if (msr > last_msr) + return 1; - data = vcpu->arch.mce_banks[offset]; - break; - } + if (!(mcg_cap & MCG_CMCI_P) && !host) + return 1; + offset = array_index_nospec(msr - MSR_IA32_MC0_CTL2, + last_msr + 1 - MSR_IA32_MC0_CTL2); + data = vcpu->arch.mci_ctl2_banks[offset]; + break; + case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: + last_msr = MSR_IA32_MCx_CTL(bank_num) - 1; + if (msr > last_msr) + return 1; + + offset = array_index_nospec(msr - MSR_IA32_MC0_CTL, + last_msr + 1 - MSR_IA32_MC0_CTL); + data = vcpu->arch.mce_banks[offset]; + break; + default: return 1; } *pdata = data; @@ -3010,12 +3934,11 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) switch (msr_info->index) { case MSR_IA32_PLATFORM_ID: case MSR_IA32_EBL_CR_POWERON: - case MSR_IA32_DEBUGCTLMSR: case MSR_IA32_LASTBRANCHFROMIP: case MSR_IA32_LASTBRANCHTOIP: case MSR_IA32_LASTINTFROMIP: case MSR_IA32_LASTINTTOIP: - case MSR_K8_SYSCFG: + case MSR_AMD64_SYSCFG: case MSR_K8_TSEG_ADDR: case MSR_K8_TSEG_MASK: case MSR_VM_HSAVE_PA: @@ -3026,15 +3949,39 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_PERF_CTL: case MSR_AMD64_DC_CFG: case MSR_F15H_EX_CFG: + /* + * Intel Sandy Bridge CPUs must support the RAPL (running average power + * limit) MSRs. Just return 0, as we do not want to expose the host + * data here. Do not conditionalize this on CPUID, as KVM does not do + * so for existing CPU-specific MSRs. + */ + case MSR_RAPL_POWER_UNIT: + case MSR_PP0_ENERGY_STATUS: /* Power plane 0 (core) */ + case MSR_PP1_ENERGY_STATUS: /* Power plane 1 (graphics uncore) */ + case MSR_PKG_ENERGY_STATUS: /* Total package */ + case MSR_DRAM_ENERGY_STATUS: /* DRAM controller */ msr_info->data = 0; break; + case MSR_IA32_PEBS_ENABLE: + case MSR_IA32_DS_AREA: + case MSR_PEBS_DATA_CFG: case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5: + if (kvm_pmu_is_valid_msr(vcpu, msr_info->index)) + return kvm_pmu_get_msr(vcpu, msr_info); + /* + * Userspace is allowed to read MSRs that KVM reports as + * to-be-saved, even if an MSR isn't fully supported. + */ + if (!msr_info->host_initiated) + return 1; + msr_info->data = 0; + break; case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3: case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3: case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1: case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1: if (kvm_pmu_is_valid_msr(vcpu, msr_info->index)) - return kvm_pmu_get_msr(vcpu, msr_info->index, &msr_info->data); + return kvm_pmu_get_msr(vcpu, msr_info); msr_info->data = 0; break; case MSR_IA32_UCODE_REV: @@ -3046,14 +3993,41 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; msr_info->data = vcpu->arch.arch_capabilities; break; + case MSR_IA32_PERF_CAPABILITIES: + if (!msr_info->host_initiated && + !guest_cpuid_has(vcpu, X86_FEATURE_PDCM)) + return 1; + msr_info->data = vcpu->arch.perf_capabilities; + break; case MSR_IA32_POWER_CTL: msr_info->data = vcpu->arch.msr_ia32_power_ctl; break; - case MSR_IA32_TSC: - msr_info->data = kvm_scale_tsc(vcpu, rdtsc()) + vcpu->arch.tsc_offset; + case MSR_IA32_TSC: { + /* + * Intel SDM states that MSR_IA32_TSC read adds the TSC offset + * even when not intercepted. AMD manual doesn't explicitly + * state this but appears to behave the same. + * + * On userspace reads and writes, however, we unconditionally + * return L1's TSC value to ensure backwards-compatible + * behavior for migration. + */ + u64 offset, ratio; + + if (msr_info->host_initiated) { + offset = vcpu->arch.l1_tsc_offset; + ratio = vcpu->arch.l1_tsc_scaling_ratio; + } else { + offset = vcpu->arch.tsc_offset; + ratio = vcpu->arch.tsc_scaling_ratio; + } + + msr_info->data = kvm_scale_tsc(rdtsc(), ratio) + offset; break; + } case MSR_MTRRcap: - case 0x200 ... 0x2ff: + case 0x200 ... MSR_IA32_MC0_CTL2 - 1: + case MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) ... 0x2ff: return kvm_mtrr_get_msr(vcpu, msr_info->index, &msr_info->data); case 0xcd: /* fsb frequency */ msr_info->data = 3; @@ -3075,10 +4049,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_APICBASE: msr_info->data = kvm_get_apic_base(vcpu); break; - case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff: + case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff: return kvm_x2apic_msr_read(vcpu, msr_info->index, &msr_info->data); - break; - case MSR_IA32_TSCDEADLINE: + case MSR_IA32_TSC_DEADLINE: msr_info->data = kvm_get_lapic_tscdeadline_msr(vcpu); break; case MSR_IA32_TSC_ADJUST: @@ -3105,23 +4078,63 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = vcpu->arch.efer; break; case MSR_KVM_WALL_CLOCK: + if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE)) + return 1; + + msr_info->data = vcpu->kvm->arch.wall_clock; + break; case MSR_KVM_WALL_CLOCK_NEW: + if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2)) + return 1; + msr_info->data = vcpu->kvm->arch.wall_clock; break; case MSR_KVM_SYSTEM_TIME: + if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE)) + return 1; + + msr_info->data = vcpu->arch.time; + break; case MSR_KVM_SYSTEM_TIME_NEW: + if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2)) + return 1; + msr_info->data = vcpu->arch.time; break; case MSR_KVM_ASYNC_PF_EN: - msr_info->data = vcpu->arch.apf.msr_val; + if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF)) + return 1; + + msr_info->data = vcpu->arch.apf.msr_en_val; + break; + case MSR_KVM_ASYNC_PF_INT: + if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT)) + return 1; + + msr_info->data = vcpu->arch.apf.msr_int_val; + break; + case MSR_KVM_ASYNC_PF_ACK: + if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT)) + return 1; + + msr_info->data = 0; break; case MSR_KVM_STEAL_TIME: + if (!guest_pv_has(vcpu, KVM_FEATURE_STEAL_TIME)) + return 1; + msr_info->data = vcpu->arch.st.msr_val; break; case MSR_KVM_PV_EOI_EN: + if (!guest_pv_has(vcpu, KVM_FEATURE_PV_EOI)) + return 1; + msr_info->data = vcpu->arch.pv_eoi.msr_val; break; case MSR_KVM_POLL_CONTROL: + if (!guest_pv_has(vcpu, KVM_FEATURE_POLL_CONTROL)) + return 1; + msr_info->data = vcpu->arch.msr_kvm_poll_control; break; case MSR_IA32_P5_MC_ADDR: @@ -3130,6 +4143,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_MCG_CTL: case MSR_IA32_MCG_STATUS: case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: + case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1: return get_msr_mce(vcpu, msr_info->index, &msr_info->data, msr_info->host_initiated); case MSR_IA32_XSS: @@ -3151,6 +4165,8 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = 0x20000000; break; case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: + case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER: + case HV_X64_MSR_SYNDBG_OPTIONS: case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: case HV_X64_MSR_CRASH_CTL: case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT: @@ -3160,7 +4176,6 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return kvm_hv_get_msr_common(vcpu, msr_info->index, &msr_info->data, msr_info->host_initiated); - break; case MSR_IA32_BBL_CR_CTL3: /* This legacy MSR exists but isn't fully documented in current * silicon. It is however accessed by winxp in very narrow @@ -3196,20 +4211,26 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_K7_HWCR: msr_info->data = vcpu->arch.msr_hwcr; break; - default: - if (kvm_pmu_is_valid_msr(vcpu, msr_info->index)) - return kvm_pmu_get_msr(vcpu, msr_info->index, &msr_info->data); - if (!ignore_msrs) { - vcpu_debug_ratelimited(vcpu, "unhandled rdmsr: 0x%x\n", - msr_info->index); +#ifdef CONFIG_X86_64 + case MSR_IA32_XFD: + if (!msr_info->host_initiated && + !guest_cpuid_has(vcpu, X86_FEATURE_XFD)) return 1; - } else { - if (report_ignored_msrs) - vcpu_unimpl(vcpu, "ignored rdmsr: 0x%x\n", - msr_info->index); - msr_info->data = 0; - } + + msr_info->data = vcpu->arch.guest_fpu.fpstate->xfd; + break; + case MSR_IA32_XFD_ERR: + if (!msr_info->host_initiated && + !guest_cpuid_has(vcpu, X86_FEATURE_XFD)) + return 1; + + msr_info->data = vcpu->arch.guest_fpu.xfd_err; break; +#endif + default: + if (kvm_pmu_is_valid_msr(vcpu, msr_info->index)) + return kvm_pmu_get_msr(vcpu, msr_info); + return KVM_MSR_RET_INVALID; } return 0; } @@ -3287,6 +4308,27 @@ static inline bool kvm_can_mwait_in_guest(void) boot_cpu_has(X86_FEATURE_ARAT); } +static int kvm_ioctl_get_supported_hv_cpuid(struct kvm_vcpu *vcpu, + struct kvm_cpuid2 __user *cpuid_arg) +{ + struct kvm_cpuid2 cpuid; + int r; + + r = -EFAULT; + if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid))) + return r; + + r = kvm_get_hv_cpuid(vcpu, &cpuid, cpuid_arg->entries); + if (r) + return r; + + r = -EFAULT; + if (copy_to_user(cpuid_arg, &cpuid, sizeof(cpuid))) + return r; + + return 0; +} + int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) { int r = 0; @@ -3311,7 +4353,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_PIT2: case KVM_CAP_PIT_STATE2: case KVM_CAP_SET_IDENTITY_MAP_ADDR: - case KVM_CAP_XEN_HVM: case KVM_CAP_VCPU_EVENTS: case KVM_CAP_HYPERV: case KVM_CAP_HYPERV_VAPIC: @@ -3323,11 +4364,14 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_HYPERV_TLBFLUSH: case KVM_CAP_HYPERV_SEND_IPI: case KVM_CAP_HYPERV_CPUID: + case KVM_CAP_HYPERV_ENFORCE_CPUID: + case KVM_CAP_SYS_HYPERV_CPUID: case KVM_CAP_PCI_SEGMENT: case KVM_CAP_DEBUGREGS: case KVM_CAP_X86_ROBUST_SINGLESTEP: case KVM_CAP_XSAVE: case KVM_CAP_ASYNC_PF: + case KVM_CAP_ASYNC_PF_INT: case KVM_CAP_GET_TSC_KHZ: case KVM_CAP_KVMCLOCK_CTRL: case KVM_CAP_READONLY_MEM: @@ -3342,13 +4386,47 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_GET_MSR_FEATURES: case KVM_CAP_MSR_PLATFORM_INFO: case KVM_CAP_EXCEPTION_PAYLOAD: + case KVM_CAP_X86_TRIPLE_FAULT_EVENT: + case KVM_CAP_SET_GUEST_DEBUG: + case KVM_CAP_LAST_CPU: + case KVM_CAP_X86_USER_SPACE_MSR: + case KVM_CAP_X86_MSR_FILTER: + case KVM_CAP_ENFORCE_PV_FEATURE_CPUID: +#ifdef CONFIG_X86_SGX_KVM + case KVM_CAP_SGX_ATTRIBUTE: +#endif + case KVM_CAP_VM_COPY_ENC_CONTEXT_FROM: + case KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM: + case KVM_CAP_SREGS2: + case KVM_CAP_EXIT_ON_EMULATION_FAILURE: + case KVM_CAP_VCPU_ATTRIBUTES: + case KVM_CAP_SYS_ATTRIBUTES: + case KVM_CAP_VAPIC: + case KVM_CAP_ENABLE_CAP: + case KVM_CAP_VM_DISABLE_NX_HUGE_PAGES: r = 1; break; + case KVM_CAP_EXIT_HYPERCALL: + r = KVM_EXIT_HYPERCALL_VALID_MASK; + break; + case KVM_CAP_SET_GUEST_DEBUG2: + return KVM_GUESTDBG_VALID_MASK; +#ifdef CONFIG_KVM_XEN + case KVM_CAP_XEN_HVM: + r = KVM_XEN_HVM_CONFIG_HYPERCALL_MSR | + KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL | + KVM_XEN_HVM_CONFIG_SHARED_INFO | + KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL | + KVM_XEN_HVM_CONFIG_EVTCHN_SEND; + if (sched_info_on()) + r |= KVM_XEN_HVM_CONFIG_RUNSTATE; + break; +#endif case KVM_CAP_SYNC_REGS: r = KVM_SYNC_X86_VALID_FIELDS; break; case KVM_CAP_ADJUST_CLOCK: - r = KVM_CLOCK_TSC_STABLE; + r = KVM_CLOCK_VALID_FLAGS; break; case KVM_CAP_X86_DISABLE_EXITS: r |= KVM_X86_DISABLE_EXITS_HLT | KVM_X86_DISABLE_EXITS_PAUSE | @@ -3365,19 +4443,16 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) * fringe case that is not enabled except via specific settings * of the module parameters. */ - r = kvm_x86_ops->has_emulated_msr(MSR_IA32_SMBASE); - break; - case KVM_CAP_VAPIC: - r = !kvm_x86_ops->cpu_has_accelerated_tpr(); + r = static_call(kvm_x86_has_emulated_msr)(kvm, MSR_IA32_SMBASE); break; case KVM_CAP_NR_VCPUS: - r = KVM_SOFT_MAX_VCPUS; + r = min_t(unsigned int, num_online_cpus(), KVM_MAX_VCPUS); break; case KVM_CAP_MAX_VCPUS: r = KVM_MAX_VCPUS; break; case KVM_CAP_MAX_VCPU_ID: - r = KVM_MAX_VCPU_ID; + r = KVM_MAX_VCPU_IDS; break; case KVM_CAP_PV_MMU: /* obsolete */ r = 0; @@ -3389,26 +4464,99 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = boot_cpu_has(X86_FEATURE_XSAVE); break; case KVM_CAP_TSC_CONTROL: - r = kvm_has_tsc_control; + case KVM_CAP_VM_TSC_CONTROL: + r = kvm_caps.has_tsc_control; break; case KVM_CAP_X2APIC_API: r = KVM_X2APIC_API_VALID_FLAGS; break; case KVM_CAP_NESTED_STATE: - r = kvm_x86_ops->get_nested_state ? - kvm_x86_ops->get_nested_state(NULL, NULL, 0) : 0; + r = kvm_x86_ops.nested_ops->get_state ? + kvm_x86_ops.nested_ops->get_state(NULL, NULL, 0) : 0; break; case KVM_CAP_HYPERV_DIRECT_TLBFLUSH: - r = kvm_x86_ops->enable_direct_tlbflush != NULL; + r = kvm_x86_ops.enable_direct_tlbflush != NULL; break; case KVM_CAP_HYPERV_ENLIGHTENED_VMCS: - r = kvm_x86_ops->nested_enable_evmcs != NULL; + r = kvm_x86_ops.nested_ops->enable_evmcs != NULL; + break; + case KVM_CAP_SMALLER_MAXPHYADDR: + r = (int) allow_smaller_maxphyaddr; + break; + case KVM_CAP_STEAL_TIME: + r = sched_info_on(); + break; + case KVM_CAP_X86_BUS_LOCK_EXIT: + if (kvm_caps.has_bus_lock_exit) + r = KVM_BUS_LOCK_DETECTION_OFF | + KVM_BUS_LOCK_DETECTION_EXIT; + else + r = 0; + break; + case KVM_CAP_XSAVE2: { + u64 guest_perm = xstate_get_guest_group_perm(); + + r = xstate_required_size(kvm_caps.supported_xcr0 & guest_perm, false); + if (r < sizeof(struct kvm_xsave)) + r = sizeof(struct kvm_xsave); + break; + } + case KVM_CAP_PMU_CAPABILITY: + r = enable_pmu ? KVM_CAP_PMU_VALID_MASK : 0; + break; + case KVM_CAP_DISABLE_QUIRKS2: + r = KVM_X86_VALID_QUIRKS; + break; + case KVM_CAP_X86_NOTIFY_VMEXIT: + r = kvm_caps.has_notify_vmexit; break; default: break; } return r; +} +static inline void __user *kvm_get_attr_addr(struct kvm_device_attr *attr) +{ + void __user *uaddr = (void __user*)(unsigned long)attr->addr; + + if ((u64)(unsigned long)uaddr != attr->addr) + return ERR_PTR_USR(-EFAULT); + return uaddr; +} + +static int kvm_x86_dev_get_attr(struct kvm_device_attr *attr) +{ + u64 __user *uaddr = kvm_get_attr_addr(attr); + + if (attr->group) + return -ENXIO; + + if (IS_ERR(uaddr)) + return PTR_ERR(uaddr); + + switch (attr->attr) { + case KVM_X86_XCOMP_GUEST_SUPP: + if (put_user(kvm_caps.supported_xcr0, uaddr)) + return -EFAULT; + return 0; + default: + return -ENXIO; + break; + } +} + +static int kvm_x86_dev_has_attr(struct kvm_device_attr *attr) +{ + if (attr->group) + return -ENXIO; + + switch (attr->attr) { + case KVM_X86_XCOMP_GUEST_SUPP: + return 0; + default: + return -ENXIO; + } } long kvm_arch_dev_ioctl(struct file *filp, @@ -3464,10 +4612,10 @@ long kvm_arch_dev_ioctl(struct file *filp, r = 0; break; } - case KVM_X86_GET_MCE_CAP_SUPPORTED: { + case KVM_X86_GET_MCE_CAP_SUPPORTED: r = -EFAULT; - if (copy_to_user(argp, &kvm_mce_cap_supported, - sizeof(kvm_mce_cap_supported))) + if (copy_to_user(argp, &kvm_caps.supported_mce_cap, + sizeof(kvm_caps.supported_mce_cap))) goto out; r = 0; break; @@ -3496,9 +4644,28 @@ long kvm_arch_dev_ioctl(struct file *filp, case KVM_GET_MSRS: r = msr_io(NULL, argp, do_get_msr_feature, 1); break; + case KVM_GET_SUPPORTED_HV_CPUID: + r = kvm_ioctl_get_supported_hv_cpuid(NULL, argp); + break; + case KVM_GET_DEVICE_ATTR: { + struct kvm_device_attr attr; + r = -EFAULT; + if (copy_from_user(&attr, (void __user *)arg, sizeof(attr))) + break; + r = kvm_x86_dev_get_attr(&attr); + break; + } + case KVM_HAS_DEVICE_ATTR: { + struct kvm_device_attr attr; + r = -EFAULT; + if (copy_from_user(&attr, (void __user *)arg, sizeof(attr))) + break; + r = kvm_x86_dev_has_attr(&attr); + break; } default: r = -EINVAL; + break; } out: return r; @@ -3518,14 +4685,17 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { /* Address WBINVD may be executed by guest */ if (need_emulate_wbinvd(vcpu)) { - if (kvm_x86_ops->has_wbinvd_exit()) + if (static_call(kvm_x86_has_wbinvd_exit)()) cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask); else if (vcpu->cpu != -1 && vcpu->cpu != cpu) smp_call_function_single(vcpu->cpu, wbinvd_ipi, NULL, 1); } - kvm_x86_ops->vcpu_load(vcpu, cpu); + static_call(kvm_x86_vcpu_load)(vcpu, cpu); + + /* Save host pkru register if supported */ + vcpu->arch.host_pkru = read_pkru(); /* Apply any externally detected TSC adjustments (due to suspend) */ if (unlikely(vcpu->arch.tsc_offset_adjustment)) { @@ -3541,7 +4711,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) mark_tsc_unstable("KVM discovered backwards TSC"); if (kvm_check_tsc_unstable()) { - u64 offset = kvm_compute_tsc_offset(vcpu, + u64 offset = kvm_compute_l1_tsc_offset(vcpu, vcpu->arch.last_guest_tsc); kvm_vcpu_write_tsc_offset(vcpu, offset); vcpu->arch.tsc_catchup = 1; @@ -3566,66 +4736,79 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu) { - struct kvm_host_map map; - struct kvm_steal_time *st; + struct gfn_to_hva_cache *ghc = &vcpu->arch.st.cache; + struct kvm_steal_time __user *st; + struct kvm_memslots *slots; + static const u8 preempted = KVM_VCPU_PREEMPTED; + gpa_t gpa = vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS; + + /* + * The vCPU can be marked preempted if and only if the VM-Exit was on + * an instruction boundary and will not trigger guest emulation of any + * kind (see vcpu_run). Vendor specific code controls (conservatively) + * when this is true, for example allowing the vCPU to be marked + * preempted if and only if the VM-Exit was due to a host interrupt. + */ + if (!vcpu->arch.at_instruction_boundary) { + vcpu->stat.preemption_other++; + return; + } + vcpu->stat.preemption_reported++; if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) return; if (vcpu->arch.st.preempted) return; - if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT, &map, - &vcpu->arch.st.cache, true)) + /* This happens on process exit */ + if (unlikely(current->mm != vcpu->kvm->mm)) + return; + + slots = kvm_memslots(vcpu->kvm); + + if (unlikely(slots->generation != ghc->generation || + gpa != ghc->gpa || + kvm_is_error_hva(ghc->hva) || !ghc->memslot)) return; - st = map.hva + - offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS); + st = (struct kvm_steal_time __user *)ghc->hva; + BUILD_BUG_ON(sizeof(st->preempted) != sizeof(preempted)); - st->preempted = vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED; + if (!copy_to_user_nofault(&st->preempted, &preempted, sizeof(preempted))) + vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED; - kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, true); + mark_page_dirty_in_slot(vcpu->kvm, ghc->memslot, gpa_to_gfn(ghc->gpa)); } void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) { int idx; - if (vcpu->preempted) - vcpu->arch.preempted_in_kernel = !kvm_x86_ops->get_cpl(vcpu); + if (vcpu->preempted) { + if (!vcpu->arch.guest_state_protected) + vcpu->arch.preempted_in_kernel = !static_call(kvm_x86_get_cpl)(vcpu); - /* - * Disable page faults because we're in atomic context here. - * kvm_write_guest_offset_cached() would call might_fault() - * that relies on pagefault_disable() to tell if there's a - * bug. NOTE: the write to guest memory may not go through if - * during postcopy live migration or if there's heavy guest - * paging. - */ - pagefault_disable(); - /* - * kvm_memslots() will be called by - * kvm_write_guest_offset_cached() so take the srcu lock. - */ - idx = srcu_read_lock(&vcpu->kvm->srcu); - kvm_steal_time_set_preempted(vcpu); - srcu_read_unlock(&vcpu->kvm->srcu, idx); - pagefault_enable(); - kvm_x86_ops->vcpu_put(vcpu); + /* + * Take the srcu lock as memslots will be accessed to check the gfn + * cache generation against the memslots generation. + */ + idx = srcu_read_lock(&vcpu->kvm->srcu); + if (kvm_xen_msr_enabled(vcpu->kvm)) + kvm_xen_runstate_set_preempted(vcpu); + else + kvm_steal_time_set_preempted(vcpu); + srcu_read_unlock(&vcpu->kvm->srcu, idx); + } + + static_call(kvm_x86_vcpu_put)(vcpu); vcpu->arch.last_host_tsc = rdtsc(); - /* - * If userspace has set any breakpoints or watchpoints, dr6 is restored - * on every vmexit, but if not, we might have a stale dr6 from the - * guest. do_debug expects dr6 to be cleared after it runs, do the same. - */ - set_debugreg(0, 6); } static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) { - if (vcpu->arch.apicv_active) - kvm_x86_ops->sync_pir_to_irr(vcpu); + static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu); return kvm_apic_get_state(vcpu, s); } @@ -3645,22 +4828,33 @@ static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu, static int kvm_cpu_accept_dm_intr(struct kvm_vcpu *vcpu) { + /* + * We can accept userspace's request for interrupt injection + * as long as we have a place to store the interrupt number. + * The actual injection will happen when the CPU is able to + * deliver the interrupt. + */ + if (kvm_cpu_has_extint(vcpu)) + return false; + + /* Acknowledging ExtINT does not happen if LINT0 is masked. */ return (!lapic_in_kernel(vcpu) || kvm_apic_accept_pic_intr(vcpu)); } -/* - * if userspace requested an interrupt window, check that the - * interrupt window is open. - * - * No need to exit to userspace if we already have an interrupt queued. - */ static int kvm_vcpu_ready_for_interrupt_injection(struct kvm_vcpu *vcpu) { - return kvm_arch_interrupt_allowed(vcpu) && - !kvm_cpu_has_interrupt(vcpu) && + /* + * Do not cause an interrupt window exit if an exception + * is pending or an event needs reinjection; userspace + * might want to inject the interrupt manually using KVM_SET_REGS + * or KVM_SET_SREGS. For that to work, we must be at an + * instruction boundary and with no events half-injected. + */ + return (kvm_arch_interrupt_allowed(vcpu) && + kvm_cpu_accept_dm_intr(vcpu) && !kvm_event_needs_reinjection(vcpu) && - kvm_cpu_accept_dm_intr(vcpu); + !kvm_is_exception_pending(vcpu)); } static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, @@ -3720,24 +4914,65 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu, unsigned bank_num = mcg_cap & 0xff, bank; r = -EINVAL; - if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS) + if (!bank_num || bank_num > KVM_MAX_MCE_BANKS) goto out; - if (mcg_cap & ~(kvm_mce_cap_supported | 0xff | 0xff0000)) + if (mcg_cap & ~(kvm_caps.supported_mce_cap | 0xff | 0xff0000)) goto out; r = 0; vcpu->arch.mcg_cap = mcg_cap; /* Init IA32_MCG_CTL to all 1s */ if (mcg_cap & MCG_CTL_P) vcpu->arch.mcg_ctl = ~(u64)0; - /* Init IA32_MCi_CTL to all 1s */ - for (bank = 0; bank < bank_num; bank++) + /* Init IA32_MCi_CTL to all 1s, IA32_MCi_CTL2 to all 0s */ + for (bank = 0; bank < bank_num; bank++) { vcpu->arch.mce_banks[bank*4] = ~(u64)0; + if (mcg_cap & MCG_CMCI_P) + vcpu->arch.mci_ctl2_banks[bank] = 0; + } + + kvm_apic_after_set_mcg_cap(vcpu); - kvm_x86_ops->setup_mce(vcpu); + static_call(kvm_x86_setup_mce)(vcpu); out: return r; } +/* + * Validate this is an UCNA (uncorrectable no action) error by checking the + * MCG_STATUS and MCi_STATUS registers: + * - none of the bits for Machine Check Exceptions are set + * - both the VAL (valid) and UC (uncorrectable) bits are set + * MCI_STATUS_PCC - Processor Context Corrupted + * MCI_STATUS_S - Signaled as a Machine Check Exception + * MCI_STATUS_AR - Software recoverable Action Required + */ +static bool is_ucna(struct kvm_x86_mce *mce) +{ + return !mce->mcg_status && + !(mce->status & (MCI_STATUS_PCC | MCI_STATUS_S | MCI_STATUS_AR)) && + (mce->status & MCI_STATUS_VAL) && + (mce->status & MCI_STATUS_UC); +} + +static int kvm_vcpu_x86_set_ucna(struct kvm_vcpu *vcpu, struct kvm_x86_mce *mce, u64* banks) +{ + u64 mcg_cap = vcpu->arch.mcg_cap; + + banks[1] = mce->status; + banks[2] = mce->addr; + banks[3] = mce->misc; + vcpu->arch.mcg_status = mce->mcg_status; + + if (!(mcg_cap & MCG_CMCI_P) || + !(vcpu->arch.mci_ctl2_banks[mce->bank] & MCI_CTL2_CMCI_EN)) + return 0; + + if (lapic_in_kernel(vcpu)) + kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTCMCI); + + return 0; +} + static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu, struct kvm_x86_mce *mce) { @@ -3747,6 +4982,12 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu, if (mce->bank >= bank_num || !(mce->status & MCI_STATUS_VAL)) return -EINVAL; + + banks += array_index_nospec(4 * mce->bank, 4 * bank_num); + + if (is_ucna(mce)) + return kvm_vcpu_x86_set_ucna(vcpu, mce, banks); + /* * if IA32_MCG_CTL is not all 1s, the uncorrected error * reporting is disabled @@ -3754,7 +4995,6 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu, if ((mce->status & MCI_STATUS_UC) && (mcg_cap & MCG_CTL_P) && vcpu->arch.mcg_ctl != ~(u64)0) return 0; - banks += 4 * mce->bank; /* * if IA32_MCi_CTL is not all 1s, the uncorrected error * reporting is disabled for the bank @@ -3789,22 +5029,38 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu, static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, struct kvm_vcpu_events *events) { + struct kvm_queued_exception *ex; + process_nmi(vcpu); + if (kvm_check_request(KVM_REQ_SMI, vcpu)) + process_smi(vcpu); + /* - * In guest mode, payload delivery should be deferred, - * so that the L1 hypervisor can intercept #PF before - * CR2 is modified (or intercept #DB before DR6 is - * modified under nVMX). Unless the per-VM capability, - * KVM_CAP_EXCEPTION_PAYLOAD, is set, we may not defer the delivery of - * an exception payload and handle after a KVM_GET_VCPU_EVENTS. Since we - * opportunistically defer the exception payload, deliver it if the - * capability hasn't been requested before processing a - * KVM_GET_VCPU_EVENTS. + * KVM's ABI only allows for one exception to be migrated. Luckily, + * the only time there can be two queued exceptions is if there's a + * non-exiting _injected_ exception, and a pending exiting exception. + * In that case, ignore the VM-Exiting exception as it's an extension + * of the injected exception. + */ + if (vcpu->arch.exception_vmexit.pending && + !vcpu->arch.exception.pending && + !vcpu->arch.exception.injected) + ex = &vcpu->arch.exception_vmexit; + else + ex = &vcpu->arch.exception; + + /* + * In guest mode, payload delivery should be deferred if the exception + * will be intercepted by L1, e.g. KVM should not modifying CR2 if L1 + * intercepts #PF, ditto for DR6 and #DBs. If the per-VM capability, + * KVM_CAP_EXCEPTION_PAYLOAD, is not set, userspace may or may not + * propagate the payload and so it cannot be safely deferred. Deliver + * the payload if the capability hasn't been requested. */ if (!vcpu->kvm->arch.exception_payload_enabled && - vcpu->arch.exception.pending && vcpu->arch.exception.has_payload) - kvm_deliver_exception_payload(vcpu); + ex->pending && ex->has_payload) + kvm_deliver_exception_payload(vcpu, ex); /* * The API doesn't provide the instruction length for software @@ -3812,36 +5068,35 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, * isn't advanced, we should expect to encounter the exception * again. */ - if (kvm_exception_is_soft(vcpu->arch.exception.nr)) { + if (kvm_exception_is_soft(ex->vector)) { events->exception.injected = 0; events->exception.pending = 0; } else { - events->exception.injected = vcpu->arch.exception.injected; - events->exception.pending = vcpu->arch.exception.pending; + events->exception.injected = ex->injected; + events->exception.pending = ex->pending; /* * For ABI compatibility, deliberately conflate * pending and injected exceptions when * KVM_CAP_EXCEPTION_PAYLOAD isn't enabled. */ if (!vcpu->kvm->arch.exception_payload_enabled) - events->exception.injected |= - vcpu->arch.exception.pending; + events->exception.injected |= ex->pending; } - events->exception.nr = vcpu->arch.exception.nr; - events->exception.has_error_code = vcpu->arch.exception.has_error_code; - events->exception.error_code = vcpu->arch.exception.error_code; - events->exception_has_payload = vcpu->arch.exception.has_payload; - events->exception_payload = vcpu->arch.exception.payload; + events->exception.nr = ex->vector; + events->exception.has_error_code = ex->has_error_code; + events->exception.error_code = ex->error_code; + events->exception_has_payload = ex->has_payload; + events->exception_payload = ex->payload; events->interrupt.injected = vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft; events->interrupt.nr = vcpu->arch.interrupt.nr; events->interrupt.soft = 0; - events->interrupt.shadow = kvm_x86_ops->get_interrupt_shadow(vcpu); + events->interrupt.shadow = static_call(kvm_x86_get_interrupt_shadow)(vcpu); events->nmi.injected = vcpu->arch.nmi_injected; events->nmi.pending = vcpu->arch.nmi_pending != 0; - events->nmi.masked = kvm_x86_ops->get_nmi_mask(vcpu); + events->nmi.masked = static_call(kvm_x86_get_nmi_mask)(vcpu); events->nmi.pad = 0; events->sipi_vector = 0; /* never valid when reporting to user space */ @@ -3857,11 +5112,15 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, | KVM_VCPUEVENT_VALID_SMM); if (vcpu->kvm->arch.exception_payload_enabled) events->flags |= KVM_VCPUEVENT_VALID_PAYLOAD; + if (vcpu->kvm->arch.triple_fault_event) { + events->triple_fault.pending = kvm_test_request(KVM_REQ_TRIPLE_FAULT, vcpu); + events->flags |= KVM_VCPUEVENT_VALID_TRIPLE_FAULT; + } memset(&events->reserved, 0, sizeof(events->reserved)); } -static void kvm_smm_changed(struct kvm_vcpu *vcpu); +static void kvm_smm_changed(struct kvm_vcpu *vcpu, bool entering_smm); static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, struct kvm_vcpu_events *events) @@ -3870,7 +5129,8 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, | KVM_VCPUEVENT_VALID_SIPI_VECTOR | KVM_VCPUEVENT_VALID_SHADOW | KVM_VCPUEVENT_VALID_SMM - | KVM_VCPUEVENT_VALID_PAYLOAD)) + | KVM_VCPUEVENT_VALID_PAYLOAD + | KVM_VCPUEVENT_VALID_TRIPLE_FAULT)) return -EINVAL; if (events->flags & KVM_VCPUEVENT_VALID_PAYLOAD) { @@ -3896,9 +5156,22 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, return -EINVAL; process_nmi(vcpu); + + /* + * Flag that userspace is stuffing an exception, the next KVM_RUN will + * morph the exception to a VM-Exit if appropriate. Do this only for + * pending exceptions, already-injected exceptions are not subject to + * intercpetion. Note, userspace that conflates pending and injected + * is hosed, and will incorrectly convert an injected exception into a + * pending exception, which in turn may cause a spurious VM-Exit. + */ + vcpu->arch.exception_from_userspace = events->exception.pending; + + vcpu->arch.exception_vmexit.pending = false; + vcpu->arch.exception.injected = events->exception.injected; vcpu->arch.exception.pending = events->exception.pending; - vcpu->arch.exception.nr = events->exception.nr; + vcpu->arch.exception.vector = events->exception.nr; vcpu->arch.exception.has_error_code = events->exception.has_error_code; vcpu->arch.exception.error_code = events->exception.error_code; vcpu->arch.exception.has_payload = events->exception_has_payload; @@ -3908,13 +5181,13 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, vcpu->arch.interrupt.nr = events->interrupt.nr; vcpu->arch.interrupt.soft = events->interrupt.soft; if (events->flags & KVM_VCPUEVENT_VALID_SHADOW) - kvm_x86_ops->set_interrupt_shadow(vcpu, - events->interrupt.shadow); + static_call(kvm_x86_set_interrupt_shadow)(vcpu, + events->interrupt.shadow); vcpu->arch.nmi_injected = events->nmi.injected; if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING) vcpu->arch.nmi_pending = events->nmi.pending; - kvm_x86_ops->set_nmi_mask(vcpu, events->nmi.masked); + static_call(kvm_x86_set_nmi_mask)(vcpu, events->nmi.masked); if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR && lapic_in_kernel(vcpu)) @@ -3922,11 +5195,8 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, if (events->flags & KVM_VCPUEVENT_VALID_SMM) { if (!!(vcpu->arch.hflags & HF_SMM_MASK) != events->smi.smm) { - if (events->smi.smm) - vcpu->arch.hflags |= HF_SMM_MASK; - else - vcpu->arch.hflags &= ~HF_SMM_MASK; - kvm_smm_changed(vcpu); + kvm_x86_ops.nested_ops->leave_nested(vcpu); + kvm_smm_changed(vcpu, events->smi.smm); } vcpu->arch.smi_pending = events->smi.pending; @@ -3946,6 +5216,15 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, } } + if (events->flags & KVM_VCPUEVENT_VALID_TRIPLE_FAULT) { + if (!vcpu->kvm->arch.triple_fault_event) + return -EINVAL; + if (events->triple_fault.pending) + kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); + else + kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu); + } + kvm_make_request(KVM_REQ_EVENT, vcpu); return 0; @@ -3970,149 +5249,52 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu, if (dbgregs->flags) return -EINVAL; - if (dbgregs->dr6 & ~0xffffffffull) + if (!kvm_dr6_valid(dbgregs->dr6)) return -EINVAL; - if (dbgregs->dr7 & ~0xffffffffull) + if (!kvm_dr7_valid(dbgregs->dr7)) return -EINVAL; memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db)); kvm_update_dr0123(vcpu); vcpu->arch.dr6 = dbgregs->dr6; - kvm_update_dr6(vcpu); vcpu->arch.dr7 = dbgregs->dr7; kvm_update_dr7(vcpu); return 0; } -#define XSTATE_COMPACTION_ENABLED (1ULL << 63) - -static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu) +static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu, + struct kvm_xsave *guest_xsave) { - struct xregs_state *xsave = &vcpu->arch.guest_fpu->state.xsave; - u64 xstate_bv = xsave->header.xfeatures; - u64 valid; - - /* - * Copy legacy XSAVE area, to avoid complications with CPUID - * leaves 0 and 1 in the loop below. - */ - memcpy(dest, xsave, XSAVE_HDR_OFFSET); - - /* Set XSTATE_BV */ - xstate_bv &= vcpu->arch.guest_supported_xcr0 | XFEATURE_MASK_FPSSE; - *(u64 *)(dest + XSAVE_HDR_OFFSET) = xstate_bv; - - /* - * Copy each region from the possibly compacted offset to the - * non-compacted offset. - */ - valid = xstate_bv & ~XFEATURE_MASK_FPSSE; - while (valid) { - u64 xfeature_mask = valid & -valid; - int xfeature_nr = fls64(xfeature_mask) - 1; - void *src = get_xsave_addr(xsave, xfeature_nr); - - if (src) { - u32 size, offset, ecx, edx; - cpuid_count(XSTATE_CPUID, xfeature_nr, - &size, &offset, &ecx, &edx); - if (xfeature_nr == XFEATURE_PKRU) - memcpy(dest + offset, &vcpu->arch.pkru, - sizeof(vcpu->arch.pkru)); - else - memcpy(dest + offset, src, size); - - } + if (fpstate_is_confidential(&vcpu->arch.guest_fpu)) + return; - valid -= xfeature_mask; - } + fpu_copy_guest_fpstate_to_uabi(&vcpu->arch.guest_fpu, + guest_xsave->region, + sizeof(guest_xsave->region), + vcpu->arch.pkru); } -static void load_xsave(struct kvm_vcpu *vcpu, u8 *src) +static void kvm_vcpu_ioctl_x86_get_xsave2(struct kvm_vcpu *vcpu, + u8 *state, unsigned int size) { - struct xregs_state *xsave = &vcpu->arch.guest_fpu->state.xsave; - u64 xstate_bv = *(u64 *)(src + XSAVE_HDR_OFFSET); - u64 valid; - - /* - * Copy legacy XSAVE area, to avoid complications with CPUID - * leaves 0 and 1 in the loop below. - */ - memcpy(xsave, src, XSAVE_HDR_OFFSET); - - /* Set XSTATE_BV and possibly XCOMP_BV. */ - xsave->header.xfeatures = xstate_bv; - if (boot_cpu_has(X86_FEATURE_XSAVES)) - xsave->header.xcomp_bv = host_xcr0 | XSTATE_COMPACTION_ENABLED; - - /* - * Copy each region from the non-compacted offset to the - * possibly compacted offset. - */ - valid = xstate_bv & ~XFEATURE_MASK_FPSSE; - while (valid) { - u64 xfeature_mask = valid & -valid; - int xfeature_nr = fls64(xfeature_mask) - 1; - void *dest = get_xsave_addr(xsave, xfeature_nr); - - if (dest) { - u32 size, offset, ecx, edx; - cpuid_count(XSTATE_CPUID, xfeature_nr, - &size, &offset, &ecx, &edx); - if (xfeature_nr == XFEATURE_PKRU) - memcpy(&vcpu->arch.pkru, src + offset, - sizeof(vcpu->arch.pkru)); - else - memcpy(dest, src + offset, size); - } + if (fpstate_is_confidential(&vcpu->arch.guest_fpu)) + return; - valid -= xfeature_mask; - } + fpu_copy_guest_fpstate_to_uabi(&vcpu->arch.guest_fpu, + state, size, vcpu->arch.pkru); } -static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu, - struct kvm_xsave *guest_xsave) -{ - if (boot_cpu_has(X86_FEATURE_XSAVE)) { - memset(guest_xsave, 0, sizeof(struct kvm_xsave)); - fill_xsave((u8 *) guest_xsave->region, vcpu); - } else { - memcpy(guest_xsave->region, - &vcpu->arch.guest_fpu->state.fxsave, - sizeof(struct fxregs_state)); - *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] = - XFEATURE_MASK_FPSSE; - } -} - -#define XSAVE_MXCSR_OFFSET 24 - static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu, struct kvm_xsave *guest_xsave) { - u64 xstate_bv = - *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)]; - u32 mxcsr = *(u32 *)&guest_xsave->region[XSAVE_MXCSR_OFFSET / sizeof(u32)]; + if (fpstate_is_confidential(&vcpu->arch.guest_fpu)) + return 0; - if (boot_cpu_has(X86_FEATURE_XSAVE)) { - /* - * Here we allow setting states that are not present in - * CPUID leaf 0xD, index 0, EDX:EAX. This is for compatibility - * with old userspace. - */ - if (xstate_bv & ~kvm_supported_xcr0() || - mxcsr & ~mxcsr_feature_mask) - return -EINVAL; - load_xsave(vcpu, (u8 *)guest_xsave->region); - } else { - if (xstate_bv & ~XFEATURE_MASK_FPSSE || - mxcsr & ~mxcsr_feature_mask) - return -EINVAL; - memcpy(&vcpu->arch.guest_fpu->state.fxsave, - guest_xsave->region, sizeof(struct fxregs_state)); - } - return 0; + return fpu_copy_uabi_to_guest_fpstate(&vcpu->arch.guest_fpu, + guest_xsave->region, + kvm_caps.supported_xcr0, + &vcpu->arch.pkru); } static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu, @@ -4160,13 +5342,122 @@ static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu, */ static int kvm_set_guest_paused(struct kvm_vcpu *vcpu) { - if (!vcpu->arch.pv_time_enabled) + if (!vcpu->arch.pv_time.active) return -EINVAL; vcpu->arch.pvclock_set_guest_stopped_request = true; kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); return 0; } +static int kvm_arch_tsc_has_attr(struct kvm_vcpu *vcpu, + struct kvm_device_attr *attr) +{ + int r; + + switch (attr->attr) { + case KVM_VCPU_TSC_OFFSET: + r = 0; + break; + default: + r = -ENXIO; + } + + return r; +} + +static int kvm_arch_tsc_get_attr(struct kvm_vcpu *vcpu, + struct kvm_device_attr *attr) +{ + u64 __user *uaddr = kvm_get_attr_addr(attr); + int r; + + if (IS_ERR(uaddr)) + return PTR_ERR(uaddr); + + switch (attr->attr) { + case KVM_VCPU_TSC_OFFSET: + r = -EFAULT; + if (put_user(vcpu->arch.l1_tsc_offset, uaddr)) + break; + r = 0; + break; + default: + r = -ENXIO; + } + + return r; +} + +static int kvm_arch_tsc_set_attr(struct kvm_vcpu *vcpu, + struct kvm_device_attr *attr) +{ + u64 __user *uaddr = kvm_get_attr_addr(attr); + struct kvm *kvm = vcpu->kvm; + int r; + + if (IS_ERR(uaddr)) + return PTR_ERR(uaddr); + + switch (attr->attr) { + case KVM_VCPU_TSC_OFFSET: { + u64 offset, tsc, ns; + unsigned long flags; + bool matched; + + r = -EFAULT; + if (get_user(offset, uaddr)) + break; + + raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); + + matched = (vcpu->arch.virtual_tsc_khz && + kvm->arch.last_tsc_khz == vcpu->arch.virtual_tsc_khz && + kvm->arch.last_tsc_offset == offset); + + tsc = kvm_scale_tsc(rdtsc(), vcpu->arch.l1_tsc_scaling_ratio) + offset; + ns = get_kvmclock_base_ns(); + + __kvm_synchronize_tsc(vcpu, offset, tsc, ns, matched); + raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); + + r = 0; + break; + } + default: + r = -ENXIO; + } + + return r; +} + +static int kvm_vcpu_ioctl_device_attr(struct kvm_vcpu *vcpu, + unsigned int ioctl, + void __user *argp) +{ + struct kvm_device_attr attr; + int r; + + if (copy_from_user(&attr, argp, sizeof(attr))) + return -EFAULT; + + if (attr.group != KVM_VCPU_TSC_CTRL) + return -ENXIO; + + switch (ioctl) { + case KVM_HAS_DEVICE_ATTR: + r = kvm_arch_tsc_has_attr(vcpu, &attr); + break; + case KVM_GET_DEVICE_ATTR: + r = kvm_arch_tsc_get_attr(vcpu, &attr); + break; + case KVM_SET_DEVICE_ATTR: + r = kvm_arch_tsc_set_attr(vcpu, &attr); + break; + } + + return r; +} + static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, struct kvm_enable_cap *cap) { @@ -4181,7 +5472,7 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, case KVM_CAP_HYPERV_SYNIC2: if (cap->args[0]) return -EINVAL; - /* fall through */ + fallthrough; case KVM_CAP_HYPERV_SYNIC: if (!irqchip_in_kernel(vcpu->kvm)) @@ -4189,9 +5480,9 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, return kvm_hv_activate_synic(vcpu, cap->cap == KVM_CAP_HYPERV_SYNIC2); case KVM_CAP_HYPERV_ENLIGHTENED_VMCS: - if (!kvm_x86_ops->nested_enable_evmcs) + if (!kvm_x86_ops.nested_ops->enable_evmcs) return -ENOTTY; - r = kvm_x86_ops->nested_enable_evmcs(vcpu, &vmcs_version); + r = kvm_x86_ops.nested_ops->enable_evmcs(vcpu, &vmcs_version); if (!r) { user_ptr = (void __user *)(uintptr_t)cap->args[0]; if (copy_to_user(user_ptr, &vmcs_version, @@ -4200,11 +5491,20 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, } return r; case KVM_CAP_HYPERV_DIRECT_TLBFLUSH: - if (!kvm_x86_ops->enable_direct_tlbflush) + if (!kvm_x86_ops.enable_direct_tlbflush) return -ENOTTY; - return kvm_x86_ops->enable_direct_tlbflush(vcpu); + return static_call(kvm_x86_enable_direct_tlbflush)(vcpu); + + case KVM_CAP_HYPERV_ENFORCE_CPUID: + return kvm_hv_set_enforce_cpuid(vcpu, cap->args[0]); + case KVM_CAP_ENFORCE_PV_FEATURE_CPUID: + vcpu->arch.pv_cpuid.enforce = cap->args[0]; + if (vcpu->arch.pv_cpuid.enforce) + kvm_update_pv_runtime(vcpu); + + return 0; default: return -EINVAL; } @@ -4217,6 +5517,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, void __user *argp = (void __user *)arg; int r; union { + struct kvm_sregs2 *sregs2; struct kvm_lapic_state *lapic; struct kvm_xsave *xsave; struct kvm_xcrs *xcrs; @@ -4419,6 +5720,10 @@ long kvm_arch_vcpu_ioctl(struct file *filp, break; } case KVM_GET_XSAVE: { + r = -EINVAL; + if (vcpu->arch.guest_fpu.uabi_size > sizeof(struct kvm_xsave)) + break; + u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL_ACCOUNT); r = -ENOMEM; if (!u.xsave) @@ -4433,7 +5738,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp, break; } case KVM_SET_XSAVE: { - u.xsave = memdup_user(argp, sizeof(*u.xsave)); + int size = vcpu->arch.guest_fpu.uabi_size; + + u.xsave = memdup_user(argp, size); if (IS_ERR(u.xsave)) { r = PTR_ERR(u.xsave); goto out_nofree; @@ -4442,6 +5749,25 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave); break; } + + case KVM_GET_XSAVE2: { + int size = vcpu->arch.guest_fpu.uabi_size; + + u.xsave = kzalloc(size, GFP_KERNEL_ACCOUNT); + r = -ENOMEM; + if (!u.xsave) + break; + + kvm_vcpu_ioctl_x86_get_xsave2(vcpu, u.buffer, size); + + r = -EFAULT; + if (copy_to_user(argp, u.xsave, size)) + break; + + r = 0; + break; + } + case KVM_GET_XCRS: { u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL_ACCOUNT); r = -ENOMEM; @@ -4473,7 +5799,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = -EINVAL; user_tsc_khz = (u32)arg; - if (user_tsc_khz >= kvm_max_guest_tsc_khz) + if (kvm_caps.has_tsc_control && + user_tsc_khz >= kvm_caps.max_guest_tsc_khz) goto out; if (user_tsc_khz == 0) @@ -4506,7 +5833,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, u32 user_data_size; r = -EINVAL; - if (!kvm_x86_ops->get_nested_state) + if (!kvm_x86_ops.nested_ops->get_state) break; BUILD_BUG_ON(sizeof(user_data_size) != sizeof(user_kvm_nested_state->size)); @@ -4514,8 +5841,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp, if (get_user(user_data_size, &user_kvm_nested_state->size)) break; - r = kvm_x86_ops->get_nested_state(vcpu, user_kvm_nested_state, - user_data_size); + r = kvm_x86_ops.nested_ops->get_state(vcpu, user_kvm_nested_state, + user_data_size); if (r < 0) break; @@ -4536,7 +5863,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, int idx; r = -EINVAL; - if (!kvm_x86_ops->set_nested_state) + if (!kvm_x86_ops.nested_ops->set_state) break; r = -EFAULT; @@ -4549,7 +5876,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp, if (kvm_state.flags & ~(KVM_STATE_NESTED_RUN_PENDING | KVM_STATE_NESTED_GUEST_MODE - | KVM_STATE_NESTED_EVMCS)) + | KVM_STATE_NESTED_EVMCS | KVM_STATE_NESTED_MTF_PENDING + | KVM_STATE_NESTED_GIF_SET)) break; /* nested_run_pending implies guest_mode. */ @@ -4558,29 +5886,62 @@ long kvm_arch_vcpu_ioctl(struct file *filp, break; idx = srcu_read_lock(&vcpu->kvm->srcu); - r = kvm_x86_ops->set_nested_state(vcpu, user_kvm_nested_state, &kvm_state); + r = kvm_x86_ops.nested_ops->set_state(vcpu, user_kvm_nested_state, &kvm_state); srcu_read_unlock(&vcpu->kvm->srcu, idx); break; } - case KVM_GET_SUPPORTED_HV_CPUID: { - struct kvm_cpuid2 __user *cpuid_arg = argp; - struct kvm_cpuid2 cpuid; + case KVM_GET_SUPPORTED_HV_CPUID: + r = kvm_ioctl_get_supported_hv_cpuid(vcpu, argp); + break; +#ifdef CONFIG_KVM_XEN + case KVM_XEN_VCPU_GET_ATTR: { + struct kvm_xen_vcpu_attr xva; r = -EFAULT; - if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid))) + if (copy_from_user(&xva, argp, sizeof(xva))) goto out; + r = kvm_xen_vcpu_get_attr(vcpu, &xva); + if (!r && copy_to_user(argp, &xva, sizeof(xva))) + r = -EFAULT; + break; + } + case KVM_XEN_VCPU_SET_ATTR: { + struct kvm_xen_vcpu_attr xva; - r = kvm_vcpu_ioctl_get_hv_cpuid(vcpu, &cpuid, - cpuid_arg->entries); - if (r) + r = -EFAULT; + if (copy_from_user(&xva, argp, sizeof(xva))) goto out; - + r = kvm_xen_vcpu_set_attr(vcpu, &xva); + break; + } +#endif + case KVM_GET_SREGS2: { + u.sregs2 = kzalloc(sizeof(struct kvm_sregs2), GFP_KERNEL); + r = -ENOMEM; + if (!u.sregs2) + goto out; + __get_sregs2(vcpu, u.sregs2); r = -EFAULT; - if (copy_to_user(cpuid_arg, &cpuid, sizeof(cpuid))) + if (copy_to_user(argp, u.sregs2, sizeof(struct kvm_sregs2))) goto out; r = 0; break; } + case KVM_SET_SREGS2: { + u.sregs2 = memdup_user(argp, sizeof(struct kvm_sregs2)); + if (IS_ERR(u.sregs2)) { + r = PTR_ERR(u.sregs2); + u.sregs2 = NULL; + goto out; + } + r = __set_sregs2(vcpu, u.sregs2); + break; + } + case KVM_HAS_DEVICE_ATTR: + case KVM_GET_DEVICE_ATTR: + case KVM_SET_DEVICE_ATTR: + r = kvm_vcpu_ioctl_device_attr(vcpu, ioctl, argp); + break; default: r = -EINVAL; } @@ -4602,14 +5963,14 @@ static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr) if (addr > (unsigned int)(-3 * PAGE_SIZE)) return -EINVAL; - ret = kvm_x86_ops->set_tss_addr(kvm, addr); + ret = static_call(kvm_x86_set_tss_addr)(kvm, addr); return ret; } static int kvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm, u64 ident_addr) { - return kvm_x86_ops->set_identity_map_addr(kvm, ident_addr); + return static_call(kvm_x86_set_identity_map_addr)(kvm, ident_addr); } static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm, @@ -4761,77 +6122,20 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm, return 0; } -/** - * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot - * @kvm: kvm instance - * @log: slot id and address to which we copy the log - * - * Steps 1-4 below provide general overview of dirty page logging. See - * kvm_get_dirty_log_protect() function description for additional details. - * - * We call kvm_get_dirty_log_protect() to handle steps 1-3, upon return we - * always flush the TLB (step 4) even if previous step failed and the dirty - * bitmap may be corrupt. Regardless of previous outcome the KVM logging API - * does not preclude user space subsequent dirty log read. Flushing TLB ensures - * writes will be marked dirty for next log read. - * - * 1. Take a snapshot of the bit and clear it if needed. - * 2. Write protect the corresponding page. - * 3. Copy the snapshot to the userspace. - * 4. Flush TLB's if needed. - */ -int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) -{ - bool flush = false; - int r; - - mutex_lock(&kvm->slots_lock); - - /* - * Flush potentially hardware-cached dirty pages to dirty_bitmap. - */ - if (kvm_x86_ops->flush_log_dirty) - kvm_x86_ops->flush_log_dirty(kvm); - - r = kvm_get_dirty_log_protect(kvm, log, &flush); - - /* - * All the TLBs can be flushed out of mmu lock, see the comments in - * kvm_mmu_slot_remove_write_access(). - */ - lockdep_assert_held(&kvm->slots_lock); - if (flush) - kvm_flush_remote_tlbs(kvm); - - mutex_unlock(&kvm->slots_lock); - return r; -} - -int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm, struct kvm_clear_dirty_log *log) +void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot) { - bool flush = false; - int r; - - mutex_lock(&kvm->slots_lock); - - /* - * Flush potentially hardware-cached dirty pages to dirty_bitmap. - */ - if (kvm_x86_ops->flush_log_dirty) - kvm_x86_ops->flush_log_dirty(kvm); - - r = kvm_clear_dirty_log_protect(kvm, log, &flush); /* - * All the TLBs can be flushed out of mmu lock, see the comments in - * kvm_mmu_slot_remove_write_access(). + * Flush all CPUs' dirty log buffers to the dirty_bitmap. Called + * before reporting dirty_bitmap to userspace. KVM flushes the buffers + * on all VM-Exits, thus we only need to kick running vCPUs to force a + * VM-Exit. */ - lockdep_assert_held(&kvm->slots_lock); - if (flush) - kvm_flush_remote_tlbs(kvm); + struct kvm_vcpu *vcpu; + unsigned long i; - mutex_unlock(&kvm->slots_lock); - return r; + kvm_for_each_vcpu(i, vcpu, kvm) + kvm_vcpu_kick(vcpu); } int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event, @@ -4855,6 +6159,11 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, return -EINVAL; switch (cap->cap) { + case KVM_CAP_DISABLE_QUIRKS2: + r = -EINVAL; + if (cap->args[0] & ~KVM_X86_VALID_QUIRKS) + break; + fallthrough; case KVM_CAP_DISABLE_QUIRKS: kvm->arch.disabled_quirks = cap->args[0]; r = 0; @@ -4876,6 +6185,7 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, smp_wmb(); kvm->arch.irqchip_mode = KVM_IRQCHIP_SPLIT; kvm->arch.nr_reserved_ioapic_pins = cap->args[0]; + kvm_clear_apicv_inhibit(kvm, APICV_INHIBIT_REASON_ABSENT); r = 0; split_irqchip_unlock: mutex_unlock(&kvm->lock); @@ -4917,6 +6227,150 @@ split_irqchip_unlock: kvm->arch.exception_payload_enabled = cap->args[0]; r = 0; break; + case KVM_CAP_X86_TRIPLE_FAULT_EVENT: + kvm->arch.triple_fault_event = cap->args[0]; + r = 0; + break; + case KVM_CAP_X86_USER_SPACE_MSR: + r = -EINVAL; + if (cap->args[0] & ~(KVM_MSR_EXIT_REASON_INVAL | + KVM_MSR_EXIT_REASON_UNKNOWN | + KVM_MSR_EXIT_REASON_FILTER)) + break; + kvm->arch.user_space_msr_mask = cap->args[0]; + r = 0; + break; + case KVM_CAP_X86_BUS_LOCK_EXIT: + r = -EINVAL; + if (cap->args[0] & ~KVM_BUS_LOCK_DETECTION_VALID_MODE) + break; + + if ((cap->args[0] & KVM_BUS_LOCK_DETECTION_OFF) && + (cap->args[0] & KVM_BUS_LOCK_DETECTION_EXIT)) + break; + + if (kvm_caps.has_bus_lock_exit && + cap->args[0] & KVM_BUS_LOCK_DETECTION_EXIT) + kvm->arch.bus_lock_detection_enabled = true; + r = 0; + break; +#ifdef CONFIG_X86_SGX_KVM + case KVM_CAP_SGX_ATTRIBUTE: { + unsigned long allowed_attributes = 0; + + r = sgx_set_attribute(&allowed_attributes, cap->args[0]); + if (r) + break; + + /* KVM only supports the PROVISIONKEY privileged attribute. */ + if ((allowed_attributes & SGX_ATTR_PROVISIONKEY) && + !(allowed_attributes & ~SGX_ATTR_PROVISIONKEY)) + kvm->arch.sgx_provisioning_allowed = true; + else + r = -EINVAL; + break; + } +#endif + case KVM_CAP_VM_COPY_ENC_CONTEXT_FROM: + r = -EINVAL; + if (!kvm_x86_ops.vm_copy_enc_context_from) + break; + + r = static_call(kvm_x86_vm_copy_enc_context_from)(kvm, cap->args[0]); + break; + case KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM: + r = -EINVAL; + if (!kvm_x86_ops.vm_move_enc_context_from) + break; + + r = static_call(kvm_x86_vm_move_enc_context_from)(kvm, cap->args[0]); + break; + case KVM_CAP_EXIT_HYPERCALL: + if (cap->args[0] & ~KVM_EXIT_HYPERCALL_VALID_MASK) { + r = -EINVAL; + break; + } + kvm->arch.hypercall_exit_enabled = cap->args[0]; + r = 0; + break; + case KVM_CAP_EXIT_ON_EMULATION_FAILURE: + r = -EINVAL; + if (cap->args[0] & ~1) + break; + kvm->arch.exit_on_emulation_error = cap->args[0]; + r = 0; + break; + case KVM_CAP_PMU_CAPABILITY: + r = -EINVAL; + if (!enable_pmu || (cap->args[0] & ~KVM_CAP_PMU_VALID_MASK)) + break; + + mutex_lock(&kvm->lock); + if (!kvm->created_vcpus) { + kvm->arch.enable_pmu = !(cap->args[0] & KVM_PMU_CAP_DISABLE); + r = 0; + } + mutex_unlock(&kvm->lock); + break; + case KVM_CAP_MAX_VCPU_ID: + r = -EINVAL; + if (cap->args[0] > KVM_MAX_VCPU_IDS) + break; + + mutex_lock(&kvm->lock); + if (kvm->arch.max_vcpu_ids == cap->args[0]) { + r = 0; + } else if (!kvm->arch.max_vcpu_ids) { + kvm->arch.max_vcpu_ids = cap->args[0]; + r = 0; + } + mutex_unlock(&kvm->lock); + break; + case KVM_CAP_X86_NOTIFY_VMEXIT: + r = -EINVAL; + if ((u32)cap->args[0] & ~KVM_X86_NOTIFY_VMEXIT_VALID_BITS) + break; + if (!kvm_caps.has_notify_vmexit) + break; + if (!((u32)cap->args[0] & KVM_X86_NOTIFY_VMEXIT_ENABLED)) + break; + mutex_lock(&kvm->lock); + if (!kvm->created_vcpus) { + kvm->arch.notify_window = cap->args[0] >> 32; + kvm->arch.notify_vmexit_flags = (u32)cap->args[0]; + r = 0; + } + mutex_unlock(&kvm->lock); + break; + case KVM_CAP_VM_DISABLE_NX_HUGE_PAGES: + r = -EINVAL; + + /* + * Since the risk of disabling NX hugepages is a guest crashing + * the system, ensure the userspace process has permission to + * reboot the system. + * + * Note that unlike the reboot() syscall, the process must have + * this capability in the root namespace because exposing + * /dev/kvm into a container does not limit the scope of the + * iTLB multihit bug to that container. In other words, + * this must use capable(), not ns_capable(). + */ + if (!capable(CAP_SYS_BOOT)) { + r = -EPERM; + break; + } + + if (cap->args[0]) + break; + + mutex_lock(&kvm->lock); + if (!kvm->created_vcpus) { + kvm->arch.disable_nx_huge_pages = true; + r = 0; + } + mutex_unlock(&kvm->lock); + break; default: r = -EINVAL; break; @@ -4924,6 +6378,261 @@ split_irqchip_unlock: return r; } +static struct kvm_x86_msr_filter *kvm_alloc_msr_filter(bool default_allow) +{ + struct kvm_x86_msr_filter *msr_filter; + + msr_filter = kzalloc(sizeof(*msr_filter), GFP_KERNEL_ACCOUNT); + if (!msr_filter) + return NULL; + + msr_filter->default_allow = default_allow; + return msr_filter; +} + +static void kvm_free_msr_filter(struct kvm_x86_msr_filter *msr_filter) +{ + u32 i; + + if (!msr_filter) + return; + + for (i = 0; i < msr_filter->count; i++) + kfree(msr_filter->ranges[i].bitmap); + + kfree(msr_filter); +} + +static int kvm_add_msr_filter(struct kvm_x86_msr_filter *msr_filter, + struct kvm_msr_filter_range *user_range) +{ + unsigned long *bitmap = NULL; + size_t bitmap_size; + + if (!user_range->nmsrs) + return 0; + + if (user_range->flags & ~(KVM_MSR_FILTER_READ | KVM_MSR_FILTER_WRITE)) + return -EINVAL; + + if (!user_range->flags) + return -EINVAL; + + bitmap_size = BITS_TO_LONGS(user_range->nmsrs) * sizeof(long); + if (!bitmap_size || bitmap_size > KVM_MSR_FILTER_MAX_BITMAP_SIZE) + return -EINVAL; + + bitmap = memdup_user((__user u8*)user_range->bitmap, bitmap_size); + if (IS_ERR(bitmap)) + return PTR_ERR(bitmap); + + msr_filter->ranges[msr_filter->count] = (struct msr_bitmap_range) { + .flags = user_range->flags, + .base = user_range->base, + .nmsrs = user_range->nmsrs, + .bitmap = bitmap, + }; + + msr_filter->count++; + return 0; +} + +static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, + struct kvm_msr_filter *filter) +{ + struct kvm_x86_msr_filter *new_filter, *old_filter; + bool default_allow; + bool empty = true; + int r = 0; + u32 i; + + if (filter->flags & ~KVM_MSR_FILTER_DEFAULT_DENY) + return -EINVAL; + + for (i = 0; i < ARRAY_SIZE(filter->ranges); i++) + empty &= !filter->ranges[i].nmsrs; + + default_allow = !(filter->flags & KVM_MSR_FILTER_DEFAULT_DENY); + if (empty && !default_allow) + return -EINVAL; + + new_filter = kvm_alloc_msr_filter(default_allow); + if (!new_filter) + return -ENOMEM; + + for (i = 0; i < ARRAY_SIZE(filter->ranges); i++) { + r = kvm_add_msr_filter(new_filter, &filter->ranges[i]); + if (r) { + kvm_free_msr_filter(new_filter); + return r; + } + } + + mutex_lock(&kvm->lock); + + /* The per-VM filter is protected by kvm->lock... */ + old_filter = srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1); + + rcu_assign_pointer(kvm->arch.msr_filter, new_filter); + synchronize_srcu(&kvm->srcu); + + kvm_free_msr_filter(old_filter); + + kvm_make_all_cpus_request(kvm, KVM_REQ_MSR_FILTER_CHANGED); + mutex_unlock(&kvm->lock); + + return 0; +} + +#ifdef CONFIG_KVM_COMPAT +/* for KVM_X86_SET_MSR_FILTER */ +struct kvm_msr_filter_range_compat { + __u32 flags; + __u32 nmsrs; + __u32 base; + __u32 bitmap; +}; + +struct kvm_msr_filter_compat { + __u32 flags; + struct kvm_msr_filter_range_compat ranges[KVM_MSR_FILTER_MAX_RANGES]; +}; + +#define KVM_X86_SET_MSR_FILTER_COMPAT _IOW(KVMIO, 0xc6, struct kvm_msr_filter_compat) + +long kvm_arch_vm_compat_ioctl(struct file *filp, unsigned int ioctl, + unsigned long arg) +{ + void __user *argp = (void __user *)arg; + struct kvm *kvm = filp->private_data; + long r = -ENOTTY; + + switch (ioctl) { + case KVM_X86_SET_MSR_FILTER_COMPAT: { + struct kvm_msr_filter __user *user_msr_filter = argp; + struct kvm_msr_filter_compat filter_compat; + struct kvm_msr_filter filter; + int i; + + if (copy_from_user(&filter_compat, user_msr_filter, + sizeof(filter_compat))) + return -EFAULT; + + filter.flags = filter_compat.flags; + for (i = 0; i < ARRAY_SIZE(filter.ranges); i++) { + struct kvm_msr_filter_range_compat *cr; + + cr = &filter_compat.ranges[i]; + filter.ranges[i] = (struct kvm_msr_filter_range) { + .flags = cr->flags, + .nmsrs = cr->nmsrs, + .base = cr->base, + .bitmap = (__u8 *)(ulong)cr->bitmap, + }; + } + + r = kvm_vm_ioctl_set_msr_filter(kvm, &filter); + break; + } + } + + return r; +} +#endif + +#ifdef CONFIG_HAVE_KVM_PM_NOTIFIER +static int kvm_arch_suspend_notifier(struct kvm *kvm) +{ + struct kvm_vcpu *vcpu; + unsigned long i; + int ret = 0; + + mutex_lock(&kvm->lock); + kvm_for_each_vcpu(i, vcpu, kvm) { + if (!vcpu->arch.pv_time.active) + continue; + + ret = kvm_set_guest_paused(vcpu); + if (ret) { + kvm_err("Failed to pause guest VCPU%d: %d\n", + vcpu->vcpu_id, ret); + break; + } + } + mutex_unlock(&kvm->lock); + + return ret ? NOTIFY_BAD : NOTIFY_DONE; +} + +int kvm_arch_pm_notifier(struct kvm *kvm, unsigned long state) +{ + switch (state) { + case PM_HIBERNATION_PREPARE: + case PM_SUSPEND_PREPARE: + return kvm_arch_suspend_notifier(kvm); + } + + return NOTIFY_DONE; +} +#endif /* CONFIG_HAVE_KVM_PM_NOTIFIER */ + +static int kvm_vm_ioctl_get_clock(struct kvm *kvm, void __user *argp) +{ + struct kvm_clock_data data = { 0 }; + + get_kvmclock(kvm, &data); + if (copy_to_user(argp, &data, sizeof(data))) + return -EFAULT; + + return 0; +} + +static int kvm_vm_ioctl_set_clock(struct kvm *kvm, void __user *argp) +{ + struct kvm_arch *ka = &kvm->arch; + struct kvm_clock_data data; + u64 now_raw_ns; + + if (copy_from_user(&data, argp, sizeof(data))) + return -EFAULT; + + /* + * Only KVM_CLOCK_REALTIME is used, but allow passing the + * result of KVM_GET_CLOCK back to KVM_SET_CLOCK. + */ + if (data.flags & ~KVM_CLOCK_VALID_FLAGS) + return -EINVAL; + + kvm_hv_request_tsc_page_update(kvm); + kvm_start_pvclock_update(kvm); + pvclock_update_vm_gtod_copy(kvm); + + /* + * This pairs with kvm_guest_time_update(): when masterclock is + * in use, we use master_kernel_ns + kvmclock_offset to set + * unsigned 'system_time' so if we use get_kvmclock_ns() (which + * is slightly ahead) here we risk going negative on unsigned + * 'system_time' when 'data.clock' is very small. + */ + if (data.flags & KVM_CLOCK_REALTIME) { + u64 now_real_ns = ktime_get_real_ns(); + + /* + * Avoid stepping the kvmclock backwards. + */ + if (now_real_ns > data.realtime) + data.clock += now_real_ns - data.realtime; + } + + if (ka->use_master_clock) + now_raw_ns = ka->master_kernel_ns; + else + now_raw_ns = get_kvmclock_base_ns(); + ka->kvmclock_offset = data.clock - now_raw_ns; + kvm_end_pvclock_update(kvm); + return 0; +} + long kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -4996,6 +6705,7 @@ set_identity_unlock: /* Write kvm->irq_routing before enabling irqchip_in_kernel. */ smp_wmb(); kvm->arch.irqchip_mode = KVM_IRQCHIP_KERNEL; + kvm_clear_apicv_inhibit(kvm, APICV_INHIBIT_REASON_ABSENT); create_irqchip_unlock: mutex_unlock(&kvm->lock); break; @@ -5082,10 +6792,13 @@ set_identity_unlock: r = -EFAULT; if (copy_from_user(&u.ps, argp, sizeof(u.ps))) goto out; + mutex_lock(&kvm->lock); r = -ENXIO; if (!kvm->arch.vpit) - goto out; + goto set_pit_out; r = kvm_vm_ioctl_set_pit(kvm, &u.ps); +set_pit_out: + mutex_unlock(&kvm->lock); break; } case KVM_GET_PIT2: { @@ -5105,10 +6818,13 @@ set_identity_unlock: r = -EFAULT; if (copy_from_user(&u.ps2, argp, sizeof(u.ps2))) goto out; + mutex_lock(&kvm->lock); r = -ENXIO; if (!kvm->arch.vpit) - goto out; + goto set_pit2_out; r = kvm_vm_ioctl_set_pit2(kvm, &u.ps2); +set_pit2_out: + mutex_unlock(&kvm->lock); break; } case KVM_REINJECT_CONTROL: { @@ -5131,61 +6847,79 @@ set_identity_unlock: kvm->arch.bsp_vcpu_id = arg; mutex_unlock(&kvm->lock); break; +#ifdef CONFIG_KVM_XEN case KVM_XEN_HVM_CONFIG: { struct kvm_xen_hvm_config xhc; r = -EFAULT; if (copy_from_user(&xhc, argp, sizeof(xhc))) goto out; - r = -EINVAL; - if (xhc.flags) - goto out; - memcpy(&kvm->arch.xen_hvm_config, &xhc, sizeof(xhc)); - r = 0; + r = kvm_xen_hvm_config(kvm, &xhc); break; } - case KVM_SET_CLOCK: { - struct kvm_clock_data user_ns; - u64 now_ns; + case KVM_XEN_HVM_GET_ATTR: { + struct kvm_xen_hvm_attr xha; r = -EFAULT; - if (copy_from_user(&user_ns, argp, sizeof(user_ns))) + if (copy_from_user(&xha, argp, sizeof(xha))) goto out; + r = kvm_xen_hvm_get_attr(kvm, &xha); + if (!r && copy_to_user(argp, &xha, sizeof(xha))) + r = -EFAULT; + break; + } + case KVM_XEN_HVM_SET_ATTR: { + struct kvm_xen_hvm_attr xha; - r = -EINVAL; - if (user_ns.flags) + r = -EFAULT; + if (copy_from_user(&xha, argp, sizeof(xha))) goto out; + r = kvm_xen_hvm_set_attr(kvm, &xha); + break; + } + case KVM_XEN_HVM_EVTCHN_SEND: { + struct kvm_irq_routing_xen_evtchn uxe; - r = 0; - /* - * TODO: userspace has to take care of races with VCPU_RUN, so - * kvm_gen_update_masterclock() can be cut down to locked - * pvclock_update_vm_gtod_copy(). - */ - kvm_gen_update_masterclock(kvm); - now_ns = get_kvmclock_ns(kvm); - kvm->arch.kvmclock_offset += user_ns.clock - now_ns; - kvm_make_all_cpus_request(kvm, KVM_REQ_CLOCK_UPDATE); + r = -EFAULT; + if (copy_from_user(&uxe, argp, sizeof(uxe))) + goto out; + r = kvm_xen_hvm_evtchn_send(kvm, &uxe); break; } - case KVM_GET_CLOCK: { - struct kvm_clock_data user_ns; - u64 now_ns; +#endif + case KVM_SET_CLOCK: + r = kvm_vm_ioctl_set_clock(kvm, argp); + break; + case KVM_GET_CLOCK: + r = kvm_vm_ioctl_get_clock(kvm, argp); + break; + case KVM_SET_TSC_KHZ: { + u32 user_tsc_khz; - now_ns = get_kvmclock_ns(kvm); - user_ns.clock = now_ns; - user_ns.flags = kvm->arch.use_master_clock ? KVM_CLOCK_TSC_STABLE : 0; - memset(&user_ns.pad, 0, sizeof(user_ns.pad)); + r = -EINVAL; + user_tsc_khz = (u32)arg; - r = -EFAULT; - if (copy_to_user(argp, &user_ns, sizeof(user_ns))) + if (kvm_caps.has_tsc_control && + user_tsc_khz >= kvm_caps.max_guest_tsc_khz) goto out; + + if (user_tsc_khz == 0) + user_tsc_khz = tsc_khz; + + WRITE_ONCE(kvm->arch.default_tsc_khz, user_tsc_khz); r = 0; - break; + + goto out; + } + case KVM_GET_TSC_KHZ: { + r = READ_ONCE(kvm->arch.default_tsc_khz); + goto out; } case KVM_MEMORY_ENCRYPT_OP: { r = -ENOTTY; - if (kvm_x86_ops->mem_enc_op) - r = kvm_x86_ops->mem_enc_op(kvm, argp); + if (!kvm_x86_ops.mem_enc_ioctl) + goto out; + + r = static_call(kvm_x86_mem_enc_ioctl)(kvm, argp); break; } case KVM_MEMORY_ENCRYPT_REG_REGION: { @@ -5196,8 +6930,10 @@ set_identity_unlock: goto out; r = -ENOTTY; - if (kvm_x86_ops->mem_enc_reg_region) - r = kvm_x86_ops->mem_enc_reg_region(kvm, ®ion); + if (!kvm_x86_ops.mem_enc_register_region) + goto out; + + r = static_call(kvm_x86_mem_enc_register_region)(kvm, ®ion); break; } case KVM_MEMORY_ENCRYPT_UNREG_REGION: { @@ -5208,8 +6944,10 @@ set_identity_unlock: goto out; r = -ENOTTY; - if (kvm_x86_ops->mem_enc_unreg_region) - r = kvm_x86_ops->mem_enc_unreg_region(kvm, ®ion); + if (!kvm_x86_ops.mem_enc_unregister_region) + goto out; + + r = static_call(kvm_x86_mem_enc_unregister_region)(kvm, ®ion); break; } case KVM_HYPERV_EVENTFD: { @@ -5224,6 +6962,16 @@ set_identity_unlock: case KVM_SET_PMU_EVENT_FILTER: r = kvm_vm_ioctl_set_pmu_event_filter(kvm, argp); break; + case KVM_X86_SET_MSR_FILTER: { + struct kvm_msr_filter __user *user_msr_filter = argp; + struct kvm_msr_filter filter; + + if (copy_from_user(&filter, user_msr_filter, sizeof(filter))) + return -EFAULT; + + r = kvm_vm_ioctl_set_msr_filter(kvm, &filter); + break; + } default: r = -ENOTTY; } @@ -5233,15 +6981,12 @@ out: static void kvm_init_msr_list(void) { - struct x86_pmu_capability x86_pmu; u32 dummy[2]; unsigned i; - BUILD_BUG_ON_MSG(INTEL_PMC_MAX_FIXED != 4, + BUILD_BUG_ON_MSG(KVM_PMC_MAX_FIXED != 3, "Please update the fixed PMCs in msrs_to_saved_all[]"); - perf_get_x86_pmu_capability(&x86_pmu); - num_msrs_to_save = 0; num_emulated_msrs = 0; num_msr_based_features = 0; @@ -5260,42 +7005,52 @@ static void kvm_init_msr_list(void) continue; break; case MSR_TSC_AUX: - if (!kvm_x86_ops->rdtscp_supported()) + if (!kvm_cpu_cap_has(X86_FEATURE_RDTSCP) && + !kvm_cpu_cap_has(X86_FEATURE_RDPID)) + continue; + break; + case MSR_IA32_UMWAIT_CONTROL: + if (!kvm_cpu_cap_has(X86_FEATURE_WAITPKG)) continue; break; case MSR_IA32_RTIT_CTL: case MSR_IA32_RTIT_STATUS: - if (!kvm_x86_ops->pt_supported()) + if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT)) continue; break; case MSR_IA32_RTIT_CR3_MATCH: - if (!kvm_x86_ops->pt_supported() || + if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) || !intel_pt_validate_hw_cap(PT_CAP_cr3_filtering)) continue; break; case MSR_IA32_RTIT_OUTPUT_BASE: case MSR_IA32_RTIT_OUTPUT_MASK: - if (!kvm_x86_ops->pt_supported() || + if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) || (!intel_pt_validate_hw_cap(PT_CAP_topa_output) && !intel_pt_validate_hw_cap(PT_CAP_single_range_output))) continue; break; - case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: { - if (!kvm_x86_ops->pt_supported() || + case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: + if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) || msrs_to_save_all[i] - MSR_IA32_RTIT_ADDR0_A >= intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2) continue; break; - case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR0 + 17: + case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR_MAX: if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_PERFCTR0 >= - min(INTEL_PMC_MAX_GENERIC, x86_pmu.num_counters_gp)) + min(KVM_INTEL_PMC_MAX_GENERIC, kvm_pmu_cap.num_counters_gp)) continue; break; - case MSR_ARCH_PERFMON_EVENTSEL0 ... MSR_ARCH_PERFMON_EVENTSEL0 + 17: + case MSR_ARCH_PERFMON_EVENTSEL0 ... MSR_ARCH_PERFMON_EVENTSEL_MAX: if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_EVENTSEL0 >= - min(INTEL_PMC_MAX_GENERIC, x86_pmu.num_counters_gp)) + min(KVM_INTEL_PMC_MAX_GENERIC, kvm_pmu_cap.num_counters_gp)) continue; - } + break; + case MSR_IA32_XFD: + case MSR_IA32_XFD_ERR: + if (!kvm_cpu_cap_has(X86_FEATURE_XFD)) + continue; + break; default: break; } @@ -5304,7 +7059,7 @@ static void kvm_init_msr_list(void) } for (i = 0; i < ARRAY_SIZE(emulated_msrs_all); i++) { - if (!kvm_x86_ops->has_emulated_msr(emulated_msrs_all[i])) + if (!static_call(kvm_x86_has_emulated_msr)(NULL, emulated_msrs_all[i])) continue; emulated_msrs[num_emulated_msrs++] = emulated_msrs_all[i]; @@ -5367,25 +7122,26 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v) static void kvm_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) { - kvm_x86_ops->set_segment(vcpu, var, seg); + static_call(kvm_x86_set_segment)(vcpu, var, seg); } void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) { - kvm_x86_ops->get_segment(vcpu, var, seg); + static_call(kvm_x86_get_segment)(vcpu, var, seg); } -gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access, +gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u64 access, struct x86_exception *exception) { + struct kvm_mmu *mmu = vcpu->arch.mmu; gpa_t t_gpa; BUG_ON(!mmu_is_nested(vcpu)); /* NPT walks are always user-walks */ access |= PFERR_USER_MASK; - t_gpa = vcpu->arch.mmu->gva_to_gpa(vcpu, gpa, access, exception); + t_gpa = mmu->gva_to_gpa(vcpu, mmu, gpa, access, exception); return t_gpa; } @@ -5393,48 +7149,58 @@ gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access, gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, struct x86_exception *exception) { - u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; - return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception); + struct kvm_mmu *mmu = vcpu->arch.walk_mmu; + + u64 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0; + return mmu->gva_to_gpa(vcpu, mmu, gva, access, exception); } +EXPORT_SYMBOL_GPL(kvm_mmu_gva_to_gpa_read); gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, struct x86_exception *exception) { - u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + struct kvm_mmu *mmu = vcpu->arch.walk_mmu; + + u64 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0; access |= PFERR_FETCH_MASK; - return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception); + return mmu->gva_to_gpa(vcpu, mmu, gva, access, exception); } gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, struct x86_exception *exception) { - u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + struct kvm_mmu *mmu = vcpu->arch.walk_mmu; + + u64 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0; access |= PFERR_WRITE_MASK; - return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception); + return mmu->gva_to_gpa(vcpu, mmu, gva, access, exception); } +EXPORT_SYMBOL_GPL(kvm_mmu_gva_to_gpa_write); /* uses this to access any guest's mapped memory without checking CPL */ gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, struct x86_exception *exception) { - return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, 0, exception); + struct kvm_mmu *mmu = vcpu->arch.walk_mmu; + + return mmu->gva_to_gpa(vcpu, mmu, gva, 0, exception); } static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes, - struct kvm_vcpu *vcpu, u32 access, + struct kvm_vcpu *vcpu, u64 access, struct x86_exception *exception) { + struct kvm_mmu *mmu = vcpu->arch.walk_mmu; void *data = val; int r = X86EMUL_CONTINUE; while (bytes) { - gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access, - exception); + gpa_t gpa = mmu->gva_to_gpa(vcpu, mmu, addr, access, exception); unsigned offset = addr & (PAGE_SIZE-1); unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset); int ret; - if (gpa == UNMAPPED_GVA) + if (gpa == INVALID_GPA) return X86EMUL_PROPAGATE_FAULT; ret = kvm_vcpu_read_guest_page(vcpu, gpa >> PAGE_SHIFT, data, offset, toread); @@ -5457,14 +7223,15 @@ static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt, struct x86_exception *exception) { struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); - u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + struct kvm_mmu *mmu = vcpu->arch.walk_mmu; + u64 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0; unsigned offset; int ret; /* Inline kvm_read_guest_virt_helper for speed. */ - gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access|PFERR_FETCH_MASK, - exception); - if (unlikely(gpa == UNMAPPED_GVA)) + gpa_t gpa = mmu->gva_to_gpa(vcpu, mmu, addr, access|PFERR_FETCH_MASK, + exception); + if (unlikely(gpa == INVALID_GPA)) return X86EMUL_PROPAGATE_FAULT; offset = addr & (PAGE_SIZE-1); @@ -5482,7 +7249,7 @@ int kvm_read_guest_virt(struct kvm_vcpu *vcpu, gva_t addr, void *val, unsigned int bytes, struct x86_exception *exception) { - u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + u64 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0; /* * FIXME: this should call handle_emulation_failure if X86EMUL_IO_NEEDED @@ -5501,9 +7268,11 @@ static int emulator_read_std(struct x86_emulate_ctxt *ctxt, struct x86_exception *exception, bool system) { struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); - u32 access = 0; + u64 access = 0; - if (!system && kvm_x86_ops->get_cpl(vcpu) == 3) + if (system) + access |= PFERR_IMPLICIT_ACCESS; + else if (static_call(kvm_x86_get_cpl)(vcpu) == 3) access |= PFERR_USER_MASK; return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, exception); @@ -5519,21 +7288,20 @@ static int kvm_read_guest_phys_system(struct x86_emulate_ctxt *ctxt, } static int kvm_write_guest_virt_helper(gva_t addr, void *val, unsigned int bytes, - struct kvm_vcpu *vcpu, u32 access, + struct kvm_vcpu *vcpu, u64 access, struct x86_exception *exception) { + struct kvm_mmu *mmu = vcpu->arch.walk_mmu; void *data = val; int r = X86EMUL_CONTINUE; while (bytes) { - gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, - access, - exception); + gpa_t gpa = mmu->gva_to_gpa(vcpu, mmu, addr, access, exception); unsigned offset = addr & (PAGE_SIZE-1); unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset); int ret; - if (gpa == UNMAPPED_GVA) + if (gpa == INVALID_GPA) return X86EMUL_PROPAGATE_FAULT; ret = kvm_vcpu_write_guest(vcpu, gpa, data, towrite); if (ret < 0) { @@ -5554,9 +7322,11 @@ static int emulator_write_std(struct x86_emulate_ctxt *ctxt, gva_t addr, void *v bool system) { struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); - u32 access = PFERR_WRITE_MASK; + u64 access = PFERR_WRITE_MASK; - if (!system && kvm_x86_ops->get_cpl(vcpu) == 3) + if (system) + access |= PFERR_IMPLICIT_ACCESS; + else if (static_call(kvm_x86_get_cpl)(vcpu) == 3) access |= PFERR_USER_MASK; return kvm_write_guest_virt_helper(addr, val, bytes, vcpu, @@ -5569,29 +7339,35 @@ int kvm_write_guest_virt_system(struct kvm_vcpu *vcpu, gva_t addr, void *val, /* kvm_write_guest_virt_system can pull in tons of pages. */ vcpu->arch.l1tf_flush_l1d = true; - /* - * FIXME: this should call handle_emulation_failure if X86EMUL_IO_NEEDED - * is returned, but our callers are not ready for that and they blindly - * call kvm_inject_page_fault. Ensure that they at least do not leak - * uninitialized kernel stack memory into cr2 and error code. - */ - memset(exception, 0, sizeof(*exception)); return kvm_write_guest_virt_helper(addr, val, bytes, vcpu, PFERR_WRITE_MASK, exception); } EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system); +static int kvm_can_emulate_insn(struct kvm_vcpu *vcpu, int emul_type, + void *insn, int insn_len) +{ + return static_call(kvm_x86_can_emulate_instruction)(vcpu, emul_type, + insn, insn_len); +} + int handle_ud(struct kvm_vcpu *vcpu) { static const char kvm_emulate_prefix[] = { __KVM_EMULATE_PREFIX }; + int fep_flags = READ_ONCE(force_emulation_prefix); int emul_type = EMULTYPE_TRAP_UD; char sig[5]; /* ud2; .ascii "kvm" */ struct x86_exception e; - if (force_emulation_prefix && + if (unlikely(!kvm_can_emulate_insn(vcpu, emul_type, NULL, 0))) + return 1; + + if (fep_flags && kvm_read_guest_virt(vcpu, kvm_get_linear_rip(vcpu), sig, sizeof(sig), &e) == 0 && memcmp(sig, kvm_emulate_prefix, sizeof(sig)) == 0) { + if (fep_flags & KVM_FEP_CLEAR_RFLAGS_RF) + kvm_set_rflags(vcpu, kvm_get_rflags(vcpu) & ~X86_EFLAGS_RF); kvm_rip_write(vcpu, kvm_rip_read(vcpu) + sizeof(sig)); emul_type = EMULTYPE_TRAP_UD_FORCED; } @@ -5619,7 +7395,8 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva, gpa_t *gpa, struct x86_exception *exception, bool write) { - u32 access = ((kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0) + struct kvm_mmu *mmu = vcpu->arch.walk_mmu; + u64 access = ((static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0) | (write ? PFERR_WRITE_MASK : 0); /* @@ -5627,18 +7404,18 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva, * there is no pkey in EPT page table for L1 guest or EPT * shadow page table for L2 guest. */ - if (vcpu_match_mmio_gva(vcpu, gva) - && !permission_fault(vcpu, vcpu->arch.walk_mmu, - vcpu->arch.mmio_access, 0, access)) { + if (vcpu_match_mmio_gva(vcpu, gva) && (!is_paging(vcpu) || + !permission_fault(vcpu, vcpu->arch.walk_mmu, + vcpu->arch.mmio_access, 0, access))) { *gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT | (gva & (PAGE_SIZE - 1)); trace_vcpu_match_mmio(gva, *gpa, write, false); return 1; } - *gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception); + *gpa = mmu->gva_to_gpa(vcpu, mmu, gva, access, exception); - if (*gpa == UNMAPPED_GVA) + if (*gpa == INVALID_GPA) return -1; return vcpu_is_mmio_gpa(vcpu, gva, *gpa, write); @@ -5738,7 +7515,7 @@ static int emulator_read_write_onepage(unsigned long addr, void *val, int handled, ret; bool write = ops->write; struct kvm_mmio_fragment *frag; - struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; + struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; /* * If the exit was due to a NPF we may already have a GPA. @@ -5747,10 +7524,9 @@ static int emulator_read_write_onepage(unsigned long addr, void *val, * operation using rep will only have the initial GPA from the NPF * occurred. */ - if (vcpu->arch.gpa_available && - emulator_can_use_gpa(ctxt) && - (addr & ~PAGE_MASK) == (vcpu->arch.gpa_val & ~PAGE_MASK)) { - gpa = vcpu->arch.gpa_val; + if (ctxt->gpa_available && emulator_can_use_gpa(ctxt) && + (addr & ~PAGE_MASK) == (ctxt->gpa_val & ~PAGE_MASK)) { + gpa = ctxt->gpa_val; ret = vcpu_is_mmio_gpa(vcpu, addr, gpa, write); } else { ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, write); @@ -5854,15 +7630,8 @@ static int emulator_write_emulated(struct x86_emulate_ctxt *ctxt, exception, &write_emultor); } -#define CMPXCHG_TYPE(t, ptr, old, new) \ - (cmpxchg((t *)(ptr), *(t *)(old), *(t *)(new)) == *(t *)(old)) - -#ifdef CONFIG_X86_64 -# define CMPXCHG64(ptr, old, new) CMPXCHG_TYPE(u64, ptr, old, new) -#else -# define CMPXCHG64(ptr, old, new) \ - (cmpxchg64((u64 *)(ptr), *(u64 *)(old), *(u64 *)(new)) == *(u64 *)(old)) -#endif +#define emulator_try_cmpxchg_user(t, ptr, old, new) \ + (__try_cmpxchg_user((t __user *)(ptr), (t *)(old), *(t *)(new), efault ## t)) static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt, unsigned long addr, @@ -5871,11 +7640,11 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt, unsigned int bytes, struct x86_exception *exception) { - struct kvm_host_map map; struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); + u64 page_line_mask; + unsigned long hva; gpa_t gpa; - char *kaddr; - bool exchanged; + int r; /* guests cmpxchg8b have to be emulated atomically */ if (bytes > 8 || (bytes & (bytes - 1))) @@ -5883,38 +7652,48 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt, gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL); - if (gpa == UNMAPPED_GVA || + if (gpa == INVALID_GPA || (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) goto emul_write; - if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK)) + /* + * Emulate the atomic as a straight write to avoid #AC if SLD is + * enabled in the host and the access splits a cache line. + */ + if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) + page_line_mask = ~(cache_line_size() - 1); + else + page_line_mask = PAGE_MASK; + + if (((gpa + bytes - 1) & page_line_mask) != (gpa & page_line_mask)) goto emul_write; - if (kvm_vcpu_map(vcpu, gpa_to_gfn(gpa), &map)) + hva = kvm_vcpu_gfn_to_hva(vcpu, gpa_to_gfn(gpa)); + if (kvm_is_error_hva(hva)) goto emul_write; - kaddr = map.hva + offset_in_page(gpa); + hva += offset_in_page(gpa); switch (bytes) { case 1: - exchanged = CMPXCHG_TYPE(u8, kaddr, old, new); + r = emulator_try_cmpxchg_user(u8, hva, old, new); break; case 2: - exchanged = CMPXCHG_TYPE(u16, kaddr, old, new); + r = emulator_try_cmpxchg_user(u16, hva, old, new); break; case 4: - exchanged = CMPXCHG_TYPE(u32, kaddr, old, new); + r = emulator_try_cmpxchg_user(u32, hva, old, new); break; case 8: - exchanged = CMPXCHG64(kaddr, old, new); + r = emulator_try_cmpxchg_user(u64, hva, old, new); break; default: BUG(); } - kvm_vcpu_unmap(vcpu, &map, true); - - if (!exchanged) + if (r < 0) + return X86EMUL_UNHANDLEABLE; + if (r) return X86EMUL_CMPXCHG_FAILED; kvm_page_track_write(vcpu, gpa, new, bytes); @@ -5927,38 +7706,47 @@ emul_write: return emulator_write_emulated(ctxt, addr, new, bytes, exception); } -static int kernel_pio(struct kvm_vcpu *vcpu, void *pd) +static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size, + unsigned short port, void *data, + unsigned int count, bool in) { - int r = 0, i; + unsigned i; + int r; - for (i = 0; i < vcpu->arch.pio.count; i++) { - if (vcpu->arch.pio.in) - r = kvm_io_bus_read(vcpu, KVM_PIO_BUS, vcpu->arch.pio.port, - vcpu->arch.pio.size, pd); + WARN_ON_ONCE(vcpu->arch.pio.count); + for (i = 0; i < count; i++) { + if (in) + r = kvm_io_bus_read(vcpu, KVM_PIO_BUS, port, size, data); else - r = kvm_io_bus_write(vcpu, KVM_PIO_BUS, - vcpu->arch.pio.port, vcpu->arch.pio.size, - pd); - if (r) + r = kvm_io_bus_write(vcpu, KVM_PIO_BUS, port, size, data); + + if (r) { + if (i == 0) + goto userspace_io; + + /* + * Userspace must have unregistered the device while PIO + * was running. Drop writes / read as 0. + */ + if (in) + memset(data, 0, size * (count - i)); break; - pd += vcpu->arch.pio.size; + } + + data += size; } - return r; -} + return 1; -static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size, - unsigned short port, void *val, - unsigned int count, bool in) -{ +userspace_io: vcpu->arch.pio.port = port; vcpu->arch.pio.in = in; - vcpu->arch.pio.count = count; + vcpu->arch.pio.count = count; vcpu->arch.pio.size = size; - if (!kernel_pio(vcpu, vcpu->arch.pio_data)) { - vcpu->arch.pio.count = 0; - return 1; - } + if (in) + memset(vcpu->arch.pio_data, 0, size * count); + else + memcpy(vcpu->arch.pio_data, data, size * count); vcpu->run->exit_reason = KVM_EXIT_IO; vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; @@ -5966,48 +7754,66 @@ static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size, vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE; vcpu->run->io.count = count; vcpu->run->io.port = port; - return 0; } +static int emulator_pio_in(struct kvm_vcpu *vcpu, int size, + unsigned short port, void *val, unsigned int count) +{ + int r = emulator_pio_in_out(vcpu, size, port, val, count, true); + if (r) + trace_kvm_pio(KVM_PIO_IN, port, size, count, val); + + return r; +} + +static void complete_emulator_pio_in(struct kvm_vcpu *vcpu, void *val) +{ + int size = vcpu->arch.pio.size; + unsigned int count = vcpu->arch.pio.count; + memcpy(val, vcpu->arch.pio_data, size * count); + trace_kvm_pio(KVM_PIO_IN, vcpu->arch.pio.port, size, count, vcpu->arch.pio_data); + vcpu->arch.pio.count = 0; +} + static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt, int size, unsigned short port, void *val, unsigned int count) { struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); - int ret; - - if (vcpu->arch.pio.count) - goto data_avail; - - memset(vcpu->arch.pio_data, 0, size * count); - - ret = emulator_pio_in_out(vcpu, size, port, val, count, true); - if (ret) { -data_avail: - memcpy(val, vcpu->arch.pio_data, size * count); - trace_kvm_pio(KVM_PIO_IN, port, size, count, vcpu->arch.pio_data); - vcpu->arch.pio.count = 0; + if (vcpu->arch.pio.count) { + /* + * Complete a previous iteration that required userspace I/O. + * Note, @count isn't guaranteed to match pio.count as userspace + * can modify ECX before rerunning the vCPU. Ignore any such + * shenanigans as KVM doesn't support modifying the rep count, + * and the emulator ensures @count doesn't overflow the buffer. + */ + complete_emulator_pio_in(vcpu, val); return 1; } - return 0; + return emulator_pio_in(vcpu, size, port, val, count); +} + +static int emulator_pio_out(struct kvm_vcpu *vcpu, int size, + unsigned short port, const void *val, + unsigned int count) +{ + trace_kvm_pio(KVM_PIO_OUT, port, size, count, val); + return emulator_pio_in_out(vcpu, size, port, (void *)val, count, false); } static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt, int size, unsigned short port, const void *val, unsigned int count) { - struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); - - memcpy(vcpu->arch.pio_data, val, size * count); - trace_kvm_pio(KVM_PIO_OUT, port, size, count, vcpu->arch.pio_data); - return emulator_pio_in_out(vcpu, size, port, (void *)val, count, false); + return emulator_pio_out(emul_to_vcpu(ctxt), size, port, val, count); } static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg) { - return kvm_x86_ops->get_segment_base(vcpu, seg); + return static_call(kvm_x86_get_segment_base)(vcpu, seg); } static void emulator_invlpg(struct x86_emulate_ctxt *ctxt, ulong address) @@ -6020,11 +7826,11 @@ static int kvm_emulate_wbinvd_noskip(struct kvm_vcpu *vcpu) if (!need_emulate_wbinvd(vcpu)) return X86EMUL_CONTINUE; - if (kvm_x86_ops->has_wbinvd_exit()) { + if (static_call(kvm_x86_has_wbinvd_exit)()) { int cpu = get_cpu(); cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask); - smp_call_function_many(vcpu->arch.wbinvd_dirty_mask, + on_each_cpu_mask(vcpu->arch.wbinvd_dirty_mask, wbinvd_ipi, NULL, 1); put_cpu(); cpumask_clear(vcpu->arch.wbinvd_dirty_mask); @@ -6047,17 +7853,17 @@ static void emulator_wbinvd(struct x86_emulate_ctxt *ctxt) kvm_emulate_wbinvd_noskip(emul_to_vcpu(ctxt)); } -static int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, - unsigned long *dest) +static void emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, + unsigned long *dest) { - return kvm_get_dr(emul_to_vcpu(ctxt), dr, dest); + kvm_get_dr(emul_to_vcpu(ctxt), dr, dest); } static int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value) { - return __kvm_set_dr(emul_to_vcpu(ctxt), dr, value); + return kvm_set_dr(emul_to_vcpu(ctxt), dr, value); } static u64 mk_cr_64(u64 curr_cr, u32 new_val) @@ -6125,27 +7931,27 @@ static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val) static int emulator_get_cpl(struct x86_emulate_ctxt *ctxt) { - return kvm_x86_ops->get_cpl(emul_to_vcpu(ctxt)); + return static_call(kvm_x86_get_cpl)(emul_to_vcpu(ctxt)); } static void emulator_get_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt) { - kvm_x86_ops->get_gdt(emul_to_vcpu(ctxt), dt); + static_call(kvm_x86_get_gdt)(emul_to_vcpu(ctxt), dt); } static void emulator_get_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt) { - kvm_x86_ops->get_idt(emul_to_vcpu(ctxt), dt); + static_call(kvm_x86_get_idt)(emul_to_vcpu(ctxt), dt); } static void emulator_set_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt) { - kvm_x86_ops->set_gdt(emul_to_vcpu(ctxt), dt); + static_call(kvm_x86_set_gdt)(emul_to_vcpu(ctxt), dt); } static void emulator_set_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt) { - kvm_x86_ops->set_idt(emul_to_vcpu(ctxt), dt); + static_call(kvm_x86_set_idt)(emul_to_vcpu(ctxt), dt); } static unsigned long emulator_get_cached_segment_base( @@ -6220,6 +8026,52 @@ static void emulator_set_segment(struct x86_emulate_ctxt *ctxt, u16 selector, return; } +static int emulator_get_msr_with_filter(struct x86_emulate_ctxt *ctxt, + u32 msr_index, u64 *pdata) +{ + struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); + int r; + + r = kvm_get_msr_with_filter(vcpu, msr_index, pdata); + if (r < 0) + return X86EMUL_UNHANDLEABLE; + + if (r) { + if (kvm_msr_user_space(vcpu, msr_index, KVM_EXIT_X86_RDMSR, 0, + complete_emulated_rdmsr, r)) + return X86EMUL_IO_NEEDED; + + trace_kvm_msr_read_ex(msr_index); + return X86EMUL_PROPAGATE_FAULT; + } + + trace_kvm_msr_read(msr_index, *pdata); + return X86EMUL_CONTINUE; +} + +static int emulator_set_msr_with_filter(struct x86_emulate_ctxt *ctxt, + u32 msr_index, u64 data) +{ + struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); + int r; + + r = kvm_set_msr_with_filter(vcpu, msr_index, data); + if (r < 0) + return X86EMUL_UNHANDLEABLE; + + if (r) { + if (kvm_msr_user_space(vcpu, msr_index, KVM_EXIT_X86_WRMSR, data, + complete_emulated_msr_access, r)) + return X86EMUL_IO_NEEDED; + + trace_kvm_msr_write_ex(msr_index, data); + return X86EMUL_PROPAGATE_FAULT; + } + + trace_kvm_msr_write(msr_index, data); + return X86EMUL_CONTINUE; +} + static int emulator_get_msr(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata) { @@ -6249,7 +8101,9 @@ static void emulator_set_smbase(struct x86_emulate_ctxt *ctxt, u64 smbase) static int emulator_check_pmc(struct x86_emulate_ctxt *ctxt, u32 pmc) { - return kvm_pmu_is_valid_rdpmc_ecx(emul_to_vcpu(ctxt), pmc); + if (kvm_pmu_is_valid_rdpmc_ecx(emul_to_vcpu(ctxt), pmc)) + return 0; + return -EINVAL; } static int emulator_read_pmc(struct x86_emulate_ctxt *ctxt, @@ -6267,13 +8121,15 @@ static int emulator_intercept(struct x86_emulate_ctxt *ctxt, struct x86_instruction_info *info, enum x86_intercept_stage stage) { - return kvm_x86_ops->check_intercept(emul_to_vcpu(ctxt), info, stage); + return static_call(kvm_x86_check_intercept)(emul_to_vcpu(ctxt), info, stage, + &ctxt->exception); } static bool emulator_get_cpuid(struct x86_emulate_ctxt *ctxt, - u32 *eax, u32 *ebx, u32 *ecx, u32 *edx, bool check_limit) + u32 *eax, u32 *ebx, u32 *ecx, u32 *edx, + bool exact_only) { - return kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx, check_limit); + return kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx, exact_only); } static bool emulator_guest_has_long_mode(struct x86_emulate_ctxt *ctxt) @@ -6291,19 +8147,24 @@ static bool emulator_guest_has_fxsr(struct x86_emulate_ctxt *ctxt) return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_FXSR); } +static bool emulator_guest_has_rdpid(struct x86_emulate_ctxt *ctxt) +{ + return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_RDPID); +} + static ulong emulator_read_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg) { - return kvm_register_read(emul_to_vcpu(ctxt), reg); + return kvm_register_read_raw(emul_to_vcpu(ctxt), reg); } static void emulator_write_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg, ulong val) { - kvm_register_write(emul_to_vcpu(ctxt), reg, val); + kvm_register_write_raw(emul_to_vcpu(ctxt), reg, val); } static void emulator_set_nmi_mask(struct x86_emulate_ctxt *ctxt, bool masked) { - kvm_x86_ops->set_nmi_mask(emul_to_vcpu(ctxt), masked); + static_call(kvm_x86_set_nmi_mask)(emul_to_vcpu(ctxt), masked); } static unsigned emulator_get_hflags(struct x86_emulate_ctxt *ctxt) @@ -6311,20 +8172,22 @@ static unsigned emulator_get_hflags(struct x86_emulate_ctxt *ctxt) return emul_to_vcpu(ctxt)->arch.hflags; } -static void emulator_set_hflags(struct x86_emulate_ctxt *ctxt, unsigned emul_flags) +static void emulator_exiting_smm(struct x86_emulate_ctxt *ctxt) { - emul_to_vcpu(ctxt)->arch.hflags = emul_flags; + struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); + + kvm_smm_changed(vcpu, false); } -static int emulator_pre_leave_smm(struct x86_emulate_ctxt *ctxt, +static int emulator_leave_smm(struct x86_emulate_ctxt *ctxt, const char *smstate) { - return kvm_x86_ops->pre_leave_smm(emul_to_vcpu(ctxt), smstate); + return static_call(kvm_x86_leave_smm)(emul_to_vcpu(ctxt), smstate); } -static void emulator_post_leave_smm(struct x86_emulate_ctxt *ctxt) +static void emulator_triple_fault(struct x86_emulate_ctxt *ctxt) { - kvm_smm_changed(emul_to_vcpu(ctxt)); + kvm_make_request(KVM_REQ_TRIPLE_FAULT, emul_to_vcpu(ctxt)); } static int emulator_set_xcr(struct x86_emulate_ctxt *ctxt, u32 index, u64 xcr) @@ -6332,7 +8195,16 @@ static int emulator_set_xcr(struct x86_emulate_ctxt *ctxt, u32 index, u64 xcr) return __kvm_set_xcr(emul_to_vcpu(ctxt), index, xcr); } +static void emulator_vm_bugged(struct x86_emulate_ctxt *ctxt) +{ + struct kvm *kvm = emul_to_vcpu(ctxt)->kvm; + + if (!kvm->vm_bugged) + kvm_vm_bugged(kvm); +} + static const struct x86_emulate_ops emulate_ops = { + .vm_bugged = emulator_vm_bugged, .read_gpr = emulator_read_gpr, .write_gpr = emulator_write_gpr, .read_std = emulator_read_std, @@ -6359,6 +8231,8 @@ static const struct x86_emulate_ops emulate_ops = { .set_dr = emulator_set_dr, .get_smbase = emulator_get_smbase, .set_smbase = emulator_set_smbase, + .set_msr_with_filter = emulator_set_msr_with_filter, + .get_msr_with_filter = emulator_get_msr_with_filter, .set_msr = emulator_set_msr, .get_msr = emulator_get_msr, .check_pmc = emulator_check_pmc, @@ -6371,17 +8245,18 @@ static const struct x86_emulate_ops emulate_ops = { .guest_has_long_mode = emulator_guest_has_long_mode, .guest_has_movbe = emulator_guest_has_movbe, .guest_has_fxsr = emulator_guest_has_fxsr, + .guest_has_rdpid = emulator_guest_has_rdpid, .set_nmi_mask = emulator_set_nmi_mask, .get_hflags = emulator_get_hflags, - .set_hflags = emulator_set_hflags, - .pre_leave_smm = emulator_pre_leave_smm, - .post_leave_smm = emulator_post_leave_smm, + .exiting_smm = emulator_exiting_smm, + .leave_smm = emulator_leave_smm, + .triple_fault = emulator_triple_fault, .set_xcr = emulator_set_xcr, }; static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask) { - u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(vcpu); + u32 int_shadow = static_call(kvm_x86_get_interrupt_shadow)(vcpu); /* * an sti; sti; sequence only disable interrupts for the first * instruction. So, if the last instruction, be it emulated or @@ -6392,33 +8267,50 @@ static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask) if (int_shadow & mask) mask = 0; if (unlikely(int_shadow || mask)) { - kvm_x86_ops->set_interrupt_shadow(vcpu, mask); + static_call(kvm_x86_set_interrupt_shadow)(vcpu, mask); if (!mask) kvm_make_request(KVM_REQ_EVENT, vcpu); } } -static bool inject_emulated_exception(struct kvm_vcpu *vcpu) +static void inject_emulated_exception(struct kvm_vcpu *vcpu) { - struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; - if (ctxt->exception.vector == PF_VECTOR) - return kvm_propagate_fault(vcpu, &ctxt->exception); + struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; - if (ctxt->exception.error_code_valid) + if (ctxt->exception.vector == PF_VECTOR) + kvm_inject_emulated_page_fault(vcpu, &ctxt->exception); + else if (ctxt->exception.error_code_valid) kvm_queue_exception_e(vcpu, ctxt->exception.vector, ctxt->exception.error_code); else kvm_queue_exception(vcpu, ctxt->exception.vector); - return false; +} + +static struct x86_emulate_ctxt *alloc_emulate_ctxt(struct kvm_vcpu *vcpu) +{ + struct x86_emulate_ctxt *ctxt; + + ctxt = kmem_cache_zalloc(x86_emulator_cache, GFP_KERNEL_ACCOUNT); + if (!ctxt) { + pr_err("kvm: failed to allocate vcpu's emulator\n"); + return NULL; + } + + ctxt->vcpu = vcpu; + ctxt->ops = &emulate_ops; + vcpu->arch.emulate_ctxt = ctxt; + + return ctxt; } static void init_emulate_ctxt(struct kvm_vcpu *vcpu) { - struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; + struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; int cs_db, cs_l; - kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); + static_call(kvm_x86_get_cs_db_l_bits)(vcpu, &cs_db, &cs_l); + ctxt->gpa_available = false; ctxt->eflags = kvm_get_rflags(vcpu); ctxt->tf = (ctxt->eflags & X86_EFLAGS_TF) != 0; @@ -6432,13 +8324,18 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu) BUILD_BUG_ON(HF_SMM_MASK != X86EMUL_SMM_MASK); BUILD_BUG_ON(HF_SMM_INSIDE_NMI_MASK != X86EMUL_SMM_INSIDE_NMI_MASK); + ctxt->interruptibility = 0; + ctxt->have_exception = false; + ctxt->exception.vector = -1; + ctxt->perm_ok = false; + init_decode_cache(ctxt); vcpu->arch.emulate_regs_need_sync_from_vcpu = false; } void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip) { - struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; + struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; int ret; init_emulate_ctxt(vcpu); @@ -6458,8 +8355,82 @@ void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip) } EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt); +static void prepare_emulation_failure_exit(struct kvm_vcpu *vcpu, u64 *data, + u8 ndata, u8 *insn_bytes, u8 insn_size) +{ + struct kvm_run *run = vcpu->run; + u64 info[5]; + u8 info_start; + + /* + * Zero the whole array used to retrieve the exit info, as casting to + * u32 for select entries will leave some chunks uninitialized. + */ + memset(&info, 0, sizeof(info)); + + static_call(kvm_x86_get_exit_info)(vcpu, (u32 *)&info[0], &info[1], + &info[2], (u32 *)&info[3], + (u32 *)&info[4]); + + run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + run->emulation_failure.suberror = KVM_INTERNAL_ERROR_EMULATION; + + /* + * There's currently space for 13 entries, but 5 are used for the exit + * reason and info. Restrict to 4 to reduce the maintenance burden + * when expanding kvm_run.emulation_failure in the future. + */ + if (WARN_ON_ONCE(ndata > 4)) + ndata = 4; + + /* Always include the flags as a 'data' entry. */ + info_start = 1; + run->emulation_failure.flags = 0; + + if (insn_size) { + BUILD_BUG_ON((sizeof(run->emulation_failure.insn_size) + + sizeof(run->emulation_failure.insn_bytes) != 16)); + info_start += 2; + run->emulation_failure.flags |= + KVM_INTERNAL_ERROR_EMULATION_FLAG_INSTRUCTION_BYTES; + run->emulation_failure.insn_size = insn_size; + memset(run->emulation_failure.insn_bytes, 0x90, + sizeof(run->emulation_failure.insn_bytes)); + memcpy(run->emulation_failure.insn_bytes, insn_bytes, insn_size); + } + + memcpy(&run->internal.data[info_start], info, sizeof(info)); + memcpy(&run->internal.data[info_start + ARRAY_SIZE(info)], data, + ndata * sizeof(data[0])); + + run->emulation_failure.ndata = info_start + ARRAY_SIZE(info) + ndata; +} + +static void prepare_emulation_ctxt_failure_exit(struct kvm_vcpu *vcpu) +{ + struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; + + prepare_emulation_failure_exit(vcpu, NULL, 0, ctxt->fetch.data, + ctxt->fetch.end - ctxt->fetch.data); +} + +void __kvm_prepare_emulation_failure_exit(struct kvm_vcpu *vcpu, u64 *data, + u8 ndata) +{ + prepare_emulation_failure_exit(vcpu, data, ndata, NULL, 0); +} +EXPORT_SYMBOL_GPL(__kvm_prepare_emulation_failure_exit); + +void kvm_prepare_emulation_failure_exit(struct kvm_vcpu *vcpu) +{ + __kvm_prepare_emulation_failure_exit(vcpu, NULL, 0); +} +EXPORT_SYMBOL_GPL(kvm_prepare_emulation_failure_exit); + static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type) { + struct kvm *kvm = vcpu->kvm; + ++vcpu->stat.insn_emulation_fail; trace_kvm_emulate_insn_failed(vcpu); @@ -6468,19 +8439,16 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type) return 1; } - if (emulation_type & EMULTYPE_SKIP) { - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; - vcpu->run->internal.ndata = 0; + if (kvm->arch.exit_on_emulation_error || + (emulation_type & EMULTYPE_SKIP)) { + prepare_emulation_ctxt_failure_exit(vcpu); return 0; } kvm_queue_exception(vcpu, UD_VECTOR); - if (!is_guest_mode(vcpu) && kvm_x86_ops->get_cpl(vcpu) == 0) { - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; - vcpu->run->internal.ndata = 0; + if (!is_guest_mode(vcpu) && static_call(kvm_x86_get_cpl)(vcpu) == 0) { + prepare_emulation_ctxt_failure_exit(vcpu); return 0; } @@ -6494,13 +8462,14 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, gpa_t gpa = cr2_or_gpa; kvm_pfn_t pfn; - if (!(emulation_type & EMULTYPE_ALLOW_RETRY)) + if (!(emulation_type & EMULTYPE_ALLOW_RETRY_PF)) return false; - if (WARN_ON_ONCE(is_guest_mode(vcpu))) + if (WARN_ON_ONCE(is_guest_mode(vcpu)) || + WARN_ON_ONCE(!(emulation_type & EMULTYPE_PF))) return false; - if (!vcpu->arch.mmu->direct_map) { + if (!vcpu->arch.mmu->root_role.direct) { /* * Write permission should be allowed since only * write access need to be emulated. @@ -6511,7 +8480,7 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, * If the mapping is invalid in guest, let cpu retry * it to generate fault. */ - if (gpa == UNMAPPED_GVA) + if (gpa == INVALID_GPA) return true; } @@ -6533,12 +8502,12 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, kvm_release_pfn_clean(pfn); /* The instructions are well-emulated on direct mmu. */ - if (vcpu->arch.mmu->direct_map) { + if (vcpu->arch.mmu->root_role.direct) { unsigned int indirect_shadow_pages; - spin_lock(&vcpu->kvm->mmu_lock); + write_lock(&vcpu->kvm->mmu_lock); indirect_shadow_pages = vcpu->kvm->arch.indirect_shadow_pages; - spin_unlock(&vcpu->kvm->mmu_lock); + write_unlock(&vcpu->kvm->mmu_lock); if (indirect_shadow_pages) kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa)); @@ -6585,10 +8554,11 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt, */ vcpu->arch.last_retry_eip = vcpu->arch.last_retry_addr = 0; - if (!(emulation_type & EMULTYPE_ALLOW_RETRY)) + if (!(emulation_type & EMULTYPE_ALLOW_RETRY_PF)) return false; - if (WARN_ON_ONCE(is_guest_mode(vcpu))) + if (WARN_ON_ONCE(is_guest_mode(vcpu)) || + WARN_ON_ONCE(!(emulation_type & EMULTYPE_PF))) return false; if (x86_page_table_writing_insn(ctxt)) @@ -6600,7 +8570,7 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt, vcpu->arch.last_retry_eip = ctxt->eip; vcpu->arch.last_retry_addr = cr2_or_gpa; - if (!vcpu->arch.mmu->direct_map) + if (!vcpu->arch.mmu->root_role.direct) gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL); kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa)); @@ -6611,14 +8581,24 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt, static int complete_emulated_mmio(struct kvm_vcpu *vcpu); static int complete_emulated_pio(struct kvm_vcpu *vcpu); -static void kvm_smm_changed(struct kvm_vcpu *vcpu) +static void kvm_smm_changed(struct kvm_vcpu *vcpu, bool entering_smm) { - if (!(vcpu->arch.hflags & HF_SMM_MASK)) { - /* This is a good place to trace that we are exiting SMM. */ - trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, false); + trace_kvm_smm_transition(vcpu->vcpu_id, vcpu->arch.smbase, entering_smm); + + if (entering_smm) { + vcpu->arch.hflags |= HF_SMM_MASK; + } else { + vcpu->arch.hflags &= ~(HF_SMM_MASK | HF_SMM_INSIDE_NMI_MASK); /* Process a latched INIT or SMI, if any. */ kvm_make_request(KVM_REQ_EVENT, vcpu); + + /* + * Even if KVM_SET_SREGS2 loaded PDPTRs out of band, + * on SMM exit we still need to reload them from + * guest memory + */ + vcpu->arch.pdptrs_from_userspace = false; } kvm_mmu_reset_context(vcpu); @@ -6644,8 +8624,8 @@ static int kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu) struct kvm_run *kvm_run = vcpu->run; if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) { - kvm_run->debug.arch.dr6 = DR6_BS | DR6_FIXED_1 | DR6_RTM; - kvm_run->debug.arch.pc = vcpu->arch.singlestep_rip; + kvm_run->debug.arch.dr6 = DR6_BS | DR6_ACTIVE_LOW; + kvm_run->debug.arch.pc = kvm_get_linear_rip(vcpu); kvm_run->debug.arch.exception = DB_VECTOR; kvm_run->exit_reason = KVM_EXIT_DEBUG; return 0; @@ -6656,13 +8636,15 @@ static int kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu) int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu) { - unsigned long rflags = kvm_x86_ops->get_rflags(vcpu); + unsigned long rflags = static_call(kvm_x86_get_rflags)(vcpu); int r; - r = kvm_x86_ops->skip_emulated_instruction(vcpu); + r = static_call(kvm_x86_skip_emulated_instruction)(vcpu); if (unlikely(!r)) return 0; + kvm_pmu_trigger_event(vcpu, PERF_COUNT_HW_INSTRUCTIONS); + /* * rflags is the old, "raw" value of the flags. The new value has * not been saved yet. @@ -6677,8 +8659,46 @@ int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_skip_emulated_instruction); -static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r) +static bool kvm_is_code_breakpoint_inhibited(struct kvm_vcpu *vcpu) { + u32 shadow; + + if (kvm_get_rflags(vcpu) & X86_EFLAGS_RF) + return true; + + /* + * Intel CPUs inhibit code #DBs when MOV/POP SS blocking is active, + * but AMD CPUs do not. MOV/POP SS blocking is rare, check that first + * to avoid the relatively expensive CPUID lookup. + */ + shadow = static_call(kvm_x86_get_interrupt_shadow)(vcpu); + return (shadow & KVM_X86_SHADOW_INT_MOV_SS) && + guest_cpuid_is_intel(vcpu); +} + +static bool kvm_vcpu_check_code_breakpoint(struct kvm_vcpu *vcpu, + int emulation_type, int *r) +{ + WARN_ON_ONCE(emulation_type & EMULTYPE_NO_DECODE); + + /* + * Do not check for code breakpoints if hardware has already done the + * checks, as inferred from the emulation type. On NO_DECODE and SKIP, + * the instruction has passed all exception checks, and all intercepted + * exceptions that trigger emulation have lower priority than code + * breakpoints, i.e. the fact that the intercepted exception occurred + * means any code breakpoints have already been serviced. + * + * Note, KVM needs to check for code #DBs on EMULTYPE_TRAP_UD_FORCED as + * hardware has checked the RIP of the magic prefix, but not the RIP of + * the instruction being emulated. The intent of forced emulation is + * to behave as if KVM intercepted the instruction without an exception + * and without a prefix. + */ + if (emulation_type & (EMULTYPE_NO_DECODE | EMULTYPE_SKIP | + EMULTYPE_TRAP_UD | EMULTYPE_VMWARE_GP | EMULTYPE_PF)) + return false; + if (unlikely(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) && (vcpu->arch.guest_debug_dr7 & DR7_BP_EN_MASK)) { struct kvm_run *kvm_run = vcpu->run; @@ -6688,7 +8708,7 @@ static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r) vcpu->arch.eff_db); if (dr6 != 0) { - kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1 | DR6_RTM; + kvm_run->debug.arch.dr6 = dr6 | DR6_ACTIVE_LOW; kvm_run->debug.arch.pc = eip; kvm_run->debug.arch.exception = DB_VECTOR; kvm_run->exit_reason = KVM_EXIT_DEBUG; @@ -6698,16 +8718,14 @@ static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r) } if (unlikely(vcpu->arch.dr7 & DR7_BP_EN_MASK) && - !(kvm_get_rflags(vcpu) & X86_EFLAGS_RF)) { + !kvm_is_code_breakpoint_inhibited(vcpu)) { unsigned long eip = kvm_get_linear_rip(vcpu); u32 dr6 = kvm_vcpu_check_hw_bp(eip, 0, vcpu->arch.dr7, vcpu->arch.db); if (dr6 != 0) { - vcpu->arch.dr6 &= ~DR_TRAP_BITS; - vcpu->arch.dr6 |= dr6 | DR6_RTM; - kvm_queue_exception(vcpu, DB_VECTOR); + kvm_queue_exception_p(vcpu, DB_VECTOR, dr6); *r = 1; return true; } @@ -6747,13 +8765,43 @@ static bool is_vmware_backdoor_opcode(struct x86_emulate_ctxt *ctxt) return false; } +/* + * Decode an instruction for emulation. The caller is responsible for handling + * code breakpoints. Note, manually detecting code breakpoints is unnecessary + * (and wrong) when emulating on an intercepted fault-like exception[*], as + * code breakpoints have higher priority and thus have already been done by + * hardware. + * + * [*] Except #MC, which is higher priority, but KVM should never emulate in + * response to a machine check. + */ +int x86_decode_emulated_instruction(struct kvm_vcpu *vcpu, int emulation_type, + void *insn, int insn_len) +{ + struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; + int r; + + init_emulate_ctxt(vcpu); + + r = x86_decode_insn(ctxt, insn, insn_len, emulation_type); + + trace_kvm_emulate_insn_start(vcpu); + ++vcpu->stat.insn_emulation; + + return r; +} +EXPORT_SYMBOL_GPL(x86_decode_emulated_instruction); + int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, int emulation_type, void *insn, int insn_len) { int r; - struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; + struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; bool writeback = true; - bool write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable; + bool write_fault_to_spt; + + if (unlikely(!kvm_can_emulate_insn(vcpu, emulation_type, insn, insn_len))) + return 1; vcpu->arch.l1tf_flush_l1d = true; @@ -6761,33 +8809,22 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, * Clear write_fault_to_shadow_pgtable here to ensure it is * never reused. */ + write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable; vcpu->arch.write_fault_to_shadow_pgtable = false; - kvm_clear_exception_queue(vcpu); if (!(emulation_type & EMULTYPE_NO_DECODE)) { - init_emulate_ctxt(vcpu); + kvm_clear_exception_queue(vcpu); /* - * We will reenter on the same instruction since - * we do not set complete_userspace_io. This does not - * handle watchpoints yet, those would be handled in - * the emulate_ops. + * Return immediately if RIP hits a code breakpoint, such #DBs + * are fault-like and are higher priority than any faults on + * the code fetch itself. */ - if (!(emulation_type & EMULTYPE_SKIP) && - kvm_vcpu_check_breakpoint(vcpu, &r)) + if (kvm_vcpu_check_code_breakpoint(vcpu, emulation_type, &r)) return r; - ctxt->interruptibility = 0; - ctxt->have_exception = false; - ctxt->exception.vector = -1; - ctxt->perm_ok = false; - - ctxt->ud = emulation_type & EMULTYPE_TRAP_UD; - - r = x86_decode_insn(ctxt, insn, insn_len); - - trace_kvm_emulate_insn_start(vcpu); - ++vcpu->stat.insn_emulation; + r = x86_decode_emulated_instruction(vcpu, emulation_type, + insn, insn_len); if (r != EMULATION_OK) { if ((emulation_type & EMULTYPE_TRAP_UD) || (emulation_type & EMULTYPE_TRAP_UD_FORCED)) { @@ -6819,12 +8856,23 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, } /* - * Note, EMULTYPE_SKIP is intended for use *only* by vendor callbacks - * for kvm_skip_emulated_instruction(). The caller is responsible for - * updating interruptibility state and injecting single-step #DBs. + * EMULTYPE_SKIP without EMULTYPE_COMPLETE_USER_EXIT is intended for + * use *only* by vendor callbacks for kvm_skip_emulated_instruction(). + * The caller is responsible for updating interruptibility state and + * injecting single-step #DBs. */ if (emulation_type & EMULTYPE_SKIP) { - kvm_rip_write(vcpu, ctxt->_eip); + if (ctxt->mode != X86EMUL_MODE_PROT64) + ctxt->eip = (u32)ctxt->_eip; + else + ctxt->eip = ctxt->_eip; + + if (emulation_type & EMULTYPE_COMPLETE_USER_EXIT) { + r = 1; + goto writeback; + } + + kvm_rip_write(vcpu, ctxt->eip); if (ctxt->eflags & X86_EFLAGS_RF) kvm_set_rflags(vcpu, ctxt->eflags & ~X86_EFLAGS_RF); return 1; @@ -6841,8 +8889,19 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, } restart: - /* Save the faulting GPA (cr2) in the address field */ - ctxt->exception.address = cr2_or_gpa; + if (emulation_type & EMULTYPE_PF) { + /* Save the faulting GPA (cr2) in the address field */ + ctxt->exception.address = cr2_or_gpa; + + /* With shadow page tables, cr2 contains a GVA or nGPA. */ + if (vcpu->arch.mmu->root_role.direct) { + ctxt->gpa_available = true; + ctxt->gpa_val = cr2_or_gpa; + } + } else { + /* Sanitize the address out of an abundance of paranoia. */ + ctxt->exception.address = 0; + } r = x86_emulate_insn(ctxt); @@ -6859,8 +8918,7 @@ restart: if (ctxt->have_exception) { r = 1; - if (inject_emulated_exception(vcpu)) - return r; + inject_emulated_exception(vcpu); } else if (vcpu->arch.pio.count) { if (!vcpu->arch.pio.in) { /* FIXME: return into emulator if single-stepping. */ @@ -6877,22 +8935,34 @@ restart: writeback = false; r = 0; vcpu->arch.complete_userspace_io = complete_emulated_mmio; + } else if (vcpu->arch.complete_userspace_io) { + writeback = false; + r = 0; } else if (r == EMULATION_RESTART) goto restart; else r = 1; +writeback: if (writeback) { - unsigned long rflags = kvm_x86_ops->get_rflags(vcpu); + unsigned long rflags = static_call(kvm_x86_get_rflags)(vcpu); toggle_interruptibility(vcpu, ctxt->interruptibility); vcpu->arch.emulate_regs_need_sync_to_vcpu = false; + + /* + * Note, EXCPT_DB is assumed to be fault-like as the emulator + * only supports code breakpoints and general detect #DB, both + * of which are fault-like. + */ if (!ctxt->have_exception || exception_type(ctxt->exception.vector) == EXCPT_TRAP) { + kvm_pmu_trigger_event(vcpu, PERF_COUNT_HW_INSTRUCTIONS); + if (ctxt->is_branch) + kvm_pmu_trigger_event(vcpu, PERF_COUNT_HW_BRANCH_INSTRUCTIONS); kvm_rip_write(vcpu, ctxt->eip); - if (r && ctxt->tf) + if (r && (ctxt->tf || (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP))) r = kvm_vcpu_do_singlestep(vcpu); - if (kvm_x86_ops->update_emulated_instruction) - kvm_x86_ops->update_emulated_instruction(vcpu); + static_call_cond(kvm_x86_update_emulated_instruction)(vcpu); __kvm_set_rflags(vcpu, ctxt->eflags); } @@ -6943,8 +9013,8 @@ static int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port) { unsigned long val = kvm_rax_read(vcpu); - int ret = emulator_pio_out_emulated(&vcpu->arch.emulate_ctxt, - size, port, &val, 1); + int ret = emulator_pio_out(vcpu, size, port, &val, 1); + if (ret) return ret; @@ -6979,12 +9049,7 @@ static int complete_fast_pio_in(struct kvm_vcpu *vcpu) /* For size less than 4 we merge, else we zero extend */ val = (vcpu->arch.pio.size < 4) ? kvm_rax_read(vcpu) : 0; - /* - * Since vcpu->arch.pio.count == 1 let emulator_pio_in_emulated perform - * the copy and tracing - */ - emulator_pio_in_emulated(&vcpu->arch.emulate_ctxt, vcpu->arch.pio.size, - vcpu->arch.pio.port, &val, 1); + complete_emulator_pio_in(vcpu, &val); kvm_rax_write(vcpu, val); return kvm_skip_emulated_instruction(vcpu); @@ -6999,8 +9064,7 @@ static int kvm_fast_pio_in(struct kvm_vcpu *vcpu, int size, /* For size less than 4 we merge, else we zero extend */ val = (size < 4) ? kvm_rax_read(vcpu) : 0; - ret = emulator_pio_in_emulated(&vcpu->arch.emulate_ctxt, size, port, - &val, 1); + ret = emulator_pio_in(vcpu, size, port, &val, 1); if (ret) { kvm_rax_write(vcpu, val); return ret; @@ -7048,35 +9112,26 @@ static void tsc_khz_changed(void *data) static void kvm_hyperv_tsc_notifier(void) { struct kvm *kvm; - struct kvm_vcpu *vcpu; int cpu; mutex_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) kvm_make_mclock_inprogress_request(kvm); + /* no guest entries from this point */ hyperv_stop_tsc_emulation(); /* TSC frequency always matches when on Hyper-V */ for_each_present_cpu(cpu) per_cpu(cpu_tsc_khz, cpu) = tsc_khz; - kvm_max_guest_tsc_khz = tsc_khz; + kvm_caps.max_guest_tsc_khz = tsc_khz; list_for_each_entry(kvm, &vm_list, vm_list) { - struct kvm_arch *ka = &kvm->arch; - - spin_lock(&ka->pvclock_gtod_sync_lock); - + __kvm_start_pvclock_update(kvm); pvclock_update_vm_gtod_copy(kvm); - - kvm_for_each_vcpu(cpu, vcpu, kvm) - kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); - - kvm_for_each_vcpu(cpu, vcpu, kvm) - kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu); - - spin_unlock(&ka->pvclock_gtod_sync_lock); + kvm_end_pvclock_update(kvm); } + mutex_unlock(&kvm_lock); } #endif @@ -7085,7 +9140,8 @@ static void __kvmclock_cpufreq_notifier(struct cpufreq_freqs *freq, int cpu) { struct kvm *kvm; struct kvm_vcpu *vcpu; - int i, send_ipi = 0; + int send_ipi = 0; + unsigned long i; /* * We allow guests to temporarily run on slowing clocks, @@ -7186,22 +9242,22 @@ static int kvmclock_cpu_online(unsigned int cpu) static void kvm_timer_init(void) { - max_tsc_khz = tsc_khz; - if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { -#ifdef CONFIG_CPU_FREQ - struct cpufreq_policy *policy; - int cpu; - - cpu = get_cpu(); - policy = cpufreq_cpu_get(cpu); - if (policy) { - if (policy->cpuinfo.max_freq) - max_tsc_khz = policy->cpuinfo.max_freq; - cpufreq_cpu_put(policy); + max_tsc_khz = tsc_khz; + + if (IS_ENABLED(CONFIG_CPU_FREQ)) { + struct cpufreq_policy *policy; + int cpu; + + cpu = get_cpu(); + policy = cpufreq_cpu_get(cpu); + if (policy) { + if (policy->cpuinfo.max_freq) + max_tsc_khz = policy->cpuinfo.max_freq; + cpufreq_cpu_put(policy); + } + put_cpu(); } - put_cpu(); -#endif cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block, CPUFREQ_TRANSITION_NOTIFIER); } @@ -7210,57 +9266,12 @@ static void kvm_timer_init(void) kvmclock_cpu_online, kvmclock_cpu_down_prep); } -DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu); -EXPORT_PER_CPU_SYMBOL_GPL(current_vcpu); - -int kvm_is_in_guest(void) -{ - return __this_cpu_read(current_vcpu) != NULL; -} - -static int kvm_is_user_mode(void) -{ - int user_mode = 3; - - if (__this_cpu_read(current_vcpu)) - user_mode = kvm_x86_ops->get_cpl(__this_cpu_read(current_vcpu)); - - return user_mode != 0; -} - -static unsigned long kvm_get_guest_ip(void) -{ - unsigned long ip = 0; - - if (__this_cpu_read(current_vcpu)) - ip = kvm_rip_read(__this_cpu_read(current_vcpu)); - - return ip; -} - -static void kvm_handle_intel_pt_intr(void) -{ - struct kvm_vcpu *vcpu = __this_cpu_read(current_vcpu); - - kvm_make_request(KVM_REQ_PMI, vcpu); - __set_bit(MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI_BIT, - (unsigned long *)&vcpu->arch.pmu.global_status); -} - -static struct perf_guest_info_callbacks kvm_guest_cbs = { - .is_in_guest = kvm_is_in_guest, - .is_user_mode = kvm_is_user_mode, - .get_guest_ip = kvm_get_guest_ip, - .handle_intel_pt_intr = kvm_handle_intel_pt_intr, -}; - #ifdef CONFIG_X86_64 static void pvclock_gtod_update_fn(struct work_struct *work) { struct kvm *kvm; - struct kvm_vcpu *vcpu; - int i; + unsigned long i; mutex_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) @@ -7273,6 +9284,18 @@ static void pvclock_gtod_update_fn(struct work_struct *work) static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn); /* + * Indirection to move queue_work() out of the tk_core.seq write held + * region to prevent possible deadlocks against time accessors which + * are invoked with work related locks held. + */ +static void pvclock_irq_work_fn(struct irq_work *w) +{ + queue_work(system_long_wq, &pvclock_gtod_work); +} + +static DEFINE_IRQ_WORK(pvclock_irq_work, pvclock_irq_work_fn); + +/* * Notification about pvclock gtod data update. */ static int pvclock_gtod_notify(struct notifier_block *nb, unsigned long unused, @@ -7283,13 +9306,14 @@ static int pvclock_gtod_notify(struct notifier_block *nb, unsigned long unused, update_pvclock_gtod(tk); - /* disable master clock if host does not trust, or does not - * use, TSC based clocksource. + /* + * Disable master clock if host does not trust, or does not use, + * TSC based clocksource. Delegate queue_work() to irq_work as + * this is invoked with tk_core.seq write held. */ if (!gtod_is_based_on_tsc(gtod->clock.vclock_mode) && atomic_read(&kvm_guest_has_master_clock) != 0) - queue_work(system_long_wq, &pvclock_gtod_work); - + irq_work_queue(&pvclock_irq_work); return 0; } @@ -7300,24 +9324,24 @@ static struct notifier_block pvclock_gtod_notifier = { int kvm_arch_init(void *opaque) { + struct kvm_x86_init_ops *ops = opaque; + u64 host_pat; int r; - struct kvm_x86_ops *ops = opaque; - if (kvm_x86_ops) { - printk(KERN_ERR "kvm: already loaded the other module\n"); - r = -EEXIST; - goto out; + if (kvm_x86_ops.hardware_enable) { + pr_err("kvm: already loaded vendor module '%s'\n", kvm_x86_ops.name); + return -EEXIST; } if (!ops->cpu_has_kvm_support()) { - pr_err_ratelimited("kvm: no hardware support\n"); - r = -EOPNOTSUPP; - goto out; + pr_err_ratelimited("kvm: no hardware support for '%s'\n", + ops->runtime_ops->name); + return -EOPNOTSUPP; } if (ops->disabled_by_bios()) { - pr_err_ratelimited("kvm: disabled by bios\n"); - r = -EOPNOTSUPP; - goto out; + pr_err_ratelimited("kvm: support for '%s' disabled by bios\n", + ops->runtime_ops->name); + return -EOPNOTSUPP; } /* @@ -7327,44 +9351,54 @@ int kvm_arch_init(void *opaque) */ if (!boot_cpu_has(X86_FEATURE_FPU) || !boot_cpu_has(X86_FEATURE_FXSR)) { printk(KERN_ERR "kvm: inadequate fpu\n"); - r = -EOPNOTSUPP; - goto out; + return -EOPNOTSUPP; } - r = -ENOMEM; - x86_fpu_cache = kmem_cache_create("x86_fpu", sizeof(struct fpu), - __alignof__(struct fpu), SLAB_ACCOUNT, - NULL); - if (!x86_fpu_cache) { - printk(KERN_ERR "kvm: failed to allocate cache for x86 fpu\n"); - goto out; + if (IS_ENABLED(CONFIG_PREEMPT_RT) && !boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { + pr_err("RT requires X86_FEATURE_CONSTANT_TSC\n"); + return -EOPNOTSUPP; + } + + /* + * KVM assumes that PAT entry '0' encodes WB memtype and simply zeroes + * the PAT bits in SPTEs. Bail if PAT[0] is programmed to something + * other than WB. Note, EPT doesn't utilize the PAT, but don't bother + * with an exception. PAT[0] is set to WB on RESET and also by the + * kernel, i.e. failure indicates a kernel bug or broken firmware. + */ + if (rdmsrl_safe(MSR_IA32_CR_PAT, &host_pat) || + (host_pat & GENMASK(2, 0)) != 6) { + pr_err("kvm: host PAT[0] is not WB\n"); + return -EIO; } - shared_msrs = alloc_percpu(struct kvm_shared_msrs); - if (!shared_msrs) { - printk(KERN_ERR "kvm: failed to allocate percpu kvm_shared_msrs\n"); - goto out_free_x86_fpu_cache; + x86_emulator_cache = kvm_alloc_emulator_cache(); + if (!x86_emulator_cache) { + pr_err("kvm: failed to allocate cache for x86 emulator\n"); + return -ENOMEM; } - r = kvm_mmu_module_init(); + user_return_msrs = alloc_percpu(struct kvm_user_return_msrs); + if (!user_return_msrs) { + printk(KERN_ERR "kvm: failed to allocate percpu kvm_user_return_msrs\n"); + r = -ENOMEM; + goto out_free_x86_emulator_cache; + } + kvm_nr_uret_msrs = 0; + + r = kvm_mmu_vendor_module_init(); if (r) goto out_free_percpu; - kvm_x86_ops = ops; - - kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, - PT_DIRTY_MASK, PT64_NX_MASK, 0, - PT_PRESENT_MASK, 0, sme_me_mask); kvm_timer_init(); - perf_register_guest_info_callbacks(&kvm_guest_cbs); - - if (boot_cpu_has(X86_FEATURE_XSAVE)) + if (boot_cpu_has(X86_FEATURE_XSAVE)) { host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); + kvm_caps.supported_xcr0 = host_xcr0 & KVM_SUPPORTED_XCR0; + } - kvm_lapic_init(); if (pi_inject_timer == -1) - pi_inject_timer = housekeeping_enabled(HK_FLAG_TIMER); + pi_inject_timer = housekeeping_enabled(HK_TYPE_TIMER); #ifdef CONFIG_X86_64 pvclock_gtod_register_notifier(&pvclock_gtod_notifier); @@ -7375,10 +9409,9 @@ int kvm_arch_init(void *opaque) return 0; out_free_percpu: - free_percpu(shared_msrs); -out_free_x86_fpu_cache: - kmem_cache_destroy(x86_fpu_cache); -out: + free_percpu(user_return_msrs); +out_free_x86_emulator_cache: + kmem_cache_destroy(x86_emulator_cache); return r; } @@ -7389,7 +9422,6 @@ void kvm_arch_exit(void) clear_hv_tscchange_cb(); #endif kvm_lapic_exit(); - perf_unregister_guest_info_callbacks(&kvm_guest_cbs); if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block, @@ -7397,25 +9429,43 @@ void kvm_arch_exit(void) cpuhp_remove_state_nocalls(CPUHP_AP_X86_KVM_CLK_ONLINE); #ifdef CONFIG_X86_64 pvclock_gtod_unregister_notifier(&pvclock_gtod_notifier); + irq_work_sync(&pvclock_irq_work); + cancel_work_sync(&pvclock_gtod_work); +#endif + kvm_x86_ops.hardware_enable = NULL; + kvm_mmu_vendor_module_exit(); + free_percpu(user_return_msrs); + kmem_cache_destroy(x86_emulator_cache); +#ifdef CONFIG_KVM_XEN + static_key_deferred_flush(&kvm_xen_enabled); + WARN_ON(static_branch_unlikely(&kvm_xen_enabled.key)); #endif - kvm_x86_ops = NULL; - kvm_mmu_module_exit(); - free_percpu(shared_msrs); - kmem_cache_destroy(x86_fpu_cache); } -int kvm_vcpu_halt(struct kvm_vcpu *vcpu) +static int __kvm_emulate_halt(struct kvm_vcpu *vcpu, int state, int reason) { + /* + * The vCPU has halted, e.g. executed HLT. Update the run state if the + * local APIC is in-kernel, the run loop will detect the non-runnable + * state and halt the vCPU. Exit to userspace if the local APIC is + * managed by userspace, in which case userspace is responsible for + * handling wake events. + */ ++vcpu->stat.halt_exits; if (lapic_in_kernel(vcpu)) { - vcpu->arch.mp_state = KVM_MP_STATE_HALTED; + vcpu->arch.mp_state = state; return 1; } else { - vcpu->run->exit_reason = KVM_EXIT_HLT; + vcpu->run->exit_reason = reason; return 0; } } -EXPORT_SYMBOL_GPL(kvm_vcpu_halt); + +int kvm_emulate_halt_noskip(struct kvm_vcpu *vcpu) +{ + return __kvm_emulate_halt(vcpu, KVM_MP_STATE_HALTED, KVM_EXIT_HLT); +} +EXPORT_SYMBOL_GPL(kvm_emulate_halt_noskip); int kvm_emulate_halt(struct kvm_vcpu *vcpu) { @@ -7424,10 +9474,19 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu) * TODO: we might be squashing a GUESTDBG_SINGLESTEP-triggered * KVM_EXIT_DEBUG here. */ - return kvm_vcpu_halt(vcpu) && ret; + return kvm_emulate_halt_noskip(vcpu) && ret; } EXPORT_SYMBOL_GPL(kvm_emulate_halt); +int kvm_emulate_ap_reset_hold(struct kvm_vcpu *vcpu) +{ + int ret = kvm_skip_emulated_instruction(vcpu); + + return __kvm_emulate_halt(vcpu, KVM_MP_STATE_AP_RESET_HOLD, + KVM_EXIT_AP_RESET_HOLD) && ret; +} +EXPORT_SYMBOL_GPL(kvm_emulate_ap_reset_hold); + #ifdef CONFIG_X86_64 static int kvm_pv_clock_pairing(struct kvm_vcpu *vcpu, gpa_t paddr, unsigned long clock_type) @@ -7440,7 +9499,14 @@ static int kvm_pv_clock_pairing(struct kvm_vcpu *vcpu, gpa_t paddr, if (clock_type != KVM_CLOCK_PAIRING_WALLCLOCK) return -KVM_EOPNOTSUPP; - if (kvm_get_walltime_and_clockread(&ts, &cycle) == false) + /* + * When tsc is in permanent catchup mode guests won't be able to use + * pvclock_read_retry loop to get consistent view of pvclock + */ + if (vcpu->arch.tsc_always_catchup) + return -KVM_EOPNOTSUPP; + + if (!kvm_get_walltime_and_clockread(&ts, &cycle)) return -KVM_EOPNOTSUPP; clock_pairing.sec = ts.tv_sec; @@ -7463,17 +9529,19 @@ static int kvm_pv_clock_pairing(struct kvm_vcpu *vcpu, gpa_t paddr, * * @apicid - apicid of vcpu to be kicked. */ -static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid) +static void kvm_pv_kick_cpu_op(struct kvm *kvm, int apicid) { - struct kvm_lapic_irq lapic_irq; - - lapic_irq.shorthand = APIC_DEST_NOSHORT; - lapic_irq.dest_mode = APIC_DEST_PHYSICAL; - lapic_irq.level = 0; - lapic_irq.dest_id = apicid; - lapic_irq.msi_redir_hint = false; + /* + * All other fields are unused for APIC_DM_REMRD, but may be consumed by + * common code, e.g. for tracing. Defer initialization to the compiler. + */ + struct kvm_lapic_irq lapic_irq = { + .delivery_mode = APIC_DM_REMRD, + .dest_mode = APIC_DEST_PHYSICAL, + .shorthand = APIC_DEST_NOSHORT, + .dest_id = apicid, + }; - lapic_irq.delivery_mode = APIC_DM_REMRD; kvm_irq_delivery_to_apic(kvm, NULL, &lapic_irq, NULL); } @@ -7483,32 +9551,82 @@ bool kvm_apicv_activated(struct kvm *kvm) } EXPORT_SYMBOL_GPL(kvm_apicv_activated); -void kvm_apicv_init(struct kvm *kvm, bool enable) +bool kvm_vcpu_apicv_activated(struct kvm_vcpu *vcpu) +{ + ulong vm_reasons = READ_ONCE(vcpu->kvm->arch.apicv_inhibit_reasons); + ulong vcpu_reasons = static_call(kvm_x86_vcpu_get_apicv_inhibit_reasons)(vcpu); + + return (vm_reasons | vcpu_reasons) == 0; +} +EXPORT_SYMBOL_GPL(kvm_vcpu_apicv_activated); + +static void set_or_clear_apicv_inhibit(unsigned long *inhibits, + enum kvm_apicv_inhibit reason, bool set) { - if (enable) - clear_bit(APICV_INHIBIT_REASON_DISABLE, - &kvm->arch.apicv_inhibit_reasons); + if (set) + __set_bit(reason, inhibits); else - set_bit(APICV_INHIBIT_REASON_DISABLE, - &kvm->arch.apicv_inhibit_reasons); + __clear_bit(reason, inhibits); + + trace_kvm_apicv_inhibit_changed(reason, set, *inhibits); } -EXPORT_SYMBOL_GPL(kvm_apicv_init); -static void kvm_sched_yield(struct kvm *kvm, unsigned long dest_id) +static void kvm_apicv_init(struct kvm *kvm) +{ + unsigned long *inhibits = &kvm->arch.apicv_inhibit_reasons; + + init_rwsem(&kvm->arch.apicv_update_lock); + + set_or_clear_apicv_inhibit(inhibits, APICV_INHIBIT_REASON_ABSENT, true); + + if (!enable_apicv) + set_or_clear_apicv_inhibit(inhibits, + APICV_INHIBIT_REASON_DISABLE, true); +} + +static void kvm_sched_yield(struct kvm_vcpu *vcpu, unsigned long dest_id) { struct kvm_vcpu *target = NULL; struct kvm_apic_map *map; + vcpu->stat.directed_yield_attempted++; + + if (single_task_running()) + goto no_yield; + rcu_read_lock(); - map = rcu_dereference(kvm->arch.apic_map); + map = rcu_dereference(vcpu->kvm->arch.apic_map); if (likely(map) && dest_id <= map->max_apic_id && map->phys_map[dest_id]) target = map->phys_map[dest_id]->vcpu; rcu_read_unlock(); - if (target && READ_ONCE(target->ready)) - kvm_vcpu_yield_to(target); + if (!target || !READ_ONCE(target->ready)) + goto no_yield; + + /* Ignore requests to yield to self */ + if (vcpu == target) + goto no_yield; + + if (kvm_vcpu_yield_to(target) <= 0) + goto no_yield; + + vcpu->stat.directed_yield_successful++; + +no_yield: + return; +} + +static int complete_hypercall_exit(struct kvm_vcpu *vcpu) +{ + u64 ret = vcpu->run->hypercall.ret; + + if (!is_64_bit_mode(vcpu)) + ret = (u32)ret; + kvm_rax_write(vcpu, ret); + ++vcpu->stat.hypercalls; + return kvm_skip_emulated_instruction(vcpu); } int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) @@ -7516,7 +9634,10 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) unsigned long nr, a0, a1, a2, a3, ret; int op_64_bit; - if (kvm_hv_hypercall_enabled(vcpu->kvm)) + if (kvm_xen_hypercall_enabled(vcpu->kvm)) + return kvm_xen_hypercall(vcpu); + + if (kvm_hv_hypercall_enabled(vcpu)) return kvm_hv_hypercall(vcpu); nr = kvm_rax_read(vcpu); @@ -7527,7 +9648,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) trace_kvm_hypercall(nr, a0, a1, a2, a3); - op_64_bit = is_64_bit_mode(vcpu); + op_64_bit = is_64_bit_hypercall(vcpu); if (!op_64_bit) { nr &= 0xFFFFFFFF; a0 &= 0xFFFFFFFF; @@ -7536,18 +9657,23 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) a3 &= 0xFFFFFFFF; } - if (kvm_x86_ops->get_cpl(vcpu) != 0) { + if (static_call(kvm_x86_get_cpl)(vcpu) != 0) { ret = -KVM_EPERM; goto out; } + ret = -KVM_ENOSYS; + switch (nr) { case KVM_HC_VAPIC_POLL_IRQ: ret = 0; break; case KVM_HC_KICK_CPU: - kvm_pv_kick_cpu_op(vcpu->kvm, a0, a1); - kvm_sched_yield(vcpu->kvm, a1); + if (!guest_pv_has(vcpu, KVM_FEATURE_PV_UNHALT)) + break; + + kvm_pv_kick_cpu_op(vcpu->kvm, a1); + kvm_sched_yield(vcpu, a1); ret = 0; break; #ifdef CONFIG_X86_64 @@ -7556,12 +9682,40 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) break; #endif case KVM_HC_SEND_IPI: + if (!guest_pv_has(vcpu, KVM_FEATURE_PV_SEND_IPI)) + break; + ret = kvm_pv_send_ipi(vcpu->kvm, a0, a1, a2, a3, op_64_bit); break; case KVM_HC_SCHED_YIELD: - kvm_sched_yield(vcpu->kvm, a0); + if (!guest_pv_has(vcpu, KVM_FEATURE_PV_SCHED_YIELD)) + break; + + kvm_sched_yield(vcpu, a0); ret = 0; break; + case KVM_HC_MAP_GPA_RANGE: { + u64 gpa = a0, npages = a1, attrs = a2; + + ret = -KVM_ENOSYS; + if (!(vcpu->kvm->arch.hypercall_exit_enabled & (1 << KVM_HC_MAP_GPA_RANGE))) + break; + + if (!PAGE_ALIGNED(gpa) || !npages || + gpa_to_gfn(gpa) + npages <= gpa_to_gfn(gpa)) { + ret = -KVM_EINVAL; + break; + } + + vcpu->run->exit_reason = KVM_EXIT_HYPERCALL; + vcpu->run->hypercall.nr = KVM_HC_MAP_GPA_RANGE; + vcpu->run->hypercall.args[0] = gpa; + vcpu->run->hypercall.args[1] = npages; + vcpu->run->hypercall.args[2] = attrs; + vcpu->run->hypercall.longmode = op_64_bit; + vcpu->arch.complete_userspace_io = complete_hypercall_exit; + return 0; + } default: ret = -KVM_ENOSYS; break; @@ -7582,7 +9736,18 @@ static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt) char instruction[3]; unsigned long rip = kvm_rip_read(vcpu); - kvm_x86_ops->patch_hypercall(vcpu, instruction); + /* + * If the quirk is disabled, synthesize a #UD and let the guest pick up + * the pieces. + */ + if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_FIX_HYPERCALL_INSN)) { + ctxt->exception.error_code_valid = false; + ctxt->exception.vector = UD_VECTOR; + ctxt->have_exception = true; + return X86EMUL_PROPAGATE_FAULT; + } + + static_call(kvm_x86_patch_hypercall)(vcpu, instruction); return emulator_write_emulated(ctxt, rip, instruction, 3, &ctxt->exception); @@ -7594,30 +9759,34 @@ static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu) likely(!pic_in_kernel(vcpu->kvm)); } +/* Called within kvm->srcu read side. */ static void post_kvm_run_save(struct kvm_vcpu *vcpu) { struct kvm_run *kvm_run = vcpu->run; - kvm_run->if_flag = (kvm_get_rflags(vcpu) & X86_EFLAGS_IF) != 0; - kvm_run->flags = is_smm(vcpu) ? KVM_RUN_X86_SMM : 0; + kvm_run->if_flag = static_call(kvm_x86_get_if_flag)(vcpu); kvm_run->cr8 = kvm_get_cr8(vcpu); kvm_run->apic_base = kvm_get_apic_base(vcpu); + kvm_run->ready_for_interrupt_injection = pic_in_kernel(vcpu->kvm) || kvm_vcpu_ready_for_interrupt_injection(vcpu); + + if (is_smm(vcpu)) + kvm_run->flags |= KVM_RUN_X86_SMM; } static void update_cr8_intercept(struct kvm_vcpu *vcpu) { int max_irr, tpr; - if (!kvm_x86_ops->update_cr8_intercept) + if (!kvm_x86_ops.update_cr8_intercept) return; if (!lapic_in_kernel(vcpu)) return; - if (vcpu->arch.apicv_active) + if (vcpu->arch.apic->apicv_active) return; if (!vcpu->arch.apic->vapic_addr) @@ -7630,119 +9799,268 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu) tpr = kvm_lapic_get_cr8(vcpu); - kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr); + static_call(kvm_x86_update_cr8_intercept)(vcpu, tpr, max_irr); +} + + +int kvm_check_nested_events(struct kvm_vcpu *vcpu) +{ + if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) { + kvm_x86_ops.nested_ops->triple_fault(vcpu); + return 1; + } + + return kvm_x86_ops.nested_ops->check_events(vcpu); } -static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win) +static void kvm_inject_exception(struct kvm_vcpu *vcpu) { + trace_kvm_inj_exception(vcpu->arch.exception.vector, + vcpu->arch.exception.has_error_code, + vcpu->arch.exception.error_code, + vcpu->arch.exception.injected); + + if (vcpu->arch.exception.error_code && !is_protmode(vcpu)) + vcpu->arch.exception.error_code = false; + static_call(kvm_x86_inject_exception)(vcpu); +} + +/* + * Check for any event (interrupt or exception) that is ready to be injected, + * and if there is at least one event, inject the event with the highest + * priority. This handles both "pending" events, i.e. events that have never + * been injected into the guest, and "injected" events, i.e. events that were + * injected as part of a previous VM-Enter, but weren't successfully delivered + * and need to be re-injected. + * + * Note, this is not guaranteed to be invoked on a guest instruction boundary, + * i.e. doesn't guarantee that there's an event window in the guest. KVM must + * be able to inject exceptions in the "middle" of an instruction, and so must + * also be able to re-inject NMIs and IRQs in the middle of an instruction. + * I.e. for exceptions and re-injected events, NOT invoking this on instruction + * boundaries is necessary and correct. + * + * For simplicity, KVM uses a single path to inject all events (except events + * that are injected directly from L1 to L2) and doesn't explicitly track + * instruction boundaries for asynchronous events. However, because VM-Exits + * that can occur during instruction execution typically result in KVM skipping + * the instruction or injecting an exception, e.g. instruction and exception + * intercepts, and because pending exceptions have higher priority than pending + * interrupts, KVM still honors instruction boundaries in most scenarios. + * + * But, if a VM-Exit occurs during instruction execution, and KVM does NOT skip + * the instruction or inject an exception, then KVM can incorrecty inject a new + * asynchrounous event if the event became pending after the CPU fetched the + * instruction (in the guest). E.g. if a page fault (#PF, #NPF, EPT violation) + * occurs and is resolved by KVM, a coincident NMI, SMI, IRQ, etc... can be + * injected on the restarted instruction instead of being deferred until the + * instruction completes. + * + * In practice, this virtualization hole is unlikely to be observed by the + * guest, and even less likely to cause functional problems. To detect the + * hole, the guest would have to trigger an event on a side effect of an early + * phase of instruction execution, e.g. on the instruction fetch from memory. + * And for it to be a functional problem, the guest would need to depend on the + * ordering between that side effect, the instruction completing, _and_ the + * delivery of the asynchronous event. + */ +static int kvm_check_and_inject_events(struct kvm_vcpu *vcpu, + bool *req_immediate_exit) +{ + bool can_inject; int r; - /* try to reinject previous events if any */ + /* + * Process nested events first, as nested VM-Exit supercedes event + * re-injection. If there's an event queued for re-injection, it will + * be saved into the appropriate vmc{b,s}12 fields on nested VM-Exit. + */ + if (is_guest_mode(vcpu)) + r = kvm_check_nested_events(vcpu); + else + r = 0; + /* + * Re-inject exceptions and events *especially* if immediate entry+exit + * to/from L2 is needed, as any event that has already been injected + * into L2 needs to complete its lifecycle before injecting a new event. + * + * Don't re-inject an NMI or interrupt if there is a pending exception. + * This collision arises if an exception occurred while vectoring the + * injected event, KVM intercepted said exception, and KVM ultimately + * determined the fault belongs to the guest and queues the exception + * for injection back into the guest. + * + * "Injected" interrupts can also collide with pending exceptions if + * userspace ignores the "ready for injection" flag and blindly queues + * an interrupt. In that case, prioritizing the exception is correct, + * as the exception "occurred" before the exit to userspace. Trap-like + * exceptions, e.g. most #DBs, have higher priority than interrupts. + * And while fault-like exceptions, e.g. #GP and #PF, are the lowest + * priority, they're only generated (pended) during instruction + * execution, and interrupts are recognized at instruction boundaries. + * Thus a pending fault-like exception means the fault occurred on the + * *previous* instruction and must be serviced prior to recognizing any + * new events in order to fully complete the previous instruction. + */ if (vcpu->arch.exception.injected) - kvm_x86_ops->queue_exception(vcpu); + kvm_inject_exception(vcpu); + else if (kvm_is_exception_pending(vcpu)) + ; /* see above */ + else if (vcpu->arch.nmi_injected) + static_call(kvm_x86_inject_nmi)(vcpu); + else if (vcpu->arch.interrupt.injected) + static_call(kvm_x86_inject_irq)(vcpu, true); + /* - * Do not inject an NMI or interrupt if there is a pending - * exception. Exceptions and interrupts are recognized at - * instruction boundaries, i.e. the start of an instruction. - * Trap-like exceptions, e.g. #DB, have higher priority than - * NMIs and interrupts, i.e. traps are recognized before an - * NMI/interrupt that's pending on the same instruction. - * Fault-like exceptions, e.g. #GP and #PF, are the lowest - * priority, but are only generated (pended) during instruction - * execution, i.e. a pending fault-like exception means the - * fault occurred on the *previous* instruction and must be - * serviced prior to recognizing any new events in order to - * fully complete the previous instruction. + * Exceptions that morph to VM-Exits are handled above, and pending + * exceptions on top of injected exceptions that do not VM-Exit should + * either morph to #DF or, sadly, override the injected exception. */ - else if (!vcpu->arch.exception.pending) { - if (vcpu->arch.nmi_injected) - kvm_x86_ops->set_nmi(vcpu); - else if (vcpu->arch.interrupt.injected) - kvm_x86_ops->set_irq(vcpu); - } + WARN_ON_ONCE(vcpu->arch.exception.injected && + vcpu->arch.exception.pending); /* - * Call check_nested_events() even if we reinjected a previous event - * in order for caller to determine if it should require immediate-exit - * from L2 to L1 due to pending L1 events which require exit - * from L2 to L1. + * Bail if immediate entry+exit to/from the guest is needed to complete + * nested VM-Enter or event re-injection so that a different pending + * event can be serviced (or if KVM needs to exit to userspace). + * + * Otherwise, continue processing events even if VM-Exit occurred. The + * VM-Exit will have cleared exceptions that were meant for L2, but + * there may now be events that can be injected into L1. */ - if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) { - r = kvm_x86_ops->check_nested_events(vcpu, req_int_win); - if (r != 0) - return r; - } + if (r < 0) + goto out; - /* try to inject new event if pending */ - if (vcpu->arch.exception.pending) { - trace_kvm_inj_exception(vcpu->arch.exception.nr, - vcpu->arch.exception.has_error_code, - vcpu->arch.exception.error_code); + /* + * A pending exception VM-Exit should either result in nested VM-Exit + * or force an immediate re-entry and exit to/from L2, and exception + * VM-Exits cannot be injected (flag should _never_ be set). + */ + WARN_ON_ONCE(vcpu->arch.exception_vmexit.injected || + vcpu->arch.exception_vmexit.pending); - WARN_ON_ONCE(vcpu->arch.exception.injected); - vcpu->arch.exception.pending = false; - vcpu->arch.exception.injected = true; + /* + * New events, other than exceptions, cannot be injected if KVM needs + * to re-inject a previous event. See above comments on re-injecting + * for why pending exceptions get priority. + */ + can_inject = !kvm_event_needs_reinjection(vcpu); - if (exception_type(vcpu->arch.exception.nr) == EXCPT_FAULT) + if (vcpu->arch.exception.pending) { + /* + * Fault-class exceptions, except #DBs, set RF=1 in the RFLAGS + * value pushed on the stack. Trap-like exception and all #DBs + * leave RF as-is (KVM follows Intel's behavior in this regard; + * AMD states that code breakpoint #DBs excplitly clear RF=0). + * + * Note, most versions of Intel's SDM and AMD's APM incorrectly + * describe the behavior of General Detect #DBs, which are + * fault-like. They do _not_ set RF, a la code breakpoints. + */ + if (exception_type(vcpu->arch.exception.vector) == EXCPT_FAULT) __kvm_set_rflags(vcpu, kvm_get_rflags(vcpu) | X86_EFLAGS_RF); - if (vcpu->arch.exception.nr == DB_VECTOR) { - /* - * This code assumes that nSVM doesn't use - * check_nested_events(). If it does, the - * DR6/DR7 changes should happen before L1 - * gets a #VMEXIT for an intercepted #DB in - * L2. (Under VMX, on the other hand, the - * DR6/DR7 changes should not happen in the - * event of a VM-exit to L1 for an intercepted - * #DB in L2.) - */ - kvm_deliver_exception_payload(vcpu); + if (vcpu->arch.exception.vector == DB_VECTOR) { + kvm_deliver_exception_payload(vcpu, &vcpu->arch.exception); if (vcpu->arch.dr7 & DR7_GD) { vcpu->arch.dr7 &= ~DR7_GD; kvm_update_dr7(vcpu); } } - kvm_x86_ops->queue_exception(vcpu); + kvm_inject_exception(vcpu); + + vcpu->arch.exception.pending = false; + vcpu->arch.exception.injected = true; + + can_inject = false; } - /* Don't consider new event if we re-injected an event */ - if (kvm_event_needs_reinjection(vcpu)) + /* Don't inject interrupts if the user asked to avoid doing so */ + if (vcpu->guest_debug & KVM_GUESTDBG_BLOCKIRQ) return 0; - if (vcpu->arch.smi_pending && !is_smm(vcpu) && - kvm_x86_ops->smi_allowed(vcpu)) { - vcpu->arch.smi_pending = false; - ++vcpu->arch.smi_count; - enter_smm(vcpu); - } else if (vcpu->arch.nmi_pending && kvm_x86_ops->nmi_allowed(vcpu)) { - --vcpu->arch.nmi_pending; - vcpu->arch.nmi_injected = true; - kvm_x86_ops->set_nmi(vcpu); - } else if (kvm_cpu_has_injectable_intr(vcpu)) { - /* - * Because interrupts can be injected asynchronously, we are - * calling check_nested_events again here to avoid a race condition. - * See https://lkml.org/lkml/2014/7/2/60 for discussion about this - * proposal and current concerns. Perhaps we should be setting - * KVM_REQ_EVENT only on certain events and not unconditionally? - */ - if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) { - r = kvm_x86_ops->check_nested_events(vcpu, req_int_win); - if (r != 0) - return r; + /* + * Finally, inject interrupt events. If an event cannot be injected + * due to architectural conditions (e.g. IF=0) a window-open exit + * will re-request KVM_REQ_EVENT. Sometimes however an event is pending + * and can architecturally be injected, but we cannot do it right now: + * an interrupt could have arrived just now and we have to inject it + * as a vmexit, or there could already an event in the queue, which is + * indicated by can_inject. In that case we request an immediate exit + * in order to make progress and get back here for another iteration. + * The kvm_x86_ops hooks communicate this by returning -EBUSY. + */ + if (vcpu->arch.smi_pending) { + r = can_inject ? static_call(kvm_x86_smi_allowed)(vcpu, true) : -EBUSY; + if (r < 0) + goto out; + if (r) { + vcpu->arch.smi_pending = false; + ++vcpu->arch.smi_count; + enter_smm(vcpu); + can_inject = false; + } else + static_call(kvm_x86_enable_smi_window)(vcpu); + } + + if (vcpu->arch.nmi_pending) { + r = can_inject ? static_call(kvm_x86_nmi_allowed)(vcpu, true) : -EBUSY; + if (r < 0) + goto out; + if (r) { + --vcpu->arch.nmi_pending; + vcpu->arch.nmi_injected = true; + static_call(kvm_x86_inject_nmi)(vcpu); + can_inject = false; + WARN_ON(static_call(kvm_x86_nmi_allowed)(vcpu, true) < 0); } - if (kvm_x86_ops->interrupt_allowed(vcpu)) { - kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu), - false); - kvm_x86_ops->set_irq(vcpu); + if (vcpu->arch.nmi_pending) + static_call(kvm_x86_enable_nmi_window)(vcpu); + } + + if (kvm_cpu_has_injectable_intr(vcpu)) { + r = can_inject ? static_call(kvm_x86_interrupt_allowed)(vcpu, true) : -EBUSY; + if (r < 0) + goto out; + if (r) { + kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu), false); + static_call(kvm_x86_inject_irq)(vcpu, false); + WARN_ON(static_call(kvm_x86_interrupt_allowed)(vcpu, true) < 0); } + if (kvm_cpu_has_injectable_intr(vcpu)) + static_call(kvm_x86_enable_irq_window)(vcpu); } + if (is_guest_mode(vcpu) && + kvm_x86_ops.nested_ops->has_events && + kvm_x86_ops.nested_ops->has_events(vcpu)) + *req_immediate_exit = true; + + /* + * KVM must never queue a new exception while injecting an event; KVM + * is done emulating and should only propagate the to-be-injected event + * to the VMCS/VMCB. Queueing a new exception can put the vCPU into an + * infinite loop as KVM will bail from VM-Enter to inject the pending + * exception and start the cycle all over. + * + * Exempt triple faults as they have special handling and won't put the + * vCPU into an infinite loop. Triple fault can be queued when running + * VMX without unrestricted guest, as that requires KVM to emulate Real + * Mode events (see kvm_inject_realmode_interrupt()). + */ + WARN_ON_ONCE(vcpu->arch.exception.pending || + vcpu->arch.exception_vmexit.pending); return 0; + +out: + if (r == -EBUSY) { + *req_immediate_exit = true; + r = 0; + } + return r; } static void process_nmi(struct kvm_vcpu *vcpu) @@ -7754,7 +10072,7 @@ static void process_nmi(struct kvm_vcpu *vcpu) * If an NMI is already in progress, limit further NMIs to just one. * Otherwise, allow two (and we'll inject the first one immediately). */ - if (kvm_x86_ops->get_nmi_mask(vcpu) || vcpu->arch.nmi_injected) + if (static_call(kvm_x86_get_nmi_mask)(vcpu) || vcpu->arch.nmi_injected) limit = 1; vcpu->arch.nmi_pending += atomic_xchg(&vcpu->arch.nmi_queued, 0); @@ -7825,7 +10143,7 @@ static void enter_smm_save_state_32(struct kvm_vcpu *vcpu, char *buf) put_smstate(u32, buf, 0x7ff0, kvm_rip_read(vcpu)); for (i = 0; i < 8; i++) - put_smstate(u32, buf, 0x7fd0 + i * 4, kvm_register_read(vcpu, i)); + put_smstate(u32, buf, 0x7fd0 + i * 4, kvm_register_read_raw(vcpu, i)); kvm_get_dr(vcpu, 6, &val); put_smstate(u32, buf, 0x7fcc, (u32)val); @@ -7844,11 +10162,11 @@ static void enter_smm_save_state_32(struct kvm_vcpu *vcpu, char *buf) put_smstate(u32, buf, 0x7f7c, seg.limit); put_smstate(u32, buf, 0x7f78, enter_smm_get_segment_flags(&seg)); - kvm_x86_ops->get_gdt(vcpu, &dt); + static_call(kvm_x86_get_gdt)(vcpu, &dt); put_smstate(u32, buf, 0x7f74, dt.address); put_smstate(u32, buf, 0x7f70, dt.size); - kvm_x86_ops->get_idt(vcpu, &dt); + static_call(kvm_x86_get_idt)(vcpu, &dt); put_smstate(u32, buf, 0x7f58, dt.address); put_smstate(u32, buf, 0x7f54, dt.size); @@ -7871,7 +10189,7 @@ static void enter_smm_save_state_64(struct kvm_vcpu *vcpu, char *buf) int i; for (i = 0; i < 16; i++) - put_smstate(u64, buf, 0x7ff8 - i * 8, kvm_register_read(vcpu, i)); + put_smstate(u64, buf, 0x7ff8 - i * 8, kvm_register_read_raw(vcpu, i)); put_smstate(u64, buf, 0x7f78, kvm_rip_read(vcpu)); put_smstate(u32, buf, 0x7f70, kvm_get_rflags(vcpu)); @@ -7898,7 +10216,7 @@ static void enter_smm_save_state_64(struct kvm_vcpu *vcpu, char *buf) put_smstate(u32, buf, 0x7e94, seg.limit); put_smstate(u64, buf, 0x7e98, seg.base); - kvm_x86_ops->get_idt(vcpu, &dt); + static_call(kvm_x86_get_idt)(vcpu, &dt); put_smstate(u32, buf, 0x7e84, dt.size); put_smstate(u64, buf, 0x7e88, dt.address); @@ -7908,7 +10226,7 @@ static void enter_smm_save_state_64(struct kvm_vcpu *vcpu, char *buf) put_smstate(u32, buf, 0x7e74, seg.limit); put_smstate(u64, buf, 0x7e78, seg.base); - kvm_x86_ops->get_gdt(vcpu, &dt); + static_call(kvm_x86_get_gdt)(vcpu, &dt); put_smstate(u32, buf, 0x7e64, dt.size); put_smstate(u64, buf, 0x7e68, dt.address); @@ -7921,10 +10239,9 @@ static void enter_smm(struct kvm_vcpu *vcpu) { struct kvm_segment cs, ds; struct desc_ptr dt; + unsigned long cr0; char buf[512]; - u32 cr0; - trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, true); memset(buf, 0, 512); #ifdef CONFIG_X86_64 if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) @@ -7934,34 +10251,34 @@ static void enter_smm(struct kvm_vcpu *vcpu) enter_smm_save_state_32(vcpu, buf); /* - * Give pre_enter_smm() a chance to make ISA-specific changes to the - * vCPU state (e.g. leave guest mode) after we've saved the state into - * the SMM state-save area. + * Give enter_smm() a chance to make ISA-specific changes to the vCPU + * state (e.g. leave guest mode) after we've saved the state into the + * SMM state-save area. */ - kvm_x86_ops->pre_enter_smm(vcpu, buf); + static_call(kvm_x86_enter_smm)(vcpu, buf); - vcpu->arch.hflags |= HF_SMM_MASK; + kvm_smm_changed(vcpu, true); kvm_vcpu_write_guest(vcpu, vcpu->arch.smbase + 0xfe00, buf, sizeof(buf)); - if (kvm_x86_ops->get_nmi_mask(vcpu)) + if (static_call(kvm_x86_get_nmi_mask)(vcpu)) vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK; else - kvm_x86_ops->set_nmi_mask(vcpu, true); + static_call(kvm_x86_set_nmi_mask)(vcpu, true); kvm_set_rflags(vcpu, X86_EFLAGS_FIXED); kvm_rip_write(vcpu, 0x8000); cr0 = vcpu->arch.cr0 & ~(X86_CR0_PE | X86_CR0_EM | X86_CR0_TS | X86_CR0_PG); - kvm_x86_ops->set_cr0(vcpu, cr0); + static_call(kvm_x86_set_cr0)(vcpu, cr0); vcpu->arch.cr0 = cr0; - kvm_x86_ops->set_cr4(vcpu, 0); + static_call(kvm_x86_set_cr4)(vcpu, 0); /* Undocumented: IDT limit is set to zero on entry to SMM. */ dt.address = dt.size = 0; - kvm_x86_ops->set_idt(vcpu, &dt); + static_call(kvm_x86_set_idt)(vcpu, &dt); - __kvm_set_dr(vcpu, 7, DR7_FIXED_1); + kvm_set_dr(vcpu, 7, DR7_FIXED_1); cs.selector = (vcpu->arch.smbase >> 4) & 0xffff; cs.base = vcpu->arch.smbase; @@ -7990,10 +10307,10 @@ static void enter_smm(struct kvm_vcpu *vcpu) #ifdef CONFIG_X86_64 if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) - kvm_x86_ops->set_efer(vcpu, 0); + static_call(kvm_x86_set_efer)(vcpu, 0); #endif - kvm_update_cpuid(vcpu); + kvm_update_cpuid_runtime(vcpu); kvm_mmu_reset_context(vcpu); } @@ -8006,14 +10323,7 @@ static void process_smi(struct kvm_vcpu *vcpu) void kvm_make_scan_ioapic_request_mask(struct kvm *kvm, unsigned long *vcpu_bitmap) { - cpumask_var_t cpus; - - zalloc_cpumask_var(&cpus, GFP_ATOMIC); - - kvm_make_vcpus_request_mask(kvm, KVM_REQ_SCAN_IOAPIC, - vcpu_bitmap, cpus); - - free_cpumask_var(cpus); + kvm_make_vcpus_request_mask(kvm, KVM_REQ_SCAN_IOAPIC, vcpu_bitmap); } void kvm_make_scan_ioapic_request(struct kvm *kvm) @@ -8023,44 +10333,93 @@ void kvm_make_scan_ioapic_request(struct kvm *kvm) void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu) { + struct kvm_lapic *apic = vcpu->arch.apic; + bool activate; + if (!lapic_in_kernel(vcpu)) return; - vcpu->arch.apicv_active = kvm_apicv_activated(vcpu->kvm); + down_read(&vcpu->kvm->arch.apicv_update_lock); + preempt_disable(); + + /* Do not activate APICV when APIC is disabled */ + activate = kvm_vcpu_apicv_activated(vcpu) && + (kvm_get_apic_mode(vcpu) != LAPIC_MODE_DISABLED); + + if (apic->apicv_active == activate) + goto out; + + apic->apicv_active = activate; kvm_apic_update_apicv(vcpu); - kvm_x86_ops->refresh_apicv_exec_ctrl(vcpu); + static_call(kvm_x86_refresh_apicv_exec_ctrl)(vcpu); + + /* + * When APICv gets disabled, we may still have injected interrupts + * pending. At the same time, KVM_REQ_EVENT may not be set as APICv was + * still active when the interrupt got accepted. Make sure + * kvm_check_and_inject_events() is called to check for that. + */ + if (!apic->apicv_active) + kvm_make_request(KVM_REQ_EVENT, vcpu); + +out: + preempt_enable(); + up_read(&vcpu->kvm->arch.apicv_update_lock); } EXPORT_SYMBOL_GPL(kvm_vcpu_update_apicv); -/* - * NOTE: Do not hold any lock prior to calling this. - * - * In particular, kvm_request_apicv_update() expects kvm->srcu not to be - * locked, because it calls __x86_set_memory_region() which does - * synchronize_srcu(&kvm->srcu). - */ -void kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit) +void __kvm_set_or_clear_apicv_inhibit(struct kvm *kvm, + enum kvm_apicv_inhibit reason, bool set) { - if (!kvm_x86_ops->check_apicv_inhibit_reasons || - !kvm_x86_ops->check_apicv_inhibit_reasons(bit)) + unsigned long old, new; + + lockdep_assert_held_write(&kvm->arch.apicv_update_lock); + + if (!static_call(kvm_x86_check_apicv_inhibit_reasons)(reason)) return; - if (activate) { - if (!test_and_clear_bit(bit, &kvm->arch.apicv_inhibit_reasons) || - !kvm_apicv_activated(kvm)) - return; + old = new = kvm->arch.apicv_inhibit_reasons; + + set_or_clear_apicv_inhibit(&new, reason, set); + + if (!!old != !!new) { + /* + * Kick all vCPUs before setting apicv_inhibit_reasons to avoid + * false positives in the sanity check WARN in svm_vcpu_run(). + * This task will wait for all vCPUs to ack the kick IRQ before + * updating apicv_inhibit_reasons, and all other vCPUs will + * block on acquiring apicv_update_lock so that vCPUs can't + * redo svm_vcpu_run() without seeing the new inhibit state. + * + * Note, holding apicv_update_lock and taking it in the read + * side (handling the request) also prevents other vCPUs from + * servicing the request with a stale apicv_inhibit_reasons. + */ + kvm_make_all_cpus_request(kvm, KVM_REQ_APICV_UPDATE); + kvm->arch.apicv_inhibit_reasons = new; + if (new) { + unsigned long gfn = gpa_to_gfn(APIC_DEFAULT_PHYS_BASE); + int idx = srcu_read_lock(&kvm->srcu); + + kvm_zap_gfn_range(kvm, gfn, gfn+1); + srcu_read_unlock(&kvm->srcu, idx); + } } else { - if (test_and_set_bit(bit, &kvm->arch.apicv_inhibit_reasons) || - kvm_apicv_activated(kvm)) - return; + kvm->arch.apicv_inhibit_reasons = new; } +} + +void kvm_set_or_clear_apicv_inhibit(struct kvm *kvm, + enum kvm_apicv_inhibit reason, bool set) +{ + if (!enable_apicv) + return; - trace_kvm_apicv_update_request(activate, bit); - if (kvm_x86_ops->pre_update_apicv_exec_ctrl) - kvm_x86_ops->pre_update_apicv_exec_ctrl(kvm, activate); - kvm_make_all_cpus_request(kvm, KVM_REQ_APICV_UPDATE); + down_write(&kvm->arch.apicv_update_lock); + __kvm_set_or_clear_apicv_inhibit(kvm, reason, set); + up_write(&kvm->arch.apicv_update_lock); } -EXPORT_SYMBOL_GPL(kvm_request_apicv_update); +EXPORT_SYMBOL_GPL(kvm_set_or_clear_apicv_inhibit); static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu) { @@ -8072,8 +10431,7 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu) if (irqchip_split(vcpu->kvm)) kvm_scan_ioapic_routes(vcpu, vcpu->arch.ioapic_handled_vectors); else { - if (vcpu->arch.apicv_active) - kvm_x86_ops->sync_pir_to_irr(vcpu); + static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu); if (ioapic_in_kernel(vcpu->kvm)) kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors); } @@ -8091,14 +10449,20 @@ static void vcpu_load_eoi_exitmap(struct kvm_vcpu *vcpu) if (!kvm_apic_hw_enabled(vcpu->arch.apic)) return; - bitmap_or((ulong *)eoi_exit_bitmap, vcpu->arch.ioapic_handled_vectors, - vcpu_to_synic(vcpu)->vec_bitmap, 256); - kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap); + if (to_hv_vcpu(vcpu)) { + bitmap_or((ulong *)eoi_exit_bitmap, + vcpu->arch.ioapic_handled_vectors, + to_hv_synic(vcpu)->vec_bitmap, 256); + static_call_cond(kvm_x86_load_eoi_exitmap)(vcpu, eoi_exit_bitmap); + return; + } + + static_call_cond(kvm_x86_load_eoi_exitmap)( + vcpu, (u64 *)vcpu->arch.ioapic_handled_vectors); } -int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, - unsigned long start, unsigned long end, - bool blockable) +void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, + unsigned long start, unsigned long end) { unsigned long apic_address; @@ -8109,30 +10473,19 @@ int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, apic_address = gfn_to_hva(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT); if (start <= apic_address && apic_address < end) kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD); - - return 0; } -void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu) +void kvm_arch_guest_memory_reclaimed(struct kvm *kvm) { - struct page *page = NULL; + static_call_cond(kvm_x86_guest_memory_reclaimed)(kvm); +} +static void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu) +{ if (!lapic_in_kernel(vcpu)) return; - if (!kvm_x86_ops->set_apic_access_page_addr) - return; - - page = gfn_to_page(vcpu->kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT); - if (is_error_page(page)) - return; - kvm_x86_ops->set_apic_access_page_addr(vcpu, page_to_phys(page)); - - /* - * Do not pin apic access page in memory, the MMU notifier - * will call us again if it is migrated or swapped out. - */ - put_page(page); + static_call_cond(kvm_x86_set_apic_access_page_addr)(vcpu); } void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu) @@ -8142,6 +10495,7 @@ void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu) EXPORT_SYMBOL_GPL(__kvm_request_immediate_exit); /* + * Called within kvm->srcu read side. * Returns 1 to let vcpu_run() continue the guest execution loop without * exiting to the userspace. Otherwise, the value will be returned to the * userspace. @@ -8152,23 +10506,36 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) bool req_int_win = dm_request_for_irq_injection(vcpu) && kvm_cpu_accept_dm_intr(vcpu); - enum exit_fastpath_completion exit_fastpath = EXIT_FASTPATH_NONE; + fastpath_t exit_fastpath; bool req_immediate_exit = false; + /* Forbid vmenter if vcpu dirty ring is soft-full */ + if (unlikely(vcpu->kvm->dirty_ring_size && + kvm_dirty_ring_soft_full(&vcpu->dirty_ring))) { + vcpu->run->exit_reason = KVM_EXIT_DIRTY_RING_FULL; + trace_kvm_dirty_ring_exit(vcpu); + r = 0; + goto out; + } + if (kvm_request_pending(vcpu)) { - if (kvm_check_request(KVM_REQ_GET_VMCS12_PAGES, vcpu)) { - if (unlikely(!kvm_x86_ops->get_vmcs12_pages(vcpu))) { + if (kvm_check_request(KVM_REQ_VM_DEAD, vcpu)) { + r = -EIO; + goto out; + } + if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) { + if (unlikely(!kvm_x86_ops.nested_ops->get_nested_state_pages(vcpu))) { r = 0; goto out; } } - if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu)) - kvm_mmu_unload(vcpu); + if (kvm_check_request(KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, vcpu)) + kvm_mmu_free_obsolete_roots(vcpu); if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu)) __kvm_migrate_timers(vcpu); if (kvm_check_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu)) - kvm_gen_update_masterclock(vcpu->kvm); + kvm_update_masterclock(vcpu->kvm); if (kvm_check_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu)) kvm_gen_kvmclock_update(vcpu); if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) { @@ -8178,20 +10545,30 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) } if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu)) kvm_mmu_sync_roots(vcpu); - if (kvm_check_request(KVM_REQ_LOAD_CR3, vcpu)) - kvm_mmu_load_cr3(vcpu); - if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) - kvm_vcpu_flush_tlb(vcpu, true); + if (kvm_check_request(KVM_REQ_LOAD_MMU_PGD, vcpu)) + kvm_mmu_load_pgd(vcpu); + if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) { + kvm_vcpu_flush_tlb_all(vcpu); + + /* Flushing all ASIDs flushes the current ASID... */ + kvm_clear_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); + } + kvm_service_local_tlb_flush_requests(vcpu); + if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) { vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS; r = 0; goto out; } if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) { - vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN; - vcpu->mmio_needed = 0; - r = 0; - goto out; + if (is_guest_mode(vcpu)) { + kvm_x86_ops.nested_ops->triple_fault(vcpu); + } else { + vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN; + vcpu->mmio_needed = 0; + r = 0; + goto out; + } } if (kvm_check_request(KVM_REQ_APF_HALT, vcpu)) { /* Page is swapped out. Do synthetic halt */ @@ -8229,18 +10606,22 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (kvm_check_request(KVM_REQ_HV_CRASH, vcpu)) { vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT; vcpu->run->system_event.type = KVM_SYSTEM_EVENT_CRASH; + vcpu->run->system_event.ndata = 0; r = 0; goto out; } if (kvm_check_request(KVM_REQ_HV_RESET, vcpu)) { vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT; vcpu->run->system_event.type = KVM_SYSTEM_EVENT_RESET; + vcpu->run->system_event.ndata = 0; r = 0; goto out; } if (kvm_check_request(KVM_REQ_HV_EXIT, vcpu)) { + struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); + vcpu->run->exit_reason = KVM_EXIT_HYPERV; - vcpu->run->hyperv = vcpu->arch.hyperv.exit; + vcpu->run->hyperv = hv_vcpu->exit; r = 0; goto out; } @@ -8254,42 +10635,35 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_hv_process_stimers(vcpu); if (kvm_check_request(KVM_REQ_APICV_UPDATE, vcpu)) kvm_vcpu_update_apicv(vcpu); + if (kvm_check_request(KVM_REQ_APF_READY, vcpu)) + kvm_check_async_pf_completion(vcpu); + if (kvm_check_request(KVM_REQ_MSR_FILTER_CHANGED, vcpu)) + static_call(kvm_x86_msr_filter_changed)(vcpu); + + if (kvm_check_request(KVM_REQ_UPDATE_CPU_DIRTY_LOGGING, vcpu)) + static_call(kvm_x86_update_cpu_dirty_logging)(vcpu); } - if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) { + if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win || + kvm_xen_has_interrupt(vcpu)) { ++vcpu->stat.req_event; - kvm_apic_accept_events(vcpu); + r = kvm_apic_accept_events(vcpu); + if (r < 0) { + r = 0; + goto out; + } if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { r = 1; goto out; } - if (inject_pending_event(vcpu, req_int_win) != 0) - req_immediate_exit = true; - else { - /* Enable SMI/NMI/IRQ window open exits if needed. - * - * SMIs have three cases: - * 1) They can be nested, and then there is nothing to - * do here because RSM will cause a vmexit anyway. - * 2) There is an ISA-specific reason why SMI cannot be - * injected, and the moment when this changes can be - * intercepted. - * 3) Or the SMI can be pending because - * inject_pending_event has completed the injection - * of an IRQ or NMI from the previous vmexit, and - * then we request an immediate exit to inject the - * SMI. - */ - if (vcpu->arch.smi_pending && !is_smm(vcpu)) - if (!kvm_x86_ops->enable_smi_window(vcpu)) - req_immediate_exit = true; - if (vcpu->arch.nmi_pending) - kvm_x86_ops->enable_nmi_window(vcpu); - if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win) - kvm_x86_ops->enable_irq_window(vcpu); - WARN_ON(vcpu->arch.exception.pending); + r = kvm_check_and_inject_events(vcpu, &req_immediate_exit); + if (r < 0) { + r = 0; + goto out; } + if (req_int_win) + static_call(kvm_x86_enable_irq_window)(vcpu); if (kvm_lapic_enabled(vcpu)) { update_cr8_intercept(vcpu); @@ -8304,7 +10678,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) preempt_disable(); - kvm_x86_ops->prepare_guest_switch(vcpu); + static_call(kvm_x86_prepare_switch_to_guest)(vcpu); /* * Disable IRQs before setting IN_GUEST_MODE. Posted interrupt @@ -8312,9 +10686,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) * result in virtual interrupt delivery. */ local_irq_disable(); - vcpu->mode = IN_GUEST_MODE; - srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); + /* Store vcpu->apicv_active before vcpu->mode. */ + smp_store_release(&vcpu->mode, IN_GUEST_MODE); + + kvm_vcpu_srcu_read_unlock(vcpu); /* * 1) We should set ->mode before checking ->requests. Please see @@ -8331,46 +10707,71 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) smp_mb__after_srcu_read_unlock(); /* - * This handles the case where a posted interrupt was - * notified with kvm_vcpu_kick. + * Process pending posted interrupts to handle the case where the + * notification IRQ arrived in the host, or was never sent (because the + * target vCPU wasn't running). Do this regardless of the vCPU's APICv + * status, KVM doesn't update assigned devices when APICv is inhibited, + * i.e. they can post interrupts even if APICv is temporarily disabled. */ - if (kvm_lapic_enabled(vcpu) && vcpu->arch.apicv_active) - kvm_x86_ops->sync_pir_to_irr(vcpu); + if (kvm_lapic_enabled(vcpu)) + static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu); - if (vcpu->mode == EXITING_GUEST_MODE || kvm_request_pending(vcpu) - || need_resched() || signal_pending(current)) { + if (kvm_vcpu_exit_request(vcpu)) { vcpu->mode = OUTSIDE_GUEST_MODE; smp_wmb(); local_irq_enable(); preempt_enable(); - vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); + kvm_vcpu_srcu_read_lock(vcpu); r = 1; goto cancel_injection; } if (req_immediate_exit) { kvm_make_request(KVM_REQ_EVENT, vcpu); - kvm_x86_ops->request_immediate_exit(vcpu); + static_call(kvm_x86_request_immediate_exit)(vcpu); } - trace_kvm_entry(vcpu->vcpu_id); - guest_enter_irqoff(); - fpregs_assert_state_consistent(); if (test_thread_flag(TIF_NEED_FPU_LOAD)) switch_fpu_return(); + if (vcpu->arch.guest_fpu.xfd_err) + wrmsrl(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err); + if (unlikely(vcpu->arch.switch_db_regs)) { set_debugreg(0, 7); set_debugreg(vcpu->arch.eff_db[0], 0); set_debugreg(vcpu->arch.eff_db[1], 1); set_debugreg(vcpu->arch.eff_db[2], 2); set_debugreg(vcpu->arch.eff_db[3], 3); - set_debugreg(vcpu->arch.dr6, 6); - vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD; + } else if (unlikely(hw_breakpoint_active())) { + set_debugreg(0, 7); } - kvm_x86_ops->run(vcpu); + guest_timing_enter_irqoff(); + + for (;;) { + /* + * Assert that vCPU vs. VM APICv state is consistent. An APICv + * update must kick and wait for all vCPUs before toggling the + * per-VM state, and responsing vCPUs must wait for the update + * to complete before servicing KVM_REQ_APICV_UPDATE. + */ + WARN_ON_ONCE((kvm_vcpu_apicv_activated(vcpu) != kvm_vcpu_apicv_active(vcpu)) && + (kvm_get_apic_mode(vcpu) != LAPIC_MODE_DISABLED)); + + exit_fastpath = static_call(kvm_x86_vcpu_run)(vcpu); + if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST)) + break; + + if (kvm_lapic_enabled(vcpu)) + static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu); + + if (unlikely(kvm_vcpu_exit_request(vcpu))) { + exit_fastpath = EXIT_FASTPATH_EXIT_HANDLED; + break; + } + } /* * Do this here before restoring debug registers on the host. And @@ -8380,11 +10781,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) */ if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) { WARN_ON(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP); - kvm_x86_ops->sync_dirty_debug_regs(vcpu); + static_call(kvm_x86_sync_dirty_debug_regs)(vcpu); kvm_update_dr0123(vcpu); - kvm_update_dr6(vcpu); kvm_update_dr7(vcpu); - vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD; } /* @@ -8397,12 +10796,24 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (hw_breakpoint_active()) hw_breakpoint_restore(); + vcpu->arch.last_vmentry_cpu = vcpu->cpu; vcpu->arch.last_guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc()); vcpu->mode = OUTSIDE_GUEST_MODE; smp_wmb(); - kvm_x86_ops->handle_exit_irqoff(vcpu, &exit_fastpath); + /* + * Sync xfd before calling handle_exit_irqoff() which may + * rely on the fact that guest_fpu::xfd is up-to-date (e.g. + * in #NM irqoff handler). + */ + if (vcpu->arch.xfd_no_write_intercept) + fpu_sync_guest_vmexit_xfd_state(); + + static_call(kvm_x86_handle_exit_irqoff)(vcpu); + + if (vcpu->arch.guest_fpu.xfd_err) + wrmsrl(MSR_IA32_XFD_ERR, 0); /* * Consume any pending interrupts, including the possible source of @@ -8411,25 +10822,25 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) * interrupts on processors that implement an interrupt shadow, the * stat.exits increment will do nicely. */ - kvm_before_interrupt(vcpu); + kvm_before_interrupt(vcpu, KVM_HANDLING_IRQ); local_irq_enable(); ++vcpu->stat.exits; local_irq_disable(); kvm_after_interrupt(vcpu); - guest_exit_irqoff(); - if (lapic_in_kernel(vcpu)) { - s64 delta = vcpu->arch.apic->lapic_timer.advance_expire_delta; - if (delta != S64_MIN) { - trace_kvm_wait_lapic_expire(vcpu->vcpu_id, delta); - vcpu->arch.apic->lapic_timer.advance_expire_delta = S64_MIN; - } - } + /* + * Wait until after servicing IRQs to account guest time so that any + * ticks that occurred while running the guest are properly accounted + * to the guest. Waiting until IRQs are enabled degrades the accuracy + * of accounting via context tracking, but the loss of accuracy is + * acceptable for all known use cases. + */ + guest_timing_exit_irqoff(); local_irq_enable(); preempt_enable(); - vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); + kvm_vcpu_srcu_read_lock(vcpu); /* * Profile KVM exit RIPs: @@ -8445,47 +10856,82 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (vcpu->arch.apic_attention) kvm_lapic_sync_from_vapic(vcpu); - vcpu->arch.gpa_available = false; - r = kvm_x86_ops->handle_exit(vcpu, exit_fastpath); + r = static_call(kvm_x86_handle_exit)(vcpu, exit_fastpath); return r; cancel_injection: - kvm_x86_ops->cancel_injection(vcpu); + if (req_immediate_exit) + kvm_make_request(KVM_REQ_EVENT, vcpu); + static_call(kvm_x86_cancel_injection)(vcpu); if (unlikely(vcpu->arch.apic_attention)) kvm_lapic_sync_from_vapic(vcpu); out: return r; } -static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu) +/* Called within kvm->srcu read side. */ +static inline int vcpu_block(struct kvm_vcpu *vcpu) { - if (!kvm_arch_vcpu_runnable(vcpu) && - (!kvm_x86_ops->pre_block || kvm_x86_ops->pre_block(vcpu) == 0)) { - srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); - kvm_vcpu_block(vcpu); - vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); + bool hv_timer; - if (kvm_x86_ops->post_block) - kvm_x86_ops->post_block(vcpu); + if (!kvm_arch_vcpu_runnable(vcpu)) { + /* + * Switch to the software timer before halt-polling/blocking as + * the guest's timer may be a break event for the vCPU, and the + * hypervisor timer runs only when the CPU is in guest mode. + * Switch before halt-polling so that KVM recognizes an expired + * timer before blocking. + */ + hv_timer = kvm_lapic_hv_timer_in_use(vcpu); + if (hv_timer) + kvm_lapic_switch_to_sw_timer(vcpu); - if (!kvm_check_request(KVM_REQ_UNHALT, vcpu)) + kvm_vcpu_srcu_read_unlock(vcpu); + if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) + kvm_vcpu_halt(vcpu); + else + kvm_vcpu_block(vcpu); + kvm_vcpu_srcu_read_lock(vcpu); + + if (hv_timer) + kvm_lapic_switch_to_hv_timer(vcpu); + + /* + * If the vCPU is not runnable, a signal or another host event + * of some kind is pending; service it without changing the + * vCPU's activity state. + */ + if (!kvm_arch_vcpu_runnable(vcpu)) return 1; } - kvm_apic_accept_events(vcpu); + /* + * Evaluate nested events before exiting the halted state. This allows + * the halt state to be recorded properly in the VMCS12's activity + * state field (AMD does not have a similar field and a VM-Exit always + * causes a spurious wakeup from HLT). + */ + if (is_guest_mode(vcpu)) { + if (kvm_check_nested_events(vcpu) < 0) + return 0; + } + + if (kvm_apic_accept_events(vcpu) < 0) + return 0; switch(vcpu->arch.mp_state) { case KVM_MP_STATE_HALTED: + case KVM_MP_STATE_AP_RESET_HOLD: vcpu->arch.pv.pv_unhalted = false; vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; - /* fall through */ + fallthrough; case KVM_MP_STATE_RUNNABLE: vcpu->arch.apf.halted = false; break; case KVM_MP_STATE_INIT_RECEIVED: break; default: - return -EINTR; + WARN_ON_ONCE(1); break; } return 1; @@ -8493,32 +10939,38 @@ static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu) static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu) { - if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) - kvm_x86_ops->check_nested_events(vcpu, false); - return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE && !vcpu->arch.apf.halted); } +/* Called within kvm->srcu read side. */ static int vcpu_run(struct kvm_vcpu *vcpu) { int r; - struct kvm *kvm = vcpu->kvm; - vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); vcpu->arch.l1tf_flush_l1d = true; for (;;) { + /* + * If another guest vCPU requests a PV TLB flush in the middle + * of instruction emulation, the rest of the emulation could + * use a stale page translation. Assume that any code after + * this point can start executing an instruction. + */ + vcpu->arch.at_instruction_boundary = false; if (kvm_vcpu_running(vcpu)) { r = vcpu_enter_guest(vcpu); } else { - r = vcpu_block(kvm, vcpu); + r = vcpu_block(vcpu); } if (r <= 0) break; - kvm_clear_request(KVM_REQ_PENDING_TIMER, vcpu); + kvm_clear_request(KVM_REQ_UNBLOCK, vcpu); + if (kvm_xen_has_pending_events(vcpu)) + kvm_xen_inject_pending_events(vcpu); + if (kvm_cpu_has_pending_timer(vcpu)) kvm_inject_pending_timer_irqs(vcpu); @@ -8530,34 +10982,21 @@ static int vcpu_run(struct kvm_vcpu *vcpu) break; } - kvm_check_async_pf_completion(vcpu); - - if (signal_pending(current)) { - r = -EINTR; - vcpu->run->exit_reason = KVM_EXIT_INTR; - ++vcpu->stat.signal_exits; - break; - } - if (need_resched()) { - srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); - cond_resched(); - vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); + if (__xfer_to_guest_mode_work_pending()) { + kvm_vcpu_srcu_read_unlock(vcpu); + r = xfer_to_guest_mode_handle_work(vcpu); + kvm_vcpu_srcu_read_lock(vcpu); + if (r) + return r; } } - srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); - return r; } static inline int complete_emulated_io(struct kvm_vcpu *vcpu) { - int r; - - vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); - r = kvm_emulate_instruction(vcpu, EMULTYPE_NO_DECODE); - srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); - return r; + return kvm_emulate_instruction(vcpu, EMULTYPE_NO_DECODE); } static int complete_emulated_pio(struct kvm_vcpu *vcpu) @@ -8630,83 +11069,69 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu) return 0; } -static void kvm_save_current_fpu(struct fpu *fpu) -{ - /* - * If the target FPU state is not resident in the CPU registers, just - * memcpy() from current, else save CPU state directly to the target. - */ - if (test_thread_flag(TIF_NEED_FPU_LOAD)) - memcpy(&fpu->state, ¤t->thread.fpu.state, - fpu_kernel_xstate_size); - else - copy_fpregs_to_fpstate(fpu); -} - /* Swap (qemu) user FPU context for the guest FPU context. */ static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) { - fpregs_lock(); - - kvm_save_current_fpu(vcpu->arch.user_fpu); - - /* PKRU is separately restored in kvm_x86_ops->run. */ - __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu->state, - ~XFEATURE_MASK_PKRU); - - fpregs_mark_activate(); - fpregs_unlock(); - + /* Exclude PKRU, it's restored separately immediately after VM-Exit. */ + fpu_swap_kvm_fpstate(&vcpu->arch.guest_fpu, true); trace_kvm_fpu(1); } /* When vcpu_run ends, restore user space FPU context. */ static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) { - fpregs_lock(); - - kvm_save_current_fpu(vcpu->arch.guest_fpu); - - copy_kernel_to_fpregs(&vcpu->arch.user_fpu->state); - - fpregs_mark_activate(); - fpregs_unlock(); - + fpu_swap_kvm_fpstate(&vcpu->arch.guest_fpu, false); ++vcpu->stat.fpu_reload; trace_kvm_fpu(0); } -int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) { + struct kvm_queued_exception *ex = &vcpu->arch.exception; + struct kvm_run *kvm_run = vcpu->run; int r; vcpu_load(vcpu); kvm_sigset_activate(vcpu); + kvm_run->flags = 0; kvm_load_guest_fpu(vcpu); + kvm_vcpu_srcu_read_lock(vcpu); if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) { if (kvm_run->immediate_exit) { r = -EINTR; goto out; } + /* + * It should be impossible for the hypervisor timer to be in + * use before KVM has ever run the vCPU. + */ + WARN_ON_ONCE(kvm_lapic_hv_timer_in_use(vcpu)); + + kvm_vcpu_srcu_read_unlock(vcpu); kvm_vcpu_block(vcpu); - kvm_apic_accept_events(vcpu); - kvm_clear_request(KVM_REQ_UNHALT, vcpu); + kvm_vcpu_srcu_read_lock(vcpu); + + if (kvm_apic_accept_events(vcpu) < 0) { + r = 0; + goto out; + } r = -EAGAIN; if (signal_pending(current)) { r = -EINTR; - vcpu->run->exit_reason = KVM_EXIT_INTR; + kvm_run->exit_reason = KVM_EXIT_INTR; ++vcpu->stat.signal_exits; } goto out; } - if (vcpu->run->kvm_valid_regs & ~KVM_SYNC_X86_VALID_FIELDS) { + if ((kvm_run->kvm_valid_regs & ~KVM_SYNC_X86_VALID_FIELDS) || + (kvm_run->kvm_dirty_regs & ~KVM_SYNC_X86_VALID_FIELDS)) { r = -EINVAL; goto out; } - if (vcpu->run->kvm_dirty_regs) { + if (kvm_run->kvm_dirty_regs) { r = sync_regs(vcpu); if (r != 0) goto out; @@ -8720,27 +11145,51 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) } } + /* + * If userspace set a pending exception and L2 is active, convert it to + * a pending VM-Exit if L1 wants to intercept the exception. + */ + if (vcpu->arch.exception_from_userspace && is_guest_mode(vcpu) && + kvm_x86_ops.nested_ops->is_exception_vmexit(vcpu, ex->vector, + ex->error_code)) { + kvm_queue_exception_vmexit(vcpu, ex->vector, + ex->has_error_code, ex->error_code, + ex->has_payload, ex->payload); + ex->injected = false; + ex->pending = false; + } + vcpu->arch.exception_from_userspace = false; + if (unlikely(vcpu->arch.complete_userspace_io)) { int (*cui)(struct kvm_vcpu *) = vcpu->arch.complete_userspace_io; vcpu->arch.complete_userspace_io = NULL; r = cui(vcpu); if (r <= 0) goto out; - } else - WARN_ON(vcpu->arch.pio.count || vcpu->mmio_needed); + } else { + WARN_ON_ONCE(vcpu->arch.pio.count); + WARN_ON_ONCE(vcpu->mmio_needed); + } - if (kvm_run->immediate_exit) + if (kvm_run->immediate_exit) { r = -EINTR; - else - r = vcpu_run(vcpu); + goto out; + } + + r = static_call(kvm_x86_vcpu_pre_run)(vcpu); + if (r <= 0) + goto out; + + r = vcpu_run(vcpu); out: kvm_put_guest_fpu(vcpu); - if (vcpu->run->kvm_valid_regs) + if (kvm_run->kvm_valid_regs) store_regs(vcpu); post_kvm_run_save(vcpu); - kvm_sigset_deactivate(vcpu); + kvm_vcpu_srcu_read_unlock(vcpu); + kvm_sigset_deactivate(vcpu); vcpu_put(vcpu); return r; } @@ -8755,7 +11204,7 @@ static void __get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) * that usually, but some bad designed PV devices (vmware * backdoor interface) need this to work */ - emulator_writeback_register_cache(&vcpu->arch.emulate_ctxt); + emulator_writeback_register_cache(vcpu->arch.emulate_ctxt); vcpu->arch.emulate_regs_need_sync_to_vcpu = false; } regs->rax = kvm_rax_read(vcpu); @@ -8817,6 +11266,7 @@ static void __set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) kvm_set_rflags(vcpu, regs->rflags | X86_EFLAGS_FIXED); vcpu->arch.exception.pending = false; + vcpu->arch.exception_vmexit.pending = false; kvm_make_request(KVM_REQ_EVENT, vcpu); } @@ -8829,20 +11279,13 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) return 0; } -void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) -{ - struct kvm_segment cs; - - kvm_get_segment(vcpu, &cs, VCPU_SREG_CS); - *db = cs.db; - *l = cs.l; -} -EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits); - -static void __get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) +static void __get_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { struct desc_ptr dt; + if (vcpu->arch.guest_state_protected) + goto skip_protected_regs; + kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS); kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS); kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES); @@ -8853,28 +11296,52 @@ static void __get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) kvm_get_segment(vcpu, &sregs->tr, VCPU_SREG_TR); kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); - kvm_x86_ops->get_idt(vcpu, &dt); + static_call(kvm_x86_get_idt)(vcpu, &dt); sregs->idt.limit = dt.size; sregs->idt.base = dt.address; - kvm_x86_ops->get_gdt(vcpu, &dt); + static_call(kvm_x86_get_gdt)(vcpu, &dt); sregs->gdt.limit = dt.size; sregs->gdt.base = dt.address; - sregs->cr0 = kvm_read_cr0(vcpu); sregs->cr2 = vcpu->arch.cr2; sregs->cr3 = kvm_read_cr3(vcpu); + +skip_protected_regs: + sregs->cr0 = kvm_read_cr0(vcpu); sregs->cr4 = kvm_read_cr4(vcpu); sregs->cr8 = kvm_get_cr8(vcpu); sregs->efer = vcpu->arch.efer; sregs->apic_base = kvm_get_apic_base(vcpu); +} - memset(sregs->interrupt_bitmap, 0, sizeof(sregs->interrupt_bitmap)); +static void __get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) +{ + __get_sregs_common(vcpu, sregs); + + if (vcpu->arch.guest_state_protected) + return; if (vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft) set_bit(vcpu->arch.interrupt.nr, (unsigned long *)sregs->interrupt_bitmap); } +static void __get_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2) +{ + int i; + + __get_sregs_common(vcpu, (struct kvm_sregs *)sregs2); + + if (vcpu->arch.guest_state_protected) + return; + + if (is_pae_paging(vcpu)) { + for (i = 0 ; i < 4 ; i++) + sregs2->pdptrs[i] = kvm_pdptr_read(vcpu, i); + sregs2->flags |= KVM_SREGS2_FLAGS_PDPTRS_VALID; + } +} + int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { @@ -8887,21 +11354,29 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, struct kvm_mp_state *mp_state) { + int r; + vcpu_load(vcpu); if (kvm_mpx_supported()) kvm_load_guest_fpu(vcpu); - kvm_apic_accept_events(vcpu); - if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED && - vcpu->arch.pv.pv_unhalted) + r = kvm_apic_accept_events(vcpu); + if (r < 0) + goto out; + r = 0; + + if ((vcpu->arch.mp_state == KVM_MP_STATE_HALTED || + vcpu->arch.mp_state == KVM_MP_STATE_AP_RESET_HOLD) && + vcpu->arch.pv.pv_unhalted) mp_state->mp_state = KVM_MP_STATE_RUNNABLE; else mp_state->mp_state = vcpu->arch.mp_state; +out: if (kvm_mpx_supported()) kvm_put_guest_fpu(vcpu); vcpu_put(vcpu); - return 0; + return r; } int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, @@ -8911,16 +11386,30 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, vcpu_load(vcpu); - if (!lapic_in_kernel(vcpu) && - mp_state->mp_state != KVM_MP_STATE_RUNNABLE) + switch (mp_state->mp_state) { + case KVM_MP_STATE_UNINITIALIZED: + case KVM_MP_STATE_HALTED: + case KVM_MP_STATE_AP_RESET_HOLD: + case KVM_MP_STATE_INIT_RECEIVED: + case KVM_MP_STATE_SIPI_RECEIVED: + if (!lapic_in_kernel(vcpu)) + goto out; + break; + + case KVM_MP_STATE_RUNNABLE: + break; + + default: goto out; + } /* - * KVM_MP_STATE_INIT_RECEIVED means the processor is in - * INIT state; latched init should be reported using - * KVM_SET_VCPU_EVENTS, so reject it here. + * Pending INITs are reported using KVM_SET_VCPU_EVENTS, disallow + * forcing the guest into INIT/SIPI if those events are supposed to be + * blocked. KVM prioritizes SMI over INIT, so reject INIT/SIPI state + * if an SMI is pending as well. */ - if ((kvm_vcpu_latch_init(vcpu) || vcpu->arch.smi_pending) && + if ((!kvm_apic_init_sipi_allowed(vcpu) || vcpu->arch.smi_pending) && (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED || mp_state->mp_state == KVM_MP_STATE_INIT_RECEIVED)) goto out; @@ -8941,7 +11430,7 @@ out: int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index, int reason, bool has_error_code, u32 error_code) { - struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; + struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; int ret; init_emulate_ctxt(vcpu); @@ -8961,7 +11450,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index, } EXPORT_SYMBOL_GPL(kvm_task_switch); -static int kvm_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) +static bool kvm_is_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { if ((sregs->efer & EFER_LME) && (sregs->cr0 & X86_CR0_PG)) { /* @@ -8969,82 +11458,72 @@ static int kvm_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) * 64-bit mode (though maybe in a 32-bit code segment). * CR4.PAE and EFER.LMA must be set. */ - if (!(sregs->cr4 & X86_CR4_PAE) - || !(sregs->efer & EFER_LMA)) - return -EINVAL; + if (!(sregs->cr4 & X86_CR4_PAE) || !(sregs->efer & EFER_LMA)) + return false; + if (kvm_vcpu_is_illegal_gpa(vcpu, sregs->cr3)) + return false; } else { /* * Not in 64-bit mode: EFER.LMA is clear and the code * segment cannot be 64-bit. */ if (sregs->efer & EFER_LMA || sregs->cs.l) - return -EINVAL; + return false; } - return kvm_valid_cr4(vcpu, sregs->cr4); + return kvm_is_valid_cr4(vcpu, sregs->cr4); } -static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) +static int __set_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs, + int *mmu_reset_needed, bool update_pdptrs) { struct msr_data apic_base_msr; - int mmu_reset_needed = 0; - int cpuid_update_needed = 0; - int pending_vec, max_bits, idx; + int idx; struct desc_ptr dt; - int ret = -EINVAL; - if (kvm_valid_sregs(vcpu, sregs)) - goto out; + if (!kvm_is_valid_sregs(vcpu, sregs)) + return -EINVAL; apic_base_msr.data = sregs->apic_base; apic_base_msr.host_initiated = true; if (kvm_set_apic_base(vcpu, &apic_base_msr)) - goto out; + return -EINVAL; + + if (vcpu->arch.guest_state_protected) + return 0; dt.size = sregs->idt.limit; dt.address = sregs->idt.base; - kvm_x86_ops->set_idt(vcpu, &dt); + static_call(kvm_x86_set_idt)(vcpu, &dt); dt.size = sregs->gdt.limit; dt.address = sregs->gdt.base; - kvm_x86_ops->set_gdt(vcpu, &dt); + static_call(kvm_x86_set_gdt)(vcpu, &dt); vcpu->arch.cr2 = sregs->cr2; - mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3; + *mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3; vcpu->arch.cr3 = sregs->cr3; - kvm_register_mark_available(vcpu, VCPU_EXREG_CR3); + kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3); + static_call_cond(kvm_x86_post_set_cr3)(vcpu, sregs->cr3); kvm_set_cr8(vcpu, sregs->cr8); - mmu_reset_needed |= vcpu->arch.efer != sregs->efer; - kvm_x86_ops->set_efer(vcpu, sregs->efer); + *mmu_reset_needed |= vcpu->arch.efer != sregs->efer; + static_call(kvm_x86_set_efer)(vcpu, sregs->efer); - mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0; - kvm_x86_ops->set_cr0(vcpu, sregs->cr0); + *mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0; + static_call(kvm_x86_set_cr0)(vcpu, sregs->cr0); vcpu->arch.cr0 = sregs->cr0; - mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4; - cpuid_update_needed |= ((kvm_read_cr4(vcpu) ^ sregs->cr4) & - (X86_CR4_OSXSAVE | X86_CR4_PKE)); - kvm_x86_ops->set_cr4(vcpu, sregs->cr4); - if (cpuid_update_needed) - kvm_update_cpuid(vcpu); + *mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4; + static_call(kvm_x86_set_cr4)(vcpu, sregs->cr4); - idx = srcu_read_lock(&vcpu->kvm->srcu); - if (is_pae_paging(vcpu)) { - load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu)); - mmu_reset_needed = 1; - } - srcu_read_unlock(&vcpu->kvm->srcu, idx); - - if (mmu_reset_needed) - kvm_mmu_reset_context(vcpu); - - max_bits = KVM_NR_INTERRUPTS; - pending_vec = find_first_bit( - (const unsigned long *)sregs->interrupt_bitmap, max_bits); - if (pending_vec < max_bits) { - kvm_queue_interrupt(vcpu, pending_vec, false); - pr_debug("Set back pending irq %d\n", pending_vec); + if (update_pdptrs) { + idx = srcu_read_lock(&vcpu->kvm->srcu); + if (is_pae_paging(vcpu)) { + load_pdptrs(vcpu, kvm_read_cr3(vcpu)); + *mmu_reset_needed = 1; + } + srcu_read_unlock(&vcpu->kvm->srcu, idx); } kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS); @@ -9065,11 +11544,63 @@ static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) !is_protmode(vcpu)) vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; - kvm_make_request(KVM_REQ_EVENT, vcpu); + return 0; +} - ret = 0; -out: - return ret; +static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) +{ + int pending_vec, max_bits; + int mmu_reset_needed = 0; + int ret = __set_sregs_common(vcpu, sregs, &mmu_reset_needed, true); + + if (ret) + return ret; + + if (mmu_reset_needed) + kvm_mmu_reset_context(vcpu); + + max_bits = KVM_NR_INTERRUPTS; + pending_vec = find_first_bit( + (const unsigned long *)sregs->interrupt_bitmap, max_bits); + + if (pending_vec < max_bits) { + kvm_queue_interrupt(vcpu, pending_vec, false); + pr_debug("Set back pending irq %d\n", pending_vec); + kvm_make_request(KVM_REQ_EVENT, vcpu); + } + return 0; +} + +static int __set_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2) +{ + int mmu_reset_needed = 0; + bool valid_pdptrs = sregs2->flags & KVM_SREGS2_FLAGS_PDPTRS_VALID; + bool pae = (sregs2->cr0 & X86_CR0_PG) && (sregs2->cr4 & X86_CR4_PAE) && + !(sregs2->efer & EFER_LMA); + int i, ret; + + if (sregs2->flags & ~KVM_SREGS2_FLAGS_PDPTRS_VALID) + return -EINVAL; + + if (valid_pdptrs && (!pae || vcpu->arch.guest_state_protected)) + return -EINVAL; + + ret = __set_sregs_common(vcpu, (struct kvm_sregs *)sregs2, + &mmu_reset_needed, !valid_pdptrs); + if (ret) + return ret; + + if (valid_pdptrs) { + for (i = 0; i < 4 ; i++) + kvm_pdptr_write(vcpu, i, sregs2->pdptrs[i]); + + kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR); + mmu_reset_needed = 1; + vcpu->arch.pdptrs_from_userspace = true; + } + if (mmu_reset_needed) + kvm_mmu_reset_context(vcpu); + return 0; } int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, @@ -9083,17 +11614,41 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, return ret; } +static void kvm_arch_vcpu_guestdbg_update_apicv_inhibit(struct kvm *kvm) +{ + bool set = false; + struct kvm_vcpu *vcpu; + unsigned long i; + + if (!enable_apicv) + return; + + down_write(&kvm->arch.apicv_update_lock); + + kvm_for_each_vcpu(i, vcpu, kvm) { + if (vcpu->guest_debug & KVM_GUESTDBG_BLOCKIRQ) { + set = true; + break; + } + } + __kvm_set_or_clear_apicv_inhibit(kvm, APICV_INHIBIT_REASON_BLOCKIRQ, set); + up_write(&kvm->arch.apicv_update_lock); +} + int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) { unsigned long rflags; int i, r; + if (vcpu->arch.guest_state_protected) + return -EINVAL; + vcpu_load(vcpu); if (dbg->control & (KVM_GUESTDBG_INJECT_DB | KVM_GUESTDBG_INJECT_BP)) { r = -EBUSY; - if (vcpu->arch.exception.pending) + if (kvm_is_exception_pending(vcpu)) goto out; if (dbg->control & KVM_GUESTDBG_INJECT_DB) kvm_queue_exception(vcpu, DB_VECTOR); @@ -9122,8 +11677,7 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, kvm_update_dr7(vcpu); if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) - vcpu->arch.singlestep_rip = kvm_rip_read(vcpu) + - get_segment_base(vcpu, VCPU_SREG_CS); + vcpu->arch.singlestep_rip = kvm_get_linear_rip(vcpu); /* * Trigger an rflags update that will inject or remove the trace @@ -9131,7 +11685,9 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, */ kvm_set_rflags(vcpu, rflags); - kvm_x86_ops->update_bp_intercept(vcpu); + static_call(kvm_x86_update_exception_bitmap)(vcpu); + + kvm_arch_vcpu_guestdbg_update_apicv_inhibit(vcpu->kvm); r = 0; @@ -9156,7 +11712,7 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, gpa = kvm_mmu_gva_to_gpa_system(vcpu, vaddr, NULL); srcu_read_unlock(&vcpu->kvm->srcu, idx); tr->physical_address = gpa; - tr->valid = gpa != UNMAPPED_GVA; + tr->valid = gpa != INVALID_GPA; tr->writeable = 1; tr->usermode = 0; @@ -9168,9 +11724,12 @@ int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { struct fxregs_state *fxsave; + if (fpstate_is_confidential(&vcpu->arch.guest_fpu)) + return 0; + vcpu_load(vcpu); - fxsave = &vcpu->arch.guest_fpu->state.fxsave; + fxsave = &vcpu->arch.guest_fpu.fpstate->regs.fxsave; memcpy(fpu->fpr, fxsave->st_space, 128); fpu->fcw = fxsave->cwd; fpu->fsw = fxsave->swd; @@ -9188,9 +11747,12 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { struct fxregs_state *fxsave; + if (fpstate_is_confidential(&vcpu->arch.guest_fpu)) + return 0; + vcpu_load(vcpu); - fxsave = &vcpu->arch.guest_fpu->state.fxsave; + fxsave = &vcpu->arch.guest_fpu.fpstate->regs.fxsave; memcpy(fxsave->st_space, fpu->fpr, 128); fxsave->cwd = fpu->fcw; @@ -9222,9 +11784,6 @@ static void store_regs(struct kvm_vcpu *vcpu) static int sync_regs(struct kvm_vcpu *vcpu) { - if (vcpu->run->kvm_dirty_regs & ~KVM_SYNC_X86_VALID_FIELDS) - return -EINVAL; - if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_REGS) { __set_regs(vcpu, &vcpu->run->s.regs.regs); vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_REGS; @@ -9244,28 +11803,19 @@ static int sync_regs(struct kvm_vcpu *vcpu) return 0; } -static void fx_init(struct kvm_vcpu *vcpu) -{ - fpstate_init(&vcpu->arch.guest_fpu->state); - if (boot_cpu_has(X86_FEATURE_XSAVES)) - vcpu->arch.guest_fpu->state.xsave.header.xcomp_bv = - host_xcr0 | XSTATE_COMPACTION_ENABLED; - - /* - * Ensure guest xcr0 is valid for loading - */ - vcpu->arch.xcr0 = XFEATURE_MASK_FP; - - vcpu->arch.cr0 |= X86_CR0_ET; -} - int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) { - if (kvm_check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0) + if (kvm_check_tsc_unstable() && kvm->created_vcpus) pr_warn_once("kvm: SMP vm created on host with unstable TSC; " "guest TSC will not be reliable\n"); - return 0; + if (!kvm->arch.max_vcpu_ids) + kvm->arch.max_vcpu_ids = KVM_MAX_VCPU_IDS; + + if (id >= kvm->arch.max_vcpu_ids) + return -EINVAL; + + return static_call(kvm_x86_vcpu_precreate)(kvm); } int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) @@ -9273,14 +11823,17 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) struct page *page; int r; - vcpu->arch.emulate_ctxt.ops = &emulate_ops; + vcpu->arch.last_vmentry_cpu = -1; + vcpu->arch.regs_avail = ~0; + vcpu->arch.regs_dirty = ~0; + + kvm_gpc_init(&vcpu->arch.pv_time); + if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu)) vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; else vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED; - kvm_set_tsc_khz(vcpu, max_tsc_khz); - r = kvm_mmu_create(vcpu); if (r < 0) return r; @@ -9289,46 +11842,53 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) r = kvm_create_lapic(vcpu, lapic_timer_advance_ns); if (r < 0) goto fail_mmu_destroy; - if (kvm_apicv_activated(vcpu->kvm)) - vcpu->arch.apicv_active = true; + + /* + * Defer evaluating inhibits until the vCPU is first run, as + * this vCPU will not get notified of any changes until this + * vCPU is visible to other vCPUs (marked online and added to + * the set of vCPUs). Opportunistically mark APICv active as + * VMX in particularly is highly unlikely to have inhibits. + * Ignore the current per-VM APICv state so that vCPU creation + * is guaranteed to run with a deterministic value, the request + * will ensure the vCPU gets the correct state before VM-Entry. + */ + if (enable_apicv) { + vcpu->arch.apic->apicv_active = true; + kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu); + } } else - static_key_slow_inc(&kvm_no_apic_vcpu); + static_branch_inc(&kvm_has_noapic_vcpu); r = -ENOMEM; - page = alloc_page(GFP_KERNEL | __GFP_ZERO); + page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); if (!page) goto fail_free_lapic; vcpu->arch.pio_data = page_address(page); - vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4, + vcpu->arch.mce_banks = kcalloc(KVM_MAX_MCE_BANKS * 4, sizeof(u64), GFP_KERNEL_ACCOUNT); - if (!vcpu->arch.mce_banks) - goto fail_free_pio_data; + vcpu->arch.mci_ctl2_banks = kcalloc(KVM_MAX_MCE_BANKS, sizeof(u64), + GFP_KERNEL_ACCOUNT); + if (!vcpu->arch.mce_banks || !vcpu->arch.mci_ctl2_banks) + goto fail_free_mce_banks; vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS; if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL_ACCOUNT)) goto fail_free_mce_banks; - vcpu->arch.user_fpu = kmem_cache_zalloc(x86_fpu_cache, - GFP_KERNEL_ACCOUNT); - if (!vcpu->arch.user_fpu) { - pr_err("kvm: failed to allocate userspace's fpu\n"); + if (!alloc_emulate_ctxt(vcpu)) goto free_wbinvd_dirty_mask; - } - vcpu->arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache, - GFP_KERNEL_ACCOUNT); - if (!vcpu->arch.guest_fpu) { + if (!fpu_alloc_guest_fpstate(&vcpu->arch.guest_fpu)) { pr_err("kvm: failed to allocate vcpu's fpu\n"); - goto free_user_fpu; + goto free_emulate_ctxt; } - fx_init(vcpu); - - vcpu->arch.guest_xstate_size = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET; vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); + vcpu->arch.reserved_gpa_bits = kvm_vcpu_reserved_gpa_bits_raw(vcpu); vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT; @@ -9338,30 +11898,34 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) vcpu->arch.pending_external_vector = -1; vcpu->arch.preempted_in_kernel = false; - kvm_hv_vcpu_init(vcpu); +#if IS_ENABLED(CONFIG_HYPERV) + vcpu->arch.hv_root_tdp = INVALID_PAGE; +#endif - r = kvm_x86_ops->vcpu_create(vcpu); + r = static_call(kvm_x86_vcpu_create)(vcpu); if (r) goto free_guest_fpu; vcpu->arch.arch_capabilities = kvm_get_arch_capabilities(); vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT; + kvm_xen_init_vcpu(vcpu); kvm_vcpu_mtrr_init(vcpu); vcpu_load(vcpu); + kvm_set_tsc_khz(vcpu, vcpu->kvm->arch.default_tsc_khz); kvm_vcpu_reset(vcpu, false); - kvm_init_mmu(vcpu, false); + kvm_init_mmu(vcpu); vcpu_put(vcpu); return 0; free_guest_fpu: - kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu); -free_user_fpu: - kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu); + fpu_free_guest_fpstate(&vcpu->arch.guest_fpu); +free_emulate_ctxt: + kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt); free_wbinvd_dirty_mask: free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); fail_free_mce_banks: kfree(vcpu->arch.mce_banks); -fail_free_pio_data: + kfree(vcpu->arch.mci_ctl2_banks); free_page((unsigned long)vcpu->arch.pio_data); fail_free_lapic: kvm_free_lapic(vcpu); @@ -9372,18 +11936,12 @@ fail_mmu_destroy: void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) { - struct msr_data msr; struct kvm *kvm = vcpu->kvm; - kvm_hv_vcpu_postcreate(vcpu); - if (mutex_lock_killable(&vcpu->mutex)) return; vcpu_load(vcpu); - msr.data = 0x0; - msr.index = MSR_IA32_TSC; - msr.host_initiated = true; - kvm_write_tsc(vcpu, &msr); + kvm_synchronize_tsc(vcpu, 0); vcpu_put(vcpu); /* poll control enabled by default */ @@ -9391,42 +11949,54 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) mutex_unlock(&vcpu->mutex); - if (!kvmclock_periodic_sync) - return; - - schedule_delayed_work(&kvm->arch.kvmclock_sync_work, - KVMCLOCK_SYNC_PERIOD); + if (kvmclock_periodic_sync && vcpu->vcpu_idx == 0) + schedule_delayed_work(&kvm->arch.kvmclock_sync_work, + KVMCLOCK_SYNC_PERIOD); } void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) { - struct gfn_to_pfn_cache *cache = &vcpu->arch.st.cache; int idx; - kvm_release_pfn(cache->pfn, cache->dirty, cache); - kvmclock_reset(vcpu); - kvm_x86_ops->vcpu_free(vcpu); + static_call(kvm_x86_vcpu_free)(vcpu); + kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt); free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); - kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu); - kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu); + fpu_free_guest_fpstate(&vcpu->arch.guest_fpu); + kvm_xen_destroy_vcpu(vcpu); kvm_hv_vcpu_uninit(vcpu); kvm_pmu_destroy(vcpu); kfree(vcpu->arch.mce_banks); + kfree(vcpu->arch.mci_ctl2_banks); kvm_free_lapic(vcpu); idx = srcu_read_lock(&vcpu->kvm->srcu); kvm_mmu_destroy(vcpu); srcu_read_unlock(&vcpu->kvm->srcu, idx); free_page((unsigned long)vcpu->arch.pio_data); + kvfree(vcpu->arch.cpuid_entries); if (!lapic_in_kernel(vcpu)) - static_key_slow_dec(&kvm_no_apic_vcpu); + static_branch_dec(&kvm_has_noapic_vcpu); } void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) { + struct kvm_cpuid_entry2 *cpuid_0x1; + unsigned long old_cr0 = kvm_read_cr0(vcpu); + unsigned long new_cr0; + + /* + * Several of the "set" flows, e.g. ->set_cr0(), read other registers + * to handle side effects. RESET emulation hits those flows and relies + * on emulated/virtualized registers, including those that are loaded + * into hardware, to be zeroed at vCPU creation. Use CRs as a sentinel + * to detect improper or missing initialization. + */ + WARN_ON_ONCE(!init_event && + (old_cr0 || kvm_read_cr3(vcpu) || kvm_read_cr4(vcpu))); + kvm_lapic_reset(vcpu, init_event); vcpu->arch.hflags = 0; @@ -9441,15 +12011,15 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db)); kvm_update_dr0123(vcpu); - vcpu->arch.dr6 = DR6_INIT; - kvm_update_dr6(vcpu); + vcpu->arch.dr6 = DR6_ACTIVE_LOW; vcpu->arch.dr7 = DR7_FIXED_1; kvm_update_dr7(vcpu); vcpu->arch.cr2 = 0; kvm_make_request(KVM_REQ_EVENT, vcpu); - vcpu->arch.apf.msr_val = 0; + vcpu->arch.apf.msr_en_val = 0; + vcpu->arch.apf.msr_int_val = 0; vcpu->arch.st.msr_val = 0; kvmclock_reset(vcpu); @@ -9458,23 +12028,19 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) kvm_async_pf_hash_reset(vcpu); vcpu->arch.apf.halted = false; - if (kvm_mpx_supported()) { - void *mpx_state_buffer; + if (vcpu->arch.guest_fpu.fpstate && kvm_mpx_supported()) { + struct fpstate *fpstate = vcpu->arch.guest_fpu.fpstate; /* - * To avoid have the INIT path from kvm_apic_has_events() that be - * called with loaded FPU and does not let userspace fix the state. + * All paths that lead to INIT are required to load the guest's + * FPU state (because most paths are buried in KVM_RUN). */ if (init_event) kvm_put_guest_fpu(vcpu); - mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu->state.xsave, - XFEATURE_BNDREGS); - if (mpx_state_buffer) - memset(mpx_state_buffer, 0, sizeof(struct mpx_bndreg_state)); - mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu->state.xsave, - XFEATURE_BNDCSR); - if (mpx_state_buffer) - memset(mpx_state_buffer, 0, sizeof(struct mpx_bndcsr)); + + fpstate_clear_xstate_component(fpstate, XFEATURE_BNDREGS); + fpstate_clear_xstate_component(fpstate, XFEATURE_BNDCSR); + if (init_event) kvm_load_guest_fpu(vcpu); } @@ -9484,18 +12050,77 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vcpu->arch.smbase = 0x30000; vcpu->arch.msr_misc_features_enables = 0; + vcpu->arch.ia32_misc_enable_msr = MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL | + MSR_IA32_MISC_ENABLE_BTS_UNAVAIL; - vcpu->arch.xcr0 = XFEATURE_MASK_FP; + __kvm_set_xcr(vcpu, 0, XFEATURE_MASK_FP); + __kvm_set_msr(vcpu, MSR_IA32_XSS, 0, true); } + /* All GPRs except RDX (handled below) are zeroed on RESET/INIT. */ memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs)); - vcpu->arch.regs_avail = ~0; - vcpu->arch.regs_dirty = ~0; + kvm_register_mark_dirty(vcpu, VCPU_REGS_RSP); - vcpu->arch.ia32_xss = 0; + /* + * Fall back to KVM's default Family/Model/Stepping of 0x600 (P6/Athlon) + * if no CPUID match is found. Note, it's impossible to get a match at + * RESET since KVM emulates RESET before exposing the vCPU to userspace, + * i.e. it's impossible for kvm_find_cpuid_entry() to find a valid entry + * on RESET. But, go through the motions in case that's ever remedied. + */ + cpuid_0x1 = kvm_find_cpuid_entry(vcpu, 1); + kvm_rdx_write(vcpu, cpuid_0x1 ? cpuid_0x1->eax : 0x600); + + static_call(kvm_x86_vcpu_reset)(vcpu, init_event); + + kvm_set_rflags(vcpu, X86_EFLAGS_FIXED); + kvm_rip_write(vcpu, 0xfff0); + + vcpu->arch.cr3 = 0; + kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3); - kvm_x86_ops->vcpu_reset(vcpu, init_event); + /* + * CR0.CD/NW are set on RESET, preserved on INIT. Note, some versions + * of Intel's SDM list CD/NW as being set on INIT, but they contradict + * (or qualify) that with a footnote stating that CD/NW are preserved. + */ + new_cr0 = X86_CR0_ET; + if (init_event) + new_cr0 |= (old_cr0 & (X86_CR0_NW | X86_CR0_CD)); + else + new_cr0 |= X86_CR0_NW | X86_CR0_CD; + + static_call(kvm_x86_set_cr0)(vcpu, new_cr0); + static_call(kvm_x86_set_cr4)(vcpu, 0); + static_call(kvm_x86_set_efer)(vcpu, 0); + static_call(kvm_x86_update_exception_bitmap)(vcpu); + + /* + * On the standard CR0/CR4/EFER modification paths, there are several + * complex conditions determining whether the MMU has to be reset and/or + * which PCIDs have to be flushed. However, CR0.WP and the paging-related + * bits in CR4 and EFER are irrelevant if CR0.PG was '0'; and a reset+flush + * is needed anyway if CR0.PG was '1' (which can only happen for INIT, as + * CR0 will be '0' prior to RESET). So we only need to check CR0.PG here. + */ + if (old_cr0 & X86_CR0_PG) { + kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); + kvm_mmu_reset_context(vcpu); + } + + /* + * Intel's SDM states that all TLB entries are flushed on INIT. AMD's + * APM states the TLBs are untouched by INIT, but it also states that + * the TLBs are flushed on "External initialization of the processor." + * Flush the guest TLB regardless of vendor, there is no meaningful + * benefit in relying on the guest to flush the TLB immediately after + * INIT. A spurious TLB flush is benign and likely negligible from a + * performance perspective. + */ + if (init_event) + kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); } +EXPORT_SYMBOL_GPL(kvm_vcpu_reset); void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector) { @@ -9507,19 +12132,20 @@ void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector) kvm_set_segment(vcpu, &cs, VCPU_SREG_CS); kvm_rip_write(vcpu, 0); } +EXPORT_SYMBOL_GPL(kvm_vcpu_deliver_sipi_vector); int kvm_arch_hardware_enable(void) { struct kvm *kvm; struct kvm_vcpu *vcpu; - int i; + unsigned long i; int ret; u64 local_tsc; u64 max_tsc = 0; bool stable, backwards_tsc = false; - kvm_shared_msr_cpu_online(); - ret = kvm_x86_ops->hardware_enable(); + kvm_user_return_msr_cpu_online(); + ret = static_call(kvm_x86_hardware_enable)(); if (ret != 0) return ret; @@ -9601,21 +12227,56 @@ int kvm_arch_hardware_enable(void) void kvm_arch_hardware_disable(void) { - kvm_x86_ops->hardware_disable(); + static_call(kvm_x86_hardware_disable)(); drop_user_return_notifiers(); } -int kvm_arch_hardware_setup(void) +static inline void kvm_ops_update(struct kvm_x86_init_ops *ops) +{ + memcpy(&kvm_x86_ops, ops->runtime_ops, sizeof(kvm_x86_ops)); + +#define __KVM_X86_OP(func) \ + static_call_update(kvm_x86_##func, kvm_x86_ops.func); +#define KVM_X86_OP(func) \ + WARN_ON(!kvm_x86_ops.func); __KVM_X86_OP(func) +#define KVM_X86_OP_OPTIONAL __KVM_X86_OP +#define KVM_X86_OP_OPTIONAL_RET0(func) \ + static_call_update(kvm_x86_##func, (void *)kvm_x86_ops.func ? : \ + (void *)__static_call_return0); +#include <asm/kvm-x86-ops.h> +#undef __KVM_X86_OP + + kvm_pmu_ops_update(ops->pmu_ops); +} + +int kvm_arch_hardware_setup(void *opaque) { + struct kvm_x86_init_ops *ops = opaque; int r; - r = kvm_x86_ops->hardware_setup(); + rdmsrl_safe(MSR_EFER, &host_efer); + + if (boot_cpu_has(X86_FEATURE_XSAVES)) + rdmsrl(MSR_IA32_XSS, host_xss); + + kvm_init_pmu_capability(); + + r = ops->hardware_setup(); if (r != 0) return r; - cr4_reserved_bits = kvm_host_cr4_reserved_bits(&boot_cpu_data); + kvm_ops_update(ops); + + kvm_register_perf_callbacks(ops->handle_intel_pt_intr); + + if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES)) + kvm_caps.supported_xss = 0; - if (kvm_has_tsc_control) { +#define __kvm_cpu_cap_has(UNUSED_, f) kvm_cpu_cap_has(f) + cr4_reserved_bits = __cr4_reserved_bits(__kvm_cpu_cap_has, UNUSED_); +#undef __kvm_cpu_cap_has + + if (kvm_caps.has_tsc_control) { /* * Make sure the user can only configure tsc_khz values that * fit into a signed integer. @@ -9623,34 +12284,33 @@ int kvm_arch_hardware_setup(void) * be 1 on all machines. */ u64 max = min(0x7fffffffULL, - __scale_tsc(kvm_max_tsc_scaling_ratio, tsc_khz)); - kvm_max_guest_tsc_khz = max; - - kvm_default_tsc_scaling_ratio = 1ULL << kvm_tsc_scaling_ratio_frac_bits; + __scale_tsc(kvm_caps.max_tsc_scaling_ratio, tsc_khz)); + kvm_caps.max_guest_tsc_khz = max; } - - if (boot_cpu_has(X86_FEATURE_XSAVES)) - rdmsrl(MSR_IA32_XSS, host_xss); - + kvm_caps.default_tsc_scaling_ratio = 1ULL << kvm_caps.tsc_scaling_ratio_frac_bits; kvm_init_msr_list(); return 0; } void kvm_arch_hardware_unsetup(void) { - kvm_x86_ops->hardware_unsetup(); + kvm_unregister_perf_callbacks(); + + static_call(kvm_x86_hardware_unsetup)(); } -int kvm_arch_check_processor_compat(void) +int kvm_arch_check_processor_compat(void *opaque) { struct cpuinfo_x86 *c = &cpu_data(smp_processor_id()); + struct kvm_x86_init_ops *ops = opaque; WARN_ON(!irqs_disabled()); - if (kvm_host_cr4_reserved_bits(c) != cr4_reserved_bits) + if (__cr4_reserved_bits(cpu_has, c) != + __cr4_reserved_bits(cpu_has, &boot_cpu_data)) return -EIO; - return kvm_x86_ops->check_processor_compatibility(); + return ops->check_processor_compatibility(); } bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu) @@ -9664,8 +12324,8 @@ bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu) return (vcpu->arch.apic_base & MSR_IA32_APICBASE_BSP) != 0; } -struct static_key kvm_no_apic_vcpu __read_mostly; -EXPORT_SYMBOL_GPL(kvm_no_apic_vcpu); +__read_mostly DEFINE_STATIC_KEY_FALSE(kvm_has_noapic_vcpu); +EXPORT_SYMBOL_GPL(kvm_has_noapic_vcpu); void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) { @@ -9676,18 +12336,37 @@ void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) pmu->need_cleanup = true; kvm_make_request(KVM_REQ_PMU, vcpu); } - kvm_x86_ops->sched_in(vcpu, cpu); + static_call(kvm_x86_sched_in)(vcpu, cpu); } +void kvm_arch_free_vm(struct kvm *kvm) +{ + kfree(to_kvm_hv(kvm)->hv_pa_pg); + __kvm_arch_free_vm(kvm); +} + + int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) { + int ret; + unsigned long flags; + if (type) return -EINVAL; + ret = kvm_page_track_init(kvm); + if (ret) + goto out; + + ret = kvm_mmu_init_vm(kvm); + if (ret) + goto out_page_track; + + ret = static_call(kvm_x86_vm_init)(kvm); + if (ret) + goto out_uninit_mmu; + INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list); - INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); - INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages); - INIT_LIST_HEAD(&kvm->arch.lpage_disallowed_mmu_pages); INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); atomic_set(&kvm->arch.noncoherent_dma_count, 0); @@ -9699,21 +12378,37 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) raw_spin_lock_init(&kvm->arch.tsc_write_lock); mutex_init(&kvm->arch.apic_map_lock); - spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock); - + seqcount_raw_spinlock_init(&kvm->arch.pvclock_sc, &kvm->arch.tsc_write_lock); kvm->arch.kvmclock_offset = -get_kvmclock_base_ns(); + + raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); pvclock_update_vm_gtod_copy(kvm); + raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); + kvm->arch.default_tsc_khz = max_tsc_khz ? : tsc_khz; kvm->arch.guest_can_read_msr_platform_info = true; + kvm->arch.enable_pmu = enable_pmu; + +#if IS_ENABLED(CONFIG_HYPERV) + spin_lock_init(&kvm->arch.hv_root_tdp_lock); + kvm->arch.hv_root_tdp = INVALID_PAGE; +#endif INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn); INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work, kvmclock_sync_fn); + kvm_apicv_init(kvm); kvm_hv_init_vm(kvm); - kvm_page_track_init(kvm); - kvm_mmu_init_vm(kvm); + kvm_xen_init_vm(kvm); + + return 0; - return kvm_x86_ops->vm_init(kvm); +out_uninit_mmu: + kvm_mmu_uninit_vm(kvm); +out_page_track: + kvm_page_track_cleanup(kvm); +out: + return ret; } int kvm_arch_post_init_vm(struct kvm *kvm) @@ -9728,27 +12423,15 @@ static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) vcpu_put(vcpu); } -static void kvm_free_vcpus(struct kvm *kvm) +static void kvm_unload_vcpu_mmus(struct kvm *kvm) { - unsigned int i; + unsigned long i; struct kvm_vcpu *vcpu; - /* - * Unpin any mmu pages first. - */ kvm_for_each_vcpu(i, vcpu, kvm) { kvm_clear_async_pf_completion_queue(vcpu); kvm_unload_vcpu_mmu(vcpu); } - kvm_for_each_vcpu(i, vcpu, kvm) - kvm_vcpu_destroy(vcpu); - - mutex_lock(&kvm->lock); - for (i = 0; i < atomic_read(&kvm->online_vcpus); i++) - kvm->vcpus[i] = NULL; - - atomic_set(&kvm->online_vcpus, 0); - mutex_unlock(&kvm->lock); } void kvm_arch_sync_events(struct kvm *kvm) @@ -9758,21 +12441,44 @@ void kvm_arch_sync_events(struct kvm *kvm) kvm_free_pit(kvm); } -int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size) +/** + * __x86_set_memory_region: Setup KVM internal memory slot + * + * @kvm: the kvm pointer to the VM. + * @id: the slot ID to setup. + * @gpa: the GPA to install the slot (unused when @size == 0). + * @size: the size of the slot. Set to zero to uninstall a slot. + * + * This function helps to setup a KVM internal memory slot. Specify + * @size > 0 to install a new slot, while @size == 0 to uninstall a + * slot. The return code can be one of the following: + * + * HVA: on success (uninstall will return a bogus HVA) + * -errno: on error + * + * The caller should always use IS_ERR() to check the return value + * before use. Note, the KVM internal memory slots are guaranteed to + * remain valid and unchanged until the VM is destroyed, i.e., the + * GPA->HVA translation will not change. However, the HVA is a user + * address, i.e. its accessibility is not guaranteed, and must be + * accessed via __copy_{to,from}_user(). + */ +void __user * __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, + u32 size) { int i, r; - unsigned long hva; + unsigned long hva, old_npages; struct kvm_memslots *slots = kvm_memslots(kvm); - struct kvm_memory_slot *slot, old; + struct kvm_memory_slot *slot; /* Called with kvm->slots_lock held. */ if (WARN_ON(id >= KVM_MEM_SLOTS_NUM)) - return -EINVAL; + return ERR_PTR_USR(-EINVAL); slot = id_to_memslot(slots, id); if (size) { - if (slot->npages) - return -EEXIST; + if (slot && slot->npages) + return ERR_PTR_USR(-EEXIST); /* * MAP_SHARED to prevent internal slot pages from being moved @@ -9781,15 +12487,15 @@ int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size) hva = vm_mmap(NULL, 0, size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, 0); if (IS_ERR((void *)hva)) - return PTR_ERR((void *)hva); + return (void __user *)hva; } else { - if (!slot->npages) - return 0; + if (!slot || !slot->npages) + return NULL; - hva = 0; + old_npages = slot->npages; + hva = slot->userspace_addr; } - old = *slot; for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { struct kvm_userspace_memory_region m; @@ -9800,13 +12506,13 @@ int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size) m.memory_size = size; r = __kvm_set_memory_region(kvm, &m); if (r < 0) - return r; + return ERR_PTR_USR(r); } if (!size) - vm_munmap(old.userspace_addr, old.npages * PAGE_SIZE); + vm_munmap(hva, old_npages * PAGE_SIZE); - return 0; + return (void __user *)hva; } EXPORT_SYMBOL_GPL(__x86_set_memory_region); @@ -9820,7 +12526,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm) if (current->mm == kvm->mm) { /* * Free memory regions allocated on behalf of userspace, - * unless the the memory map has changed due to process exit + * unless the memory map has changed due to process exit * or fd copying. */ mutex_lock(&kvm->slots_lock); @@ -9831,64 +12537,94 @@ void kvm_arch_destroy_vm(struct kvm *kvm) __x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, 0, 0); mutex_unlock(&kvm->slots_lock); } - if (kvm_x86_ops->vm_destroy) - kvm_x86_ops->vm_destroy(kvm); + kvm_unload_vcpu_mmus(kvm); + static_call_cond(kvm_x86_vm_destroy)(kvm); + kvm_free_msr_filter(srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1)); kvm_pic_destroy(kvm); kvm_ioapic_destroy(kvm); - kvm_free_vcpus(kvm); + kvm_destroy_vcpus(kvm); kvfree(rcu_dereference_check(kvm->arch.apic_map, 1)); kfree(srcu_dereference_check(kvm->arch.pmu_event_filter, &kvm->srcu, 1)); kvm_mmu_uninit_vm(kvm); kvm_page_track_cleanup(kvm); + kvm_xen_destroy_vm(kvm); kvm_hv_destroy_vm(kvm); } -void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, - struct kvm_memory_slot *dont) +static void memslot_rmap_free(struct kvm_memory_slot *slot) { int i; for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) { - if (!dont || free->arch.rmap[i] != dont->arch.rmap[i]) { - kvfree(free->arch.rmap[i]); - free->arch.rmap[i] = NULL; - } - if (i == 0) - continue; + kvfree(slot->arch.rmap[i]); + slot->arch.rmap[i] = NULL; + } +} - if (!dont || free->arch.lpage_info[i - 1] != - dont->arch.lpage_info[i - 1]) { - kvfree(free->arch.lpage_info[i - 1]); - free->arch.lpage_info[i - 1] = NULL; - } +void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) +{ + int i; + + memslot_rmap_free(slot); + + for (i = 1; i < KVM_NR_PAGE_SIZES; ++i) { + kvfree(slot->arch.lpage_info[i - 1]); + slot->arch.lpage_info[i - 1] = NULL; } - kvm_page_track_free_memslot(free, dont); + kvm_page_track_free_memslot(slot); } -int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, - unsigned long npages) +int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages) { + const int sz = sizeof(*slot->arch.rmap[0]); int i; for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) { + int level = i + 1; + int lpages = __kvm_mmu_slot_lpages(slot, npages, level); + + if (slot->arch.rmap[i]) + continue; + + slot->arch.rmap[i] = __vcalloc(lpages, sz, GFP_KERNEL_ACCOUNT); + if (!slot->arch.rmap[i]) { + memslot_rmap_free(slot); + return -ENOMEM; + } + } + + return 0; +} + +static int kvm_alloc_memslot_metadata(struct kvm *kvm, + struct kvm_memory_slot *slot) +{ + unsigned long npages = slot->npages; + int i, r; + + /* + * Clear out the previous array pointers for the KVM_MR_MOVE case. The + * old arrays will be freed by __kvm_set_memory_region() if installing + * the new memslot is successful. + */ + memset(&slot->arch, 0, sizeof(slot->arch)); + + if (kvm_memslots_have_rmaps(kvm)) { + r = memslot_rmap_alloc(slot, npages); + if (r) + return r; + } + + for (i = 1; i < KVM_NR_PAGE_SIZES; ++i) { struct kvm_lpage_info *linfo; unsigned long ugfn; int lpages; int level = i + 1; - lpages = gfn_to_index(slot->base_gfn + npages - 1, - slot->base_gfn, level) + 1; + lpages = __kvm_mmu_slot_lpages(slot, npages, level); - slot->arch.rmap[i] = - kvcalloc(lpages, sizeof(*slot->arch.rmap[i]), - GFP_KERNEL_ACCOUNT); - if (!slot->arch.rmap[i]) - goto out_free; - if (i == 0) - continue; - - linfo = kvcalloc(lpages, sizeof(*linfo), GFP_KERNEL_ACCOUNT); + linfo = __vcalloc(lpages, sizeof(*linfo), GFP_KERNEL_ACCOUNT); if (!linfo) goto out_free; @@ -9901,11 +12637,9 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, ugfn = slot->userspace_addr >> PAGE_SHIFT; /* * If the gfn and userspace address are not aligned wrt each - * other, or if explicitly asked to, disable large page - * support for this slot + * other, disable large page support for this slot. */ - if ((slot->base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1) || - !kvm_largepages_enabled()) { + if ((slot->base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1)) { unsigned long j; for (j = 0; j < lpages; ++j) @@ -9913,18 +12647,15 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, } } - if (kvm_page_track_create_memslot(slot, npages)) + if (kvm_page_track_create_memslot(kvm, slot, npages)) goto out_free; return 0; out_free: - for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) { - kvfree(slot->arch.rmap[i]); - slot->arch.rmap[i] = NULL; - if (i == 0) - continue; + memslot_rmap_free(slot); + for (i = 1; i < KVM_NR_PAGE_SIZES; ++i) { kvfree(slot->arch.lpage_info[i - 1]); slot->arch.lpage_info[i - 1] = NULL; } @@ -9934,7 +12665,7 @@ out_free: void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) { struct kvm_vcpu *vcpu; - int i; + unsigned long i; /* * memslots->generation has been incremented. @@ -9948,107 +12679,180 @@ void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) } int kvm_arch_prepare_memory_region(struct kvm *kvm, - struct kvm_memory_slot *memslot, - const struct kvm_userspace_memory_region *mem, - enum kvm_mr_change change) + const struct kvm_memory_slot *old, + struct kvm_memory_slot *new, + enum kvm_mr_change change) { + if (change == KVM_MR_CREATE || change == KVM_MR_MOVE) { + if ((new->base_gfn + new->npages - 1) > kvm_mmu_max_gfn()) + return -EINVAL; + + return kvm_alloc_memslot_metadata(kvm, new); + } + + if (change == KVM_MR_FLAGS_ONLY) + memcpy(&new->arch, &old->arch, sizeof(old->arch)); + else if (WARN_ON_ONCE(change != KVM_MR_DELETE)) + return -EIO; + return 0; } -static void kvm_mmu_slot_apply_flags(struct kvm *kvm, - struct kvm_memory_slot *new) + +static void kvm_mmu_update_cpu_dirty_logging(struct kvm *kvm, bool enable) { - /* Still write protect RO slot */ - if (new->flags & KVM_MEM_READONLY) { - kvm_mmu_slot_remove_write_access(kvm, new); + struct kvm_arch *ka = &kvm->arch; + + if (!kvm_x86_ops.cpu_dirty_log_size) return; - } + + if ((enable && ++ka->cpu_dirty_logging_count == 1) || + (!enable && --ka->cpu_dirty_logging_count == 0)) + kvm_make_all_cpus_request(kvm, KVM_REQ_UPDATE_CPU_DIRTY_LOGGING); + + WARN_ON_ONCE(ka->cpu_dirty_logging_count < 0); +} + +static void kvm_mmu_slot_apply_flags(struct kvm *kvm, + struct kvm_memory_slot *old, + const struct kvm_memory_slot *new, + enum kvm_mr_change change) +{ + u32 old_flags = old ? old->flags : 0; + u32 new_flags = new ? new->flags : 0; + bool log_dirty_pages = new_flags & KVM_MEM_LOG_DIRTY_PAGES; /* - * Call kvm_x86_ops dirty logging hooks when they are valid. - * - * kvm_x86_ops->slot_disable_log_dirty is called when: - * - * - KVM_MR_CREATE with dirty logging is disabled - * - KVM_MR_FLAGS_ONLY with dirty logging is disabled in new flag - * - * The reason is, in case of PML, we need to set D-bit for any slots - * with dirty logging disabled in order to eliminate unnecessary GPA - * logging in PML buffer (and potential PML buffer full VMEXIT). This - * guarantees leaving PML enabled during guest's lifetime won't have - * any additional overhead from PML when guest is running with dirty - * logging disabled for memory slots. - * - * kvm_x86_ops->slot_enable_log_dirty is called when switching new slot - * to dirty logging mode. - * - * If kvm_x86_ops dirty logging hooks are invalid, use write protect. - * - * In case of write protect: - * - * Write protect all pages for dirty logging. + * Update CPU dirty logging if dirty logging is being toggled. This + * applies to all operations. + */ + if ((old_flags ^ new_flags) & KVM_MEM_LOG_DIRTY_PAGES) + kvm_mmu_update_cpu_dirty_logging(kvm, log_dirty_pages); + + /* + * Nothing more to do for RO slots (which can't be dirtied and can't be + * made writable) or CREATE/MOVE/DELETE of a slot. * - * All the sptes including the large sptes which point to this - * slot are set to readonly. We can not create any new large - * spte on this slot until the end of the logging. + * For a memslot with dirty logging disabled: + * CREATE: No dirty mappings will already exist. + * MOVE/DELETE: The old mappings will already have been cleaned up by + * kvm_arch_flush_shadow_memslot() * - * See the comments in fast_page_fault(). + * For a memslot with dirty logging enabled: + * CREATE: No shadow pages exist, thus nothing to write-protect + * and no dirty bits to clear. + * MOVE/DELETE: The old mappings will already have been cleaned up by + * kvm_arch_flush_shadow_memslot(). */ - if (new->flags & KVM_MEM_LOG_DIRTY_PAGES) { - if (kvm_x86_ops->slot_enable_log_dirty) - kvm_x86_ops->slot_enable_log_dirty(kvm, new); - else - kvm_mmu_slot_remove_write_access(kvm, new); + if ((change != KVM_MR_FLAGS_ONLY) || (new_flags & KVM_MEM_READONLY)) + return; + + /* + * READONLY and non-flags changes were filtered out above, and the only + * other flag is LOG_DIRTY_PAGES, i.e. something is wrong if dirty + * logging isn't being toggled on or off. + */ + if (WARN_ON_ONCE(!((old_flags ^ new_flags) & KVM_MEM_LOG_DIRTY_PAGES))) + return; + + if (!log_dirty_pages) { + /* + * Dirty logging tracks sptes in 4k granularity, meaning that + * large sptes have to be split. If live migration succeeds, + * the guest in the source machine will be destroyed and large + * sptes will be created in the destination. However, if the + * guest continues to run in the source machine (for example if + * live migration fails), small sptes will remain around and + * cause bad performance. + * + * Scan sptes if dirty logging has been stopped, dropping those + * which can be collapsed into a single large-page spte. Later + * page faults will create the large-page sptes. + */ + kvm_mmu_zap_collapsible_sptes(kvm, new); } else { - if (kvm_x86_ops->slot_disable_log_dirty) - kvm_x86_ops->slot_disable_log_dirty(kvm, new); + /* + * Initially-all-set does not require write protecting any page, + * because they're all assumed to be dirty. + */ + if (kvm_dirty_log_manual_protect_and_init_set(kvm)) + return; + + if (READ_ONCE(eager_page_split)) + kvm_mmu_slot_try_split_huge_pages(kvm, new, PG_LEVEL_4K); + + if (kvm_x86_ops.cpu_dirty_log_size) { + kvm_mmu_slot_leaf_clear_dirty(kvm, new); + kvm_mmu_slot_remove_write_access(kvm, new, PG_LEVEL_2M); + } else { + kvm_mmu_slot_remove_write_access(kvm, new, PG_LEVEL_4K); + } + + /* + * Unconditionally flush the TLBs after enabling dirty logging. + * A flush is almost always going to be necessary (see below), + * and unconditionally flushing allows the helpers to omit + * the subtly complex checks when removing write access. + * + * Do the flush outside of mmu_lock to reduce the amount of + * time mmu_lock is held. Flushing after dropping mmu_lock is + * safe as KVM only needs to guarantee the slot is fully + * write-protected before returning to userspace, i.e. before + * userspace can consume the dirty status. + * + * Flushing outside of mmu_lock requires KVM to be careful when + * making decisions based on writable status of an SPTE, e.g. a + * !writable SPTE doesn't guarantee a CPU can't perform writes. + * + * Specifically, KVM also write-protects guest page tables to + * monitor changes when using shadow paging, and must guarantee + * no CPUs can write to those page before mmu_lock is dropped. + * Because CPUs may have stale TLB entries at this point, a + * !writable SPTE doesn't guarantee CPUs can't perform writes. + * + * KVM also allows making SPTES writable outside of mmu_lock, + * e.g. to allow dirty logging without taking mmu_lock. + * + * To handle these scenarios, KVM uses a separate software-only + * bit (MMU-writable) to track if a SPTE is !writable due to + * a guest page table being write-protected (KVM clears the + * MMU-writable flag when write-protecting for shadow paging). + * + * The use of MMU-writable is also the primary motivation for + * the unconditional flush. Because KVM must guarantee that a + * CPU doesn't contain stale, writable TLB entries for a + * !MMU-writable SPTE, KVM must flush if it encounters any + * MMU-writable SPTE regardless of whether the actual hardware + * writable bit was set. I.e. KVM is almost guaranteed to need + * to flush, while unconditionally flushing allows the "remove + * write access" helpers to ignore MMU-writable entirely. + * + * See is_writable_pte() for more details (the case involving + * access-tracked SPTEs is particularly relevant). + */ + kvm_arch_flush_remote_tlbs_memslot(kvm, new); } } void kvm_arch_commit_memory_region(struct kvm *kvm, - const struct kvm_userspace_memory_region *mem, - const struct kvm_memory_slot *old, + struct kvm_memory_slot *old, const struct kvm_memory_slot *new, enum kvm_mr_change change) { - if (!kvm->arch.n_requested_mmu_pages) - kvm_mmu_change_mmu_pages(kvm, - kvm_mmu_calculate_default_mmu_pages(kvm)); + if (!kvm->arch.n_requested_mmu_pages && + (change == KVM_MR_CREATE || change == KVM_MR_DELETE)) { + unsigned long nr_mmu_pages; - /* - * Dirty logging tracks sptes in 4k granularity, meaning that large - * sptes have to be split. If live migration is successful, the guest - * in the source machine will be destroyed and large sptes will be - * created in the destination. However, if the guest continues to run - * in the source machine (for example if live migration fails), small - * sptes will remain around and cause bad performance. - * - * Scan sptes if dirty logging has been stopped, dropping those - * which can be collapsed into a single large-page spte. Later - * page faults will create the large-page sptes. - * - * There is no need to do this in any of the following cases: - * CREATE: No dirty mappings will already exist. - * MOVE/DELETE: The old mappings will already have been cleaned up by - * kvm_arch_flush_shadow_memslot() - */ - if (change == KVM_MR_FLAGS_ONLY && - (old->flags & KVM_MEM_LOG_DIRTY_PAGES) && - !(new->flags & KVM_MEM_LOG_DIRTY_PAGES)) - kvm_mmu_zap_collapsible_sptes(kvm, new); + nr_mmu_pages = kvm->nr_memslot_pages / KVM_MEMSLOT_PAGES_TO_MMU_PAGES_RATIO; + nr_mmu_pages = max(nr_mmu_pages, KVM_MIN_ALLOC_MMU_PAGES); + kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages); + } - /* - * Set up write protection and/or dirty logging for the new slot. - * - * For KVM_MR_DELETE and KVM_MR_MOVE, the shadow pages of old slot have - * been zapped so no dirty logging staff is needed for old slot. For - * KVM_MR_FLAGS_ONLY, the old slot is essentially the same one as the - * new and it's also covered when dealing with the new slot. - * - * FIXME: const-ify all uses of struct kvm_memory_slot. - */ - if (change != KVM_MR_DELETE) - kvm_mmu_slot_apply_flags(kvm, (struct kvm_memory_slot *) new); + kvm_mmu_slot_apply_flags(kvm, old, new, change); + + /* Free the arrays associated with the old memslot. */ + if (change == KVM_MR_MOVE) + kvm_arch_free_memslot(kvm, old); } void kvm_arch_flush_shadow_all(struct kvm *kvm) @@ -10065,8 +12869,7 @@ void kvm_arch_flush_shadow_memslot(struct kvm *kvm, static inline bool kvm_guest_apic_has_interrupt(struct kvm_vcpu *vcpu) { return (is_guest_mode(vcpu) && - kvm_x86_ops->guest_apic_has_interrupt && - kvm_x86_ops->guest_apic_has_interrupt(vcpu)); + static_call(kvm_x86_guest_apic_has_interrupt)(vcpu)); } static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) @@ -10074,22 +12877,24 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) if (!list_empty_careful(&vcpu->async_pf.done)) return true; - if (kvm_apic_has_events(vcpu)) + if (kvm_apic_has_pending_init_or_sipi(vcpu) && + kvm_apic_init_sipi_allowed(vcpu)) return true; if (vcpu->arch.pv.pv_unhalted) return true; - if (vcpu->arch.exception.pending) + if (kvm_is_exception_pending(vcpu)) return true; if (kvm_test_request(KVM_REQ_NMI, vcpu) || (vcpu->arch.nmi_pending && - kvm_x86_ops->nmi_allowed(vcpu))) + static_call(kvm_x86_nmi_allowed)(vcpu, false))) return true; if (kvm_test_request(KVM_REQ_SMI, vcpu) || - (vcpu->arch.smi_pending && !is_smm(vcpu))) + (vcpu->arch.smi_pending && + static_call(kvm_x86_smi_allowed)(vcpu, false))) return true; if (kvm_arch_interrupt_allowed(vcpu) && @@ -10100,6 +12905,14 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) if (kvm_hv_has_stimer_pending(vcpu)) return true; + if (is_guest_mode(vcpu) && + kvm_x86_ops.nested_ops->has_events && + kvm_x86_ops.nested_ops->has_events(vcpu)) + return true; + + if (kvm_xen_has_pending_events(vcpu)) + return true; + return false; } @@ -10108,6 +12921,15 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) return kvm_vcpu_running(vcpu) || kvm_vcpu_has_events(vcpu); } +bool kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu) +{ + if (kvm_vcpu_apicv_active(vcpu) && + static_call(kvm_x86_dy_apicv_has_pending_interrupt)(vcpu)) + return true; + + return false; +} + bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu) { if (READ_ONCE(vcpu->arch.pv.pv_unhalted)) @@ -10118,17 +12940,22 @@ bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu) kvm_test_request(KVM_REQ_EVENT, vcpu)) return true; - if (vcpu->arch.apicv_active && kvm_x86_ops->dy_apicv_has_pending_interrupt(vcpu)) - return true; - - return false; + return kvm_arch_dy_has_pending_interrupt(vcpu); } bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu) { + if (vcpu->arch.guest_state_protected) + return true; + return vcpu->arch.preempted_in_kernel; } +unsigned long kvm_arch_vcpu_get_ip(struct kvm_vcpu *vcpu) +{ + return kvm_rip_read(vcpu); +} + int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) { return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE; @@ -10136,11 +12963,15 @@ int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu) { - return kvm_x86_ops->interrupt_allowed(vcpu); + return static_call(kvm_x86_interrupt_allowed)(vcpu, false); } unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu) { + /* Can't read the RIP when guest state is protected, just return 0 */ + if (vcpu->arch.guest_state_protected) + return 0; + if (is_64_bit_mode(vcpu)) return kvm_rip_read(vcpu); return (u32)(get_segment_base(vcpu, VCPU_SREG_CS) + @@ -10158,7 +12989,7 @@ unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu) { unsigned long rflags; - rflags = kvm_x86_ops->get_rflags(vcpu); + rflags = static_call(kvm_x86_get_rflags)(vcpu); if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) rflags &= ~X86_EFLAGS_TF; return rflags; @@ -10170,7 +13001,7 @@ static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP && kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip)) rflags |= X86_EFLAGS_TF; - kvm_x86_ops->set_rflags(vcpu, rflags); + static_call(kvm_x86_set_rflags)(vcpu, rflags); } void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) @@ -10180,33 +13011,16 @@ void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) } EXPORT_SYMBOL_GPL(kvm_set_rflags); -void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work) -{ - int r; - - if ((vcpu->arch.mmu->direct_map != work->arch.direct_map) || - work->wakeup_all) - return; - - r = kvm_mmu_reload(vcpu); - if (unlikely(r)) - return; - - if (!vcpu->arch.mmu->direct_map && - work->arch.cr3 != vcpu->arch.mmu->get_cr3(vcpu)) - return; - - kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, 0, true); -} - static inline u32 kvm_async_pf_hash_fn(gfn_t gfn) { + BUILD_BUG_ON(!is_power_of_2(ASYNC_PF_PER_VCPU)); + return hash_32(gfn & 0xffffffff, order_base_2(ASYNC_PF_PER_VCPU)); } static inline u32 kvm_async_pf_next_probe(u32 key) { - return (key + 1) & (roundup_pow_of_two(ASYNC_PF_PER_VCPU) - 1); + return (key + 1) & (ASYNC_PF_PER_VCPU - 1); } static void kvm_add_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn) @@ -10224,7 +13038,7 @@ static u32 kvm_async_pf_gfn_slot(struct kvm_vcpu *vcpu, gfn_t gfn) int i; u32 key = kvm_async_pf_hash_fn(gfn); - for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU) && + for (i = 0; i < ASYNC_PF_PER_VCPU && (vcpu->arch.apf.gfns[key] != gfn && vcpu->arch.apf.gfns[key] != ~0); i++) key = kvm_async_pf_next_probe(key); @@ -10242,6 +13056,10 @@ static void kvm_del_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn) u32 i, j, k; i = j = kvm_async_pf_gfn_slot(vcpu, gfn); + + if (WARN_ON_ONCE(vcpu->arch.apf.gfns[i] != gfn)) + return; + while (true) { vcpu->arch.apf.gfns[i] = ~0; do { @@ -10260,38 +13078,65 @@ static void kvm_del_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn) } } -static int apf_put_user(struct kvm_vcpu *vcpu, u32 val) +static inline int apf_put_user_notpresent(struct kvm_vcpu *vcpu) { + u32 reason = KVM_PV_REASON_PAGE_NOT_PRESENT; - return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, &val, - sizeof(val)); + return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, &reason, + sizeof(reason)); } -static int apf_get_user(struct kvm_vcpu *vcpu, u32 *val) +static inline int apf_put_user_ready(struct kvm_vcpu *vcpu, u32 token) { + unsigned int offset = offsetof(struct kvm_vcpu_pv_apf_data, token); + + return kvm_write_guest_offset_cached(vcpu->kvm, &vcpu->arch.apf.data, + &token, offset, sizeof(token)); +} + +static inline bool apf_pageready_slot_free(struct kvm_vcpu *vcpu) +{ + unsigned int offset = offsetof(struct kvm_vcpu_pv_apf_data, token); + u32 val; - return kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, val, - sizeof(u32)); + if (kvm_read_guest_offset_cached(vcpu->kvm, &vcpu->arch.apf.data, + &val, offset, sizeof(val))) + return false; + + return !val; } static bool kvm_can_deliver_async_pf(struct kvm_vcpu *vcpu) { - if (!vcpu->arch.apf.delivery_as_pf_vmexit && is_guest_mode(vcpu)) + + if (!kvm_pv_async_pf_enabled(vcpu)) return false; - if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) || - (vcpu->arch.apf.send_user_only && - kvm_x86_ops->get_cpl(vcpu) == 0)) + if (vcpu->arch.apf.send_user_only && + static_call(kvm_x86_get_cpl)(vcpu) == 0) return false; - return true; + if (is_guest_mode(vcpu)) { + /* + * L1 needs to opt into the special #PF vmexits that are + * used to deliver async page faults. + */ + return vcpu->arch.apf.delivery_as_pf_vmexit; + } else { + /* + * Play it safe in case the guest temporarily disables paging. + * The real mode IDT in particular is unlikely to have a #PF + * exception setup. + */ + return is_paging(vcpu); + } } bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu) { if (unlikely(!lapic_in_kernel(vcpu) || kvm_event_needs_reinjection(vcpu) || - vcpu->arch.exception.pending)) + kvm_is_exception_pending(vcpu))) return false; if (kvm_hlt_in_guest(vcpu->kvm) && !kvm_can_deliver_async_pf(vcpu)) @@ -10301,10 +13146,10 @@ bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu) * If interrupts are off we cannot even use an artificial * halt state. */ - return kvm_x86_ops->interrupt_allowed(vcpu); + return kvm_arch_interrupt_allowed(vcpu); } -void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, +bool kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, struct kvm_async_pf *work) { struct x86_exception fault; @@ -10313,7 +13158,7 @@ void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, kvm_add_async_pf_gfn(vcpu, work->arch.gfn); if (kvm_can_deliver_async_pf(vcpu) && - !apf_put_user(vcpu, KVM_PV_REASON_PAGE_NOT_PRESENT)) { + !apf_put_user_notpresent(vcpu)) { fault.vector = PF_VECTOR; fault.error_code_valid = true; fault.error_code = 0; @@ -10321,6 +13166,7 @@ void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, fault.address = work->arch.token; fault.async_page_fault = true; kvm_inject_page_fault(vcpu, &fault); + return true; } else { /* * It is not possible to deliver a paravirtualized asynchronous @@ -10331,14 +13177,17 @@ void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, * fault is retried, hopefully the page will be ready in the host. */ kvm_make_request(KVM_REQ_APF_HALT, vcpu); + return false; } } void kvm_arch_async_page_present(struct kvm_vcpu *vcpu, struct kvm_async_pf *work) { - struct x86_exception fault; - u32 val; + struct kvm_lapic_irq irq = { + .delivery_mode = APIC_DM_FIXED, + .vector = vcpu->arch.apf.vec + }; if (work->wakeup_all) work->arch.token = ~0; /* broadcast wakeup */ @@ -10346,44 +13195,36 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu, kvm_del_async_pf_gfn(vcpu, work->arch.gfn); trace_kvm_async_pf_ready(work->arch.token, work->cr2_or_gpa); - if (vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED && - !apf_get_user(vcpu, &val)) { - if (val == KVM_PV_REASON_PAGE_NOT_PRESENT && - vcpu->arch.exception.pending && - vcpu->arch.exception.nr == PF_VECTOR && - !apf_put_user(vcpu, 0)) { - vcpu->arch.exception.injected = false; - vcpu->arch.exception.pending = false; - vcpu->arch.exception.nr = 0; - vcpu->arch.exception.has_error_code = false; - vcpu->arch.exception.error_code = 0; - vcpu->arch.exception.has_payload = false; - vcpu->arch.exception.payload = 0; - } else if (!apf_put_user(vcpu, KVM_PV_REASON_PAGE_READY)) { - fault.vector = PF_VECTOR; - fault.error_code_valid = true; - fault.error_code = 0; - fault.nested_page_fault = false; - fault.address = work->arch.token; - fault.async_page_fault = true; - kvm_inject_page_fault(vcpu, &fault); - } + if ((work->wakeup_all || work->notpresent_injected) && + kvm_pv_async_pf_enabled(vcpu) && + !apf_put_user_ready(vcpu, work->arch.token)) { + vcpu->arch.apf.pageready_pending = true; + kvm_apic_set_irq(vcpu, &irq, NULL); } + vcpu->arch.apf.halted = false; vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; } -bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu) +void kvm_arch_async_page_present_queued(struct kvm_vcpu *vcpu) +{ + kvm_make_request(KVM_REQ_APF_READY, vcpu); + if (!vcpu->arch.apf.pageready_pending) + kvm_vcpu_kick(vcpu); +} + +bool kvm_arch_can_dequeue_async_page_present(struct kvm_vcpu *vcpu) { - if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED)) + if (!kvm_pv_async_pf_enabled(vcpu)) return true; else - return kvm_can_do_async_pf(vcpu); + return kvm_lapic_enabled(vcpu) && apf_pageready_slot_free(vcpu); } void kvm_arch_start_assignment(struct kvm *kvm) { - atomic_inc(&kvm->arch.assigned_device_count); + if (atomic_inc_return(&kvm->arch.assigned_device_count) == 1) + static_call_cond(kvm_x86_pi_start_assignment)(kvm); } EXPORT_SYMBOL_GPL(kvm_arch_start_assignment); @@ -10393,9 +13234,9 @@ void kvm_arch_end_assignment(struct kvm *kvm) } EXPORT_SYMBOL_GPL(kvm_arch_end_assignment); -bool kvm_arch_has_assigned_device(struct kvm *kvm) +bool noinstr kvm_arch_has_assigned_device(struct kvm *kvm) { - return atomic_read(&kvm->arch.assigned_device_count); + return arch_atomic_read(&kvm->arch.assigned_device_count); } EXPORT_SYMBOL_GPL(kvm_arch_has_assigned_device); @@ -10427,11 +13268,17 @@ int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons, { struct kvm_kernel_irqfd *irqfd = container_of(cons, struct kvm_kernel_irqfd, consumer); + int ret; irqfd->producer = prod; + kvm_arch_start_assignment(irqfd->kvm); + ret = static_call(kvm_x86_pi_update_irte)(irqfd->kvm, + prod->irq, irqfd->gsi, 1); + + if (ret) + kvm_arch_end_assignment(irqfd->kvm); - return kvm_x86_ops->update_pi_irte(irqfd->kvm, - prod->irq, irqfd->gsi, 1); + return ret; } void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, @@ -10450,16 +13297,27 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, * when the irq is masked/disabled or the consumer side (KVM * int this case doesn't want to receive the interrupts. */ - ret = kvm_x86_ops->update_pi_irte(irqfd->kvm, prod->irq, irqfd->gsi, 0); + ret = static_call(kvm_x86_pi_update_irte)(irqfd->kvm, prod->irq, irqfd->gsi, 0); if (ret) printk(KERN_INFO "irq bypass consumer (token %p) unregistration" " fails: %d\n", irqfd->consumer.token, ret); + + kvm_arch_end_assignment(irqfd->kvm); } int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, bool set) { - return kvm_x86_ops->update_pi_irte(kvm, host_irq, guest_irq, set); + return static_call(kvm_x86_pi_update_irte)(kvm, host_irq, guest_irq, set); +} + +bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *old, + struct kvm_kernel_irq_routing_entry *new) +{ + if (new->type != KVM_IRQ_ROUTING_MSI) + return true; + + return !!memcmp(&old->msi, &new->msi, sizeof(new->msi)); } bool kvm_vector_hashing_enabled(void) @@ -10473,36 +13331,363 @@ bool kvm_arch_no_poll(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_arch_no_poll); -u64 kvm_spec_ctrl_valid_bits(struct kvm_vcpu *vcpu) + +int kvm_spec_ctrl_test_value(u64 value) { - uint64_t bits = SPEC_CTRL_IBRS | SPEC_CTRL_STIBP | SPEC_CTRL_SSBD; + /* + * test that setting IA32_SPEC_CTRL to given value + * is allowed by the host processor + */ + + u64 saved_value; + unsigned long flags; + int ret = 0; + + local_irq_save(flags); + + if (rdmsrl_safe(MSR_IA32_SPEC_CTRL, &saved_value)) + ret = 1; + else if (wrmsrl_safe(MSR_IA32_SPEC_CTRL, value)) + ret = 1; + else + wrmsrl(MSR_IA32_SPEC_CTRL, saved_value); - /* The STIBP bit doesn't fault even if it's not advertised */ - if (!guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) && - !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBRS)) - bits &= ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP); - if (!boot_cpu_has(X86_FEATURE_SPEC_CTRL) && - !boot_cpu_has(X86_FEATURE_AMD_IBRS)) - bits &= ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP); + local_irq_restore(flags); + + return ret; +} +EXPORT_SYMBOL_GPL(kvm_spec_ctrl_test_value); - if (!guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL_SSBD) && - !guest_cpuid_has(vcpu, X86_FEATURE_AMD_SSBD)) - bits &= ~SPEC_CTRL_SSBD; - if (!boot_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) && - !boot_cpu_has(X86_FEATURE_AMD_SSBD)) - bits &= ~SPEC_CTRL_SSBD; +void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_code) +{ + struct kvm_mmu *mmu = vcpu->arch.walk_mmu; + struct x86_exception fault; + u64 access = error_code & + (PFERR_WRITE_MASK | PFERR_FETCH_MASK | PFERR_USER_MASK); - return bits; + if (!(error_code & PFERR_PRESENT_MASK) || + mmu->gva_to_gpa(vcpu, mmu, gva, access, &fault) != INVALID_GPA) { + /* + * If vcpu->arch.walk_mmu->gva_to_gpa succeeded, the page + * tables probably do not match the TLB. Just proceed + * with the error code that the processor gave. + */ + fault.vector = PF_VECTOR; + fault.error_code_valid = true; + fault.error_code = error_code; + fault.nested_page_fault = false; + fault.address = gva; + fault.async_page_fault = false; + } + vcpu->arch.walk_mmu->inject_page_fault(vcpu, &fault); } -EXPORT_SYMBOL_GPL(kvm_spec_ctrl_valid_bits); +EXPORT_SYMBOL_GPL(kvm_fixup_and_inject_pf_error); + +/* + * Handles kvm_read/write_guest_virt*() result and either injects #PF or returns + * KVM_EXIT_INTERNAL_ERROR for cases not currently handled by KVM. Return value + * indicates whether exit to userspace is needed. + */ +int kvm_handle_memory_failure(struct kvm_vcpu *vcpu, int r, + struct x86_exception *e) +{ + if (r == X86EMUL_PROPAGATE_FAULT) { + kvm_inject_emulated_page_fault(vcpu, e); + return 1; + } + + /* + * In case kvm_read/write_guest_virt*() failed with X86EMUL_IO_NEEDED + * while handling a VMX instruction KVM could've handled the request + * correctly by exiting to userspace and performing I/O but there + * doesn't seem to be a real use-case behind such requests, just return + * KVM_EXIT_INTERNAL_ERROR for now. + */ + kvm_prepare_emulation_failure_exit(vcpu); + return 0; +} +EXPORT_SYMBOL_GPL(kvm_handle_memory_failure); + +int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva) +{ + bool pcid_enabled; + struct x86_exception e; + struct { + u64 pcid; + u64 gla; + } operand; + int r; + + r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e); + if (r != X86EMUL_CONTINUE) + return kvm_handle_memory_failure(vcpu, r, &e); + + if (operand.pcid >> 12 != 0) { + kvm_inject_gp(vcpu, 0); + return 1; + } + + pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE); + + switch (type) { + case INVPCID_TYPE_INDIV_ADDR: + if ((!pcid_enabled && (operand.pcid != 0)) || + is_noncanonical_address(operand.gla, vcpu)) { + kvm_inject_gp(vcpu, 0); + return 1; + } + kvm_mmu_invpcid_gva(vcpu, operand.gla, operand.pcid); + return kvm_skip_emulated_instruction(vcpu); + + case INVPCID_TYPE_SINGLE_CTXT: + if (!pcid_enabled && (operand.pcid != 0)) { + kvm_inject_gp(vcpu, 0); + return 1; + } + + kvm_invalidate_pcid(vcpu, operand.pcid); + return kvm_skip_emulated_instruction(vcpu); + + case INVPCID_TYPE_ALL_NON_GLOBAL: + /* + * Currently, KVM doesn't mark global entries in the shadow + * page tables, so a non-global flush just degenerates to a + * global flush. If needed, we could optimize this later by + * keeping track of global entries in shadow page tables. + */ + + fallthrough; + case INVPCID_TYPE_ALL_INCL_GLOBAL: + kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); + return kvm_skip_emulated_instruction(vcpu); + + default: + kvm_inject_gp(vcpu, 0); + return 1; + } +} +EXPORT_SYMBOL_GPL(kvm_handle_invpcid); + +static int complete_sev_es_emulated_mmio(struct kvm_vcpu *vcpu) +{ + struct kvm_run *run = vcpu->run; + struct kvm_mmio_fragment *frag; + unsigned int len; + + BUG_ON(!vcpu->mmio_needed); + + /* Complete previous fragment */ + frag = &vcpu->mmio_fragments[vcpu->mmio_cur_fragment]; + len = min(8u, frag->len); + if (!vcpu->mmio_is_write) + memcpy(frag->data, run->mmio.data, len); + + if (frag->len <= 8) { + /* Switch to the next fragment. */ + frag++; + vcpu->mmio_cur_fragment++; + } else { + /* Go forward to the next mmio piece. */ + frag->data += len; + frag->gpa += len; + frag->len -= len; + } + + if (vcpu->mmio_cur_fragment >= vcpu->mmio_nr_fragments) { + vcpu->mmio_needed = 0; + + // VMG change, at this point, we're always done + // RIP has already been advanced + return 1; + } + + // More MMIO is needed + run->mmio.phys_addr = frag->gpa; + run->mmio.len = min(8u, frag->len); + run->mmio.is_write = vcpu->mmio_is_write; + if (run->mmio.is_write) + memcpy(run->mmio.data, frag->data, min(8u, frag->len)); + run->exit_reason = KVM_EXIT_MMIO; + + vcpu->arch.complete_userspace_io = complete_sev_es_emulated_mmio; + + return 0; +} + +int kvm_sev_es_mmio_write(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned int bytes, + void *data) +{ + int handled; + struct kvm_mmio_fragment *frag; + + if (!data) + return -EINVAL; + + handled = write_emultor.read_write_mmio(vcpu, gpa, bytes, data); + if (handled == bytes) + return 1; + + bytes -= handled; + gpa += handled; + data += handled; + + /*TODO: Check if need to increment number of frags */ + frag = vcpu->mmio_fragments; + vcpu->mmio_nr_fragments = 1; + frag->len = bytes; + frag->gpa = gpa; + frag->data = data; + + vcpu->mmio_needed = 1; + vcpu->mmio_cur_fragment = 0; + + vcpu->run->mmio.phys_addr = gpa; + vcpu->run->mmio.len = min(8u, frag->len); + vcpu->run->mmio.is_write = 1; + memcpy(vcpu->run->mmio.data, frag->data, min(8u, frag->len)); + vcpu->run->exit_reason = KVM_EXIT_MMIO; + + vcpu->arch.complete_userspace_io = complete_sev_es_emulated_mmio; + + return 0; +} +EXPORT_SYMBOL_GPL(kvm_sev_es_mmio_write); + +int kvm_sev_es_mmio_read(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned int bytes, + void *data) +{ + int handled; + struct kvm_mmio_fragment *frag; + + if (!data) + return -EINVAL; + + handled = read_emultor.read_write_mmio(vcpu, gpa, bytes, data); + if (handled == bytes) + return 1; + + bytes -= handled; + gpa += handled; + data += handled; + + /*TODO: Check if need to increment number of frags */ + frag = vcpu->mmio_fragments; + vcpu->mmio_nr_fragments = 1; + frag->len = bytes; + frag->gpa = gpa; + frag->data = data; + + vcpu->mmio_needed = 1; + vcpu->mmio_cur_fragment = 0; + + vcpu->run->mmio.phys_addr = gpa; + vcpu->run->mmio.len = min(8u, frag->len); + vcpu->run->mmio.is_write = 0; + vcpu->run->exit_reason = KVM_EXIT_MMIO; + + vcpu->arch.complete_userspace_io = complete_sev_es_emulated_mmio; + + return 0; +} +EXPORT_SYMBOL_GPL(kvm_sev_es_mmio_read); + +static void advance_sev_es_emulated_pio(struct kvm_vcpu *vcpu, unsigned count, int size) +{ + vcpu->arch.sev_pio_count -= count; + vcpu->arch.sev_pio_data += count * size; +} + +static int kvm_sev_es_outs(struct kvm_vcpu *vcpu, unsigned int size, + unsigned int port); + +static int complete_sev_es_emulated_outs(struct kvm_vcpu *vcpu) +{ + int size = vcpu->arch.pio.size; + int port = vcpu->arch.pio.port; + + vcpu->arch.pio.count = 0; + if (vcpu->arch.sev_pio_count) + return kvm_sev_es_outs(vcpu, size, port); + return 1; +} + +static int kvm_sev_es_outs(struct kvm_vcpu *vcpu, unsigned int size, + unsigned int port) +{ + for (;;) { + unsigned int count = + min_t(unsigned int, PAGE_SIZE / size, vcpu->arch.sev_pio_count); + int ret = emulator_pio_out(vcpu, size, port, vcpu->arch.sev_pio_data, count); + + /* memcpy done already by emulator_pio_out. */ + advance_sev_es_emulated_pio(vcpu, count, size); + if (!ret) + break; + + /* Emulation done by the kernel. */ + if (!vcpu->arch.sev_pio_count) + return 1; + } + + vcpu->arch.complete_userspace_io = complete_sev_es_emulated_outs; + return 0; +} + +static int kvm_sev_es_ins(struct kvm_vcpu *vcpu, unsigned int size, + unsigned int port); + +static int complete_sev_es_emulated_ins(struct kvm_vcpu *vcpu) +{ + unsigned count = vcpu->arch.pio.count; + int size = vcpu->arch.pio.size; + int port = vcpu->arch.pio.port; + + complete_emulator_pio_in(vcpu, vcpu->arch.sev_pio_data); + advance_sev_es_emulated_pio(vcpu, count, size); + if (vcpu->arch.sev_pio_count) + return kvm_sev_es_ins(vcpu, size, port); + return 1; +} + +static int kvm_sev_es_ins(struct kvm_vcpu *vcpu, unsigned int size, + unsigned int port) +{ + for (;;) { + unsigned int count = + min_t(unsigned int, PAGE_SIZE / size, vcpu->arch.sev_pio_count); + if (!emulator_pio_in(vcpu, size, port, vcpu->arch.sev_pio_data, count)) + break; + + /* Emulation done by the kernel. */ + advance_sev_es_emulated_pio(vcpu, count, size); + if (!vcpu->arch.sev_pio_count) + return 1; + } + + vcpu->arch.complete_userspace_io = complete_sev_es_emulated_ins; + return 0; +} + +int kvm_sev_es_string_io(struct kvm_vcpu *vcpu, unsigned int size, + unsigned int port, void *data, unsigned int count, + int in) +{ + vcpu->arch.sev_pio_data = data; + vcpu->arch.sev_pio_count = count; + return in ? kvm_sev_es_ins(vcpu, size, port) + : kvm_sev_es_outs(vcpu, size, port); +} +EXPORT_SYMBOL_GPL(kvm_sev_es_string_io); + +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_entry); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_cr); -EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmrun); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmenter); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit_inject); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit); @@ -10516,4 +13701,27 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pml_full); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pi_irte_update); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_unaccelerated_access); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_incomplete_ipi); -EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_apicv_update_request); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_ga_log); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_kick_vcpu_slowpath); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_doorbell); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_apicv_accept_irq); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_enter); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_exit); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_msr_protocol_enter); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_msr_protocol_exit); + +static int __init kvm_x86_init(void) +{ + kvm_mmu_x86_module_init(); + return 0; +} +module_init(kvm_x86_init); + +static void __exit kvm_x86_exit(void) +{ + /* + * If module_init() is implemented, module_exit() must also be + * implemented to allow module unload. + */ +} +module_exit(kvm_x86_exit); |