From 0e4524a5d341e719e8ee9ee7db5d58e2c5a4c10e Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Thu, 6 Jul 2017 14:44:28 +0200 Subject: KVM: mark vcpu->pid pointer as rcu protected We do use rcu to protect the pid pointer. Mark it as such and adopt all code to use the proper access methods. This was detected by sparse. "virt/kvm/kvm_main.c:2248:15: error: incompatible types in comparison expression (different address spaces)" Signed-off-by: Christian Borntraeger Reviewed-by: Paolo Bonzini --- include/linux/kvm_host.h | 2 +- virt/kvm/kvm_main.c | 15 +++++++++++---- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 0b50e7b35ed4..bcd37b855c66 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -234,7 +234,7 @@ struct kvm_vcpu { int guest_fpu_loaded, guest_xcr0_loaded; struct swait_queue_head wq; - struct pid *pid; + struct pid __rcu *pid; int sigset_active; sigset_t sigset; struct kvm_vcpu_stat stat; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 19f0ecb9b93e..fc2d58312fd5 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -293,7 +293,12 @@ EXPORT_SYMBOL_GPL(kvm_vcpu_init); void kvm_vcpu_uninit(struct kvm_vcpu *vcpu) { - put_pid(vcpu->pid); + /* + * no need for rcu_read_lock as VCPU_RUN is the only place that + * will change the vcpu->pid pointer and on uninit all file + * descriptors are already gone. + */ + put_pid(rcu_dereference_protected(vcpu->pid, 1)); kvm_arch_vcpu_uninit(vcpu); free_page((unsigned long)vcpu->run); } @@ -2551,13 +2556,14 @@ static long kvm_vcpu_ioctl(struct file *filp, if (r) return r; switch (ioctl) { - case KVM_RUN: + case KVM_RUN: { + struct pid *oldpid; r = -EINVAL; if (arg) goto out; - if (unlikely(vcpu->pid != current->pids[PIDTYPE_PID].pid)) { + oldpid = rcu_access_pointer(vcpu->pid); + if (unlikely(oldpid != current->pids[PIDTYPE_PID].pid)) { /* The thread running this VCPU changed. */ - struct pid *oldpid = vcpu->pid; struct pid *newpid = get_task_pid(current, PIDTYPE_PID); rcu_assign_pointer(vcpu->pid, newpid); @@ -2568,6 +2574,7 @@ static long kvm_vcpu_ioctl(struct file *filp, r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run); trace_kvm_userspace_exit(vcpu->run->exit_reason, r); break; + } case KVM_GET_REGS: { struct kvm_regs *kvm_regs; -- cgit v1.2.3-59-g8ed1b From 5535f800b0e1533e5f3a1428f6ef25eb29eccc0f Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Thu, 6 Jul 2017 20:31:11 +0200 Subject: KVM: use rcu access function for irq routing irq routing is rcu protected. Use the proper access functions. Found by sparse virt/kvm/irqchip.c:233:13: warning: incorrect type in assignment (different address spaces) virt/kvm/irqchip.c:233:13: expected struct kvm_irq_routing_table *old virt/kvm/irqchip.c:233:13: got struct kvm_irq_routing_table [noderef] *irq_routing Signed-off-by: Christian Borntraeger Reviewed-by: Paolo Bonzini --- virt/kvm/irqchip.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/virt/kvm/irqchip.c b/virt/kvm/irqchip.c index 31e40c9e81df..b1286c4e0712 100644 --- a/virt/kvm/irqchip.c +++ b/virt/kvm/irqchip.c @@ -230,7 +230,7 @@ int kvm_set_irq_routing(struct kvm *kvm, } mutex_lock(&kvm->irq_lock); - old = kvm->irq_routing; + old = rcu_dereference_protected(kvm->irq_routing, 1); rcu_assign_pointer(kvm->irq_routing, new); kvm_irq_routing_update(kvm); kvm_arch_irq_routing_update(kvm); -- cgit v1.2.3-59-g8ed1b From 4a12f95177280a660bda99e81838919b1cc6a91a Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Fri, 7 Jul 2017 10:51:38 +0200 Subject: KVM: mark kvm->busses as rcu protected mark kvm->busses as rcu protected and use the correct access function everywhere. found by sparse virt/kvm/kvm_main.c:3490:15: error: incompatible types in comparison expression (different address spaces) virt/kvm/kvm_main.c:3509:15: error: incompatible types in comparison expression (different address spaces) virt/kvm/kvm_main.c:3561:15: error: incompatible types in comparison expression (different address spaces) virt/kvm/kvm_main.c:3644:15: error: incompatible types in comparison expression (different address spaces) Signed-off-by: Christian Borntraeger --- include/linux/kvm_host.h | 8 +++++++- virt/kvm/eventfd.c | 8 +++++--- virt/kvm/kvm_main.c | 17 ++++++++++------- 3 files changed, 22 insertions(+), 11 deletions(-) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index bcd37b855c66..6a164f9eb02c 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -404,7 +404,7 @@ struct kvm { int last_boosted_vcpu; struct list_head vm_list; struct mutex lock; - struct kvm_io_bus *buses[KVM_NR_BUSES]; + struct kvm_io_bus __rcu *buses[KVM_NR_BUSES]; #ifdef CONFIG_HAVE_KVM_EVENTFD struct { spinlock_t lock; @@ -473,6 +473,12 @@ struct kvm { #define vcpu_err(vcpu, fmt, ...) \ kvm_err("vcpu%i " fmt, (vcpu)->vcpu_id, ## __VA_ARGS__) +static inline struct kvm_io_bus *kvm_get_bus(struct kvm *kvm, enum kvm_bus idx) +{ + return srcu_dereference_check(kvm->buses[idx], &kvm->srcu, + lockdep_is_held(&kvm->slots_lock)); +} + static inline struct kvm_vcpu *kvm_get_vcpu(struct kvm *kvm, int i) { /* Pairs with smp_wmb() in kvm_vm_ioctl_create_vcpu, in case diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index a8d540398bbd..d016aadd5fbb 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -825,7 +825,7 @@ static int kvm_assign_ioeventfd_idx(struct kvm *kvm, if (ret < 0) goto unlock_fail; - kvm->buses[bus_idx]->ioeventfd_count++; + kvm_get_bus(kvm, bus_idx)->ioeventfd_count++; list_add_tail(&p->list, &kvm->ioeventfds); mutex_unlock(&kvm->slots_lock); @@ -848,6 +848,7 @@ kvm_deassign_ioeventfd_idx(struct kvm *kvm, enum kvm_bus bus_idx, { struct _ioeventfd *p, *tmp; struct eventfd_ctx *eventfd; + struct kvm_io_bus *bus; int ret = -ENOENT; eventfd = eventfd_ctx_fdget(args->fd); @@ -870,8 +871,9 @@ kvm_deassign_ioeventfd_idx(struct kvm *kvm, enum kvm_bus bus_idx, continue; kvm_io_bus_unregister_dev(kvm, bus_idx, &p->dev); - if (kvm->buses[bus_idx]) - kvm->buses[bus_idx]->ioeventfd_count--; + bus = kvm_get_bus(kvm, bus_idx); + if (bus) + bus->ioeventfd_count--; ioeventfd_release(p); ret = 0; break; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index fc2d58312fd5..d76e822f8929 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -679,8 +679,8 @@ static struct kvm *kvm_create_vm(unsigned long type) if (init_srcu_struct(&kvm->irq_srcu)) goto out_err_no_irq_srcu; for (i = 0; i < KVM_NR_BUSES; i++) { - kvm->buses[i] = kzalloc(sizeof(struct kvm_io_bus), - GFP_KERNEL); + rcu_assign_pointer(kvm->buses[i], + kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL)); if (!kvm->buses[i]) goto out_err; } @@ -705,7 +705,7 @@ out_err_no_srcu: hardware_disable_all(); out_err_no_disable: for (i = 0; i < KVM_NR_BUSES; i++) - kfree(kvm->buses[i]); + kfree(rcu_access_pointer(kvm->buses[i])); for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) kvm_free_memslots(kvm, kvm->memslots[i]); kvm_arch_free_vm(kvm); @@ -740,8 +740,11 @@ static void kvm_destroy_vm(struct kvm *kvm) spin_unlock(&kvm_lock); kvm_free_irq_routing(kvm); for (i = 0; i < KVM_NR_BUSES; i++) { - if (kvm->buses[i]) - kvm_io_bus_destroy(kvm->buses[i]); + struct kvm_io_bus *bus; + + bus = rcu_dereference_protected(kvm->buses[i], 1); + if (bus) + kvm_io_bus_destroy(bus); kvm->buses[i] = NULL; } kvm_coalesced_mmio_free(kvm); @@ -3570,7 +3573,7 @@ int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, { struct kvm_io_bus *new_bus, *bus; - bus = kvm->buses[bus_idx]; + bus = kvm_get_bus(kvm, bus_idx); if (!bus) return -ENOMEM; @@ -3599,7 +3602,7 @@ void kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx, int i; struct kvm_io_bus *new_bus, *bus; - bus = kvm->buses[bus_idx]; + bus = kvm_get_bus(kvm, bus_idx); if (!bus) return; -- cgit v1.2.3-59-g8ed1b From a80cf7b5f4149753d5f19c872a47e66195b167d4 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Thu, 6 Jul 2017 16:17:14 +0200 Subject: KVM: mark memory slots as rcu we access the memslots array via srcu. Mark it as such and use the right access functions also for the freeing of memory slots. Found by sparse: ./include/linux/kvm_host.h:565:16: error: incompatible types in comparison expression (different address spaces) Signed-off-by: Christian Borntraeger Reviewed-by: Paolo Bonzini --- include/linux/kvm_host.h | 2 +- virt/kvm/kvm_main.c | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 6a164f9eb02c..b3ca77a96b2d 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -390,7 +390,7 @@ struct kvm { spinlock_t mmu_lock; struct mutex slots_lock; struct mm_struct *mm; /* userspace tied to this vm */ - struct kvm_memslots *memslots[KVM_ADDRESS_SPACE_NUM]; + struct kvm_memslots __rcu *memslots[KVM_ADDRESS_SPACE_NUM]; struct kvm_vcpu *vcpus[KVM_MAX_VCPUS]; /* diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index d76e822f8929..6e6d4edf0e92 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -707,7 +707,8 @@ out_err_no_disable: for (i = 0; i < KVM_NR_BUSES; i++) kfree(rcu_access_pointer(kvm->buses[i])); for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) - kvm_free_memslots(kvm, kvm->memslots[i]); + kvm_free_memslots(kvm, + rcu_dereference_protected(kvm->memslots[i], 1)); kvm_arch_free_vm(kvm); mmdrop(current->mm); return ERR_PTR(r); @@ -756,7 +757,8 @@ static void kvm_destroy_vm(struct kvm *kvm) kvm_arch_destroy_vm(kvm); kvm_destroy_devices(kvm); for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) - kvm_free_memslots(kvm, kvm->memslots[i]); + kvm_free_memslots(kvm, + rcu_dereference_protected(kvm->memslots[i], 1)); cleanup_srcu_struct(&kvm->irq_srcu); cleanup_srcu_struct(&kvm->srcu); kvm_arch_free_vm(kvm); -- cgit v1.2.3-59-g8ed1b From 7e988b103d0d52190244517edc76e649071284bb Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Fri, 7 Jul 2017 15:49:00 +0200 Subject: KVM: use correct accessor function for __kvm_memslots kvm memslots are protected by srcu and not by rcu. We must use srcu_dereference_check instead of rcu_dereference_check. Signed-off-by: Christian Borntraeger Suggested-by: Paolo Bonzini --- include/linux/kvm_host.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index b3ca77a96b2d..648b34cabb38 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -568,9 +568,8 @@ void kvm_put_kvm(struct kvm *kvm); static inline struct kvm_memslots *__kvm_memslots(struct kvm *kvm, int as_id) { - return rcu_dereference_check(kvm->memslots[as_id], - srcu_read_lock_held(&kvm->srcu) - || lockdep_is_held(&kvm->slots_lock)); + return srcu_dereference_check(kvm->memslots[as_id], &kvm->srcu, + lockdep_is_held(&kvm->slots_lock)); } static inline struct kvm_memslots *kvm_memslots(struct kvm *kvm) -- cgit v1.2.3-59-g8ed1b From b49defe83659cefbb1763d541e779da32594ab10 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 30 Jun 2017 13:25:45 +0200 Subject: kvm: avoid unused variable warning for UP builds The uniprocessor version of smp_call_function_many does not evaluate all of its argument, and the compiler emits a warning about "wait" being unused. This breaks the build on architectures for which "-Werror" is enabled by default. Work around it by moving the invocation of smp_call_function_many to its own inline function. Reported-by: Paul Mackerras Cc: stable@vger.kernel.org Fixes: 7a97cec26b94c909f4cbad2dc3186af3e457a522 Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 19f0ecb9b93e..0d796c9a6482 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -187,12 +187,23 @@ static void ack_flush(void *_completed) { } +static inline bool kvm_kick_many_cpus(const struct cpumask *cpus, bool wait) +{ + if (unlikely(!cpus)) + cpus = cpu_online_mask; + + if (cpumask_empty(cpus)) + return false; + + smp_call_function_many(cpus, ack_flush, NULL, wait); + return true; +} + bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req) { int i, cpu, me; cpumask_var_t cpus; - bool called = true; - bool wait = req & KVM_REQUEST_WAIT; + bool called; struct kvm_vcpu *vcpu; zalloc_cpumask_var(&cpus, GFP_ATOMIC); @@ -207,14 +218,9 @@ bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req) if (cpus != NULL && cpu != -1 && cpu != me && kvm_request_needs_ipi(vcpu, req)) - cpumask_set_cpu(cpu, cpus); + __cpumask_set_cpu(cpu, cpus); } - if (unlikely(cpus == NULL)) - smp_call_function_many(cpu_online_mask, ack_flush, NULL, wait); - else if (!cpumask_empty(cpus)) - smp_call_function_many(cpus, ack_flush, NULL, wait); - else - called = false; + called = kvm_kick_many_cpus(cpus, !!(req & KVM_REQUEST_WAIT)); put_cpu(); free_cpumask_var(cpus); return called; -- cgit v1.2.3-59-g8ed1b From 70bcd708dfd1de453905212c9c5c755b1844772b Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 5 Jul 2017 12:38:06 +0200 Subject: KVM: vmx: expose more information for KVM_INTERNAL_ERROR_DELIVERY_EV exits This exit ended up being reported, but the currently exposed data does not provide much of a starting point for debugging. In the reported case, the vmexit was an EPT misconfiguration (MMIO access). Let userspace report ethe exit qualification and, if relevant, the GPA. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index f76efad248ab..7592a18ecc1c 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -8418,9 +8418,15 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) exit_reason != EXIT_REASON_TASK_SWITCH)) { vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV; - vcpu->run->internal.ndata = 2; + vcpu->run->internal.ndata = 3; vcpu->run->internal.data[0] = vectoring_info; vcpu->run->internal.data[1] = exit_reason; + vcpu->run->internal.data[2] = vcpu->arch.exit_qualification; + if (exit_reason == EXIT_REASON_EPT_MISCONFIG) { + vcpu->run->internal.ndata++; + vcpu->run->internal.data[3] = + vmcs_read64(GUEST_PHYSICAL_ADDRESS); + } return 0; } -- cgit v1.2.3-59-g8ed1b From 7cdc2d62f9cce8a02eb2a5fbfca6813f04189487 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Thu, 6 Jul 2017 16:33:05 -0700 Subject: kvm: nVMX: Don't set vmcs12 to "launched" when VMLAUNCH fails The VMCS launch state is not set to "launched" unless the VMLAUNCH actually succeeds. VMLAUNCH failure includes VM-exits with bit 31 set. Note that this change does not address the general problem that a failure to launch/resume vmcs02 (i.e. vmx->fail) is not handled correctly. Signed-off-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 7592a18ecc1c..ef978d5983ae 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -10435,8 +10435,6 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry) return 1; } - vmcs12->launch_state = 1; - /* * Note no nested_vmx_succeed or nested_vmx_fail here. At this point * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet @@ -10810,6 +10808,8 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) { + vmcs12->launch_state = 1; + /* vm_entry_intr_info_field is cleared on exit. Emulate this * instead of reading the real value. */ vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK; -- cgit v1.2.3-59-g8ed1b From 56a205100d3933f785ca970c58aecedd94ff90b2 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Thu, 6 Jul 2017 16:33:06 -0700 Subject: kvm: nVMX: Validate the I/O bitmaps on nested VM-entry According to the SDM, if the "use I/O bitmaps" VM-execution control is 1, bits 11:0 of each I/O-bitmap address must be 0. Neither address should set any bits beyond the processor's physical-address width. Signed-off-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index ef978d5983ae..22034ac4b5f2 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -9595,6 +9595,19 @@ static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu) ns_to_ktime(preemption_timeout), HRTIMER_MODE_REL); } +static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) +{ + if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) + return 0; + + if (!page_address_valid(vcpu, vmcs12->io_bitmap_a) || + !page_address_valid(vcpu, vmcs12->io_bitmap_b)) + return -EINVAL; + + return 0; +} + static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { @@ -10299,6 +10312,9 @@ static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT) return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + if (nested_vmx_check_io_bitmap_controls(vcpu, vmcs12)) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + if (nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12)) return VMXERR_ENTRY_INVALID_CONTROL_FIELD; -- cgit v1.2.3-59-g8ed1b From 5fa99cbe7b666dce6dd8ac55b253778893b9c5df Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Thu, 6 Jul 2017 16:33:07 -0700 Subject: kvm: nVMX: Fix nested_vmx_check_msr_bitmap_controls Allow the L1 guest to specify the last page of addressable guest physical memory for an L2 MSR permission bitmap. Also remove the vmcs12_read_any() check that should never fail. Fixes: 3af18d9c5fe95 ("KVM: nVMX: Prepare for using hardware MSR bitmap") Signed-off-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 22034ac4b5f2..e02c7004b64b 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -4634,6 +4634,11 @@ static bool guest_state_valid(struct kvm_vcpu *vcpu) return true; } +static bool page_address_valid(struct kvm_vcpu *vcpu, gpa_t gpa) +{ + return PAGE_ALIGNED(gpa) && !(gpa >> cpuid_maxphyaddr(vcpu)); +} + static int init_rmode_tss(struct kvm *kvm) { gfn_t fn; @@ -9611,20 +9616,10 @@ static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu, static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { - int maxphyaddr; - u64 addr; - if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) return 0; - if (vmcs12_read_any(vcpu, MSR_BITMAP, &addr)) { - WARN_ON(1); - return -EINVAL; - } - maxphyaddr = cpuid_maxphyaddr(vcpu); - - if (!PAGE_ALIGNED(vmcs12->msr_bitmap) || - ((addr + PAGE_SIZE) >> maxphyaddr)) + if (!page_address_valid(vcpu, vmcs12->msr_bitmap)) return -EINVAL; return 0; -- cgit v1.2.3-59-g8ed1b From 85fd514e24238a633c971332aa96a2e5c4ddd502 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Fri, 7 Jul 2017 12:51:41 -0700 Subject: kvm: nVMX: Shadow "high" parts of shadowed 64-bit VMCS fields Inconsistencies result from shadowing only accesses to the full 64-bits of a 64-bit VMCS field, but not shadowing accesses to the high 32-bits of the field. The "high" part of a 64-bit field should be shadowed whenever the full 64-bit field is shadowed. Signed-off-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 60 +++++++++++++++++++++++++++++++----------------------- 1 file changed, 34 insertions(+), 26 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index e02c7004b64b..32db3f5dce7f 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3764,6 +3764,25 @@ static void free_kvm_area(void) } } +enum vmcs_field_type { + VMCS_FIELD_TYPE_U16 = 0, + VMCS_FIELD_TYPE_U64 = 1, + VMCS_FIELD_TYPE_U32 = 2, + VMCS_FIELD_TYPE_NATURAL_WIDTH = 3 +}; + +static inline int vmcs_field_type(unsigned long field) +{ + if (0x1 & field) /* the *_HIGH fields are all 32 bit */ + return VMCS_FIELD_TYPE_U32; + return (field >> 13) & 0x3 ; +} + +static inline int vmcs_field_readonly(unsigned long field) +{ + return (((field >> 10) & 0x3) == 1); +} + static void init_vmcs_shadow_fields(void) { int i, j; @@ -3789,14 +3808,22 @@ static void init_vmcs_shadow_fields(void) /* shadowed fields guest access without vmexit */ for (i = 0; i < max_shadow_read_write_fields; i++) { - clear_bit(shadow_read_write_fields[i], - vmx_vmwrite_bitmap); - clear_bit(shadow_read_write_fields[i], - vmx_vmread_bitmap); + unsigned long field = shadow_read_write_fields[i]; + + clear_bit(field, vmx_vmwrite_bitmap); + clear_bit(field, vmx_vmread_bitmap); + if (vmcs_field_type(field) == VMCS_FIELD_TYPE_U64) { + clear_bit(field + 1, vmx_vmwrite_bitmap); + clear_bit(field + 1, vmx_vmread_bitmap); + } + } + for (i = 0; i < max_shadow_read_only_fields; i++) { + unsigned long field = shadow_read_only_fields[i]; + + clear_bit(field, vmx_vmread_bitmap); + if (vmcs_field_type(field) == VMCS_FIELD_TYPE_U64) + clear_bit(field + 1, vmx_vmread_bitmap); } - for (i = 0; i < max_shadow_read_only_fields; i++) - clear_bit(shadow_read_only_fields[i], - vmx_vmread_bitmap); } static __init int alloc_kvm_area(void) @@ -7219,25 +7246,6 @@ static int handle_vmresume(struct kvm_vcpu *vcpu) return nested_vmx_run(vcpu, false); } -enum vmcs_field_type { - VMCS_FIELD_TYPE_U16 = 0, - VMCS_FIELD_TYPE_U64 = 1, - VMCS_FIELD_TYPE_U32 = 2, - VMCS_FIELD_TYPE_NATURAL_WIDTH = 3 -}; - -static inline int vmcs_field_type(unsigned long field) -{ - if (0x1 & field) /* the *_HIGH fields are all 32 bit */ - return VMCS_FIELD_TYPE_U32; - return (field >> 13) & 0x3 ; -} - -static inline int vmcs_field_readonly(unsigned long field) -{ - return (((field >> 10) & 0x3) == 1); -} - /* * Read a vmcs12 field. Since these can have varying lengths and we return * one type, we chose the biggest type (u64) and zero-extend the return value -- cgit v1.2.3-59-g8ed1b From 0bc48bea36d178aea9d7f83f66a1b397cec9db5c Mon Sep 17 00:00:00 2001 From: Radim Krčmář Date: Tue, 16 May 2017 22:50:00 +0200 Subject: KVM: x86: update master clock before computing kvmclock_offset MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit kvm master clock usually has a different frequency than the kernel boot clock. This is not a problem until the master clock is updated; update uses the current kernel boot clock to compute new kvm clock, which erases any kvm clock cycles that might have built up due to frequency difference over a long period. KVM_SET_CLOCK is one of places where we can safely update master clock as the guest-visible clock is going to be shifted anyway. The problem with current code is that it updates the kvm master clock after updating the offset. If the master clock was enabled before calling KVM_SET_CLOCK, then it might have built up a significant delta from kernel boot clock. In the worst case, the time set by userspace would be shifted by so much that it couldn't have been set at any point during KVM_SET_CLOCK. To fix this, move kvm_gen_update_masterclock() before computing kvmclock_offset, which means that the master clock and kernel boot clock will be sufficiently close together. Another solution would be to replace get_kvmclock_ns() with "ktime_get_boot_ns() + ka->kvmclock_offset", which is marginally more accurate, but would break symmetry with KVM_GET_CLOCK. Signed-off-by: Radim Krčmář --- arch/x86/kvm/x86.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6c7266f7766d..ca128a9c9cc4 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4188,9 +4188,15 @@ long kvm_arch_vm_ioctl(struct file *filp, goto out; r = 0; + /* + * TODO: userspace has to take care of races with VCPU_RUN, so + * kvm_gen_update_masterclock() can be cut down to locked + * pvclock_update_vm_gtod_copy(). + */ + kvm_gen_update_masterclock(kvm); now_ns = get_kvmclock_ns(kvm); kvm->arch.kvmclock_offset += user_ns.clock - now_ns; - kvm_gen_update_masterclock(kvm); + kvm_make_all_cpus_request(kvm, KVM_REQ_CLOCK_UPDATE); break; } case KVM_GET_CLOCK: { -- cgit v1.2.3-59-g8ed1b From 48ae0fb49b2e8380e60c5072f18005c8a98c6520 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Mon, 22 May 2017 09:48:33 -0700 Subject: kvm: vmx: Properly handle machine check during VM-entry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit vmx_complete_atomic_exit should call kvm_machine_check for any VM-entry failure due to a machine-check event. Such an exit should be recognized solely by its basic exit reason (i.e. the low 16 bits of the VMCS exit reason field). None of the other VMCS exit information fields contain valid information when the VM-exit is due to "VM-entry failure due to machine-check event". Signed-off-by: Jim Mattson Reviewed-by: Xiao Guangrong [Changed VM_EXIT_INTR_INFO condition to better describe its reason.] Signed-off-by: Radim Krčmář --- arch/x86/kvm/vmx.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 32db3f5dce7f..25f2fdccf625 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -8630,17 +8630,20 @@ static void vmx_apicv_post_state_restore(struct kvm_vcpu *vcpu) static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx) { - u32 exit_intr_info; + u32 exit_intr_info = 0; + u16 basic_exit_reason = (u16)vmx->exit_reason; - if (!(vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY - || vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI)) + if (!(basic_exit_reason == EXIT_REASON_MCE_DURING_VMENTRY + || basic_exit_reason == EXIT_REASON_EXCEPTION_NMI)) return; - vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); - exit_intr_info = vmx->exit_intr_info; + if (!(vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) + exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); + vmx->exit_intr_info = exit_intr_info; /* Handle machine checks before interrupts are enabled */ - if (is_machine_check(exit_intr_info)) + if (basic_exit_reason == EXIT_REASON_MCE_DURING_VMENTRY || + is_machine_check(exit_intr_info)) kvm_machine_check(); /* We need to handle NMIs before interrupts are enabled */ -- cgit v1.2.3-59-g8ed1b From 949c033694864082db9b3f5304723a6d7407f8e2 Mon Sep 17 00:00:00 2001 From: Gleb Fotengauer-Malinovskiy Date: Tue, 11 Jul 2017 00:22:33 +0300 Subject: KVM: s390: Fix KVM_S390_GET_CMMA_BITS ioctl definition MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In case of KVM_S390_GET_CMMA_BITS, the kernel does not only read struct kvm_s390_cmma_log passed from userspace (which constitutes _IOC_WRITE), it also writes back a return value (which constitutes _IOC_READ) making this an _IOWR ioctl instead of _IOW. Fixes: 4036e387 ("KVM: s390: ioctls to get and set guest storage attributes") Signed-off-by: Gleb Fotengauer-Malinovskiy Acked-by: Christian Borntraeger Signed-off-by: Radim Krčmář --- include/uapi/linux/kvm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index c0b6dfec5f87..ebd604c222d8 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1351,7 +1351,7 @@ struct kvm_s390_ucas_mapping { /* Available with KVM_CAP_X86_SMM */ #define KVM_SMI _IO(KVMIO, 0xb7) /* Available with KVM_CAP_S390_CMMA_MIGRATION */ -#define KVM_S390_GET_CMMA_BITS _IOW(KVMIO, 0xb8, struct kvm_s390_cmma_log) +#define KVM_S390_GET_CMMA_BITS _IOWR(KVMIO, 0xb8, struct kvm_s390_cmma_log) #define KVM_S390_SET_CMMA_BITS _IOW(KVMIO, 0xb9, struct kvm_s390_cmma_log) #define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0) -- cgit v1.2.3-59-g8ed1b From fb5307298e49ec1668c3a9ec888c1b9da4347395 Mon Sep 17 00:00:00 2001 From: Radim Krčmář Date: Mon, 10 Jul 2017 20:53:28 +0200 Subject: KVM: x86: take slots_lock in kvm_free_pit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit kvm_vm_release() did not have slots_lock when calling kvm_io_bus_unregister_dev() and this went unnoticed until 4a12f9517728 ("KVM: mark kvm->busses as rcu protected") added dynamic checks. Luckily, there should be no race at that point: ============================= WARNING: suspicious RCU usage 4.12.0.kvm+ #0 Not tainted ----------------------------- ./include/linux/kvm_host.h:479 suspicious rcu_dereference_check() usage! lockdep_rcu_suspicious+0xc5/0x100 kvm_io_bus_unregister_dev+0x173/0x190 [kvm] kvm_free_pit+0x28/0x80 [kvm] kvm_arch_sync_events+0x2d/0x30 [kvm] kvm_put_kvm+0xa7/0x2a0 [kvm] kvm_vm_release+0x21/0x30 [kvm] Reviewed-by: David Hildenbrand Signed-off-by: Radim Krčmář --- arch/x86/kvm/i8254.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index a78b445ce411..af192895b1fc 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -724,8 +724,10 @@ void kvm_free_pit(struct kvm *kvm) struct kvm_pit *pit = kvm->arch.vpit; if (pit) { + mutex_lock(&kvm->slots_lock); kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &pit->dev); kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &pit->speaker_dev); + mutex_unlock(&kvm->slots_lock); kvm_pit_set_reinject(pit, false); hrtimer_cancel(&pit->pit_state.timer); kthread_destroy_worker(pit->worker); -- cgit v1.2.3-59-g8ed1b From b742c1e6e79ddf4192d76336da2407c65ca7242f Mon Sep 17 00:00:00 2001 From: Ladi Prosek Date: Thu, 22 Jun 2017 09:05:26 +0200 Subject: KVM: SVM: handle singlestep exception when skipping emulated instructions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit kvm_skip_emulated_instruction handles the singlestep debug exception which is something we almost always want. This commit (specifically the change in rdmsr_interception) makes the debug.flat KVM unit test pass on AMD. Two call sites still call skip_emulated_instruction directly: * In svm_queue_exception where it's used only for moving the rip forward * In task_switch_interception which is analogous to handle_task_switch in VMX Signed-off-by: Ladi Prosek Signed-off-by: Radim Krčmář --- arch/x86/kvm/svm.c | 59 ++++++++++++++++++++++++++++++------------------------ 1 file changed, 33 insertions(+), 26 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 905ea6052517..3da42d7c629e 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -2267,7 +2267,7 @@ static int io_interception(struct vcpu_svm *svm) { struct kvm_vcpu *vcpu = &svm->vcpu; u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */ - int size, in, string; + int size, in, string, ret; unsigned port; ++svm->vcpu.stat.io_exits; @@ -2279,10 +2279,16 @@ static int io_interception(struct vcpu_svm *svm) port = io_info >> 16; size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT; svm->next_rip = svm->vmcb->control.exit_info_2; - skip_emulated_instruction(&svm->vcpu); + ret = kvm_skip_emulated_instruction(&svm->vcpu); - return in ? kvm_fast_pio_in(vcpu, size, port) - : kvm_fast_pio_out(vcpu, size, port); + /* + * TODO: we might be squashing a KVM_GUESTDBG_SINGLESTEP-triggered + * KVM_EXIT_DEBUG here. + */ + if (in) + return kvm_fast_pio_in(vcpu, size, port) && ret; + else + return kvm_fast_pio_out(vcpu, size, port) && ret; } static int nmi_interception(struct vcpu_svm *svm) @@ -3055,6 +3061,7 @@ static int vmload_interception(struct vcpu_svm *svm) { struct vmcb *nested_vmcb; struct page *page; + int ret; if (nested_svm_check_permissions(svm)) return 1; @@ -3064,18 +3071,19 @@ static int vmload_interception(struct vcpu_svm *svm) return 1; svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; - skip_emulated_instruction(&svm->vcpu); + ret = kvm_skip_emulated_instruction(&svm->vcpu); nested_svm_vmloadsave(nested_vmcb, svm->vmcb); nested_svm_unmap(page); - return 1; + return ret; } static int vmsave_interception(struct vcpu_svm *svm) { struct vmcb *nested_vmcb; struct page *page; + int ret; if (nested_svm_check_permissions(svm)) return 1; @@ -3085,12 +3093,12 @@ static int vmsave_interception(struct vcpu_svm *svm) return 1; svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; - skip_emulated_instruction(&svm->vcpu); + ret = kvm_skip_emulated_instruction(&svm->vcpu); nested_svm_vmloadsave(svm->vmcb, nested_vmcb); nested_svm_unmap(page); - return 1; + return ret; } static int vmrun_interception(struct vcpu_svm *svm) @@ -3123,25 +3131,29 @@ failed: static int stgi_interception(struct vcpu_svm *svm) { + int ret; + if (nested_svm_check_permissions(svm)) return 1; svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; - skip_emulated_instruction(&svm->vcpu); + ret = kvm_skip_emulated_instruction(&svm->vcpu); kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); enable_gif(svm); - return 1; + return ret; } static int clgi_interception(struct vcpu_svm *svm) { + int ret; + if (nested_svm_check_permissions(svm)) return 1; svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; - skip_emulated_instruction(&svm->vcpu); + ret = kvm_skip_emulated_instruction(&svm->vcpu); disable_gif(svm); @@ -3152,7 +3164,7 @@ static int clgi_interception(struct vcpu_svm *svm) mark_dirty(svm->vmcb, VMCB_INTR); } - return 1; + return ret; } static int invlpga_interception(struct vcpu_svm *svm) @@ -3166,8 +3178,7 @@ static int invlpga_interception(struct vcpu_svm *svm) kvm_mmu_invlpg(vcpu, kvm_register_read(&svm->vcpu, VCPU_REGS_RAX)); svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; - skip_emulated_instruction(&svm->vcpu); - return 1; + return kvm_skip_emulated_instruction(&svm->vcpu); } static int skinit_interception(struct vcpu_svm *svm) @@ -3190,7 +3201,7 @@ static int xsetbv_interception(struct vcpu_svm *svm) if (kvm_set_xcr(&svm->vcpu, index, new_bv) == 0) { svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; - skip_emulated_instruction(&svm->vcpu); + return kvm_skip_emulated_instruction(&svm->vcpu); } return 1; @@ -3286,8 +3297,7 @@ static int invlpg_interception(struct vcpu_svm *svm) return emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE; kvm_mmu_invlpg(&svm->vcpu, svm->vmcb->control.exit_info_1); - skip_emulated_instruction(&svm->vcpu); - return 1; + return kvm_skip_emulated_instruction(&svm->vcpu); } static int emulate_on_interception(struct vcpu_svm *svm) @@ -3437,9 +3447,7 @@ static int dr_interception(struct vcpu_svm *svm) kvm_register_write(&svm->vcpu, reg, val); } - skip_emulated_instruction(&svm->vcpu); - - return 1; + return kvm_skip_emulated_instruction(&svm->vcpu); } static int cr8_write_interception(struct vcpu_svm *svm) @@ -3562,6 +3570,7 @@ static int rdmsr_interception(struct vcpu_svm *svm) if (svm_get_msr(&svm->vcpu, &msr_info)) { trace_kvm_msr_read_ex(ecx); kvm_inject_gp(&svm->vcpu, 0); + return 1; } else { trace_kvm_msr_read(ecx, msr_info.data); @@ -3570,9 +3579,8 @@ static int rdmsr_interception(struct vcpu_svm *svm) kvm_register_write(&svm->vcpu, VCPU_REGS_RDX, msr_info.data >> 32); svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; - skip_emulated_instruction(&svm->vcpu); + return kvm_skip_emulated_instruction(&svm->vcpu); } - return 1; } static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data) @@ -3698,11 +3706,11 @@ static int wrmsr_interception(struct vcpu_svm *svm) if (kvm_set_msr(&svm->vcpu, &msr)) { trace_kvm_msr_write_ex(ecx, data); kvm_inject_gp(&svm->vcpu, 0); + return 1; } else { trace_kvm_msr_write(ecx, data); - skip_emulated_instruction(&svm->vcpu); + return kvm_skip_emulated_instruction(&svm->vcpu); } - return 1; } static int msr_interception(struct vcpu_svm *svm) @@ -3731,8 +3739,7 @@ static int pause_interception(struct vcpu_svm *svm) static int nop_interception(struct vcpu_svm *svm) { - skip_emulated_instruction(&(svm->vcpu)); - return 1; + return kvm_skip_emulated_instruction(&(svm->vcpu)); } static int monitor_interception(struct vcpu_svm *svm) -- cgit v1.2.3-59-g8ed1b From 8a77e90966e92759f94087f9845d413290be0d70 Mon Sep 17 00:00:00 2001 From: Janakarajan Natarajan Date: Thu, 6 Jul 2017 15:50:44 -0500 Subject: KVM: SVM: Prepare for new bit definition in lbr_ctl MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The lbr_ctl variable in the vmcb control area is used to enable or disable Last Branch Record (LBR) virtualization. However, this is to be done using only bit 0 of the variable. To correct this and to prepare for a new feature, change the current usage to work only on a particular bit. Signed-off-by: Janakarajan Natarajan Reviewed-by: Paolo Bonzini Signed-off-by: Radim Krčmář --- arch/x86/include/asm/svm.h | 2 ++ arch/x86/kvm/svm.c | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index 14824fc78f7e..d1163f64d732 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -119,6 +119,8 @@ struct __attribute__ ((__packed__)) vmcb_control_area { #define AVIC_ENABLE_SHIFT 31 #define AVIC_ENABLE_MASK (1 << AVIC_ENABLE_SHIFT) +#define LBR_CTL_ENABLE_MASK BIT_ULL(0) + #define SVM_INTERRUPT_SHADOW_MASK 1 #define SVM_IOIO_STR_SHIFT 2 diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 3da42d7c629e..6e72127c0d0e 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -947,7 +947,7 @@ static void svm_enable_lbrv(struct vcpu_svm *svm) { u32 *msrpm = svm->msrpm; - svm->vmcb->control.lbr_ctl = 1; + svm->vmcb->control.lbr_ctl |= LBR_CTL_ENABLE_MASK; set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1); set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1); set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 1, 1); @@ -958,7 +958,7 @@ static void svm_disable_lbrv(struct vcpu_svm *svm) { u32 *msrpm = svm->msrpm; - svm->vmcb->control.lbr_ctl = 0; + svm->vmcb->control.lbr_ctl &= ~LBR_CTL_ENABLE_MASK; set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 0, 0); set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0); set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 0, 0); -- cgit v1.2.3-59-g8ed1b From 0dc92119b50be539a5480d72a00ae8098bdba2fc Mon Sep 17 00:00:00 2001 From: Janakarajan Natarajan Date: Thu, 6 Jul 2017 15:50:45 -0500 Subject: KVM: SVM: Rename lbr_ctl field in the vmcb control area MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rename the lbr_ctl variable to better reflect the purpose of the field - provide support for virtualization extensions. Signed-off-by: Janakarajan Natarajan Reviewed-by: Paolo Bonzini Signed-off-by: Radim Krčmář --- arch/x86/include/asm/svm.h | 2 +- arch/x86/kvm/svm.c | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index d1163f64d732..74d139352491 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -83,7 +83,7 @@ struct __attribute__ ((__packed__)) vmcb_control_area { u32 event_inj; u32 event_inj_err; u64 nested_cr3; - u64 lbr_ctl; + u64 virt_ext; u32 clean; u32 reserved_5; u64 next_rip; diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 6e72127c0d0e..9a09f89145f2 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -947,7 +947,7 @@ static void svm_enable_lbrv(struct vcpu_svm *svm) { u32 *msrpm = svm->msrpm; - svm->vmcb->control.lbr_ctl |= LBR_CTL_ENABLE_MASK; + svm->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK; set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1); set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1); set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 1, 1); @@ -958,7 +958,7 @@ static void svm_disable_lbrv(struct vcpu_svm *svm) { u32 *msrpm = svm->msrpm; - svm->vmcb->control.lbr_ctl &= ~LBR_CTL_ENABLE_MASK; + svm->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK; set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 0, 0); set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0); set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 0, 0); @@ -2708,7 +2708,7 @@ static inline void copy_vmcb_control_area(struct vmcb *dst_vmcb, struct vmcb *fr dst->event_inj = from->event_inj; dst->event_inj_err = from->event_inj_err; dst->nested_cr3 = from->nested_cr3; - dst->lbr_ctl = from->lbr_ctl; + dst->virt_ext = from->virt_ext; } static int nested_svm_vmexit(struct vcpu_svm *svm) @@ -3014,7 +3014,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm) /* We don't want to see VMMCALLs from a nested guest */ clr_intercept(svm, INTERCEPT_VMMCALL); - svm->vmcb->control.lbr_ctl = nested_vmcb->control.lbr_ctl; + svm->vmcb->control.virt_ext = nested_vmcb->control.virt_ext; svm->vmcb->control.int_vector = nested_vmcb->control.int_vector; svm->vmcb->control.int_state = nested_vmcb->control.int_state; svm->vmcb->control.tsc_offset += nested_vmcb->control.tsc_offset; @@ -4124,7 +4124,7 @@ static void dump_vmcb(struct kvm_vcpu *vcpu) pr_err("%-20s%016llx\n", "avic_vapic_bar:", control->avic_vapic_bar); pr_err("%-20s%08x\n", "event_inj:", control->event_inj); pr_err("%-20s%08x\n", "event_inj_err:", control->event_inj_err); - pr_err("%-20s%lld\n", "lbr_ctl:", control->lbr_ctl); + pr_err("%-20s%lld\n", "virt_ext:", control->virt_ext); pr_err("%-20s%016llx\n", "next_rip:", control->next_rip); pr_err("%-20s%016llx\n", "avic_backing_page:", control->avic_backing_page); pr_err("%-20s%016llx\n", "avic_logical_id:", control->avic_logical_id); -- cgit v1.2.3-59-g8ed1b From 76ff359249f1e80ff0d6ced3b52b1088c4e61b9b Mon Sep 17 00:00:00 2001 From: Janakarajan Natarajan Date: Thu, 6 Jul 2017 15:50:46 -0500 Subject: KVM: SVM: Add Virtual VMLOAD VMSAVE feature definition MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Define a new cpufeature definition for Virtual VMLOAD VMSAVE. Signed-off-by: Janakarajan Natarajan Reviewed-by: Paolo Bonzini Signed-off-by: Radim Krčmář --- arch/x86/include/asm/cpufeatures.h | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 2701e5f8145b..ca3c48c0872f 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -286,6 +286,7 @@ #define X86_FEATURE_PAUSEFILTER (15*32+10) /* filtered pause intercept */ #define X86_FEATURE_PFTHRESHOLD (15*32+12) /* pause filter threshold */ #define X86_FEATURE_AVIC (15*32+13) /* Virtual Interrupt Controller */ +#define X86_FEATURE_VIRTUAL_VMLOAD_VMSAVE (15*32+15) /* Virtual VMLOAD VMSAVE */ /* Intel-defined CPU features, CPUID level 0x00000007:0 (ecx), word 16 */ #define X86_FEATURE_AVX512VBMI (16*32+ 1) /* AVX512 Vector Bit Manipulation instructions*/ -- cgit v1.2.3-59-g8ed1b From 89c8a4984fc98e625517bfe5083342d77ee35811 Mon Sep 17 00:00:00 2001 From: Janakarajan Natarajan Date: Thu, 6 Jul 2017 15:50:47 -0500 Subject: KVM: SVM: Enable Virtual VMLOAD VMSAVE feature MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Enable the Virtual VMLOAD VMSAVE feature. This is done by setting bit 1 at position B8h in the vmcb. The processor must have nested paging enabled, be in 64-bit mode and have support for the Virtual VMLOAD VMSAVE feature for the bit to be set in the vmcb. Signed-off-by: Janakarajan Natarajan Reviewed-by: Paolo Bonzini Signed-off-by: Radim Krčmář --- arch/x86/include/asm/svm.h | 1 + arch/x86/kvm/svm.c | 24 ++++++++++++++++++++++++ 2 files changed, 25 insertions(+) diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index 74d139352491..58fffe79e417 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -120,6 +120,7 @@ struct __attribute__ ((__packed__)) vmcb_control_area { #define AVIC_ENABLE_MASK (1 << AVIC_ENABLE_SHIFT) #define LBR_CTL_ENABLE_MASK BIT_ULL(0) +#define VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK BIT_ULL(1) #define SVM_INTERRUPT_SHADOW_MASK 1 diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 9a09f89145f2..4c98d362e3e4 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -277,6 +277,10 @@ static int avic; module_param(avic, int, S_IRUGO); #endif +/* enable/disable Virtual VMLOAD VMSAVE */ +static int vls = true; +module_param(vls, int, 0444); + /* AVIC VM ID bit masks and lock */ static DECLARE_BITMAP(avic_vm_id_bitmap, AVIC_VM_ID_NR); static DEFINE_SPINLOCK(avic_vm_id_lock); @@ -1093,6 +1097,16 @@ static __init int svm_hardware_setup(void) } } + if (vls) { + if (!npt_enabled || + !boot_cpu_has(X86_FEATURE_VIRTUAL_VMLOAD_VMSAVE) || + !IS_ENABLED(CONFIG_X86_64)) { + vls = false; + } else { + pr_info("Virtual VMLOAD VMSAVE supported\n"); + } + } + return 0; err: @@ -1280,6 +1294,16 @@ static void init_vmcb(struct vcpu_svm *svm) if (avic) avic_init_vmcb(svm); + /* + * If hardware supports Virtual VMLOAD VMSAVE then enable it + * in VMCB and clear intercepts to avoid #VMEXIT. + */ + if (vls) { + clr_intercept(svm, INTERCEPT_VMLOAD); + clr_intercept(svm, INTERCEPT_VMSAVE); + svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; + } + mark_all_dirty(svm->vmcb); enable_gif(svm); -- cgit v1.2.3-59-g8ed1b From 286de8f6ac9202f1c9012784639156c6ec386eb8 Mon Sep 17 00:00:00 2001 From: Claudio Imbrenda Date: Wed, 12 Jul 2017 17:56:44 +0200 Subject: KVM: trigger uevents when creating or destroying a VM MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch adds a few lines to the KVM common code to fire a KOBJ_CHANGE uevent whenever a KVM VM is created or destroyed. The event carries five environment variables: CREATED indicates how many times a new VM has been created. It is useful for example to trigger specific actions when the first VM is started COUNT indicates how many VMs are currently active. This can be used for logging or monitoring purposes PID has the pid of the KVM process that has been started or stopped. This can be used to perform process-specific tuning. STATS_PATH contains the path in debugfs to the directory with all the runtime statistics for this VM. This is useful for performance monitoring and profiling. EVENT described the type of event, its value can be either "create" or "destroy" Specific udev rules can be then set up in userspace to deal with the creation or destruction of VMs as needed. Signed-off-by: Claudio Imbrenda Signed-off-by: Radim Krčmář --- virt/kvm/kvm_main.c | 69 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 7766c2b52797..82987d457b8b 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -130,6 +130,12 @@ EXPORT_SYMBOL_GPL(kvm_rebooting); static bool largepages_enabled = true; +#define KVM_EVENT_CREATE_VM 0 +#define KVM_EVENT_DESTROY_VM 1 +static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm); +static unsigned long long kvm_createvm_count; +static unsigned long long kvm_active_vms; + bool kvm_is_reserved_pfn(kvm_pfn_t pfn) { if (pfn_valid(pfn)) @@ -740,6 +746,7 @@ static void kvm_destroy_vm(struct kvm *kvm) int i; struct mm_struct *mm = kvm->mm; + kvm_uevent_notify_change(KVM_EVENT_DESTROY_VM, kvm); kvm_destroy_vm_debugfs(kvm); kvm_arch_sync_events(kvm); spin_lock(&kvm_lock); @@ -3220,6 +3227,7 @@ static int kvm_dev_ioctl_create_vm(unsigned long type) fput(file); return -ENOMEM; } + kvm_uevent_notify_change(KVM_EVENT_CREATE_VM, kvm); fd_install(r, file); return r; @@ -3872,6 +3880,67 @@ static const struct file_operations *stat_fops[] = { [KVM_STAT_VM] = &vm_stat_fops, }; +static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm) +{ + struct kobj_uevent_env *env; + char *tmp, *pathbuf = NULL; + unsigned long long created, active; + + if (!kvm_dev.this_device || !kvm) + return; + + spin_lock(&kvm_lock); + if (type == KVM_EVENT_CREATE_VM) { + kvm_createvm_count++; + kvm_active_vms++; + } else if (type == KVM_EVENT_DESTROY_VM) { + kvm_active_vms--; + } + created = kvm_createvm_count; + active = kvm_active_vms; + spin_unlock(&kvm_lock); + + env = kzalloc(sizeof(*env), GFP_KERNEL); + if (!env) + return; + + add_uevent_var(env, "CREATED=%llu", created); + add_uevent_var(env, "COUNT=%llu", active); + + if (type == KVM_EVENT_CREATE_VM) + add_uevent_var(env, "EVENT=create"); + else if (type == KVM_EVENT_DESTROY_VM) + add_uevent_var(env, "EVENT=destroy"); + + if (kvm->debugfs_dentry) { + char p[ITOA_MAX_LEN]; + + snprintf(p, sizeof(p), "%s", kvm->debugfs_dentry->d_name.name); + tmp = strchrnul(p + 1, '-'); + *tmp = '\0'; + add_uevent_var(env, "PID=%s", p); + pathbuf = kmalloc(PATH_MAX, GFP_KERNEL); + if (pathbuf) { + /* sizeof counts the final '\0' */ + int len = sizeof("STATS_PATH=") - 1; + const char *pvar = "STATS_PATH="; + + tmp = dentry_path_raw(kvm->debugfs_dentry, + pathbuf + len, + PATH_MAX - len); + if (!IS_ERR(tmp)) { + memcpy(tmp - len, pvar, len); + env->envp[env->envp_idx++] = tmp - len; + } + } + } + /* no need for checks, since we are adding at most only 5 keys */ + env->envp[env->envp_idx++] = NULL; + kobject_uevent_env(&kvm_dev.this_device->kobj, KOBJ_CHANGE, env->envp); + kfree(env); + kfree(pathbuf); +} + static int kvm_init_debug(void) { int r = -EEXIST; -- cgit v1.2.3-59-g8ed1b From a826faf108e2d855929342268e68c43ba667379a Mon Sep 17 00:00:00 2001 From: Ladi Prosek Date: Mon, 26 Jun 2017 09:56:43 +0200 Subject: KVM: x86: make backwards_tsc_observed a per-VM variable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The backwards_tsc_observed global introduced in commit 16a9602 is never reset to false. If a VM happens to be running while the host is suspended (a common source of the TSC jumping backwards), master clock will never be enabled again for any VM. In contrast, if no VM is running while the host is suspended, master clock is unaffected. This is inconsistent and unnecessarily strict. Let's track the backwards_tsc_observed variable separately and let each VM start with a clean slate. Real world impact: My Windows VMs get slower after my laptop undergoes a suspend/resume cycle. The only way to get the perf back is unloading and reloading the kvm module. Signed-off-by: Ladi Prosek Signed-off-by: Radim Krčmář --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/x86.c | 6 ++---- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 1588e9e3dc01..ef37d0dc61bd 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -803,6 +803,7 @@ struct kvm_arch { int audit_point; #endif + bool backwards_tsc_observed; bool boot_vcpu_runs_old_kvmclock; u32 bsp_vcpu_id; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ca128a9c9cc4..08aa5e442aa7 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -134,8 +134,6 @@ module_param(lapic_timer_advance_ns, uint, S_IRUGO | S_IWUSR); static bool __read_mostly vector_hashing = true; module_param(vector_hashing, bool, S_IRUGO); -static bool __read_mostly backwards_tsc_observed = false; - #define KVM_NR_SHARED_MSRS 16 struct kvm_shared_msrs_global { @@ -1719,7 +1717,7 @@ static void pvclock_update_vm_gtod_copy(struct kvm *kvm) &ka->master_cycle_now); ka->use_master_clock = host_tsc_clocksource && vcpus_matched - && !backwards_tsc_observed + && !ka->backwards_tsc_observed && !ka->boot_vcpu_runs_old_kvmclock; if (ka->use_master_clock) @@ -7835,8 +7833,8 @@ int kvm_arch_hardware_enable(void) */ if (backwards_tsc) { u64 delta_cyc = max_tsc - local_tsc; - backwards_tsc_observed = true; list_for_each_entry(kvm, &vm_list, vm_list) { + kvm->arch.backwards_tsc_observed = true; kvm_for_each_vcpu(i, vcpu, kvm) { vcpu->arch.tsc_offset_adjustment += delta_cyc; vcpu->arch.last_host_tsc = local_tsc; -- cgit v1.2.3-59-g8ed1b From efc479e6900c22bad9a2b649d13405ed9cde2d53 Mon Sep 17 00:00:00 2001 From: Roman Kagan Date: Thu, 22 Jun 2017 16:51:01 +0300 Subject: kvm: x86: hyperv: add KVM_CAP_HYPERV_SYNIC2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is a flaw in the Hyper-V SynIC implementation in KVM: when message page or event flags page is enabled by setting the corresponding msr, KVM zeroes it out. This is problematic because on migration the corresponding MSRs are loaded on the destination, so the content of those pages is lost. This went unnoticed so far because the only user of those pages was in-KVM hyperv synic timers, which could continue working despite that zeroing. Newer QEMU uses those pages for Hyper-V VMBus implementation, and zeroing them breaks the migration. Besides, in newer QEMU the content of those pages is fully managed by QEMU, so zeroing them is undesirable even when writing the MSRs from the guest side. To support this new scheme, introduce a new capability, KVM_CAP_HYPERV_SYNIC2, which, when enabled, makes sure that the synic pages aren't zeroed out in KVM. Signed-off-by: Roman Kagan Signed-off-by: Radim Krčmář --- Documentation/virtual/kvm/api.txt | 9 +++++++++ arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/hyperv.c | 13 +++++++++---- arch/x86/kvm/hyperv.h | 2 +- arch/x86/kvm/x86.c | 7 ++++++- include/uapi/linux/kvm.h | 1 + 6 files changed, 27 insertions(+), 6 deletions(-) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 3a9831b72945..78ac577c9378 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -4329,3 +4329,12 @@ Querying this capability returns a bitmap indicating the possible virtual SMT modes that can be set using KVM_CAP_PPC_SMT. If bit N (counting from the right) is set, then a virtual SMT mode of 2^N is available. + +8.11 KVM_CAP_HYPERV_SYNIC2 + +Architectures: x86 + +This capability enables a newer version of Hyper-V Synthetic interrupt +controller (SynIC). The only difference with KVM_CAP_HYPERV_SYNIC is that KVM +doesn't clear SynIC message and event flags pages when they are enabled by +writing to the respective MSRs. diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index ef37d0dc61bd..9d8de5dd7546 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -462,6 +462,7 @@ struct kvm_vcpu_hv_synic { DECLARE_BITMAP(auto_eoi_bitmap, 256); DECLARE_BITMAP(vec_bitmap, 256); bool active; + bool dont_zero_synic_pages; }; /* Hyper-V per vcpu emulation context */ diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index ebae57ac5902..a8084406707e 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -221,7 +221,8 @@ static int synic_set_msr(struct kvm_vcpu_hv_synic *synic, synic->version = data; break; case HV_X64_MSR_SIEFP: - if (data & HV_SYNIC_SIEFP_ENABLE) + if ((data & HV_SYNIC_SIEFP_ENABLE) && !host && + !synic->dont_zero_synic_pages) if (kvm_clear_guest(vcpu->kvm, data & PAGE_MASK, PAGE_SIZE)) { ret = 1; @@ -232,7 +233,8 @@ static int synic_set_msr(struct kvm_vcpu_hv_synic *synic, synic_exit(synic, msr); break; case HV_X64_MSR_SIMP: - if (data & HV_SYNIC_SIMP_ENABLE) + if ((data & HV_SYNIC_SIMP_ENABLE) && !host && + !synic->dont_zero_synic_pages) if (kvm_clear_guest(vcpu->kvm, data & PAGE_MASK, PAGE_SIZE)) { ret = 1; @@ -687,14 +689,17 @@ void kvm_hv_vcpu_init(struct kvm_vcpu *vcpu) stimer_init(&hv_vcpu->stimer[i], i); } -int kvm_hv_activate_synic(struct kvm_vcpu *vcpu) +int kvm_hv_activate_synic(struct kvm_vcpu *vcpu, bool dont_zero_synic_pages) { + struct kvm_vcpu_hv_synic *synic = vcpu_to_synic(vcpu); + /* * Hyper-V SynIC auto EOI SINT's are * not compatible with APICV, so deactivate APICV */ kvm_vcpu_deactivate_apicv(vcpu); - vcpu_to_synic(vcpu)->active = true; + synic->active = true; + synic->dont_zero_synic_pages = dont_zero_synic_pages; return 0; } diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h index cd1119538add..12f65fe1011d 100644 --- a/arch/x86/kvm/hyperv.h +++ b/arch/x86/kvm/hyperv.h @@ -56,7 +56,7 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu); void kvm_hv_irq_routing_update(struct kvm *kvm); int kvm_hv_synic_set_irq(struct kvm *kvm, u32 vcpu_id, u32 sint); void kvm_hv_synic_send_eoi(struct kvm_vcpu *vcpu, int vector); -int kvm_hv_activate_synic(struct kvm_vcpu *vcpu); +int kvm_hv_activate_synic(struct kvm_vcpu *vcpu, bool dont_zero_synic_pages); void kvm_hv_vcpu_init(struct kvm_vcpu *vcpu); void kvm_hv_vcpu_uninit(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 08aa5e442aa7..4f41c5222ecd 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2659,6 +2659,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_HYPERV_VAPIC: case KVM_CAP_HYPERV_SPIN: case KVM_CAP_HYPERV_SYNIC: + case KVM_CAP_HYPERV_SYNIC2: case KVM_CAP_PCI_SEGMENT: case KVM_CAP_DEBUGREGS: case KVM_CAP_X86_ROBUST_SINGLESTEP: @@ -3382,10 +3383,14 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, return -EINVAL; switch (cap->cap) { + case KVM_CAP_HYPERV_SYNIC2: + if (cap->args[0]) + return -EINVAL; case KVM_CAP_HYPERV_SYNIC: if (!irqchip_in_kernel(vcpu->kvm)) return -EINVAL; - return kvm_hv_activate_synic(vcpu); + return kvm_hv_activate_synic(vcpu, cap->cap == + KVM_CAP_HYPERV_SYNIC2); default: return -EINVAL; } diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index ebd604c222d8..38b2cfbc8112 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -927,6 +927,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_S390_CMMA_MIGRATION 145 #define KVM_CAP_PPC_FWNMI 146 #define KVM_CAP_PPC_SMT_POSSIBLE 147 +#define KVM_CAP_HYPERV_SYNIC2 148 #ifdef KVM_CAP_IRQ_ROUTING -- cgit v1.2.3-59-g8ed1b From cfcd20e5caad6ba552978c16ed8bed7edb0143cf Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Thu, 13 Jul 2017 18:30:39 -0700 Subject: KVM: x86: Simplify kvm_x86_ops->queue_exception parameter list MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch removes all arguments except the first in kvm_x86_ops->queue_exception since they can extract the arguments from vcpu->arch.exception themselves. Cc: Paolo Bonzini Cc: Radim Krčmář Signed-off-by: Wanpeng Li Signed-off-by: Radim Krčmář --- arch/x86/include/asm/kvm_host.h | 4 +--- arch/x86/kvm/svm.c | 8 +++++--- arch/x86/kvm/vmx.c | 8 +++++--- arch/x86/kvm/x86.c | 5 +---- 4 files changed, 12 insertions(+), 13 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 9d8de5dd7546..8d11ddcb0dbf 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -954,9 +954,7 @@ struct kvm_x86_ops { unsigned char *hypercall_addr); void (*set_irq)(struct kvm_vcpu *vcpu); void (*set_nmi)(struct kvm_vcpu *vcpu); - void (*queue_exception)(struct kvm_vcpu *vcpu, unsigned nr, - bool has_error_code, u32 error_code, - bool reinject); + void (*queue_exception)(struct kvm_vcpu *vcpu); void (*cancel_injection)(struct kvm_vcpu *vcpu); int (*interrupt_allowed)(struct kvm_vcpu *vcpu); int (*nmi_allowed)(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 4c98d362e3e4..cde756a02b1a 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -637,11 +637,13 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) svm_set_interrupt_shadow(vcpu, 0); } -static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, - bool has_error_code, u32 error_code, - bool reinject) +static void svm_queue_exception(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); + unsigned nr = vcpu->arch.exception.nr; + bool has_error_code = vcpu->arch.exception.has_error_code; + bool reinject = vcpu->arch.exception.reinject; + u32 error_code = vcpu->arch.exception.error_code; /* * If we are within a nested VM we'd better #VMEXIT and let the guest diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 25f2fdccf625..69cc228436ea 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2435,11 +2435,13 @@ static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned nr) return 1; } -static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, - bool has_error_code, u32 error_code, - bool reinject) +static void vmx_queue_exception(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); + unsigned nr = vcpu->arch.exception.nr; + bool has_error_code = vcpu->arch.exception.has_error_code; + bool reinject = vcpu->arch.exception.reinject; + u32 error_code = vcpu->arch.exception.error_code; u32 intr_info = nr | INTR_INFO_VALID_MASK; if (!reinject && is_guest_mode(vcpu) && diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4f41c5222ecd..e149c92476f1 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6356,10 +6356,7 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win) kvm_update_dr7(vcpu); } - kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr, - vcpu->arch.exception.has_error_code, - vcpu->arch.exception.error_code, - vcpu->arch.exception.reinject); + kvm_x86_ops->queue_exception(vcpu); return 0; } -- cgit v1.2.3-59-g8ed1b From 1261bfa326f5e903166498628a1894edce0caabc Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Thu, 13 Jul 2017 18:30:40 -0700 Subject: KVM: async_pf: Add L1 guest async_pf #PF vmexit handler MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch adds the L1 guest async page fault #PF vmexit handler, such by L1 similar to ordinary async page fault. Cc: Paolo Bonzini Cc: Radim Krčmář Signed-off-by: Wanpeng Li [Passed insn parameters to kvm_mmu_page_fault().] Signed-off-by: Radim Krčmář --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/mmu.c | 33 +++++++++++++++++++++++++++++++++ arch/x86/kvm/mmu.h | 3 +++ arch/x86/kvm/svm.c | 36 ++++++------------------------------ arch/x86/kvm/vmx.c | 15 ++++++++------- 5 files changed, 51 insertions(+), 37 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 8d11ddcb0dbf..4f20ee6c79a1 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -650,6 +650,7 @@ struct kvm_vcpu_arch { u64 msr_val; u32 id; bool send_user_only; + u32 host_apf_reason; } apf; /* OSVW MSRs (AMD only) */ diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index aafd399cf8c6..3825a35cd752 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -46,6 +46,7 @@ #include #include #include +#include "trace.h" /* * When setting this variable to true it enables Two-Dimensional-Paging @@ -3780,6 +3781,38 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, return false; } +int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, + u64 fault_address, char *insn, int insn_len, + bool need_unprotect) +{ + int r = 1; + + switch (vcpu->arch.apf.host_apf_reason) { + default: + trace_kvm_page_fault(fault_address, error_code); + + if (need_unprotect && kvm_event_needs_reinjection(vcpu)) + kvm_mmu_unprotect_page_virt(vcpu, fault_address); + r = kvm_mmu_page_fault(vcpu, fault_address, error_code, insn, + insn_len); + break; + case KVM_PV_REASON_PAGE_NOT_PRESENT: + vcpu->arch.apf.host_apf_reason = 0; + local_irq_disable(); + kvm_async_pf_task_wait(fault_address); + local_irq_enable(); + break; + case KVM_PV_REASON_PAGE_READY: + vcpu->arch.apf.host_apf_reason = 0; + local_irq_disable(); + kvm_async_pf_task_wake(fault_address); + local_irq_enable(); + break; + } + return r; +} +EXPORT_SYMBOL_GPL(kvm_handle_page_fault); + static bool check_hugepage_cache_consistency(struct kvm_vcpu *vcpu, gfn_t gfn, int level) { diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index a276834950c1..d7d248a000dd 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -77,6 +77,9 @@ void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu); void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, bool accessed_dirty); bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu); +int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, + u64 fault_address, char *insn, int insn_len, + bool need_unprotect); static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm) { diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index cde756a02b1a..fb23497cf915 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -194,7 +194,6 @@ struct vcpu_svm { unsigned int3_injected; unsigned long int3_rip; - u32 apf_reason; /* cached guest cpuid flags for faster access */ bool nrips_enabled : 1; @@ -2122,34 +2121,11 @@ static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value) static int pf_interception(struct vcpu_svm *svm) { u64 fault_address = svm->vmcb->control.exit_info_2; - u64 error_code; - int r = 1; + u64 error_code = svm->vmcb->control.exit_info_1; - switch (svm->apf_reason) { - default: - error_code = svm->vmcb->control.exit_info_1; - - trace_kvm_page_fault(fault_address, error_code); - if (!npt_enabled && kvm_event_needs_reinjection(&svm->vcpu)) - kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address); - r = kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code, + return kvm_handle_page_fault(&svm->vcpu, error_code, fault_address, svm->vmcb->control.insn_bytes, - svm->vmcb->control.insn_len); - break; - case KVM_PV_REASON_PAGE_NOT_PRESENT: - svm->apf_reason = 0; - local_irq_disable(); - kvm_async_pf_task_wait(fault_address); - local_irq_enable(); - break; - case KVM_PV_REASON_PAGE_READY: - svm->apf_reason = 0; - local_irq_disable(); - kvm_async_pf_task_wake(fault_address); - local_irq_enable(); - break; - } - return r; + svm->vmcb->control.insn_len, !npt_enabled); } static int db_interception(struct vcpu_svm *svm) @@ -2630,7 +2606,7 @@ static int nested_svm_exit_special(struct vcpu_svm *svm) break; case SVM_EXIT_EXCP_BASE + PF_VECTOR: /* When we're shadowing, trap PFs, but not async PF */ - if (!npt_enabled && svm->apf_reason == 0) + if (!npt_enabled && svm->vcpu.arch.apf.host_apf_reason == 0) return NESTED_EXIT_HOST; break; default: @@ -2677,7 +2653,7 @@ static int nested_svm_intercept(struct vcpu_svm *svm) } /* async page fault always cause vmexit */ else if ((exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR) && - svm->apf_reason != 0) + svm->vcpu.arch.apf.host_apf_reason != 0) vmexit = NESTED_EXIT_DONE; break; } @@ -4998,7 +4974,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) /* if exit due to PF check for async PF */ if (svm->vmcb->control.exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR) - svm->apf_reason = kvm_read_and_reset_pf_reason(); + svm->vcpu.arch.apf.host_apf_reason = kvm_read_and_reset_pf_reason(); if (npt_enabled) { vcpu->arch.regs_avail &= ~(1 << VCPU_EXREG_PDPTR); diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 69cc228436ea..c9c46e63f744 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -5698,14 +5698,11 @@ static int handle_exception(struct kvm_vcpu *vcpu) } if (is_page_fault(intr_info)) { - /* EPT won't cause page fault directly */ - BUG_ON(enable_ept); cr2 = vmcs_readl(EXIT_QUALIFICATION); - trace_kvm_page_fault(cr2, error_code); - - if (kvm_event_needs_reinjection(vcpu)) - kvm_mmu_unprotect_page_virt(vcpu, cr2); - return kvm_mmu_page_fault(vcpu, cr2, error_code, NULL, 0); + /* EPT won't cause page fault directly */ + WARN_ON_ONCE(!vcpu->arch.apf.host_apf_reason && enable_ept); + return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0, + true); } ex_no = intr_info & INTR_INFO_VECTOR_MASK; @@ -8643,6 +8640,10 @@ static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx) exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); vmx->exit_intr_info = exit_intr_info; + /* if exit due to PF check for async PF */ + if (is_page_fault(exit_intr_info)) + vmx->vcpu.arch.apf.host_apf_reason = kvm_read_and_reset_pf_reason(); + /* Handle machine checks before interrupts are enabled */ if (basic_exit_reason == EXIT_REASON_MCE_DURING_VMENTRY || is_machine_check(exit_intr_info)) -- cgit v1.2.3-59-g8ed1b From adfe20fb48785dd73af3bf91407196eb5403c8cf Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Thu, 13 Jul 2017 18:30:41 -0700 Subject: KVM: async_pf: Force a nested vmexit if the injected #PF is async_pf MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add an nested_apf field to vcpu->arch.exception to identify an async page fault, and constructs the expected vm-exit information fields. Force a nested VM exit from nested_vmx_check_exception() if the injected #PF is async page fault. Cc: Paolo Bonzini Cc: Radim Krčmář Signed-off-by: Wanpeng Li Signed-off-by: Radim Krčmář --- arch/x86/include/asm/kvm_emulate.h | 1 + arch/x86/include/asm/kvm_host.h | 2 ++ arch/x86/kvm/svm.c | 16 ++++++++++------ arch/x86/kvm/vmx.c | 17 ++++++++++++++--- arch/x86/kvm/x86.c | 9 ++++++++- 5 files changed, 35 insertions(+), 10 deletions(-) diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 722d0e568863..fde36f189836 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -23,6 +23,7 @@ struct x86_exception { u16 error_code; bool nested_page_fault; u64 address; /* cr2 or nested page fault gpa */ + u8 async_page_fault; }; /* diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 4f20ee6c79a1..5e9ac508f718 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -550,6 +550,7 @@ struct kvm_vcpu_arch { bool reinject; u8 nr; u32 error_code; + u8 nested_apf; } exception; struct kvm_queued_interrupt { @@ -651,6 +652,7 @@ struct kvm_vcpu_arch { u32 id; bool send_user_only; u32 host_apf_reason; + unsigned long nested_apf_token; } apf; /* OSVW MSRs (AMD only) */ diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index fb23497cf915..4d8141e533c3 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -2423,15 +2423,19 @@ static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, if (!is_guest_mode(&svm->vcpu)) return 0; + vmexit = nested_svm_intercept(svm); + if (vmexit != NESTED_EXIT_DONE) + return 0; + svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr; svm->vmcb->control.exit_code_hi = 0; svm->vmcb->control.exit_info_1 = error_code; - svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2; - - vmexit = nested_svm_intercept(svm); - if (vmexit == NESTED_EXIT_DONE) - svm->nested.exit_required = true; + if (svm->vcpu.arch.exception.nested_apf) + svm->vmcb->control.exit_info_2 = svm->vcpu.arch.apf.nested_apf_token; + else + svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2; + svm->nested.exit_required = true; return vmexit; } @@ -2653,7 +2657,7 @@ static int nested_svm_intercept(struct vcpu_svm *svm) } /* async page fault always cause vmexit */ else if ((exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR) && - svm->vcpu.arch.apf.host_apf_reason != 0) + svm->vcpu.arch.exception.nested_apf != 0) vmexit = NESTED_EXIT_DONE; break; } diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index c9c46e63f744..5a3bb1a697a2 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2422,13 +2422,24 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) * KVM wants to inject page-faults which it got to the guest. This function * checks whether in a nested guest, we need to inject them to L1 or L2. */ -static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned nr) +static int nested_vmx_check_exception(struct kvm_vcpu *vcpu) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + unsigned int nr = vcpu->arch.exception.nr; - if (!(vmcs12->exception_bitmap & (1u << nr))) + if (!((vmcs12->exception_bitmap & (1u << nr)) || + (nr == PF_VECTOR && vcpu->arch.exception.nested_apf))) return 0; + if (vcpu->arch.exception.nested_apf) { + vmcs_write32(VM_EXIT_INTR_ERROR_CODE, vcpu->arch.exception.error_code); + nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, + PF_VECTOR | INTR_TYPE_HARD_EXCEPTION | + INTR_INFO_DELIVER_CODE_MASK | INTR_INFO_VALID_MASK, + vcpu->arch.apf.nested_apf_token); + return 1; + } + nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, vmcs_read32(VM_EXIT_INTR_INFO), vmcs_readl(EXIT_QUALIFICATION)); @@ -2445,7 +2456,7 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu) u32 intr_info = nr | INTR_INFO_VALID_MASK; if (!reinject && is_guest_mode(vcpu) && - nested_vmx_check_exception(vcpu, nr)) + nested_vmx_check_exception(vcpu)) return; if (has_error_code) { diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e149c92476f1..f3f10154c133 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -450,7 +450,12 @@ EXPORT_SYMBOL_GPL(kvm_complete_insn_gp); void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) { ++vcpu->stat.pf_guest; - vcpu->arch.cr2 = fault->address; + vcpu->arch.exception.nested_apf = + is_guest_mode(vcpu) && fault->async_page_fault; + if (vcpu->arch.exception.nested_apf) + vcpu->arch.apf.nested_apf_token = fault->address; + else + vcpu->arch.cr2 = fault->address; kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code); } EXPORT_SYMBOL_GPL(kvm_inject_page_fault); @@ -8582,6 +8587,7 @@ void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, fault.error_code = 0; fault.nested_page_fault = false; fault.address = work->arch.token; + fault.async_page_fault = true; kvm_inject_page_fault(vcpu, &fault); } } @@ -8604,6 +8610,7 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu, fault.error_code = 0; fault.nested_page_fault = false; fault.address = work->arch.token; + fault.async_page_fault = true; kvm_inject_page_fault(vcpu, &fault); } vcpu->arch.apf.halted = false; -- cgit v1.2.3-59-g8ed1b From 52a5c155cf79f1f059bffebf4d06d0249573e659 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Thu, 13 Jul 2017 18:30:42 -0700 Subject: KVM: async_pf: Let guest support delivery of async_pf from guest mode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds another flag bit (bit 2) to MSR_KVM_ASYNC_PF_EN. If bit 2 is 1, async page faults are delivered to L1 as #PF vmexits; if bit 2 is 0, kvm_can_do_async_pf returns 0 if in guest mode. This is similar to what svm.c wanted to do all along, but it is only enabled for Linux as L1 hypervisor. Foreign hypervisors must never receive async page faults as vmexits, because they'd probably be very confused about that. Cc: Paolo Bonzini Cc: Radim Krčmář Signed-off-by: Wanpeng Li Signed-off-by: Radim Krčmář --- Documentation/virtual/kvm/msr.txt | 5 +++-- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/include/uapi/asm/kvm_para.h | 1 + arch/x86/kernel/kvm.c | 7 ++++++- arch/x86/kvm/mmu.c | 2 +- arch/x86/kvm/vmx.c | 2 +- arch/x86/kvm/x86.c | 5 +++-- 7 files changed, 16 insertions(+), 7 deletions(-) diff --git a/Documentation/virtual/kvm/msr.txt b/Documentation/virtual/kvm/msr.txt index 0a9ea515512a..1ebecc115dc6 100644 --- a/Documentation/virtual/kvm/msr.txt +++ b/Documentation/virtual/kvm/msr.txt @@ -166,10 +166,11 @@ MSR_KVM_SYSTEM_TIME: 0x12 MSR_KVM_ASYNC_PF_EN: 0x4b564d02 data: Bits 63-6 hold 64-byte aligned physical address of a 64 byte memory area which must be in guest RAM and must be - zeroed. Bits 5-2 are reserved and should be zero. Bit 0 is 1 + zeroed. Bits 5-3 are reserved and should be zero. Bit 0 is 1 when asynchronous page faults are enabled on the vcpu 0 when disabled. Bit 1 is 1 if asynchronous page faults can be injected - when vcpu is in cpl == 0. + when vcpu is in cpl == 0. Bit 2 is 1 if asynchronous page faults + are delivered to L1 as #PF vmexits. First 4 byte of 64 byte memory location will be written to by the hypervisor at the time of asynchronous page fault (APF) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 5e9ac508f718..da3261e384d3 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -653,6 +653,7 @@ struct kvm_vcpu_arch { bool send_user_only; u32 host_apf_reason; unsigned long nested_apf_token; + bool delivery_as_pf_vmexit; } apf; /* OSVW MSRs (AMD only) */ diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h index cff0bb6556f8..a965e5b0d328 100644 --- a/arch/x86/include/uapi/asm/kvm_para.h +++ b/arch/x86/include/uapi/asm/kvm_para.h @@ -67,6 +67,7 @@ struct kvm_clock_pairing { #define KVM_ASYNC_PF_ENABLED (1 << 0) #define KVM_ASYNC_PF_SEND_ALWAYS (1 << 1) +#define KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT (1 << 2) /* Operations for KVM_HC_MMU_OP */ #define KVM_MMU_OP_WRITE_PTE 1 diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 43e10d6fdbed..71c17a5be983 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -330,7 +330,12 @@ static void kvm_guest_cpu_init(void) #ifdef CONFIG_PREEMPT pa |= KVM_ASYNC_PF_SEND_ALWAYS; #endif - wrmsrl(MSR_KVM_ASYNC_PF_EN, pa | KVM_ASYNC_PF_ENABLED); + pa |= KVM_ASYNC_PF_ENABLED; + + /* Async page fault support for L1 hypervisor is optional */ + if (wrmsr_safe(MSR_KVM_ASYNC_PF_EN, + (pa | KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT) & 0xffffffff, pa >> 32) < 0) + wrmsrl(MSR_KVM_ASYNC_PF_EN, pa); __this_cpu_write(apf_reason.enabled, 1); printk(KERN_INFO"KVM setup async PF for cpu %d\n", smp_processor_id()); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 3825a35cd752..9b1dd114956a 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3749,7 +3749,7 @@ bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu) kvm_event_needs_reinjection(vcpu))) return false; - if (is_guest_mode(vcpu)) + if (!vcpu->arch.apf.delivery_as_pf_vmexit && is_guest_mode(vcpu)) return false; return kvm_x86_ops->interrupt_allowed(vcpu); diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 5a3bb1a697a2..84e62acf2dd8 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -8037,7 +8037,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) if (is_nmi(intr_info)) return false; else if (is_page_fault(intr_info)) - return enable_ept; + return !vmx->vcpu.arch.apf.host_apf_reason && enable_ept; else if (is_no_device(intr_info) && !(vmcs12->guest_cr0 & X86_CR0_TS)) return false; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index f3f10154c133..6753f0982791 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2063,8 +2063,8 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data) { gpa_t gpa = data & ~0x3f; - /* Bits 2:5 are reserved, Should be zero */ - if (data & 0x3c) + /* Bits 3:5 are reserved, Should be zero */ + if (data & 0x38) return 1; vcpu->arch.apf.msr_val = data; @@ -2080,6 +2080,7 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data) return 1; vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS); + vcpu->arch.apf.delivery_as_pf_vmexit = data & KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT; kvm_async_pf_wakeup_all(vcpu); return 0; } -- cgit v1.2.3-59-g8ed1b From d3457c877b14aaee8c52923eedf05a3b78af0476 Mon Sep 17 00:00:00 2001 From: Roman Kagan Date: Fri, 14 Jul 2017 17:13:20 +0300 Subject: kvm: x86: hyperv: make VP_INDEX managed by userspace MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Hyper-V identifies vCPUs by Virtual Processor Index, which can be queried via HV_X64_MSR_VP_INDEX msr. It is defined by the spec as a sequential number which can't exceed the maximum number of vCPUs per VM. APIC ids can be sparse and thus aren't a valid replacement for VP indices. Current KVM uses its internal vcpu index as VP_INDEX. However, to make it predictable and persistent across VM migrations, the userspace has to control the value of VP_INDEX. This patch achieves that, by storing vp_index explicitly on vcpu, and allowing HV_X64_MSR_VP_INDEX to be set from the host side. For compatibility it's initialized to KVM vcpu index. Also a few variables are renamed to make clear distinction betweed this Hyper-V vp_index and KVM vcpu_id (== APIC id). Besides, a new capability, KVM_CAP_HYPERV_VP_INDEX, is added to allow the userspace to skip attempting msr writes where unsupported, to avoid spamming error logs. Signed-off-by: Roman Kagan Signed-off-by: Radim Krčmář --- Documentation/virtual/kvm/api.txt | 9 +++++++ arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/hyperv.c | 54 +++++++++++++++++++++++++-------------- arch/x86/kvm/hyperv.h | 1 + arch/x86/kvm/x86.c | 3 +++ include/uapi/linux/kvm.h | 1 + 6 files changed, 50 insertions(+), 19 deletions(-) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 78ac577c9378..e63a35fafef0 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -4338,3 +4338,12 @@ This capability enables a newer version of Hyper-V Synthetic interrupt controller (SynIC). The only difference with KVM_CAP_HYPERV_SYNIC is that KVM doesn't clear SynIC message and event flags pages when they are enabled by writing to the respective MSRs. + +8.12 KVM_CAP_HYPERV_VP_INDEX + +Architectures: x86 + +This capability indicates that userspace can load HV_X64_MSR_VP_INDEX msr. Its +value is used to denote the target vcpu for a SynIC interrupt. For +compatibilty, KVM initializes this msr to KVM's internal vcpu index. When this +capability is absent, userspace can still query this msr's value. diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index da3261e384d3..87ac4fba6d8e 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -467,6 +467,7 @@ struct kvm_vcpu_hv_synic { /* Hyper-V per vcpu emulation context */ struct kvm_vcpu_hv { + u32 vp_index; u64 hv_vapic; s64 runtime_offset; struct kvm_vcpu_hv_synic synic; diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index a8084406707e..2695a34fa1c5 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -106,14 +106,27 @@ static int synic_set_sint(struct kvm_vcpu_hv_synic *synic, int sint, return 0; } -static struct kvm_vcpu_hv_synic *synic_get(struct kvm *kvm, u32 vcpu_id) +static struct kvm_vcpu *get_vcpu_by_vpidx(struct kvm *kvm, u32 vpidx) +{ + struct kvm_vcpu *vcpu = NULL; + int i; + + if (vpidx < KVM_MAX_VCPUS) + vcpu = kvm_get_vcpu(kvm, vpidx); + if (vcpu && vcpu_to_hv_vcpu(vcpu)->vp_index == vpidx) + return vcpu; + kvm_for_each_vcpu(i, vcpu, kvm) + if (vcpu_to_hv_vcpu(vcpu)->vp_index == vpidx) + return vcpu; + return NULL; +} + +static struct kvm_vcpu_hv_synic *synic_get(struct kvm *kvm, u32 vpidx) { struct kvm_vcpu *vcpu; struct kvm_vcpu_hv_synic *synic; - if (vcpu_id >= atomic_read(&kvm->online_vcpus)) - return NULL; - vcpu = kvm_get_vcpu(kvm, vcpu_id); + vcpu = get_vcpu_by_vpidx(kvm, vpidx); if (!vcpu) return NULL; synic = vcpu_to_synic(vcpu); @@ -320,11 +333,11 @@ static int synic_set_irq(struct kvm_vcpu_hv_synic *synic, u32 sint) return ret; } -int kvm_hv_synic_set_irq(struct kvm *kvm, u32 vcpu_id, u32 sint) +int kvm_hv_synic_set_irq(struct kvm *kvm, u32 vpidx, u32 sint) { struct kvm_vcpu_hv_synic *synic; - synic = synic_get(kvm, vcpu_id); + synic = synic_get(kvm, vpidx); if (!synic) return -EINVAL; @@ -343,11 +356,11 @@ void kvm_hv_synic_send_eoi(struct kvm_vcpu *vcpu, int vector) kvm_hv_notify_acked_sint(vcpu, i); } -static int kvm_hv_set_sint_gsi(struct kvm *kvm, u32 vcpu_id, u32 sint, int gsi) +static int kvm_hv_set_sint_gsi(struct kvm *kvm, u32 vpidx, u32 sint, int gsi) { struct kvm_vcpu_hv_synic *synic; - synic = synic_get(kvm, vcpu_id); + synic = synic_get(kvm, vpidx); if (!synic) return -EINVAL; @@ -689,6 +702,13 @@ void kvm_hv_vcpu_init(struct kvm_vcpu *vcpu) stimer_init(&hv_vcpu->stimer[i], i); } +void kvm_hv_vcpu_postcreate(struct kvm_vcpu *vcpu) +{ + struct kvm_vcpu_hv *hv_vcpu = vcpu_to_hv_vcpu(vcpu); + + hv_vcpu->vp_index = kvm_vcpu_get_idx(vcpu); +} + int kvm_hv_activate_synic(struct kvm_vcpu *vcpu, bool dont_zero_synic_pages) { struct kvm_vcpu_hv_synic *synic = vcpu_to_synic(vcpu); @@ -983,6 +1003,11 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) struct kvm_vcpu_hv *hv = &vcpu->arch.hyperv; switch (msr) { + case HV_X64_MSR_VP_INDEX: + if (!host) + return 1; + hv->vp_index = (u32)data; + break; case HV_X64_MSR_APIC_ASSIST_PAGE: { u64 gfn; unsigned long addr; @@ -1094,18 +1119,9 @@ static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) struct kvm_vcpu_hv *hv = &vcpu->arch.hyperv; switch (msr) { - case HV_X64_MSR_VP_INDEX: { - int r; - struct kvm_vcpu *v; - - kvm_for_each_vcpu(r, v, vcpu->kvm) { - if (v == vcpu) { - data = r; - break; - } - } + case HV_X64_MSR_VP_INDEX: + data = hv->vp_index; break; - } case HV_X64_MSR_EOI: return kvm_hv_vapic_msr_read(vcpu, APIC_EOI, pdata); case HV_X64_MSR_ICR: diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h index 12f65fe1011d..e637631a9574 100644 --- a/arch/x86/kvm/hyperv.h +++ b/arch/x86/kvm/hyperv.h @@ -59,6 +59,7 @@ void kvm_hv_synic_send_eoi(struct kvm_vcpu *vcpu, int vector); int kvm_hv_activate_synic(struct kvm_vcpu *vcpu, bool dont_zero_synic_pages); void kvm_hv_vcpu_init(struct kvm_vcpu *vcpu); +void kvm_hv_vcpu_postcreate(struct kvm_vcpu *vcpu); void kvm_hv_vcpu_uninit(struct kvm_vcpu *vcpu); static inline struct kvm_vcpu_hv_stimer *vcpu_to_stimer(struct kvm_vcpu *vcpu, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6753f0982791..5b8f07889f6a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2666,6 +2666,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_HYPERV_SPIN: case KVM_CAP_HYPERV_SYNIC: case KVM_CAP_HYPERV_SYNIC2: + case KVM_CAP_HYPERV_VP_INDEX: case KVM_CAP_PCI_SEGMENT: case KVM_CAP_DEBUGREGS: case KVM_CAP_X86_ROBUST_SINGLESTEP: @@ -7688,6 +7689,8 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) struct msr_data msr; struct kvm *kvm = vcpu->kvm; + kvm_hv_vcpu_postcreate(vcpu); + if (vcpu_load(vcpu)) return; msr.data = 0x0; diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 38b2cfbc8112..6cd63c18708a 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -928,6 +928,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_PPC_FWNMI 146 #define KVM_CAP_PPC_SMT_POSSIBLE 147 #define KVM_CAP_HYPERV_SYNIC2 148 +#define KVM_CAP_HYPERV_VP_INDEX 149 #ifdef KVM_CAP_IRQ_ROUTING -- cgit v1.2.3-59-g8ed1b