From c611990844c28c61ca4b35ff69d3a2ae95ccd486 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Thu, 5 Dec 2019 07:40:43 -0500 Subject: KVM: s390: ENOTSUPP -> EOPNOTSUPP fixups There is no ENOTSUPP for userspace. Reported-by: Julian Wiedmann Fixes: 519783935451 ("KVM: s390: introduce ais mode modify function") Fixes: 2c1a48f2e5ed ("KVM: S390: add new group for flic") Reviewed-by: Cornelia Huck Reviewed-by: Thomas Huth Signed-off-by: Christian Borntraeger --- arch/s390/kvm/interrupt.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 165dea4c7f19..c06c89d370a7 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -2190,7 +2190,7 @@ static int flic_ais_mode_get_all(struct kvm *kvm, struct kvm_device_attr *attr) return -EINVAL; if (!test_kvm_facility(kvm, 72)) - return -ENOTSUPP; + return -EOPNOTSUPP; mutex_lock(&fi->ais_lock); ais.simm = fi->simm; @@ -2499,7 +2499,7 @@ static int modify_ais_mode(struct kvm *kvm, struct kvm_device_attr *attr) int ret = 0; if (!test_kvm_facility(kvm, 72)) - return -ENOTSUPP; + return -EOPNOTSUPP; if (copy_from_user(&req, (void __user *)attr->addr, sizeof(req))) return -EFAULT; @@ -2579,7 +2579,7 @@ static int flic_ais_mode_set_all(struct kvm *kvm, struct kvm_device_attr *attr) struct kvm_s390_ais_all ais; if (!test_kvm_facility(kvm, 72)) - return -ENOTSUPP; + return -EOPNOTSUPP; if (copy_from_user(&ais, (void __user *)attr->addr, sizeof(ais))) return -EFAULT; -- cgit v1.2.3-59-g8ed1b From 55680890ea78be0df5e1384989f1be835043c084 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Fri, 31 Jan 2020 05:02:00 -0500 Subject: KVM: s390: do not clobber registers during guest reset/store status The initial CPU reset clobbers the userspace fpc and the store status ioctl clobbers the guest acrs + fpr. As these calls are only done via ioctl (and not via vcpu_run), no CPU context is loaded, so we can (and must) act directly on the sync regs, not on the thread context. Cc: stable@kernel.org Fixes: e1788bb995be ("KVM: s390: handle floating point registers in the run ioctl not in vcpu_put/load") Fixes: 31d8b8d41a7e ("KVM: s390: handle access registers in the run ioctl not in vcpu_put/load") Signed-off-by: Christian Borntraeger Reviewed-by: David Hildenbrand Reviewed-by: Cornelia Huck Signed-off-by: Janosch Frank Link: https://lore.kernel.org/r/20200131100205.74720-2-frankja@linux.ibm.com Signed-off-by: Christian Borntraeger --- arch/s390/kvm/kvm-s390.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index d9e6bf3d54f0..876802894b35 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -2860,9 +2860,7 @@ static void kvm_s390_vcpu_initial_reset(struct kvm_vcpu *vcpu) vcpu->arch.sie_block->gcr[14] = CR14_UNUSED_32 | CR14_UNUSED_33 | CR14_EXTERNAL_DAMAGE_SUBMASK; - /* make sure the new fpc will be lazily loaded */ - save_fpu_regs(); - current->thread.fpu.fpc = 0; + vcpu->run->s.regs.fpc = 0; vcpu->arch.sie_block->gbea = 1; vcpu->arch.sie_block->pp = 0; vcpu->arch.sie_block->fpf &= ~FPF_BPBC; @@ -4351,7 +4349,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, switch (ioctl) { case KVM_S390_STORE_STATUS: idx = srcu_read_lock(&vcpu->kvm->srcu); - r = kvm_s390_vcpu_store_status(vcpu, arg); + r = kvm_s390_store_status_unloaded(vcpu, arg); srcu_read_unlock(&vcpu->kvm->srcu, idx); break; case KVM_S390_SET_INITIAL_PSW: { -- cgit v1.2.3-59-g8ed1b From cca00ebb8ad3b3a5eb6e60b1ac7e9211f66af477 Mon Sep 17 00:00:00 2001 From: Janosch Frank Date: Fri, 31 Jan 2020 05:02:01 -0500 Subject: KVM: s390: Cleanup initial cpu reset The code seems to be quite old and uses lots of unneeded spaces for alignment, which doesn't really help with readability. Let's: * Get rid of the extra spaces * Remove the ULs as they are not needed on 0s * Define constants for the CR 0 and 14 initial values * Use the sizeof of the gcr array to memset it to 0 Signed-off-by: Janosch Frank Reviewed-by: David Hildenbrand Reviewed-by: Cornelia Huck Reviewed-by: Thomas Huth Link: https://lore.kernel.org/r/20200131100205.74720-3-frankja@linux.ibm.com Signed-off-by: Christian Borntraeger --- arch/s390/include/asm/kvm_host.h | 5 +++++ arch/s390/kvm/kvm-s390.c | 18 +++++++----------- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 02f4c21c57f6..73044545ecac 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -122,6 +122,11 @@ struct mcck_volatile_info { __u32 reserved; }; +#define CR0_INITIAL_MASK (CR0_UNUSED_56 | CR0_INTERRUPT_KEY_SUBMASK | \ + CR0_MEASUREMENT_ALERT_SUBMASK) +#define CR14_INITIAL_MASK (CR14_UNUSED_32 | CR14_UNUSED_33 | \ + CR14_EXTERNAL_DAMAGE_SUBMASK) + #define CPUSTAT_STOPPED 0x80000000 #define CPUSTAT_WAIT 0x10000000 #define CPUSTAT_ECALL_PEND 0x08000000 diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 876802894b35..bb072866bd69 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -2847,19 +2847,15 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) static void kvm_s390_vcpu_initial_reset(struct kvm_vcpu *vcpu) { /* this equals initial cpu reset in pop, but we don't switch to ESA */ - vcpu->arch.sie_block->gpsw.mask = 0UL; - vcpu->arch.sie_block->gpsw.addr = 0UL; + vcpu->arch.sie_block->gpsw.mask = 0; + vcpu->arch.sie_block->gpsw.addr = 0; kvm_s390_set_prefix(vcpu, 0); kvm_s390_set_cpu_timer(vcpu, 0); - vcpu->arch.sie_block->ckc = 0UL; - vcpu->arch.sie_block->todpr = 0; - memset(vcpu->arch.sie_block->gcr, 0, 16 * sizeof(__u64)); - vcpu->arch.sie_block->gcr[0] = CR0_UNUSED_56 | - CR0_INTERRUPT_KEY_SUBMASK | - CR0_MEASUREMENT_ALERT_SUBMASK; - vcpu->arch.sie_block->gcr[14] = CR14_UNUSED_32 | - CR14_UNUSED_33 | - CR14_EXTERNAL_DAMAGE_SUBMASK; + vcpu->arch.sie_block->ckc = 0; + vcpu->arch.sie_block->todpr = 0; + memset(vcpu->arch.sie_block->gcr, 0, sizeof(vcpu->arch.sie_block->gcr)); + vcpu->arch.sie_block->gcr[0] = CR0_INITIAL_MASK; + vcpu->arch.sie_block->gcr[14] = CR14_INITIAL_MASK; vcpu->run->s.regs.fpc = 0; vcpu->arch.sie_block->gbea = 1; vcpu->arch.sie_block->pp = 0; -- cgit v1.2.3-59-g8ed1b From 7de3f1423ff9431f3bd5023bb78d1e062314e7f0 Mon Sep 17 00:00:00 2001 From: Janosch Frank Date: Fri, 31 Jan 2020 05:02:02 -0500 Subject: KVM: s390: Add new reset vcpu API The architecture states that we need to reset local IRQs for all CPU resets. Because the old reset interface did not support the normal CPU reset we never did that on a normal reset. Let's implement an interface for the missing normal and clear resets and reset all local IRQs, registers and control structures as stated in the architecture. Userspace might already reset the registers via the vcpu run struct, but as we need the interface for the interrupt clearing part anyway, we implement the resets fully and don't rely on userspace to reset the rest. Signed-off-by: Janosch Frank Reviewed-by: Cornelia Huck Reviewed-by: Christian Borntraeger Reviewed-by: Thomas Huth Link: https://lore.kernel.org/r/20200131100205.74720-4-frankja@linux.ibm.com Signed-off-by: Christian Borntraeger --- Documentation/virt/kvm/api.txt | 43 +++++++++++++++++++++ arch/s390/kvm/kvm-s390.c | 84 ++++++++++++++++++++++++++++-------------- include/uapi/linux/kvm.h | 5 +++ 3 files changed, 105 insertions(+), 27 deletions(-) diff --git a/Documentation/virt/kvm/api.txt b/Documentation/virt/kvm/api.txt index ebb37b34dcfc..73448764f544 100644 --- a/Documentation/virt/kvm/api.txt +++ b/Documentation/virt/kvm/api.txt @@ -4168,6 +4168,42 @@ This ioctl issues an ultravisor call to terminate the secure guest, unpins the VPA pages and releases all the device pages that are used to track the secure pages by hypervisor. +4.122 KVM_S390_NORMAL_RESET + +Capability: KVM_CAP_S390_VCPU_RESETS +Architectures: s390 +Type: vcpu ioctl +Parameters: none +Returns: 0 + +This ioctl resets VCPU registers and control structures according to +the cpu reset definition in the POP (Principles Of Operation). + +4.123 KVM_S390_INITIAL_RESET + +Capability: none +Architectures: s390 +Type: vcpu ioctl +Parameters: none +Returns: 0 + +This ioctl resets VCPU registers and control structures according to +the initial cpu reset definition in the POP. However, the cpu is not +put into ESA mode. This reset is a superset of the normal reset. + +4.124 KVM_S390_CLEAR_RESET + +Capability: KVM_CAP_S390_VCPU_RESETS +Architectures: s390 +Type: vcpu ioctl +Parameters: none +Returns: 0 + +This ioctl resets VCPU registers and control structures according to +the clear cpu reset definition in the POP. However, the cpu is not put +into ESA mode. This reset is a superset of the initial reset. + + 5. The kvm_run structure ------------------------ @@ -5396,3 +5432,10 @@ handling by KVM (as some KVM hypercall may be mistakenly treated as TLB flush hypercalls by Hyper-V) so userspace should disable KVM identification in CPUID and only exposes Hyper-V identification. In this case, guest thinks it's running on Hyper-V and only use Hyper-V hypercalls. + +8.22 KVM_CAP_S390_VCPU_RESETS + +Architectures: s390 + +This capability indicates that the KVM_S390_NORMAL_RESET and +KVM_S390_CLEAR_RESET ioctls are available. diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index bb072866bd69..e39f6ef97b09 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -529,6 +529,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_S390_CMMA_MIGRATION: case KVM_CAP_S390_AIS: case KVM_CAP_S390_AIS_MIGRATION: + case KVM_CAP_S390_VCPU_RESETS: r = 1; break; case KVM_CAP_S390_HPAGE_1M: @@ -2844,29 +2845,6 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) } -static void kvm_s390_vcpu_initial_reset(struct kvm_vcpu *vcpu) -{ - /* this equals initial cpu reset in pop, but we don't switch to ESA */ - vcpu->arch.sie_block->gpsw.mask = 0; - vcpu->arch.sie_block->gpsw.addr = 0; - kvm_s390_set_prefix(vcpu, 0); - kvm_s390_set_cpu_timer(vcpu, 0); - vcpu->arch.sie_block->ckc = 0; - vcpu->arch.sie_block->todpr = 0; - memset(vcpu->arch.sie_block->gcr, 0, sizeof(vcpu->arch.sie_block->gcr)); - vcpu->arch.sie_block->gcr[0] = CR0_INITIAL_MASK; - vcpu->arch.sie_block->gcr[14] = CR14_INITIAL_MASK; - vcpu->run->s.regs.fpc = 0; - vcpu->arch.sie_block->gbea = 1; - vcpu->arch.sie_block->pp = 0; - vcpu->arch.sie_block->fpf &= ~FPF_BPBC; - vcpu->arch.pfault_token = KVM_S390_PFAULT_TOKEN_INVALID; - kvm_clear_async_pf_completion_queue(vcpu); - if (!kvm_s390_user_cpu_state_ctrl(vcpu->kvm)) - kvm_s390_vcpu_stop(vcpu); - kvm_s390_clear_local_irqs(vcpu); -} - void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) { mutex_lock(&vcpu->kvm->lock); @@ -3281,10 +3259,53 @@ static int kvm_arch_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, return r; } -static int kvm_arch_vcpu_ioctl_initial_reset(struct kvm_vcpu *vcpu) +static void kvm_arch_vcpu_ioctl_normal_reset(struct kvm_vcpu *vcpu) { - kvm_s390_vcpu_initial_reset(vcpu); - return 0; + vcpu->arch.sie_block->gpsw.mask &= ~PSW_MASK_RI; + vcpu->arch.pfault_token = KVM_S390_PFAULT_TOKEN_INVALID; + memset(vcpu->run->s.regs.riccb, 0, sizeof(vcpu->run->s.regs.riccb)); + + kvm_clear_async_pf_completion_queue(vcpu); + if (!kvm_s390_user_cpu_state_ctrl(vcpu->kvm)) + kvm_s390_vcpu_stop(vcpu); + kvm_s390_clear_local_irqs(vcpu); +} + +static void kvm_arch_vcpu_ioctl_initial_reset(struct kvm_vcpu *vcpu) +{ + /* Initial reset is a superset of the normal reset */ + kvm_arch_vcpu_ioctl_normal_reset(vcpu); + + /* this equals initial cpu reset in pop, but we don't switch to ESA */ + vcpu->arch.sie_block->gpsw.mask = 0; + vcpu->arch.sie_block->gpsw.addr = 0; + kvm_s390_set_prefix(vcpu, 0); + kvm_s390_set_cpu_timer(vcpu, 0); + vcpu->arch.sie_block->ckc = 0; + vcpu->arch.sie_block->todpr = 0; + memset(vcpu->arch.sie_block->gcr, 0, sizeof(vcpu->arch.sie_block->gcr)); + vcpu->arch.sie_block->gcr[0] = CR0_INITIAL_MASK; + vcpu->arch.sie_block->gcr[14] = CR14_INITIAL_MASK; + vcpu->run->s.regs.fpc = 0; + vcpu->arch.sie_block->gbea = 1; + vcpu->arch.sie_block->pp = 0; + vcpu->arch.sie_block->fpf &= ~FPF_BPBC; +} + +static void kvm_arch_vcpu_ioctl_clear_reset(struct kvm_vcpu *vcpu) +{ + struct kvm_sync_regs *regs = &vcpu->run->s.regs; + + /* Clear reset is a superset of the initial reset */ + kvm_arch_vcpu_ioctl_initial_reset(vcpu); + + memset(®s->gprs, 0, sizeof(regs->gprs)); + memset(®s->vrs, 0, sizeof(regs->vrs)); + memset(®s->acrs, 0, sizeof(regs->acrs)); + memset(®s->gscb, 0, sizeof(regs->gscb)); + + regs->etoken = 0; + regs->etoken_extension = 0; } int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) @@ -4357,8 +4378,17 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = kvm_arch_vcpu_ioctl_set_initial_psw(vcpu, psw); break; } + case KVM_S390_CLEAR_RESET: + r = 0; + kvm_arch_vcpu_ioctl_clear_reset(vcpu); + break; case KVM_S390_INITIAL_RESET: - r = kvm_arch_vcpu_ioctl_initial_reset(vcpu); + r = 0; + kvm_arch_vcpu_ioctl_initial_reset(vcpu); + break; + case KVM_S390_NORMAL_RESET: + r = 0; + kvm_arch_vcpu_ioctl_normal_reset(vcpu); break; case KVM_SET_ONE_REG: case KVM_GET_ONE_REG: { diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index f0a16b4adbbd..4b95f9a31a2f 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1009,6 +1009,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_PPC_GUEST_DEBUG_SSTEP 176 #define KVM_CAP_ARM_NISV_TO_USER 177 #define KVM_CAP_ARM_INJECT_EXT_DABT 178 +#define KVM_CAP_S390_VCPU_RESETS 179 #ifdef KVM_CAP_IRQ_ROUTING @@ -1473,6 +1474,10 @@ struct kvm_enc_region { /* Available with KVM_CAP_ARM_SVE */ #define KVM_ARM_VCPU_FINALIZE _IOW(KVMIO, 0xc2, int) +/* Available with KVM_CAP_S390_VCPU_RESETS */ +#define KVM_S390_NORMAL_RESET _IO(KVMIO, 0xc3) +#define KVM_S390_CLEAR_RESET _IO(KVMIO, 0xc4) + /* Secure Encrypted Virtualization command */ enum sev_cmd_id { /* Guest initialization commands */ -- cgit v1.2.3-59-g8ed1b From ada0a50d7685e35c1b8ee1deb9a38203acda6683 Mon Sep 17 00:00:00 2001 From: Janosch Frank Date: Fri, 31 Jan 2020 05:02:03 -0500 Subject: selftests: KVM: Add fpu and one reg set/get library functions Add library access to more registers. Signed-off-by: Janosch Frank Reviewed-by: Thomas Huth Reviewed-by: Cornelia Huck Link: https://lore.kernel.org/r/20200131100205.74720-5-frankja@linux.ibm.com Signed-off-by: Christian Borntraeger --- tools/testing/selftests/kvm/include/kvm_util.h | 6 +++++ tools/testing/selftests/kvm/lib/kvm_util.c | 36 ++++++++++++++++++++++++++ 2 files changed, 42 insertions(+) diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h index 29cccaf96baf..ae0d14c2540a 100644 --- a/tools/testing/selftests/kvm/include/kvm_util.h +++ b/tools/testing/selftests/kvm/include/kvm_util.h @@ -125,6 +125,12 @@ void vcpu_sregs_set(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_sregs *sregs); int _vcpu_sregs_set(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_sregs *sregs); +void vcpu_fpu_get(struct kvm_vm *vm, uint32_t vcpuid, + struct kvm_fpu *fpu); +void vcpu_fpu_set(struct kvm_vm *vm, uint32_t vcpuid, + struct kvm_fpu *fpu); +void vcpu_get_reg(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_one_reg *reg); +void vcpu_set_reg(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_one_reg *reg); #ifdef __KVM_HAVE_VCPU_EVENTS void vcpu_events_get(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_vcpu_events *events); diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 41cf45416060..a6dd0401eb50 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -1373,6 +1373,42 @@ int _vcpu_sregs_set(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_sregs *sregs) return ioctl(vcpu->fd, KVM_SET_SREGS, sregs); } +void vcpu_fpu_get(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_fpu *fpu) +{ + int ret; + + ret = _vcpu_ioctl(vm, vcpuid, KVM_GET_FPU, fpu); + TEST_ASSERT(ret == 0, "KVM_GET_FPU failed, rc: %i errno: %i (%s)", + ret, errno, strerror(errno)); +} + +void vcpu_fpu_set(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_fpu *fpu) +{ + int ret; + + ret = _vcpu_ioctl(vm, vcpuid, KVM_SET_FPU, fpu); + TEST_ASSERT(ret == 0, "KVM_SET_FPU failed, rc: %i errno: %i (%s)", + ret, errno, strerror(errno)); +} + +void vcpu_get_reg(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_one_reg *reg) +{ + int ret; + + ret = _vcpu_ioctl(vm, vcpuid, KVM_GET_ONE_REG, reg); + TEST_ASSERT(ret == 0, "KVM_GET_ONE_REG failed, rc: %i errno: %i (%s)", + ret, errno, strerror(errno)); +} + +void vcpu_set_reg(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_one_reg *reg) +{ + int ret; + + ret = _vcpu_ioctl(vm, vcpuid, KVM_SET_ONE_REG, reg); + TEST_ASSERT(ret == 0, "KVM_SET_ONE_REG failed, rc: %i errno: %i (%s)", + ret, errno, strerror(errno)); +} + /* * VCPU Ioctl * -- cgit v1.2.3-59-g8ed1b From b25d4cb43f31e31c176eb862db2ad3072b496d44 Mon Sep 17 00:00:00 2001 From: Janosch Frank Date: Fri, 31 Jan 2020 05:02:04 -0500 Subject: selftests: KVM: s390x: Add reset tests Test if the registers end up having the correct values after a normal, initial and clear reset. Signed-off-by: Janosch Frank Reviewed-by: Thomas Huth Reviewed-by: Cornelia Huck Link: https://lore.kernel.org/r/20200131100205.74720-6-frankja@linux.ibm.com Signed-off-by: Christian Borntraeger --- tools/testing/selftests/kvm/Makefile | 1 + tools/testing/selftests/kvm/s390x/resets.c | 155 +++++++++++++++++++++++++++++ 2 files changed, 156 insertions(+) create mode 100644 tools/testing/selftests/kvm/s390x/resets.c diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index 3138a916574a..fe1ea294730c 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -36,6 +36,7 @@ TEST_GEN_PROGS_aarch64 += kvm_create_max_vcpus TEST_GEN_PROGS_s390x = s390x/memop TEST_GEN_PROGS_s390x += s390x/sync_regs_test +TEST_GEN_PROGS_s390x += s390x/resets TEST_GEN_PROGS_s390x += dirty_log_test TEST_GEN_PROGS_s390x += kvm_create_max_vcpus diff --git a/tools/testing/selftests/kvm/s390x/resets.c b/tools/testing/selftests/kvm/s390x/resets.c new file mode 100644 index 000000000000..fb8e976943a9 --- /dev/null +++ b/tools/testing/selftests/kvm/s390x/resets.c @@ -0,0 +1,155 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Test for s390x CPU resets + * + * Copyright (C) 2020, IBM + */ + +#include +#include +#include +#include + +#include "test_util.h" +#include "kvm_util.h" + +#define VCPU_ID 3 + +struct kvm_vm *vm; +struct kvm_run *run; +struct kvm_sync_regs *regs; +static uint64_t regs_null[16]; + +static uint64_t crs[16] = { 0x40000ULL, + 0x42000ULL, + 0, 0, 0, 0, 0, + 0x43000ULL, + 0, 0, 0, 0, 0, + 0x44000ULL, + 0, 0 +}; + +static void guest_code_initial(void) +{ + /* Round toward 0 */ + uint32_t fpc = 0x11; + + /* Dirty registers */ + asm volatile ( + " lctlg 0,15,%0\n" + " sfpc %1\n" + : : "Q" (crs), "d" (fpc)); + GUEST_SYNC(0); +} + +static void test_one_reg(uint64_t id, uint64_t value) +{ + struct kvm_one_reg reg; + uint64_t eval_reg; + + reg.addr = (uintptr_t)&eval_reg; + reg.id = id; + vcpu_get_reg(vm, VCPU_ID, ®); + TEST_ASSERT(eval_reg == value, "value == %s", value); +} + +static void assert_clear(void) +{ + struct kvm_sregs sregs; + struct kvm_regs regs; + struct kvm_fpu fpu; + + vcpu_regs_get(vm, VCPU_ID, ®s); + TEST_ASSERT(!memcmp(®s.gprs, regs_null, sizeof(regs.gprs)), "grs == 0"); + + vcpu_sregs_get(vm, VCPU_ID, &sregs); + TEST_ASSERT(!memcmp(&sregs.acrs, regs_null, sizeof(sregs.acrs)), "acrs == 0"); + + vcpu_fpu_get(vm, VCPU_ID, &fpu); + TEST_ASSERT(!memcmp(&fpu.fprs, regs_null, sizeof(fpu.fprs)), "fprs == 0"); +} + +static void assert_initial(void) +{ + struct kvm_sregs sregs; + struct kvm_fpu fpu; + + vcpu_sregs_get(vm, VCPU_ID, &sregs); + TEST_ASSERT(sregs.crs[0] == 0xE0UL, "cr0 == 0xE0"); + TEST_ASSERT(sregs.crs[14] == 0xC2000000UL, "cr14 == 0xC2000000"); + TEST_ASSERT(!memcmp(&sregs.crs[1], regs_null, sizeof(sregs.crs[1]) * 12), + "cr1-13 == 0"); + TEST_ASSERT(sregs.crs[15] == 0, "cr15 == 0"); + + vcpu_fpu_get(vm, VCPU_ID, &fpu); + TEST_ASSERT(!fpu.fpc, "fpc == 0"); + + test_one_reg(KVM_REG_S390_GBEA, 1); + test_one_reg(KVM_REG_S390_PP, 0); + test_one_reg(KVM_REG_S390_TODPR, 0); + test_one_reg(KVM_REG_S390_CPU_TIMER, 0); + test_one_reg(KVM_REG_S390_CLOCK_COMP, 0); +} + +static void assert_normal(void) +{ + test_one_reg(KVM_REG_S390_PFTOKEN, KVM_S390_PFAULT_TOKEN_INVALID); +} + +static void test_normal(void) +{ + printf("Testing normal reset\n"); + /* Create VM */ + vm = vm_create_default(VCPU_ID, 0, guest_code_initial); + run = vcpu_state(vm, VCPU_ID); + regs = &run->s.regs; + + vcpu_run(vm, VCPU_ID); + + vcpu_ioctl(vm, VCPU_ID, KVM_S390_NORMAL_RESET, 0); + assert_normal(); + kvm_vm_free(vm); +} + +static void test_initial(void) +{ + printf("Testing initial reset\n"); + vm = vm_create_default(VCPU_ID, 0, guest_code_initial); + run = vcpu_state(vm, VCPU_ID); + regs = &run->s.regs; + + vcpu_run(vm, VCPU_ID); + + vcpu_ioctl(vm, VCPU_ID, KVM_S390_INITIAL_RESET, 0); + assert_normal(); + assert_initial(); + kvm_vm_free(vm); +} + +static void test_clear(void) +{ + printf("Testing clear reset\n"); + vm = vm_create_default(VCPU_ID, 0, guest_code_initial); + run = vcpu_state(vm, VCPU_ID); + regs = &run->s.regs; + + vcpu_run(vm, VCPU_ID); + + vcpu_ioctl(vm, VCPU_ID, KVM_S390_CLEAR_RESET, 0); + assert_normal(); + assert_initial(); + assert_clear(); + kvm_vm_free(vm); +} + +int main(int argc, char *argv[]) +{ + setbuf(stdout, NULL); /* Tell stdout not to buffer its content */ + + test_initial(); + if (kvm_check_cap(KVM_CAP_S390_VCPU_RESETS)) { + test_normal(); + test_clear(); + } + return 0; +} -- cgit v1.2.3-59-g8ed1b From b2ff728bae9b04b533fbc8de66f1719c4dc889de Mon Sep 17 00:00:00 2001 From: Pierre Morel Date: Fri, 31 Jan 2020 05:02:05 -0500 Subject: selftests: KVM: testing the local IRQs resets Local IRQs are reset by a normal cpu reset. The initial cpu reset and the clear cpu reset, as superset of the normal reset, both clear the IRQs too. Let's inject an interrupt to a vCPU before calling a reset and see if it is gone after the reset. We choose to inject only an emergency interrupt at this point and can extend the test to other types of IRQs later. Signed-off-by: Pierre Morel Signed-off-by: Janosch Frank [minor fixups] Reviewed-by: Cornelia Huck Reviewed-by: Thomas Huth Link: https://lore.kernel.org/r/20200131100205.74720-7-frankja@linux.ibm.com Signed-off-by: Christian Borntraeger --- tools/testing/selftests/kvm/s390x/resets.c | 42 ++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/tools/testing/selftests/kvm/s390x/resets.c b/tools/testing/selftests/kvm/s390x/resets.c index fb8e976943a9..1485bc6c8999 100644 --- a/tools/testing/selftests/kvm/s390x/resets.c +++ b/tools/testing/selftests/kvm/s390x/resets.c @@ -14,6 +14,9 @@ #include "kvm_util.h" #define VCPU_ID 3 +#define LOCAL_IRQS 32 + +struct kvm_s390_irq buf[VCPU_ID + LOCAL_IRQS]; struct kvm_vm *vm; struct kvm_run *run; @@ -53,6 +56,23 @@ static void test_one_reg(uint64_t id, uint64_t value) TEST_ASSERT(eval_reg == value, "value == %s", value); } +static void assert_noirq(void) +{ + struct kvm_s390_irq_state irq_state; + int irqs; + + irq_state.len = sizeof(buf); + irq_state.buf = (unsigned long)buf; + irqs = _vcpu_ioctl(vm, VCPU_ID, KVM_S390_GET_IRQ_STATE, &irq_state); + /* + * irqs contains the number of retrieved interrupts. Any interrupt + * (notably, the emergency call interrupt we have injected) should + * be cleared by the resets, so this should be 0. + */ + TEST_ASSERT(irqs >= 0, "Could not fetch IRQs: errno %d\n", errno); + TEST_ASSERT(!irqs, "IRQ pending"); +} + static void assert_clear(void) { struct kvm_sregs sregs; @@ -94,6 +114,22 @@ static void assert_initial(void) static void assert_normal(void) { test_one_reg(KVM_REG_S390_PFTOKEN, KVM_S390_PFAULT_TOKEN_INVALID); + assert_noirq(); +} + +static void inject_irq(int cpu_id) +{ + struct kvm_s390_irq_state irq_state; + struct kvm_s390_irq *irq = &buf[0]; + int irqs; + + /* Inject IRQ */ + irq_state.len = sizeof(struct kvm_s390_irq); + irq_state.buf = (unsigned long)buf; + irq->type = KVM_S390_INT_EMERGENCY; + irq->u.emerg.code = cpu_id; + irqs = _vcpu_ioctl(vm, cpu_id, KVM_S390_SET_IRQ_STATE, &irq_state); + TEST_ASSERT(irqs >= 0, "Error injecting EMERGENCY IRQ errno %d\n", errno); } static void test_normal(void) @@ -106,6 +142,8 @@ static void test_normal(void) vcpu_run(vm, VCPU_ID); + inject_irq(VCPU_ID); + vcpu_ioctl(vm, VCPU_ID, KVM_S390_NORMAL_RESET, 0); assert_normal(); kvm_vm_free(vm); @@ -120,6 +158,8 @@ static void test_initial(void) vcpu_run(vm, VCPU_ID); + inject_irq(VCPU_ID); + vcpu_ioctl(vm, VCPU_ID, KVM_S390_INITIAL_RESET, 0); assert_normal(); assert_initial(); @@ -135,6 +175,8 @@ static void test_clear(void) vcpu_run(vm, VCPU_ID); + inject_irq(VCPU_ID); + vcpu_ioctl(vm, VCPU_ID, KVM_S390_CLEAR_RESET, 0); assert_normal(); assert_initial(); -- cgit v1.2.3-59-g8ed1b From b26a695a1d78cc415fe26d74d0463f5d887980de Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Thu, 14 Nov 2019 14:15:04 -0600 Subject: kvm: lapic: Introduce APICv update helper function Re-factor code into a helper function for setting lapic parameters when activate/deactivate APICv, and export the function for subsequent usage. Signed-off-by: Suravee Suthikulpanit Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 22 +++++++++++++++++----- arch/x86/kvm/lapic.h | 1 + 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index cce1e6b204c8..eafc631d305c 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -2187,6 +2187,21 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) pr_warn_once("APIC base relocation is unsupported by KVM"); } +void kvm_apic_update_apicv(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + + if (vcpu->arch.apicv_active) { + /* irr_pending is always true when apicv is activated. */ + apic->irr_pending = true; + apic->isr_count = 1; + } else { + apic->irr_pending = (apic_search_irr(apic) != -1); + apic->isr_count = count_vectors(apic->regs + APIC_ISR); + } +} +EXPORT_SYMBOL_GPL(kvm_apic_update_apicv); + void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) { struct kvm_lapic *apic = vcpu->arch.apic; @@ -2229,8 +2244,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) kvm_lapic_set_reg(apic, APIC_ISR + 0x10 * i, 0); kvm_lapic_set_reg(apic, APIC_TMR + 0x10 * i, 0); } - apic->irr_pending = vcpu->arch.apicv_active; - apic->isr_count = vcpu->arch.apicv_active ? 1 : 0; + kvm_apic_update_apicv(vcpu); apic->highest_isr_cache = -1; update_divide_count(apic); atomic_set(&apic->lapic_timer.pending, 0); @@ -2487,9 +2501,7 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) apic_manage_nmi_watchdog(apic, kvm_lapic_get_reg(apic, APIC_LVT0)); update_divide_count(apic); start_apic_timer(apic); - apic->irr_pending = true; - apic->isr_count = vcpu->arch.apicv_active ? - 1 : count_vectors(apic->regs + APIC_ISR); + kvm_apic_update_apicv(vcpu); apic->highest_isr_cache = -1; if (vcpu->arch.apicv_active) { kvm_x86_ops->apicv_post_state_restore(vcpu); diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index ec730ce7a344..ec6fbfe325cf 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -91,6 +91,7 @@ void kvm_apic_update_ppr(struct kvm_vcpu *vcpu); int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq, struct dest_map *dest_map); int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type); +void kvm_apic_update_apicv(struct kvm_vcpu *vcpu); bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, struct kvm_lapic_irq *irq, int *r, struct dest_map *dest_map); -- cgit v1.2.3-59-g8ed1b From 4e19c36f2df8f84da22c7287de86729aaf3e352b Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Thu, 14 Nov 2019 14:15:05 -0600 Subject: kvm: x86: Introduce APICv inhibit reason bits There are several reasons in which a VM needs to deactivate APICv e.g. disable APICv via parameter during module loading, or when enable Hyper-V SynIC support. Additional inhibit reasons will be introduced later on when dynamic APICv is supported, Introduce KVM APICv inhibit reason bits along with a new variable, apicv_inhibit_reasons, to help keep track of APICv state for each VM, Initially, the APICV_INHIBIT_REASON_DISABLE bit is used to indicate the case where APICv is disabled during KVM module load. (e.g. insmod kvm_amd avic=0 or insmod kvm_intel enable_apicv=0). Signed-off-by: Suravee Suthikulpanit [Do not use get_enable_apicv; consider irqchip_split in svm.c. - Paolo] Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 5 +++++ arch/x86/kvm/svm.c | 14 +++++++++++++- arch/x86/kvm/vmx/vmx.c | 1 + arch/x86/kvm/x86.c | 20 +++++++++++++++++++- 4 files changed, 38 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 329d01c689b7..4d57e4b74aae 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -873,6 +873,8 @@ enum kvm_irqchip_mode { KVM_IRQCHIP_SPLIT, /* created with KVM_CAP_SPLIT_IRQCHIP */ }; +#define APICV_INHIBIT_REASON_DISABLE 0 + struct kvm_arch { unsigned long n_used_mmu_pages; unsigned long n_requested_mmu_pages; @@ -904,6 +906,7 @@ struct kvm_arch { struct kvm_apic_map *apic_map; bool apic_access_page_done; + unsigned long apicv_inhibit_reasons; gpa_t wall_clock; @@ -1478,6 +1481,8 @@ gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, struct x86_exception *exception); void kvm_vcpu_deactivate_apicv(struct kvm_vcpu *vcpu); +bool kvm_apicv_activated(struct kvm *kvm); +void kvm_apicv_init(struct kvm *kvm, bool enable); int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 9dbb990c319a..ed39f72faeaf 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -2052,6 +2052,18 @@ free_avic: return err; } +static int svm_vm_init(struct kvm *kvm) +{ + if (avic) { + int ret = avic_vm_init(kvm); + if (ret) + return ret; + } + + kvm_apicv_init(kvm, avic && irqchip_split(kvm)); + return 0; +} + static inline int avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r) { @@ -7274,7 +7286,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .vm_alloc = svm_vm_alloc, .vm_free = svm_vm_free, - .vm_init = avic_vm_init, + .vm_init = svm_vm_init, .vm_destroy = svm_vm_destroy, .prepare_guest_switch = svm_prepare_guest_switch, diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index c475fa2aaae0..69bd10a563c0 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6813,6 +6813,7 @@ static int vmx_vm_init(struct kvm *kvm) break; } } + kvm_apicv_init(kvm, enable_apicv); return 0; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2d3be7f3ad67..98209b8c18c1 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7469,6 +7469,23 @@ void kvm_vcpu_deactivate_apicv(struct kvm_vcpu *vcpu) kvm_x86_ops->refresh_apicv_exec_ctrl(vcpu); } +bool kvm_apicv_activated(struct kvm *kvm) +{ + return (READ_ONCE(kvm->arch.apicv_inhibit_reasons) == 0); +} +EXPORT_SYMBOL_GPL(kvm_apicv_activated); + +void kvm_apicv_init(struct kvm *kvm, bool enable) +{ + if (enable) + clear_bit(APICV_INHIBIT_REASON_DISABLE, + &kvm->arch.apicv_inhibit_reasons); + else + set_bit(APICV_INHIBIT_REASON_DISABLE, + &kvm->arch.apicv_inhibit_reasons); +} +EXPORT_SYMBOL_GPL(kvm_apicv_init); + static void kvm_sched_yield(struct kvm *kvm, unsigned long dest_id) { struct kvm_vcpu *target = NULL; @@ -9219,10 +9236,11 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) return r; if (irqchip_in_kernel(vcpu->kvm)) { - vcpu->arch.apicv_active = kvm_x86_ops->get_enable_apicv(vcpu->kvm); r = kvm_create_lapic(vcpu, lapic_timer_advance_ns); if (r < 0) goto fail_mmu_destroy; + if (kvm_apicv_activated(vcpu->kvm)) + vcpu->arch.apicv_active = true; } else static_key_slow_inc(&kvm_no_apic_vcpu); -- cgit v1.2.3-59-g8ed1b From 7e3e67a98701cbcb4378b4f69b28a43351ca27c2 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 22 Jan 2020 16:54:37 +0100 Subject: KVM: x86: remove get_enable_apicv from kvm_x86_ops It is unused now. Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 - arch/x86/kvm/svm.c | 6 ------ arch/x86/kvm/vmx/vmx.c | 6 ------ 3 files changed, 13 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 4d57e4b74aae..9945c7bebdf8 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1121,7 +1121,6 @@ struct kvm_x86_ops { void (*enable_nmi_window)(struct kvm_vcpu *vcpu); void (*enable_irq_window)(struct kvm_vcpu *vcpu); void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr); - bool (*get_enable_apicv)(struct kvm *kvm); void (*refresh_apicv_exec_ctrl)(struct kvm_vcpu *vcpu); void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr); void (*hwapic_isr_update)(struct kvm_vcpu *vcpu, int isr); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index ed39f72faeaf..b0c343fef14d 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -5147,11 +5147,6 @@ static void svm_set_virtual_apic_mode(struct kvm_vcpu *vcpu) return; } -static bool svm_get_enable_apicv(struct kvm *kvm) -{ - return avic && irqchip_split(kvm); -} - static void svm_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr) { } @@ -7343,7 +7338,6 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .enable_irq_window = enable_irq_window, .update_cr8_intercept = update_cr8_intercept, .set_virtual_apic_mode = svm_set_virtual_apic_mode, - .get_enable_apicv = svm_get_enable_apicv, .refresh_apicv_exec_ctrl = svm_refresh_apicv_exec_ctrl, .load_eoi_exitmap = svm_load_eoi_exitmap, .hwapic_irr_update = svm_hwapic_irr_update, diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 69bd10a563c0..3e18df4cfb34 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -3719,11 +3719,6 @@ void pt_update_intercept_for_msr(struct vcpu_vmx *vmx) } } -static bool vmx_get_enable_apicv(struct kvm *kvm) -{ - return enable_apicv; -} - static bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -7787,7 +7782,6 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .update_cr8_intercept = update_cr8_intercept, .set_virtual_apic_mode = vmx_set_virtual_apic_mode, .set_apic_access_page_addr = vmx_set_apic_access_page_addr, - .get_enable_apicv = vmx_get_enable_apicv, .refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl, .load_eoi_exitmap = vmx_load_eoi_exitmap, .apicv_post_state_restore = vmx_apicv_post_state_restore, -- cgit v1.2.3-59-g8ed1b From 8df14af42f00a434c492c9964a8095bf59831a45 Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Thu, 14 Nov 2019 14:15:06 -0600 Subject: kvm: x86: Add support for dynamic APICv activation Certain runtime conditions require APICv to be temporary deactivated during runtime. The current implementation only support run-time deactivation of APICv when Hyper-V SynIC is enabled, which is not temporary. In addition, for AMD, when APICv is (de)activated at runtime, all vcpus in the VM have to operate in the same mode. Thus the requesting vcpu must notify the others. So, introduce the following: * A new KVM_REQ_APICV_UPDATE request bit * Interfaces to request all vcpus to update APICv status * A new interface to update APICV-related parameters for each vcpu Signed-off-by: Suravee Suthikulpanit Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 5 +++++ arch/x86/kvm/x86.c | 37 +++++++++++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 9945c7bebdf8..0189687877a7 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -78,6 +78,8 @@ #define KVM_REQ_HV_STIMER KVM_ARCH_REQ(22) #define KVM_REQ_LOAD_EOI_EXITMAP KVM_ARCH_REQ(23) #define KVM_REQ_GET_VMCS12_PAGES KVM_ARCH_REQ(24) +#define KVM_REQ_APICV_UPDATE \ + KVM_ARCH_REQ_FLAGS(25, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) #define CR0_RESERVED_BITS \ (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \ @@ -1482,6 +1484,9 @@ gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, void kvm_vcpu_deactivate_apicv(struct kvm_vcpu *vcpu); bool kvm_apicv_activated(struct kvm *kvm); void kvm_apicv_init(struct kvm *kvm, bool enable); +void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu); +void kvm_request_apicv_update(struct kvm *kvm, bool activate, + unsigned long bit); int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 98209b8c18c1..616491c134ae 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -26,6 +26,7 @@ #include "cpuid.h" #include "pmu.h" #include "hyperv.h" +#include "lapic.h" #include #include @@ -8013,6 +8014,40 @@ void kvm_make_scan_ioapic_request(struct kvm *kvm) kvm_make_all_cpus_request(kvm, KVM_REQ_SCAN_IOAPIC); } +void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu) +{ + if (!lapic_in_kernel(vcpu)) + return; + + vcpu->arch.apicv_active = kvm_apicv_activated(vcpu->kvm); + kvm_apic_update_apicv(vcpu); + kvm_x86_ops->refresh_apicv_exec_ctrl(vcpu); +} +EXPORT_SYMBOL_GPL(kvm_vcpu_update_apicv); + +/* + * NOTE: Do not hold any lock prior to calling this. + * + * In particular, kvm_request_apicv_update() expects kvm->srcu not to be + * locked, because it calls __x86_set_memory_region() which does + * synchronize_srcu(&kvm->srcu). + */ +void kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit) +{ + if (activate) { + if (!test_and_clear_bit(bit, &kvm->arch.apicv_inhibit_reasons) || + !kvm_apicv_activated(kvm)) + return; + } else { + if (test_and_set_bit(bit, &kvm->arch.apicv_inhibit_reasons) || + kvm_apicv_activated(kvm)) + return; + } + + kvm_make_all_cpus_request(kvm, KVM_REQ_APICV_UPDATE); +} +EXPORT_SYMBOL_GPL(kvm_request_apicv_update); + static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu) { if (!kvm_apic_present(vcpu)) @@ -8203,6 +8238,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) */ if (kvm_check_request(KVM_REQ_HV_STIMER, vcpu)) kvm_hv_process_stimers(vcpu); + if (kvm_check_request(KVM_REQ_APICV_UPDATE, vcpu)) + kvm_vcpu_update_apicv(vcpu); } if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) { -- cgit v1.2.3-59-g8ed1b From 24bbf74c0c36bfbaa276c9921b55b844018b241e Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Thu, 14 Nov 2019 14:15:07 -0600 Subject: kvm: x86: Add APICv (de)activate request trace points Add trace points when sending request to (de)activate APICv. Suggested-by: Alexander Graf Signed-off-by: Suravee Suthikulpanit Signed-off-by: Paolo Bonzini --- arch/x86/kvm/trace.h | 19 +++++++++++++++++++ arch/x86/kvm/x86.c | 2 ++ 2 files changed, 21 insertions(+) diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 7c741a0c5f80..f194dd058470 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -1291,6 +1291,25 @@ TRACE_EVENT(kvm_hv_stimer_cleanup, __entry->vcpu_id, __entry->timer_index) ); +TRACE_EVENT(kvm_apicv_update_request, + TP_PROTO(bool activate, unsigned long bit), + TP_ARGS(activate, bit), + + TP_STRUCT__entry( + __field(bool, activate) + __field(unsigned long, bit) + ), + + TP_fast_assign( + __entry->activate = activate; + __entry->bit = bit; + ), + + TP_printk("%s bit=%lu", + __entry->activate ? "activate" : "deactivate", + __entry->bit) +); + /* * Tracepoint for AMD AVIC */ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 616491c134ae..3ceb0bc7d3f2 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8044,6 +8044,7 @@ void kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit) return; } + trace_kvm_apicv_update_request(activate, bit); kvm_make_all_cpus_request(kvm, KVM_REQ_APICV_UPDATE); } EXPORT_SYMBOL_GPL(kvm_request_apicv_update); @@ -10503,3 +10504,4 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pml_full); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pi_irte_update); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_unaccelerated_access); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_incomplete_ipi); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_apicv_update_request); -- cgit v1.2.3-59-g8ed1b From 8937d762396d22bdb59f02732a66db6b58e746b1 Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Thu, 14 Nov 2019 14:15:08 -0600 Subject: kvm: x86: svm: Add support to (de)activate posted interrupts Introduce interface for (de)activate posted interrupts, and implement SVM hooks to toggle AMD IOMMU guest virtual APIC mode. Signed-off-by: Suravee Suthikulpanit Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm.c | 37 ++++++++++++++++++++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index b0c343fef14d..e6118d17de6e 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -5155,17 +5155,52 @@ static void svm_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr) { } +static int svm_set_pi_irte_mode(struct kvm_vcpu *vcpu, bool activate) +{ + int ret = 0; + unsigned long flags; + struct amd_svm_iommu_ir *ir; + struct vcpu_svm *svm = to_svm(vcpu); + + if (!kvm_arch_has_assigned_device(vcpu->kvm)) + return 0; + + /* + * Here, we go through the per-vcpu ir_list to update all existing + * interrupt remapping table entry targeting this vcpu. + */ + spin_lock_irqsave(&svm->ir_list_lock, flags); + + if (list_empty(&svm->ir_list)) + goto out; + + list_for_each_entry(ir, &svm->ir_list, node) { + if (activate) + ret = amd_iommu_activate_guest_mode(ir->data); + else + ret = amd_iommu_deactivate_guest_mode(ir->data); + if (ret) + break; + } +out: + spin_unlock_irqrestore(&svm->ir_list_lock, flags); + return ret; +} + /* Note: Currently only used by Hyper-V. */ static void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); struct vmcb *vmcb = svm->vmcb; + bool activated = kvm_vcpu_apicv_active(vcpu); - if (kvm_vcpu_apicv_active(vcpu)) + if (activated) vmcb->control.int_ctl |= AVIC_ENABLE_MASK; else vmcb->control.int_ctl &= ~AVIC_ENABLE_MASK; mark_dirty(vmcb, VMCB_AVIC); + + svm_set_pi_irte_mode(vcpu, activated); } static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) -- cgit v1.2.3-59-g8ed1b From dcbcfa287e964931f7051ff00ed33dbf33d39abd Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Thu, 14 Nov 2019 14:15:09 -0600 Subject: KVM: svm: avic: Add support for dynamic setup/teardown of virtual APIC backing page Re-factor avic_init_access_page() to avic_update_access_page() since activate/deactivate AVIC requires setting/unsetting the memory region used for virtual APIC backing page (APIC_ACCESS_PAGE_PRIVATE_MEMSLOT). Signed-off-by: Suravee Suthikulpanit Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index e6118d17de6e..dcea6b663d5c 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1729,23 +1729,22 @@ static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu, * field of the VMCB. Therefore, we set up the * APIC_ACCESS_PAGE_PRIVATE_MEMSLOT (4KB) here. */ -static int avic_init_access_page(struct kvm_vcpu *vcpu) +static int avic_update_access_page(struct kvm *kvm, bool activate) { - struct kvm *kvm = vcpu->kvm; int ret = 0; mutex_lock(&kvm->slots_lock); - if (kvm->arch.apic_access_page_done) + if (kvm->arch.apic_access_page_done == activate) goto out; ret = __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, APIC_DEFAULT_PHYS_BASE, - PAGE_SIZE); + activate ? PAGE_SIZE : 0); if (ret) goto out; - kvm->arch.apic_access_page_done = true; + kvm->arch.apic_access_page_done = activate; out: mutex_unlock(&kvm->slots_lock); return ret; @@ -1758,7 +1757,7 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu) int id = vcpu->vcpu_id; struct vcpu_svm *svm = to_svm(vcpu); - ret = avic_init_access_page(vcpu); + ret = avic_update_access_page(vcpu->kvm, true); if (ret) return ret; -- cgit v1.2.3-59-g8ed1b From ef8efd7a15bb7147a4ffb09758a6bd25d744a14e Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Thu, 14 Nov 2019 14:15:10 -0600 Subject: kvm: x86: Introduce APICv x86 ops for checking APIC inhibit reasons Inibit reason bits are used to determine if APICv deactivation is applicable for a particular hardware virtualization architecture. Signed-off-by: Suravee Suthikulpanit Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/svm.c | 8 ++++++++ arch/x86/kvm/vmx/vmx.c | 8 ++++++++ arch/x86/kvm/x86.c | 4 ++++ 4 files changed, 21 insertions(+) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 0189687877a7..81c41bfb0a5f 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1123,6 +1123,7 @@ struct kvm_x86_ops { void (*enable_nmi_window)(struct kvm_vcpu *vcpu); void (*enable_irq_window)(struct kvm_vcpu *vcpu); void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr); + bool (*check_apicv_inhibit_reasons)(ulong bit); void (*refresh_apicv_exec_ctrl)(struct kvm_vcpu *vcpu); void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr); void (*hwapic_isr_update)(struct kvm_vcpu *vcpu, int isr); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index dcea6b663d5c..842c0630af35 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -7298,6 +7298,13 @@ static bool svm_apic_init_signal_blocked(struct kvm_vcpu *vcpu) (svm->vmcb->control.intercept & (1ULL << INTERCEPT_INIT)); } +static bool svm_check_apicv_inhibit_reasons(ulong bit) +{ + ulong supported = BIT(APICV_INHIBIT_REASON_DISABLE); + + return supported & BIT(bit); +} + static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .cpu_has_kvm_support = has_svm, .disabled_by_bios = is_disabled, @@ -7373,6 +7380,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .update_cr8_intercept = update_cr8_intercept, .set_virtual_apic_mode = svm_set_virtual_apic_mode, .refresh_apicv_exec_ctrl = svm_refresh_apicv_exec_ctrl, + .check_apicv_inhibit_reasons = svm_check_apicv_inhibit_reasons, .load_eoi_exitmap = svm_load_eoi_exitmap, .hwapic_irr_update = svm_hwapic_irr_update, .hwapic_isr_update = svm_hwapic_isr_update, diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 3e18df4cfb34..7ba8de3325be 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7710,6 +7710,13 @@ static __exit void hardware_unsetup(void) free_kvm_area(); } +static bool vmx_check_apicv_inhibit_reasons(ulong bit) +{ + ulong supported = BIT(APICV_INHIBIT_REASON_DISABLE); + + return supported & BIT(bit); +} + static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .cpu_has_kvm_support = cpu_has_kvm_support, .disabled_by_bios = vmx_disabled_by_bios, @@ -7785,6 +7792,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl, .load_eoi_exitmap = vmx_load_eoi_exitmap, .apicv_post_state_restore = vmx_apicv_post_state_restore, + .check_apicv_inhibit_reasons = vmx_check_apicv_inhibit_reasons, .hwapic_irr_update = vmx_hwapic_irr_update, .hwapic_isr_update = vmx_hwapic_isr_update, .guest_apic_has_interrupt = vmx_guest_apic_has_interrupt, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3ceb0bc7d3f2..dbff8011f0f2 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8034,6 +8034,10 @@ EXPORT_SYMBOL_GPL(kvm_vcpu_update_apicv); */ void kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit) { + if (!kvm_x86_ops->check_apicv_inhibit_reasons || + !kvm_x86_ops->check_apicv_inhibit_reasons(bit)) + return; + if (activate) { if (!test_and_clear_bit(bit, &kvm->arch.apicv_inhibit_reasons) || !kvm_apicv_activated(kvm)) -- cgit v1.2.3-59-g8ed1b From 2de9d0ccd0fea32fc6a684f3f22496967ed608bc Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Thu, 14 Nov 2019 14:15:11 -0600 Subject: kvm: x86: Introduce x86 ops hook for pre-update APICv AMD SVM AVIC needs to update APIC backing page mapping before changing APICv mode. Introduce struct kvm_x86_ops.pre_update_apicv_exec_ctrl function hook to be called prior KVM APICv update request to each vcpu. Signed-off-by: Suravee Suthikulpanit Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/svm.c | 6 ++++++ arch/x86/kvm/x86.c | 2 ++ 3 files changed, 9 insertions(+) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 81c41bfb0a5f..19a7d0d3a5fa 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1124,6 +1124,7 @@ struct kvm_x86_ops { void (*enable_irq_window)(struct kvm_vcpu *vcpu); void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr); bool (*check_apicv_inhibit_reasons)(ulong bit); + void (*pre_update_apicv_exec_ctrl)(struct kvm *kvm, bool activate); void (*refresh_apicv_exec_ctrl)(struct kvm_vcpu *vcpu); void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr); void (*hwapic_isr_update)(struct kvm_vcpu *vcpu, int isr); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 842c0630af35..d85e29bc6ff1 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -7305,6 +7305,11 @@ static bool svm_check_apicv_inhibit_reasons(ulong bit) return supported & BIT(bit); } +static void svm_pre_update_apicv_exec_ctrl(struct kvm *kvm, bool activate) +{ + avic_update_access_page(kvm, activate); +} + static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .cpu_has_kvm_support = has_svm, .disabled_by_bios = is_disabled, @@ -7381,6 +7386,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .set_virtual_apic_mode = svm_set_virtual_apic_mode, .refresh_apicv_exec_ctrl = svm_refresh_apicv_exec_ctrl, .check_apicv_inhibit_reasons = svm_check_apicv_inhibit_reasons, + .pre_update_apicv_exec_ctrl = svm_pre_update_apicv_exec_ctrl, .load_eoi_exitmap = svm_load_eoi_exitmap, .hwapic_irr_update = svm_hwapic_irr_update, .hwapic_isr_update = svm_hwapic_isr_update, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index dbff8011f0f2..d2f15cbe2634 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8049,6 +8049,8 @@ void kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit) } trace_kvm_apicv_update_request(activate, bit); + if (kvm_x86_ops->pre_update_apicv_exec_ctrl) + kvm_x86_ops->pre_update_apicv_exec_ctrl(kvm, activate); kvm_make_all_cpus_request(kvm, KVM_REQ_APICV_UPDATE); } EXPORT_SYMBOL_GPL(kvm_request_apicv_update); -- cgit v1.2.3-59-g8ed1b From 6c3e4422dd201c107948adc1b7615610d7381bcb Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Thu, 14 Nov 2019 14:15:12 -0600 Subject: svm: Add support for dynamic APICv Add necessary logics to support (de)activate AVIC at runtime. Signed-off-by: Suravee Suthikulpanit Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm.c | 38 ++++++++++++++++++++++++++++---------- 1 file changed, 28 insertions(+), 10 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index d85e29bc6ff1..3c211933a0a7 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -387,6 +387,7 @@ static u8 rsm_ins_bytes[] = "\x0f\xaa"; static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); static void svm_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa); static void svm_complete_interrupts(struct vcpu_svm *svm); +static inline void avic_post_state_restore(struct kvm_vcpu *vcpu); static int nested_svm_exit_handled(struct vcpu_svm *svm); static int nested_svm_intercept(struct vcpu_svm *svm); @@ -1545,7 +1546,10 @@ static void avic_init_vmcb(struct vcpu_svm *svm) vmcb->control.avic_logical_id = lpa & AVIC_HPA_MASK; vmcb->control.avic_physical_id = ppa & AVIC_HPA_MASK; vmcb->control.avic_physical_id |= AVIC_MAX_PHYSICAL_ID_COUNT; - vmcb->control.int_ctl |= AVIC_ENABLE_MASK; + if (kvm_apicv_activated(svm->vcpu.kvm)) + vmcb->control.int_ctl |= AVIC_ENABLE_MASK; + else + vmcb->control.int_ctl &= ~AVIC_ENABLE_MASK; } static void init_vmcb(struct vcpu_svm *svm) @@ -1752,21 +1756,24 @@ out: static int avic_init_backing_page(struct kvm_vcpu *vcpu) { - int ret; u64 *entry, new_entry; int id = vcpu->vcpu_id; struct vcpu_svm *svm = to_svm(vcpu); - ret = avic_update_access_page(vcpu->kvm, true); - if (ret) - return ret; - if (id >= AVIC_MAX_PHYSICAL_ID_COUNT) return -EINVAL; if (!svm->vcpu.arch.apic->regs) return -EINVAL; + if (kvm_apicv_activated(vcpu->kvm)) { + int ret; + + ret = avic_update_access_page(vcpu->kvm, true); + if (ret) + return ret; + } + svm->avic_backing_page = virt_to_page(svm->vcpu.arch.apic->regs); /* Setting AVIC backing page address in the phy APIC ID table */ @@ -2234,7 +2241,8 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu) /* We initialize this flag to true to make sure that the is_running * bit would be set the first time the vcpu is loaded. */ - svm->avic_is_running = true; + if (irqchip_in_kernel(vcpu->kvm) && kvm_apicv_activated(vcpu->kvm)) + svm->avic_is_running = true; svm->nested.hsave = page_address(hsave_page); @@ -2359,6 +2367,8 @@ static void svm_vcpu_blocking(struct kvm_vcpu *vcpu) static void svm_vcpu_unblocking(struct kvm_vcpu *vcpu) { + if (kvm_check_request(KVM_REQ_APICV_UPDATE, vcpu)) + kvm_vcpu_update_apicv(vcpu); avic_set_running(vcpu, true); } @@ -5186,17 +5196,25 @@ out: return ret; } -/* Note: Currently only used by Hyper-V. */ static void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); struct vmcb *vmcb = svm->vmcb; bool activated = kvm_vcpu_apicv_active(vcpu); - if (activated) + if (activated) { + /** + * During AVIC temporary deactivation, guest could update + * APIC ID, DFR and LDR registers, which would not be trapped + * by avic_unaccelerated_access_interception(). In this case, + * we need to check and update the AVIC logical APIC ID table + * accordingly before re-activating. + */ + avic_post_state_restore(vcpu); vmcb->control.int_ctl |= AVIC_ENABLE_MASK; - else + } else { vmcb->control.int_ctl &= ~AVIC_ENABLE_MASK; + } mark_dirty(vmcb, VMCB_AVIC); svm_set_pi_irte_mode(vcpu, activated); -- cgit v1.2.3-59-g8ed1b From f4fdc0a2edf48f16f7b10cceaf4781fc56ab7fd9 Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Thu, 14 Nov 2019 14:15:13 -0600 Subject: kvm: x86: hyperv: Use APICv update request interface Since disabling APICv has to be done for all vcpus on AMD-based system, adopt the newly introduced kvm_request_apicv_update() interface, and introduce a new APICV_INHIBIT_REASON_HYPERV. Also, remove the kvm_vcpu_deactivate_apicv() since no longer used. Cc: Roman Kagan Signed-off-by: Suravee Suthikulpanit Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/hyperv.c | 5 +++-- arch/x86/kvm/svm.c | 3 ++- arch/x86/kvm/vmx/vmx.c | 3 ++- arch/x86/kvm/x86.c | 13 ------------- 5 files changed, 8 insertions(+), 18 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 19a7d0d3a5fa..90bfe8becc56 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -876,6 +876,7 @@ enum kvm_irqchip_mode { }; #define APICV_INHIBIT_REASON_DISABLE 0 +#define APICV_INHIBIT_REASON_HYPERV 1 struct kvm_arch { unsigned long n_used_mmu_pages; @@ -1483,7 +1484,6 @@ gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, struct x86_exception *exception); -void kvm_vcpu_deactivate_apicv(struct kvm_vcpu *vcpu); bool kvm_apicv_activated(struct kvm *kvm); void kvm_apicv_init(struct kvm *kvm, bool enable); void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 4df1c965bf1a..a86fda7a1d03 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -776,9 +776,10 @@ int kvm_hv_activate_synic(struct kvm_vcpu *vcpu, bool dont_zero_synic_pages) /* * Hyper-V SynIC auto EOI SINT's are - * not compatible with APICV, so deactivate APICV + * not compatible with APICV, so request + * to deactivate APICV permanently. */ - kvm_vcpu_deactivate_apicv(vcpu); + kvm_request_apicv_update(vcpu->kvm, false, APICV_INHIBIT_REASON_HYPERV); synic->active = true; synic->dont_zero_synic_pages = dont_zero_synic_pages; return 0; diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 3c211933a0a7..3b87ccd320d1 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -7318,7 +7318,8 @@ static bool svm_apic_init_signal_blocked(struct kvm_vcpu *vcpu) static bool svm_check_apicv_inhibit_reasons(ulong bit) { - ulong supported = BIT(APICV_INHIBIT_REASON_DISABLE); + ulong supported = BIT(APICV_INHIBIT_REASON_DISABLE) | + BIT(APICV_INHIBIT_REASON_HYPERV); return supported & BIT(bit); } diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 7ba8de3325be..678edbd6e278 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7712,7 +7712,8 @@ static __exit void hardware_unsetup(void) static bool vmx_check_apicv_inhibit_reasons(ulong bit) { - ulong supported = BIT(APICV_INHIBIT_REASON_DISABLE); + ulong supported = BIT(APICV_INHIBIT_REASON_DISABLE) | + BIT(APICV_INHIBIT_REASON_HYPERV); return supported & BIT(bit); } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index d2f15cbe2634..52edf0bb46e5 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7457,19 +7457,6 @@ static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid) kvm_irq_delivery_to_apic(kvm, NULL, &lapic_irq, NULL); } -void kvm_vcpu_deactivate_apicv(struct kvm_vcpu *vcpu) -{ - if (!lapic_in_kernel(vcpu)) { - WARN_ON_ONCE(vcpu->arch.apicv_active); - return; - } - if (!vcpu->arch.apicv_active) - return; - - vcpu->arch.apicv_active = false; - kvm_x86_ops->refresh_apicv_exec_ctrl(vcpu); -} - bool kvm_apicv_activated(struct kvm *kvm) { return (READ_ONCE(kvm->arch.apicv_inhibit_reasons) == 0); -- cgit v1.2.3-59-g8ed1b From 9a0bf05430699dc94b7ced940f6270c7cf1d77ef Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Thu, 14 Nov 2019 14:15:14 -0600 Subject: svm: Deactivate AVIC when launching guest with nested SVM support Since AVIC does not currently work w/ nested virtualization, deactivate AVIC for the guest if setting CPUID Fn80000001_ECX[SVM] (i.e. indicate support for SVM, which is needed for nested virtualization). Also, introduce a new APICV_INHIBIT_REASON_NESTED bit to be used for this reason. Suggested-by: Alexander Graf Signed-off-by: Suravee Suthikulpanit Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/svm.c | 11 ++++++++++- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 90bfe8becc56..ce19dea5f2dd 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -877,6 +877,7 @@ enum kvm_irqchip_mode { #define APICV_INHIBIT_REASON_DISABLE 0 #define APICV_INHIBIT_REASON_HYPERV 1 +#define APICV_INHIBIT_REASON_NESTED 2 struct kvm_arch { unsigned long n_used_mmu_pages; diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 3b87ccd320d1..af90f83d7123 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -5988,6 +5988,14 @@ static void svm_cpuid_update(struct kvm_vcpu *vcpu) return; guest_cpuid_clear(vcpu, X86_FEATURE_X2APIC); + + /* + * Currently, AVIC does not work with nested virtualization. + * So, we disable AVIC when cpuid for SVM is set in the L1 guest. + */ + if (nested && guest_cpuid_has(vcpu, X86_FEATURE_SVM)) + kvm_request_apicv_update(vcpu->kvm, false, + APICV_INHIBIT_REASON_NESTED); } #define F feature_bit @@ -7319,7 +7327,8 @@ static bool svm_apic_init_signal_blocked(struct kvm_vcpu *vcpu) static bool svm_check_apicv_inhibit_reasons(ulong bit) { ulong supported = BIT(APICV_INHIBIT_REASON_DISABLE) | - BIT(APICV_INHIBIT_REASON_HYPERV); + BIT(APICV_INHIBIT_REASON_HYPERV) | + BIT(APICV_INHIBIT_REASON_NESTED); return supported & BIT(bit); } -- cgit v1.2.3-59-g8ed1b From f3515dc3bef81e96bdb2ac93ef8fd20b1c2aaae5 Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Thu, 14 Nov 2019 14:15:15 -0600 Subject: svm: Temporarily deactivate AVIC during ExtINT handling AMD AVIC does not support ExtINT. Therefore, AVIC must be temporary deactivated and fall back to using legacy interrupt injection via vINTR and interrupt window. Also, introduce APICV_INHIBIT_REASON_IRQWIN to be used for this reason. Signed-off-by: Suravee Suthikulpanit [Rename svm_request_update_avic to svm_toggle_avic_for_extint. - Paolo] Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/svm.c | 33 +++++++++++++++++++++++++++++---- 2 files changed, 30 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index ce19dea5f2dd..2bd7fd96d994 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -878,6 +878,7 @@ enum kvm_irqchip_mode { #define APICV_INHIBIT_REASON_DISABLE 0 #define APICV_INHIBIT_REASON_HYPERV 1 #define APICV_INHIBIT_REASON_NESTED 2 +#define APICV_INHIBIT_REASON_IRQWIN 3 struct kvm_arch { unsigned long n_used_mmu_pages; diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index af90f83d7123..6d300c16d756 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -387,6 +387,7 @@ static u8 rsm_ins_bytes[] = "\x0f\xaa"; static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); static void svm_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa); static void svm_complete_interrupts(struct vcpu_svm *svm); +static void svm_toggle_avic_for_irq_window(struct kvm_vcpu *vcpu, bool activate); static inline void avic_post_state_restore(struct kvm_vcpu *vcpu); static int nested_svm_exit_handled(struct vcpu_svm *svm); @@ -4461,6 +4462,14 @@ static int interrupt_window_interception(struct vcpu_svm *svm) { kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); svm_clear_vintr(svm); + + /* + * For AVIC, the only reason to end up here is ExtINTs. + * In this case AVIC was temporarily disabled for + * requesting the IRQ window and we have to re-enable it. + */ + svm_toggle_avic_for_irq_window(&svm->vcpu, true); + svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; mark_dirty(svm->vmcb, VMCB_INTR); ++svm->vcpu.stat.irq_window_exits; @@ -5164,6 +5173,17 @@ static void svm_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr) { } +static void svm_toggle_avic_for_irq_window(struct kvm_vcpu *vcpu, bool activate) +{ + if (!avic || !lapic_in_kernel(vcpu)) + return; + + srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); + kvm_request_apicv_update(vcpu->kvm, activate, + APICV_INHIBIT_REASON_IRQWIN); + vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); +} + static int svm_set_pi_irte_mode(struct kvm_vcpu *vcpu, bool activate) { int ret = 0; @@ -5504,9 +5524,6 @@ static void enable_irq_window(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - if (kvm_vcpu_apicv_active(vcpu)) - return; - /* * In case GIF=0 we can't rely on the CPU to tell us when GIF becomes * 1, because that's a separate STGI/VMRUN intercept. The next time we @@ -5516,6 +5533,13 @@ static void enable_irq_window(struct kvm_vcpu *vcpu) * window under the assumption that the hardware will set the GIF. */ if ((vgif_enabled(svm) || gif_set(svm)) && nested_svm_intr(svm)) { + /* + * IRQ window is not needed when AVIC is enabled, + * unless we have pending ExtINT since it cannot be injected + * via AVIC. In such case, we need to temporarily disable AVIC, + * and fallback to injecting IRQ via V_IRQ. + */ + svm_toggle_avic_for_irq_window(vcpu, false); svm_set_vintr(svm); svm_inject_irq(svm, 0x0); } @@ -7328,7 +7352,8 @@ static bool svm_check_apicv_inhibit_reasons(ulong bit) { ulong supported = BIT(APICV_INHIBIT_REASON_DISABLE) | BIT(APICV_INHIBIT_REASON_HYPERV) | - BIT(APICV_INHIBIT_REASON_NESTED); + BIT(APICV_INHIBIT_REASON_NESTED) | + BIT(APICV_INHIBIT_REASON_IRQWIN); return supported & BIT(bit); } -- cgit v1.2.3-59-g8ed1b From e2ed4078a6ef3ddf4063329298852e24c36d46c8 Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Thu, 14 Nov 2019 14:15:16 -0600 Subject: kvm: i8254: Deactivate APICv when using in-kernel PIT re-injection mode. AMD SVM AVIC accelerates EOI write and does not trap. This causes in-kernel PIT re-injection mode to fail since it relies on irq-ack notifier mechanism. So, APICv is activated only when in-kernel PIT is in discard mode e.g. w/ qemu option: -global kvm-pit.lost_tick_policy=discard Also, introduce APICV_INHIBIT_REASON_PIT_REINJ bit to be used for this reason. Suggested-by: Paolo Bonzini Signed-off-by: Suravee Suthikulpanit Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/i8254.c | 12 ++++++++++++ arch/x86/kvm/svm.c | 11 +++++++++-- 3 files changed, 22 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 2bd7fd96d994..4dffbc10d3f8 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -879,6 +879,7 @@ enum kvm_irqchip_mode { #define APICV_INHIBIT_REASON_HYPERV 1 #define APICV_INHIBIT_REASON_NESTED 2 #define APICV_INHIBIT_REASON_IRQWIN 3 +#define APICV_INHIBIT_REASON_PIT_REINJ 4 struct kvm_arch { unsigned long n_used_mmu_pages; diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 4a6dc54cc12b..b24c606ac04b 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -295,12 +295,24 @@ void kvm_pit_set_reinject(struct kvm_pit *pit, bool reinject) if (atomic_read(&ps->reinject) == reinject) return; + /* + * AMD SVM AVIC accelerates EOI write and does not trap. + * This cause in-kernel PIT re-inject mode to fail + * since it checks ps->irq_ack before kvm_set_irq() + * and relies on the ack notifier to timely queue + * the pt->worker work iterm and reinject the missed tick. + * So, deactivate APICv when PIT is in reinject mode. + */ if (reinject) { + kvm_request_apicv_update(kvm, false, + APICV_INHIBIT_REASON_PIT_REINJ); /* The initial state is preserved while ps->reinject == 0. */ kvm_pit_reset_reinject(pit); kvm_register_irq_ack_notifier(kvm, &ps->irq_ack_notifier); kvm_register_irq_mask_notifier(kvm, 0, &pit->mask_notifier); } else { + kvm_request_apicv_update(kvm, true, + APICV_INHIBIT_REASON_PIT_REINJ); kvm_unregister_irq_ack_notifier(kvm, &ps->irq_ack_notifier); kvm_unregister_irq_mask_notifier(kvm, 0, &pit->mask_notifier); } diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 6d300c16d756..0b05967aa455 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1739,7 +1739,13 @@ static int avic_update_access_page(struct kvm *kvm, bool activate) int ret = 0; mutex_lock(&kvm->slots_lock); - if (kvm->arch.apic_access_page_done == activate) + /* + * During kvm_destroy_vm(), kvm_pit_set_reinject() could trigger + * APICv mode change, which update APIC_ACCESS_PAGE_PRIVATE_MEMSLOT + * memory region. So, we need to ensure that kvm->mm == current->mm. + */ + if ((kvm->arch.apic_access_page_done == activate) || + (kvm->mm != current->mm)) goto out; ret = __x86_set_memory_region(kvm, @@ -7353,7 +7359,8 @@ static bool svm_check_apicv_inhibit_reasons(ulong bit) ulong supported = BIT(APICV_INHIBIT_REASON_DISABLE) | BIT(APICV_INHIBIT_REASON_HYPERV) | BIT(APICV_INHIBIT_REASON_NESTED) | - BIT(APICV_INHIBIT_REASON_IRQWIN); + BIT(APICV_INHIBIT_REASON_IRQWIN) | + BIT(APICV_INHIBIT_REASON_PIT_REINJ); return supported & BIT(bit); } -- cgit v1.2.3-59-g8ed1b From 1ec2405c7cbf3afa7598c6b7546c81aa0cac78dc Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Thu, 14 Nov 2019 14:15:18 -0600 Subject: kvm: ioapic: Refactor kvm_ioapic_update_eoi() Refactor code for handling IOAPIC EOI for subsequent patch. There is no functional change. Signed-off-by: Suravee Suthikulpanit Signed-off-by: Paolo Bonzini --- arch/x86/kvm/ioapic.c | 110 +++++++++++++++++++++++++------------------------- 1 file changed, 56 insertions(+), 54 deletions(-) diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index 26aa22cb9b29..453c79550917 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -154,10 +154,16 @@ static void kvm_rtc_eoi_tracking_restore_all(struct kvm_ioapic *ioapic) __rtc_irq_eoi_tracking_restore_one(vcpu); } -static void rtc_irq_eoi(struct kvm_ioapic *ioapic, struct kvm_vcpu *vcpu) +static void rtc_irq_eoi(struct kvm_ioapic *ioapic, struct kvm_vcpu *vcpu, + int vector) { - if (test_and_clear_bit(vcpu->vcpu_id, - ioapic->rtc_status.dest_map.map)) { + struct dest_map *dest_map = &ioapic->rtc_status.dest_map; + + /* RTC special handling */ + if (test_bit(vcpu->vcpu_id, dest_map->map) && + (vector == dest_map->vectors[vcpu->vcpu_id]) && + (test_and_clear_bit(vcpu->vcpu_id, + ioapic->rtc_status.dest_map.map))) { --ioapic->rtc_status.pending_eoi; rtc_status_pending_eoi_check_valid(ioapic); } @@ -454,72 +460,68 @@ static void kvm_ioapic_eoi_inject_work(struct work_struct *work) } #define IOAPIC_SUCCESSIVE_IRQ_MAX_COUNT 10000 - -static void __kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, - struct kvm_ioapic *ioapic, int vector, int trigger_mode) +static void kvm_ioapic_update_eoi_one(struct kvm_vcpu *vcpu, + struct kvm_ioapic *ioapic, + int trigger_mode, + int pin) { - struct dest_map *dest_map = &ioapic->rtc_status.dest_map; struct kvm_lapic *apic = vcpu->arch.apic; - int i; - - /* RTC special handling */ - if (test_bit(vcpu->vcpu_id, dest_map->map) && - vector == dest_map->vectors[vcpu->vcpu_id]) - rtc_irq_eoi(ioapic, vcpu); + union kvm_ioapic_redirect_entry *ent = &ioapic->redirtbl[pin]; - for (i = 0; i < IOAPIC_NUM_PINS; i++) { - union kvm_ioapic_redirect_entry *ent = &ioapic->redirtbl[i]; - - if (ent->fields.vector != vector) - continue; - - /* - * We are dropping lock while calling ack notifiers because ack - * notifier callbacks for assigned devices call into IOAPIC - * recursively. Since remote_irr is cleared only after call - * to notifiers if the same vector will be delivered while lock - * is dropped it will be put into irr and will be delivered - * after ack notifier returns. - */ - spin_unlock(&ioapic->lock); - kvm_notify_acked_irq(ioapic->kvm, KVM_IRQCHIP_IOAPIC, i); - spin_lock(&ioapic->lock); + /* + * We are dropping lock while calling ack notifiers because ack + * notifier callbacks for assigned devices call into IOAPIC + * recursively. Since remote_irr is cleared only after call + * to notifiers if the same vector will be delivered while lock + * is dropped it will be put into irr and will be delivered + * after ack notifier returns. + */ + spin_unlock(&ioapic->lock); + kvm_notify_acked_irq(ioapic->kvm, KVM_IRQCHIP_IOAPIC, pin); + spin_lock(&ioapic->lock); - if (trigger_mode != IOAPIC_LEVEL_TRIG || - kvm_lapic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI) - continue; + if (trigger_mode != IOAPIC_LEVEL_TRIG || + kvm_lapic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI) + return; - ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG); - ent->fields.remote_irr = 0; - if (!ent->fields.mask && (ioapic->irr & (1 << i))) { - ++ioapic->irq_eoi[i]; - if (ioapic->irq_eoi[i] == IOAPIC_SUCCESSIVE_IRQ_MAX_COUNT) { - /* - * Real hardware does not deliver the interrupt - * immediately during eoi broadcast, and this - * lets a buggy guest make slow progress - * even if it does not correctly handle a - * level-triggered interrupt. Emulate this - * behavior if we detect an interrupt storm. - */ - schedule_delayed_work(&ioapic->eoi_inject, HZ / 100); - ioapic->irq_eoi[i] = 0; - trace_kvm_ioapic_delayed_eoi_inj(ent->bits); - } else { - ioapic_service(ioapic, i, false); - } + ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG); + ent->fields.remote_irr = 0; + if (!ent->fields.mask && (ioapic->irr & (1 << pin))) { + ++ioapic->irq_eoi[pin]; + if (ioapic->irq_eoi[pin] == IOAPIC_SUCCESSIVE_IRQ_MAX_COUNT) { + /* + * Real hardware does not deliver the interrupt + * immediately during eoi broadcast, and this + * lets a buggy guest make slow progress + * even if it does not correctly handle a + * level-triggered interrupt. Emulate this + * behavior if we detect an interrupt storm. + */ + schedule_delayed_work(&ioapic->eoi_inject, HZ / 100); + ioapic->irq_eoi[pin] = 0; + trace_kvm_ioapic_delayed_eoi_inj(ent->bits); } else { - ioapic->irq_eoi[i] = 0; + ioapic_service(ioapic, pin, false); } + } else { + ioapic->irq_eoi[pin] = 0; } } void kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, int vector, int trigger_mode) { + int i; struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic; spin_lock(&ioapic->lock); - __kvm_ioapic_update_eoi(vcpu, ioapic, vector, trigger_mode); + rtc_irq_eoi(ioapic, vcpu, vector); + for (i = 0; i < IOAPIC_NUM_PINS; i++) { + union kvm_ioapic_redirect_entry *ent = &ioapic->redirtbl[i]; + + if (ent->fields.vector != vector) + continue; + kvm_ioapic_update_eoi_one(vcpu, ioapic, trigger_mode, i); + } spin_unlock(&ioapic->lock); } -- cgit v1.2.3-59-g8ed1b From f458d039db7e8518041db4169d657407e3217008 Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Thu, 14 Nov 2019 14:15:19 -0600 Subject: kvm: ioapic: Lazy update IOAPIC EOI In-kernel IOAPIC does not receive EOI with AMD SVM AVIC since the processor accelerate write to APIC EOI register and does not trap if the interrupt is edge-triggered. Workaround this by lazy check for pending APIC EOI at the time when setting new IOPIC irq, and update IOAPIC EOI if no pending APIC EOI. Signed-off-by: Suravee Suthikulpanit Signed-off-by: Paolo Bonzini --- arch/x86/kvm/ioapic.c | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index 453c79550917..7668fed1ce65 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -49,6 +49,11 @@ static int ioapic_service(struct kvm_ioapic *vioapic, int irq, bool line_status); +static void kvm_ioapic_update_eoi_one(struct kvm_vcpu *vcpu, + struct kvm_ioapic *ioapic, + int trigger_mode, + int pin); + static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic, unsigned long addr, unsigned long length) @@ -177,6 +182,31 @@ static bool rtc_irq_check_coalesced(struct kvm_ioapic *ioapic) return false; } +static void ioapic_lazy_update_eoi(struct kvm_ioapic *ioapic, int irq) +{ + int i; + struct kvm_vcpu *vcpu; + union kvm_ioapic_redirect_entry *entry = &ioapic->redirtbl[irq]; + + kvm_for_each_vcpu(i, vcpu, ioapic->kvm) { + if (!kvm_apic_match_dest(vcpu, NULL, APIC_DEST_NOSHORT, + entry->fields.dest_id, + entry->fields.dest_mode) || + kvm_apic_pending_eoi(vcpu, entry->fields.vector)) + continue; + + /* + * If no longer has pending EOI in LAPICs, update + * EOI for this vetor. + */ + rtc_irq_eoi(ioapic, vcpu, entry->fields.vector); + kvm_ioapic_update_eoi_one(vcpu, ioapic, + entry->fields.trig_mode, + irq); + break; + } +} + static int ioapic_set_irq(struct kvm_ioapic *ioapic, unsigned int irq, int irq_level, bool line_status) { @@ -194,6 +224,15 @@ static int ioapic_set_irq(struct kvm_ioapic *ioapic, unsigned int irq, goto out; } + /* + * AMD SVM AVIC accelerate EOI write and do not trap, + * in-kernel IOAPIC will not be able to receive the EOI. + * In this case, we do lazy update of the pending EOI when + * trying to set IOAPIC irq. + */ + if (kvm_apicv_activated(ioapic->kvm)) + ioapic_lazy_update_eoi(ioapic, irq); + /* * Return 0 for coalesced interrupts; for edge-triggered interrupts, * this only happens if a previous edge has not been delivered due -- cgit v1.2.3-59-g8ed1b From e8ef2a19a051b755b0b9973ef1b3f81e895e2bce Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 22 Jan 2020 17:02:36 +0100 Subject: KVM: SVM: allow AVIC without split irqchip SVM is now able to disable AVIC dynamically whenever the in-kernel PIT sets up an ack notifier, so we can enable it even if in-kernel IOAPIC/PIC/PIT are in use. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 0b05967aa455..bf0556588ad0 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -2073,7 +2073,7 @@ static int svm_vm_init(struct kvm *kvm) return ret; } - kvm_apicv_init(kvm, avic && irqchip_split(kvm)); + kvm_apicv_init(kvm, avic); return 0; } -- cgit v1.2.3-59-g8ed1b From 33aabd029ffbafe314dad4763dadbc23d71296eb Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 23 Jan 2020 10:08:20 +0800 Subject: KVM: nVMX: delete meaningless nested_vmx_run() declaration The function nested_vmx_run() declaration is below its implementation. So this is meaningless and should be removed. Signed-off-by: Miaohe Lin Reviewed-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 2db21d59eaf5..53d522faaa69 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -4723,8 +4723,6 @@ static int handle_vmclear(struct kvm_vcpu *vcpu) return nested_vmx_succeed(vcpu); } -static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch); - /* Emulate the VMLAUNCH instruction */ static int handle_vmlaunch(struct kvm_vcpu *vcpu) { -- cgit v1.2.3-59-g8ed1b From 917f9475c0a8ab8958db7f22a5d495b9a1d51be6 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 22 Jan 2020 14:32:20 +0100 Subject: KVM: x86: reorganize pvclock_gtod_data members We will need a copy of tk->offs_boot in the next patch. Store it and cleanup the struct: instead of storing tk->tkr_xxx.base with the tk->offs_boot included, store the raw value in struct pvclock_clock and sum it in do_monotonic_raw and do_realtime. tk->tkr_xxx.xtime_nsec also moves to struct pvclock_clock. While at it, fix a (usually harmless) typo in do_monotonic_raw, which was using gtod->clock.shift instead of gtod->raw_clock.shift. Fixes: 53fafdbb8b21f ("KVM: x86: switch KVMCLOCK base to monotonic raw clock") Cc: stable@vger.kernel.org Reviewed-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 29 ++++++++++++----------------- 1 file changed, 12 insertions(+), 17 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 52edf0bb46e5..8faa721e4c38 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1610,6 +1610,8 @@ struct pvclock_clock { u64 mask; u32 mult; u32 shift; + u64 base_cycles; + u64 offset; }; struct pvclock_gtod_data { @@ -1618,11 +1620,8 @@ struct pvclock_gtod_data { struct pvclock_clock clock; /* extract of a clocksource struct */ struct pvclock_clock raw_clock; /* extract of a clocksource struct */ - u64 boot_ns_raw; - u64 boot_ns; - u64 nsec_base; + ktime_t offs_boot; u64 wall_time_sec; - u64 monotonic_raw_nsec; }; static struct pvclock_gtod_data pvclock_gtod_data; @@ -1630,10 +1629,6 @@ static struct pvclock_gtod_data pvclock_gtod_data; static void update_pvclock_gtod(struct timekeeper *tk) { struct pvclock_gtod_data *vdata = &pvclock_gtod_data; - u64 boot_ns, boot_ns_raw; - - boot_ns = ktime_to_ns(ktime_add(tk->tkr_mono.base, tk->offs_boot)); - boot_ns_raw = ktime_to_ns(ktime_add(tk->tkr_raw.base, tk->offs_boot)); write_seqcount_begin(&vdata->seq); @@ -1643,20 +1638,20 @@ static void update_pvclock_gtod(struct timekeeper *tk) vdata->clock.mask = tk->tkr_mono.mask; vdata->clock.mult = tk->tkr_mono.mult; vdata->clock.shift = tk->tkr_mono.shift; + vdata->clock.base_cycles = tk->tkr_mono.xtime_nsec; + vdata->clock.offset = tk->tkr_mono.base; vdata->raw_clock.vclock_mode = tk->tkr_raw.clock->archdata.vclock_mode; vdata->raw_clock.cycle_last = tk->tkr_raw.cycle_last; vdata->raw_clock.mask = tk->tkr_raw.mask; vdata->raw_clock.mult = tk->tkr_raw.mult; vdata->raw_clock.shift = tk->tkr_raw.shift; - - vdata->boot_ns = boot_ns; - vdata->nsec_base = tk->tkr_mono.xtime_nsec; + vdata->raw_clock.base_cycles = tk->tkr_raw.xtime_nsec; + vdata->raw_clock.offset = tk->tkr_raw.base; vdata->wall_time_sec = tk->xtime_sec; - vdata->boot_ns_raw = boot_ns_raw; - vdata->monotonic_raw_nsec = tk->tkr_raw.xtime_nsec; + vdata->offs_boot = tk->offs_boot; write_seqcount_end(&vdata->seq); } @@ -2126,10 +2121,10 @@ static int do_monotonic_raw(s64 *t, u64 *tsc_timestamp) do { seq = read_seqcount_begin(>od->seq); - ns = gtod->monotonic_raw_nsec; + ns = gtod->raw_clock.base_cycles; ns += vgettsc(>od->raw_clock, tsc_timestamp, &mode); - ns >>= gtod->clock.shift; - ns += gtod->boot_ns_raw; + ns >>= gtod->raw_clock.shift; + ns += ktime_to_ns(ktime_add(gtod->raw_clock.offset, gtod->offs_boot)); } while (unlikely(read_seqcount_retry(>od->seq, seq))); *t = ns; @@ -2146,7 +2141,7 @@ static int do_realtime(struct timespec64 *ts, u64 *tsc_timestamp) do { seq = read_seqcount_begin(>od->seq); ts->tv_sec = gtod->wall_time_sec; - ns = gtod->nsec_base; + ns = gtod->clock.base_cycles; ns += vgettsc(>od->clock, tsc_timestamp, &mode); ns >>= gtod->clock.shift; } while (unlikely(read_seqcount_retry(>od->seq, seq))); -- cgit v1.2.3-59-g8ed1b From 8171cd68806bd2fc28ef688e32fb2a3b3deb04e5 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 22 Jan 2020 14:36:09 +0100 Subject: KVM: x86: use raw clock values consistently Commit 53fafdbb8b21f ("KVM: x86: switch KVMCLOCK base to monotonic raw clock") changed kvmclock to use tkr_raw instead of tkr_mono. However, the default kvmclock_offset for the VM was still based on the monotonic clock and, if the raw clock drifted enough from the monotonic clock, this could cause a negative system_time to be written to the guest's struct pvclock. RHEL5 does not like it and (if it boots fast enough to observe a negative time value) it hangs. There is another thing to be careful about: getboottime64 returns the host boot time with tkr_mono frequency, and subtracting the tkr_raw-based kvmclock value will cause the wallclock to be off if tkr_raw drifts from tkr_mono. To avoid this, compute the wallclock delta from the current time instead of being clever and using getboottime64. Fixes: 53fafdbb8b21f ("KVM: x86: switch KVMCLOCK base to monotonic raw clock") Cc: stable@vger.kernel.org Reviewed-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 38 +++++++++++++++++++++++--------------- 1 file changed, 23 insertions(+), 15 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 8faa721e4c38..6db92371ad21 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1655,6 +1655,18 @@ static void update_pvclock_gtod(struct timekeeper *tk) write_seqcount_end(&vdata->seq); } + +static s64 get_kvmclock_base_ns(void) +{ + /* Count up from boot time, but with the frequency of the raw clock. */ + return ktime_to_ns(ktime_add(ktime_get_raw(), pvclock_gtod_data.offs_boot)); +} +#else +static s64 get_kvmclock_base_ns(void) +{ + /* Master clock not used, so we can just use CLOCK_BOOTTIME. */ + return ktime_get_boottime_ns(); +} #endif void kvm_set_pending_timer(struct kvm_vcpu *vcpu) @@ -1668,7 +1680,7 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) int version; int r; struct pvclock_wall_clock wc; - struct timespec64 boot; + u64 wall_nsec; if (!wall_clock) return; @@ -1688,17 +1700,12 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) /* * The guest calculates current wall clock time by adding * system time (updated by kvm_guest_time_update below) to the - * wall clock specified here. guest system time equals host - * system time for us, thus we must fill in host boot time here. + * wall clock specified here. We do the reverse here. */ - getboottime64(&boot); + wall_nsec = ktime_get_real_ns() - get_kvmclock_ns(kvm); - if (kvm->arch.kvmclock_offset) { - struct timespec64 ts = ns_to_timespec64(kvm->arch.kvmclock_offset); - boot = timespec64_sub(boot, ts); - } - wc.sec = (u32)boot.tv_sec; /* overflow in 2106 guest time */ - wc.nsec = boot.tv_nsec; + wc.nsec = do_div(wall_nsec, 1000000000); + wc.sec = (u32)wall_nsec; /* overflow in 2106 guest time */ wc.version = version; kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc)); @@ -1946,7 +1953,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); offset = kvm_compute_tsc_offset(vcpu, data); - ns = ktime_get_boottime_ns(); + ns = get_kvmclock_base_ns(); elapsed = ns - kvm->arch.last_tsc_nsec; if (vcpu->arch.virtual_tsc_khz) { @@ -2284,7 +2291,7 @@ u64 get_kvmclock_ns(struct kvm *kvm) spin_lock(&ka->pvclock_gtod_sync_lock); if (!ka->use_master_clock) { spin_unlock(&ka->pvclock_gtod_sync_lock); - return ktime_get_boottime_ns() + ka->kvmclock_offset; + return get_kvmclock_base_ns() + ka->kvmclock_offset; } hv_clock.tsc_timestamp = ka->master_cycle_now; @@ -2300,7 +2307,7 @@ u64 get_kvmclock_ns(struct kvm *kvm) &hv_clock.tsc_to_system_mul); ret = __pvclock_read_cycles(&hv_clock, rdtsc()); } else - ret = ktime_get_boottime_ns() + ka->kvmclock_offset; + ret = get_kvmclock_base_ns() + ka->kvmclock_offset; put_cpu(); @@ -2399,7 +2406,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) } if (!use_master_clock) { host_tsc = rdtsc(); - kernel_ns = ktime_get_boottime_ns(); + kernel_ns = get_kvmclock_base_ns(); } tsc_timestamp = kvm_read_l1_tsc(v, host_tsc); @@ -2439,6 +2446,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) vcpu->hv_clock.tsc_timestamp = tsc_timestamp; vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset; vcpu->last_guest_tsc = tsc_timestamp; + WARN_ON(vcpu->hv_clock.system_time < 0); /* If the host uses TSC clocksource, then it is stable */ pvclock_flags = 0; @@ -9677,7 +9685,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) mutex_init(&kvm->arch.apic_map_lock); spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock); - kvm->arch.kvmclock_offset = -ktime_get_boottime_ns(); + kvm->arch.kvmclock_offset = -get_kvmclock_base_ns(); pvclock_update_vm_gtod_copy(kvm); kvm->arch.guest_can_read_msr_platform_info = true; -- cgit v1.2.3-59-g8ed1b From 9b5e85320fcc3af20ce0397b2c6363b6ee5815b6 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 24 Jan 2020 15:07:22 -0800 Subject: KVM: x86: Take a u64 when checking for a valid dr7 value Take a u64 instead of an unsigned long in kvm_dr7_valid() to fix a build warning on i386 due to right-shifting a 32-bit value by 32 when checking for bits being set in dr7[63:32]. Alternatively, the warning could be resolved by rewriting the check to use an i386-friendly method, but taking a u64 fixes another oddity on 32-bit KVM. Beause KVM implements natural width VMCS fields as u64s to avoid layout issues between 32-bit and 64-bit, a devious guest can stuff vmcs12->guest_dr7 with a 64-bit value even when both the guest and host are 32-bit kernels. KVM eventually drops vmcs12->guest_dr7[63:32] when propagating vmcs12->guest_dr7 to vmcs02, but ideally KVM would not rely on that behavior for correctness. Cc: Jim Mattson Cc: Krish Sadhukhan Fixes: ecb697d10f70 ("KVM: nVMX: Check GUEST_DR7 on vmentry of nested guests") Reported-by: Randy Dunlap Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 2d2ff855773b..3624665acee4 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -357,7 +357,7 @@ static inline bool kvm_pat_valid(u64 data) return (data | ((data & 0x0202020202020202ull) << 1)) == data; } -static inline bool kvm_dr7_valid(unsigned long data) +static inline bool kvm_dr7_valid(u64 data) { /* Bits [63:32] are reserved */ return !(data >> 32); -- cgit v1.2.3-59-g8ed1b From 7df003c85218b5f5b10a7f6418208f31e813f38f Mon Sep 17 00:00:00 2001 From: Zhuang Yanying Date: Sat, 12 Oct 2019 11:37:31 +0800 Subject: KVM: fix overflow of zero page refcount with ksm running MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We are testing Virtual Machine with KSM on v5.4-rc2 kernel, and found the zero_page refcount overflow. The cause of refcount overflow is increased in try_async_pf (get_user_page) without being decreased in mmu_set_spte() while handling ept violation. In kvm_release_pfn_clean(), only unreserved page will call put_page. However, zero page is reserved. So, as well as creating and destroy vm, the refcount of zero page will continue to increase until it overflows. step1: echo 10000 > /sys/kernel/pages_to_scan/pages_to_scan echo 1 > /sys/kernel/pages_to_scan/run echo 1 > /sys/kernel/pages_to_scan/use_zero_pages step2: just create several normal qemu kvm vms. And destroy it after 10s. Repeat this action all the time. After a long period of time, all domains hang because of the refcount of zero page overflow. Qemu print error log as follow: … error: kvm run failed Bad address EAX=00006cdc EBX=00000008 ECX=80202001 EDX=078bfbfd ESI=ffffffff EDI=00000000 EBP=00000008 ESP=00006cc4 EIP=000efd75 EFL=00010002 [-------] CPL=0 II=0 A20=1 SMM=0 HLT=0 ES =0010 00000000 ffffffff 00c09300 DPL=0 DS [-WA] CS =0008 00000000 ffffffff 00c09b00 DPL=0 CS32 [-RA] SS =0010 00000000 ffffffff 00c09300 DPL=0 DS [-WA] DS =0010 00000000 ffffffff 00c09300 DPL=0 DS [-WA] FS =0010 00000000 ffffffff 00c09300 DPL=0 DS [-WA] GS =0010 00000000 ffffffff 00c09300 DPL=0 DS [-WA] LDT=0000 00000000 0000ffff 00008200 DPL=0 LDT TR =0000 00000000 0000ffff 00008b00 DPL=0 TSS32-busy GDT= 000f7070 00000037 IDT= 000f70ae 00000000 CR0=00000011 CR2=00000000 CR3=00000000 CR4=00000000 DR0=0000000000000000 DR1=0000000000000000 DR2=0000000000000000 DR3=0000000000000000 DR6=00000000ffff0ff0 DR7=0000000000000400 EFER=0000000000000000 Code=00 01 00 00 00 e9 e8 00 00 00 c7 05 4c 55 0f 00 01 00 00 00 <8b> 35 00 00 01 00 8b 3d 04 00 01 00 b8 d8 d3 00 00 c1 e0 08 0c ea a3 00 00 01 00 c7 05 04 … Meanwhile, a kernel warning is departed. [40914.836375] WARNING: CPU: 3 PID: 82067 at ./include/linux/mm.h:987 try_get_page+0x1f/0x30 [40914.836412] CPU: 3 PID: 82067 Comm: CPU 0/KVM Kdump: loaded Tainted: G OE 5.2.0-rc2 #5 [40914.836415] RIP: 0010:try_get_page+0x1f/0x30 [40914.836417] Code: 40 00 c3 0f 1f 84 00 00 00 00 00 48 8b 47 08 a8 01 75 11 8b 47 34 85 c0 7e 10 f0 ff 47 34 b8 01 00 00 00 c3 48 8d 78 ff eb e9 <0f> 0b 31 c0 c3 66 90 66 2e 0f 1f 84 00 0 0 00 00 00 48 8b 47 08 a8 [40914.836418] RSP: 0018:ffffb4144e523988 EFLAGS: 00010286 [40914.836419] RAX: 0000000080000000 RBX: 0000000000000326 RCX: 0000000000000000 [40914.836420] RDX: 0000000000000000 RSI: 00004ffdeba10000 RDI: ffffdf07093f6440 [40914.836421] RBP: ffffdf07093f6440 R08: 800000424fd91225 R09: 0000000000000000 [40914.836421] R10: ffff9eb41bfeebb8 R11: 0000000000000000 R12: ffffdf06bbd1e8a8 [40914.836422] R13: 0000000000000080 R14: 800000424fd91225 R15: ffffdf07093f6440 [40914.836423] FS: 00007fb60ffff700(0000) GS:ffff9eb4802c0000(0000) knlGS:0000000000000000 [40914.836425] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [40914.836426] CR2: 0000000000000000 CR3: 0000002f220e6002 CR4: 00000000003626e0 [40914.836427] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [40914.836427] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [40914.836428] Call Trace: [40914.836433] follow_page_pte+0x302/0x47b [40914.836437] __get_user_pages+0xf1/0x7d0 [40914.836441] ? irq_work_queue+0x9/0x70 [40914.836443] get_user_pages_unlocked+0x13f/0x1e0 [40914.836469] __gfn_to_pfn_memslot+0x10e/0x400 [kvm] [40914.836486] try_async_pf+0x87/0x240 [kvm] [40914.836503] tdp_page_fault+0x139/0x270 [kvm] [40914.836523] kvm_mmu_page_fault+0x76/0x5e0 [kvm] [40914.836588] vcpu_enter_guest+0xb45/0x1570 [kvm] [40914.836632] kvm_arch_vcpu_ioctl_run+0x35d/0x580 [kvm] [40914.836645] kvm_vcpu_ioctl+0x26e/0x5d0 [kvm] [40914.836650] do_vfs_ioctl+0xa9/0x620 [40914.836653] ksys_ioctl+0x60/0x90 [40914.836654] __x64_sys_ioctl+0x16/0x20 [40914.836658] do_syscall_64+0x5b/0x180 [40914.836664] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [40914.836666] RIP: 0033:0x7fb61cb6bfc7 Signed-off-by: LinFeng Signed-off-by: Zhuang Yanying Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 7e63a3236364..67ae2d5c37b2 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -186,6 +186,7 @@ bool kvm_is_reserved_pfn(kvm_pfn_t pfn) */ if (pfn_valid(pfn)) return PageReserved(pfn_to_page(pfn)) && + !is_zero_pfn(pfn) && !kvm_is_zone_device_pfn(pfn); return true; -- cgit v1.2.3-59-g8ed1b From 64b38bd1906bb62a040b4e91815e56005db4784d Mon Sep 17 00:00:00 2001 From: Thadeu Lima de Souza Cascardo Date: Fri, 31 Jan 2020 12:56:55 -0300 Subject: x86/kvm: do not setup pv tlb flush when not paravirtualized kvm_setup_pv_tlb_flush will waste memory and print a misguiding message when KVM paravirtualization is not available. Intel SDM says that the when cpuid is used with EAX higher than the maximum supported value for basic of extended function, the data for the highest supported basic function will be returned. So, in some systems, kvm_arch_para_features will return bogus data, causing kvm_setup_pv_tlb_flush to detect support for pv tlb flush. Testing for kvm_para_available will work as it checks for the hypervisor signature. Besides, when the "nopv" command line parameter is used, it should not continue as well, as kvm_guest_init will no be called in that case. Signed-off-by: Thadeu Lima de Souza Cascardo Signed-off-by: Paolo Bonzini --- arch/x86/kernel/kvm.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 81045aabb6f4..d817f255aed8 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -736,6 +736,9 @@ static __init int kvm_setup_pv_tlb_flush(void) { int cpu; + if (!kvm_para_available() || nopv) + return 0; + if (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH) && !kvm_para_has_hint(KVM_HINTS_REALTIME) && kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) { -- cgit v1.2.3-59-g8ed1b From 09df6307125cec07ef9168f1db2ffdbbcb304b1a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 3 Feb 2020 10:41:59 -0800 Subject: KVM: MIPS: Fix a build error due to referencing not-yet-defined function Hoist kvm_mips_comparecount_wakeup() above its only user, kvm_arch_vcpu_create() to fix a compilation error due to referencing an undefined function. Fixes: d11dfed5d700 ("KVM: MIPS: Move all vcpu init code into kvm_arch_vcpu_create()") Reported-by: kbuild test robot Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/mips/kvm/mips.c | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 2606f3f02b54..92509041b954 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -280,6 +280,27 @@ static inline void dump_handler(const char *symbol, void *start, void *end) pr_debug("\tEND(%s)\n", symbol); } +static void kvm_mips_comparecount_func(unsigned long data) +{ + struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data; + + kvm_mips_callbacks->queue_timer_int(vcpu); + + vcpu->arch.wait = 0; + if (swq_has_sleeper(&vcpu->wq)) + swake_up_one(&vcpu->wq); +} + +/* low level hrtimer wake routine */ +static enum hrtimer_restart kvm_mips_comparecount_wakeup(struct hrtimer *timer) +{ + struct kvm_vcpu *vcpu; + + vcpu = container_of(timer, struct kvm_vcpu, arch.comparecount_timer); + kvm_mips_comparecount_func((unsigned long) vcpu); + return kvm_mips_count_timeout(vcpu); +} + int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) { return 0; @@ -1209,27 +1230,6 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) return 0; } -static void kvm_mips_comparecount_func(unsigned long data) -{ - struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data; - - kvm_mips_callbacks->queue_timer_int(vcpu); - - vcpu->arch.wait = 0; - if (swq_has_sleeper(&vcpu->wq)) - swake_up_one(&vcpu->wq); -} - -/* low level hrtimer wake routine */ -static enum hrtimer_restart kvm_mips_comparecount_wakeup(struct hrtimer *timer) -{ - struct kvm_vcpu *vcpu; - - vcpu = container_of(timer, struct kvm_vcpu, arch.comparecount_timer); - kvm_mips_comparecount_func((unsigned long) vcpu); - return kvm_mips_count_timeout(vcpu); -} - int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, struct kvm_translation *tr) { -- cgit v1.2.3-59-g8ed1b From 879a37632b403eb8c0fe00e14f907759100c8071 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 3 Feb 2020 10:42:00 -0800 Subject: KVM: MIPS: Fold comparecount_func() into comparecount_wakeup() Fold kvm_mips_comparecount_func() into kvm_mips_comparecount_wakeup() to eliminate the nondescript function name as well as its unnecessary cast of a vcpu to "unsigned long" and back to a vcpu. Presumably func() was used as a callback at some point during pre-upstream development, as wakeup() is the only user of func() and has been the only user since both with introduced by commit 669e846e6c4e ("KVM/MIPS32: MIPS arch specific APIs for KVM"). Cc: Davidlohr Bueso Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/mips/kvm/mips.c | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 92509041b954..71244bf87c3a 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -280,24 +280,19 @@ static inline void dump_handler(const char *symbol, void *start, void *end) pr_debug("\tEND(%s)\n", symbol); } -static void kvm_mips_comparecount_func(unsigned long data) +/* low level hrtimer wake routine */ +static enum hrtimer_restart kvm_mips_comparecount_wakeup(struct hrtimer *timer) { - struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data; + struct kvm_vcpu *vcpu; + + vcpu = container_of(timer, struct kvm_vcpu, arch.comparecount_timer); kvm_mips_callbacks->queue_timer_int(vcpu); vcpu->arch.wait = 0; if (swq_has_sleeper(&vcpu->wq)) swake_up_one(&vcpu->wq); -} -/* low level hrtimer wake routine */ -static enum hrtimer_restart kvm_mips_comparecount_wakeup(struct hrtimer *timer) -{ - struct kvm_vcpu *vcpu; - - vcpu = container_of(timer, struct kvm_vcpu, arch.comparecount_timer); - kvm_mips_comparecount_func((unsigned long) vcpu); return kvm_mips_count_timeout(vcpu); } -- cgit v1.2.3-59-g8ed1b From ea79a750927e1835fa869c9136bfd6da28e605e6 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 4 Feb 2020 07:32:59 -0800 Subject: KVM: nVMX: Remove stale comment from nested_vmx_load_cr3() The blurb pertaining to the return value of nested_vmx_load_cr3() no longer matches reality, remove it entirely as the behavior it is attempting to document is quite obvious when reading the actual code. Signed-off-by: Sean Christopherson Reviewed-by: Krish Sadhukhan Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 53d522faaa69..0118637fb970 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -1074,10 +1074,10 @@ static bool nested_cr3_valid(struct kvm_vcpu *vcpu, unsigned long val) } /* - * Load guest's/host's cr3 at nested entry/exit. nested_ept is true if we are - * emulating VM entry into a guest with EPT enabled. - * Returns 0 on success, 1 on failure. Invalid state exit qualification code - * is assigned to entry_failure_code on failure. + * Load guest's/host's cr3 at nested entry/exit. @nested_ept is true if we are + * emulating VM-Entry into a guest with EPT enabled. On failure, the expected + * Exit Qualification (for a VM-Entry consistency check VM-Exit) is assigned to + * @entry_failure_code. */ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool nested_ept, u32 *entry_failure_code) -- cgit v1.2.3-59-g8ed1b From 0a2b64c50db00196c85ec9e8e4c3d7506cd09db9 Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Mon, 3 Feb 2020 15:09:09 -0800 Subject: kvm: mmu: Replace unsigned with unsigned int for PTE access There are several functions which pass an access permission mask for SPTEs as an unsigned. This works, but checkpatch complains about it. Switch the occurrences of unsigned to unsigned int to satisfy checkpatch. No functional change expected. Tested by running kvm-unit-tests on an Intel Haswell machine. This commit introduced no new failures. Signed-off-by: Ben Gardon Reviewed-by: Oliver Upton Reviewed-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index adc84f0f16ba..7c544e17c5b3 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -452,7 +452,7 @@ static u64 get_mmio_spte_generation(u64 spte) } static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn, - unsigned access) + unsigned int access) { u64 gen = kvm_vcpu_memslots(vcpu)->generation & MMIO_SPTE_GEN_MASK; u64 mask = generation_mmio_spte_mask(gen); @@ -484,7 +484,7 @@ static unsigned get_mmio_spte_access(u64 spte) } static bool set_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn, - kvm_pfn_t pfn, unsigned access) + kvm_pfn_t pfn, unsigned int access) { if (unlikely(is_noslot_pfn(pfn))) { mark_mmio_spte(vcpu, sptep, gfn, access); @@ -2475,7 +2475,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, gva_t gaddr, unsigned level, int direct, - unsigned access) + unsigned int access) { union kvm_mmu_page_role role; unsigned quadrant; @@ -2990,7 +2990,7 @@ static bool kvm_is_mmio_pfn(kvm_pfn_t pfn) #define SET_SPTE_NEED_REMOTE_TLB_FLUSH BIT(1) static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, - unsigned pte_access, int level, + unsigned int pte_access, int level, gfn_t gfn, kvm_pfn_t pfn, bool speculative, bool can_unsync, bool host_writable) { @@ -3081,9 +3081,10 @@ set_pte: return ret; } -static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access, - int write_fault, int level, gfn_t gfn, kvm_pfn_t pfn, - bool speculative, bool host_writable) +static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, + unsigned int pte_access, int write_fault, int level, + gfn_t gfn, kvm_pfn_t pfn, bool speculative, + bool host_writable) { int was_rmapped = 0; int rmap_count; @@ -3165,7 +3166,7 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, { struct page *pages[PTE_PREFETCH_NUM]; struct kvm_memory_slot *slot; - unsigned access = sp->role.access; + unsigned int access = sp->role.access; int i, ret; gfn_t gfn; @@ -3400,7 +3401,8 @@ static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn) } static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn, - kvm_pfn_t pfn, unsigned access, int *ret_val) + kvm_pfn_t pfn, unsigned int access, + int *ret_val) { /* The pfn is invalid, report the error! */ if (unlikely(is_error_pfn(pfn))) { @@ -4005,7 +4007,7 @@ static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct) if (is_mmio_spte(spte)) { gfn_t gfn = get_mmio_spte_gfn(spte); - unsigned access = get_mmio_spte_access(spte); + unsigned int access = get_mmio_spte_access(spte); if (!check_mmio_spte(vcpu, spte)) return RET_PF_INVALID; @@ -4349,7 +4351,7 @@ static void inject_page_fault(struct kvm_vcpu *vcpu, } static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn, - unsigned access, int *nr_present) + unsigned int access, int *nr_present) { if (unlikely(is_mmio_spte(*sptep))) { if (gfn != get_mmio_spte_gfn(*sptep)) { -- cgit v1.2.3-59-g8ed1b From 8f79b064959b1c858cddad1cecbf0511adca8209 Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Mon, 3 Feb 2020 15:09:10 -0800 Subject: kvm: mmu: Separate generating and setting mmio ptes Separate the functions for generating MMIO page table entries from the function that inserts them into the paging structure. This refactoring will facilitate changes to the MMU sychronization model to use atomic compare / exchanges (which are not guaranteed to succeed) instead of a monolithic MMU lock. No functional change expected. Tested by running kvm-unit-tests on an Intel Haswell machine. This commit introduced no new failures. Signed-off-by: Ben Gardon Reviewed-by: Oliver Upton Reviewed-by: Peter Shier Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 7c544e17c5b3..7011a4e54866 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -451,9 +451,9 @@ static u64 get_mmio_spte_generation(u64 spte) return gen; } -static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn, - unsigned int access) +static u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access) { + u64 gen = kvm_vcpu_memslots(vcpu)->generation & MMIO_SPTE_GEN_MASK; u64 mask = generation_mmio_spte_mask(gen); u64 gpa = gfn << PAGE_SHIFT; @@ -464,6 +464,17 @@ static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn, mask |= (gpa & shadow_nonpresent_or_rsvd_mask) << shadow_nonpresent_or_rsvd_mask_len; + return mask; +} + +static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn, + unsigned int access) +{ + u64 mask = make_mmio_spte(vcpu, gfn, access); + unsigned int gen = get_mmio_spte_generation(mask); + + access = mask & ACC_ALL; + trace_mark_mmio_spte(sptep, gfn, access, gen); mmu_spte_set(sptep, mask); } -- cgit v1.2.3-59-g8ed1b From 31de3d2500e49e9f44fdda1830a37f4d9735bcdd Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Wed, 5 Feb 2020 13:30:33 +0100 Subject: x86/kvm/hyper-v: move VMX controls sanitization out of nested_enable_evmcs() With fine grained VMX feature enablement QEMU>=4.2 tries to do KVM_SET_MSRS with default (matching CPU model) values and in case eVMCS is also enabled, fails. It would be possible to drop VMX feature filtering completely and make this a guest's responsibility: if it decides to use eVMCS it should know which fields are available and which are not. Hyper-V mostly complies to this, however, there are some problematic controls: SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES VM_{ENTRY,EXIT}_LOAD_IA32_PERF_GLOBAL_CTRL which Hyper-V enables. As there are no corresponding fields in eVMCS, we can't handle this properly in KVM. This is a Hyper-V issue. Move VMX controls sanitization from nested_enable_evmcs() to vmx_get_msr(), and do the bare minimum (only clear controls which are known to cause issues). This allows userspace to keep setting controls it wants and at the same time hides them from the guest. Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/evmcs.c | 32 ++++++++++++++++++++++++++------ arch/x86/kvm/vmx/evmcs.h | 1 + arch/x86/kvm/vmx/vmx.c | 16 ++++++++++++++-- 3 files changed, 41 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/vmx/evmcs.c b/arch/x86/kvm/vmx/evmcs.c index 89c3e0caf39f..ba886fb7bc39 100644 --- a/arch/x86/kvm/vmx/evmcs.c +++ b/arch/x86/kvm/vmx/evmcs.c @@ -346,6 +346,32 @@ uint16_t nested_get_evmcs_version(struct kvm_vcpu *vcpu) return 0; } +void nested_evmcs_filter_control_msr(u32 msr_index, u64 *pdata) +{ + u32 ctl_low = (u32)*pdata; + u32 ctl_high = (u32)(*pdata >> 32); + + /* + * Hyper-V 2016 and 2019 try using these features even when eVMCS + * is enabled but there are no corresponding fields. + */ + switch (msr_index) { + case MSR_IA32_VMX_EXIT_CTLS: + case MSR_IA32_VMX_TRUE_EXIT_CTLS: + ctl_high &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; + break; + case MSR_IA32_VMX_ENTRY_CTLS: + case MSR_IA32_VMX_TRUE_ENTRY_CTLS: + ctl_high &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; + break; + case MSR_IA32_VMX_PROCBASED_CTLS2: + ctl_high &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; + break; + } + + *pdata = ctl_low | ((u64)ctl_high << 32); +} + int nested_enable_evmcs(struct kvm_vcpu *vcpu, uint16_t *vmcs_version) { @@ -356,11 +382,5 @@ int nested_enable_evmcs(struct kvm_vcpu *vcpu, if (vmcs_version) *vmcs_version = nested_get_evmcs_version(vcpu); - vmx->nested.msrs.pinbased_ctls_high &= ~EVMCS1_UNSUPPORTED_PINCTRL; - vmx->nested.msrs.entry_ctls_high &= ~EVMCS1_UNSUPPORTED_VMENTRY_CTRL; - vmx->nested.msrs.exit_ctls_high &= ~EVMCS1_UNSUPPORTED_VMEXIT_CTRL; - vmx->nested.msrs.secondary_ctls_high &= ~EVMCS1_UNSUPPORTED_2NDEXEC; - vmx->nested.msrs.vmfunc_controls &= ~EVMCS1_UNSUPPORTED_VMFUNC; - return 0; } diff --git a/arch/x86/kvm/vmx/evmcs.h b/arch/x86/kvm/vmx/evmcs.h index 07ebf6882a45..b88d9807a796 100644 --- a/arch/x86/kvm/vmx/evmcs.h +++ b/arch/x86/kvm/vmx/evmcs.h @@ -201,5 +201,6 @@ bool nested_enlightened_vmentry(struct kvm_vcpu *vcpu, u64 *evmcs_gpa); uint16_t nested_get_evmcs_version(struct kvm_vcpu *vcpu); int nested_enable_evmcs(struct kvm_vcpu *vcpu, uint16_t *vmcs_version); +void nested_evmcs_filter_control_msr(u32 msr_index, u64 *pdata); #endif /* __KVM_X86_VMX_EVMCS_H */ diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 678edbd6e278..ba334acaa37e 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1853,8 +1853,20 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: if (!nested_vmx_allowed(vcpu)) return 1; - return vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index, - &msr_info->data); + if (vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index, + &msr_info->data)) + return 1; + /* + * Enlightened VMCS v1 doesn't have certain fields, but buggy + * Hyper-V versions are still trying to use corresponding + * features when they are exposed. Filter out the essential + * minimum. + */ + if (!msr_info->host_initiated && + vmx->nested.enlightened_vmcs_enabled) + nested_evmcs_filter_control_msr(msr_info->index, + &msr_info->data); + break; case MSR_IA32_RTIT_CTL: if (pt_mode != PT_MODE_HOST_GUEST) return 1; -- cgit v1.2.3-59-g8ed1b From a83502314ce303c6341b249c41121759c7477ba1 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Wed, 5 Feb 2020 13:30:34 +0100 Subject: x86/kvm/hyper-v: don't allow to turn on unsupported VMX controls for nested guests Sane L1 hypervisors are not supposed to turn any of the unsupported VMX controls on for its guests and nested_vmx_check_controls() checks for that. This is, however, not the case for the controls which are supported on the host but are missing in enlightened VMCS and when eVMCS is in use. It would certainly be possible to add these missing checks to nested_check_vm_execution_controls()/_vm_exit_controls()/.. but it seems preferable to keep eVMCS-specific stuff in eVMCS and reduce the impact on non-eVMCS guests by doing less unrelated checks. Create a separate nested_evmcs_check_controls() for this purpose. Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/evmcs.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++ arch/x86/kvm/vmx/evmcs.h | 2 ++ arch/x86/kvm/vmx/nested.c | 3 +++ 3 files changed, 58 insertions(+) diff --git a/arch/x86/kvm/vmx/evmcs.c b/arch/x86/kvm/vmx/evmcs.c index ba886fb7bc39..303813423c3e 100644 --- a/arch/x86/kvm/vmx/evmcs.c +++ b/arch/x86/kvm/vmx/evmcs.c @@ -7,6 +7,7 @@ #include "evmcs.h" #include "vmcs.h" #include "vmx.h" +#include "trace.h" DEFINE_STATIC_KEY_FALSE(enable_evmcs); @@ -372,6 +373,58 @@ void nested_evmcs_filter_control_msr(u32 msr_index, u64 *pdata) *pdata = ctl_low | ((u64)ctl_high << 32); } +int nested_evmcs_check_controls(struct vmcs12 *vmcs12) +{ + int ret = 0; + u32 unsupp_ctl; + + unsupp_ctl = vmcs12->pin_based_vm_exec_control & + EVMCS1_UNSUPPORTED_PINCTRL; + if (unsupp_ctl) { + trace_kvm_nested_vmenter_failed( + "eVMCS: unsupported pin-based VM-execution controls", + unsupp_ctl); + ret = -EINVAL; + } + + unsupp_ctl = vmcs12->secondary_vm_exec_control & + EVMCS1_UNSUPPORTED_2NDEXEC; + if (unsupp_ctl) { + trace_kvm_nested_vmenter_failed( + "eVMCS: unsupported secondary VM-execution controls", + unsupp_ctl); + ret = -EINVAL; + } + + unsupp_ctl = vmcs12->vm_exit_controls & + EVMCS1_UNSUPPORTED_VMEXIT_CTRL; + if (unsupp_ctl) { + trace_kvm_nested_vmenter_failed( + "eVMCS: unsupported VM-exit controls", + unsupp_ctl); + ret = -EINVAL; + } + + unsupp_ctl = vmcs12->vm_entry_controls & + EVMCS1_UNSUPPORTED_VMENTRY_CTRL; + if (unsupp_ctl) { + trace_kvm_nested_vmenter_failed( + "eVMCS: unsupported VM-entry controls", + unsupp_ctl); + ret = -EINVAL; + } + + unsupp_ctl = vmcs12->vm_function_control & EVMCS1_UNSUPPORTED_VMFUNC; + if (unsupp_ctl) { + trace_kvm_nested_vmenter_failed( + "eVMCS: unsupported VM-function controls", + unsupp_ctl); + ret = -EINVAL; + } + + return ret; +} + int nested_enable_evmcs(struct kvm_vcpu *vcpu, uint16_t *vmcs_version) { diff --git a/arch/x86/kvm/vmx/evmcs.h b/arch/x86/kvm/vmx/evmcs.h index b88d9807a796..6de47f2569c9 100644 --- a/arch/x86/kvm/vmx/evmcs.h +++ b/arch/x86/kvm/vmx/evmcs.h @@ -10,6 +10,7 @@ #include "capabilities.h" #include "vmcs.h" +#include "vmcs12.h" struct vmcs_config; @@ -202,5 +203,6 @@ uint16_t nested_get_evmcs_version(struct kvm_vcpu *vcpu); int nested_enable_evmcs(struct kvm_vcpu *vcpu, uint16_t *vmcs_version); void nested_evmcs_filter_control_msr(u32 msr_index, u64 *pdata); +int nested_evmcs_check_controls(struct vmcs12 *vmcs12); #endif /* __KVM_X86_VMX_EVMCS_H */ diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 0118637fb970..657c2eda357c 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2757,6 +2757,9 @@ static int nested_vmx_check_controls(struct kvm_vcpu *vcpu, nested_check_vm_entry_controls(vcpu, vmcs12)) return -EINVAL; + if (to_vmx(vcpu)->nested.enlightened_vmcs_enabled) + return nested_evmcs_check_controls(vmcs12); + return 0; } -- cgit v1.2.3-59-g8ed1b From 4400cf546b4bb62d49198f6642add01bf6e9b34d Mon Sep 17 00:00:00 2001 From: Eric Hankland Date: Mon, 27 Jan 2020 13:22:56 -0800 Subject: KVM: x86: Fix perfctr WRMSR for running counters Correct the logic in intel_pmu_set_msr() for fixed and general purpose counters. This was recently changed to set pmc->counter without taking in to account the value of pmc_read_counter() which will be incorrect if the counter is currently running and non-zero; this changes back to the old logic which accounted for the value of currently running counters. Signed-off-by: Eric Hankland Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/pmu_intel.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 34a3a17bb6d7..fd21cdb10b79 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -260,13 +260,12 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; default: if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0))) { - if (msr_info->host_initiated) - pmc->counter = data; - else - pmc->counter = (s32)data; + if (!msr_info->host_initiated) + data = (s64)(s32)data; + pmc->counter += data - pmc_read_counter(pmc); return 0; } else if ((pmc = get_fixed_pmc(pmu, msr))) { - pmc->counter = data; + pmc->counter += data - pmc_read_counter(pmc); return 0; } else if ((pmc = get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0))) { if (data == pmc->eventsel) -- cgit v1.2.3-59-g8ed1b From df7e8818926eb4712b67421442acf7d568fe2645 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 5 Feb 2020 16:10:52 +0100 Subject: KVM: SVM: relax conditions for allowing MSR_IA32_SPEC_CTRL accesses Userspace that does not know about the AMD_IBRS bit might still allow the guest to protect itself with MSR_IA32_SPEC_CTRL using the Intel SPEC_CTRL bit. However, svm.c disallows this and will cause a #GP in the guest when writing to the MSR. Fix this by loosening the test and allowing the Intel CPUID bit, and in fact allow the AMD_STIBP bit as well since it allows writing to MSR_IA32_SPEC_CTRL too. Reported-by: Zhiyi Guo Analyzed-by: Dr. David Alan Gilbert Analyzed-by: Laszlo Ersek Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index bf0556588ad0..a3e32d61d60c 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -4225,6 +4225,8 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_IA32_SPEC_CTRL: if (!msr_info->host_initiated && + !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) && + !guest_cpuid_has(vcpu, X86_FEATURE_AMD_STIBP) && !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBRS) && !guest_cpuid_has(vcpu, X86_FEATURE_AMD_SSBD)) return 1; @@ -4310,6 +4312,8 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) break; case MSR_IA32_SPEC_CTRL: if (!msr->host_initiated && + !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) && + !guest_cpuid_has(vcpu, X86_FEATURE_AMD_STIBP) && !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBRS) && !guest_cpuid_has(vcpu, X86_FEATURE_AMD_SSBD)) return 1; -- cgit v1.2.3-59-g8ed1b From bcfcff640c4d736933c5990d5a801d6a0c22c28b Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 5 Feb 2020 16:20:23 +0100 Subject: x86: vmxfeatures: rename features for consistency with KVM and manual Three of the feature bits in vmxfeatures.h have names that are different from the Intel SDM. The names have been adjusted recently in KVM but they were using the old name in the tip tree's x86/cpu branch. Adjust for consistency. Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/vmx.h | 6 +++--- arch/x86/include/asm/vmxfeatures.h | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index d380b3b7ddd9..2a85287b3685 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -22,8 +22,8 @@ /* * Definitions of Primary Processor-Based VM-Execution Controls. */ -#define CPU_BASED_INTR_WINDOW_EXITING VMCS_CONTROL_BIT(VIRTUAL_INTR_PENDING) -#define CPU_BASED_USE_TSC_OFFSETTING VMCS_CONTROL_BIT(TSC_OFFSETTING) +#define CPU_BASED_INTR_WINDOW_EXITING VMCS_CONTROL_BIT(INTR_WINDOW_EXITING) +#define CPU_BASED_USE_TSC_OFFSETTING VMCS_CONTROL_BIT(USE_TSC_OFFSETTING) #define CPU_BASED_HLT_EXITING VMCS_CONTROL_BIT(HLT_EXITING) #define CPU_BASED_INVLPG_EXITING VMCS_CONTROL_BIT(INVLPG_EXITING) #define CPU_BASED_MWAIT_EXITING VMCS_CONTROL_BIT(MWAIT_EXITING) @@ -34,7 +34,7 @@ #define CPU_BASED_CR8_LOAD_EXITING VMCS_CONTROL_BIT(CR8_LOAD_EXITING) #define CPU_BASED_CR8_STORE_EXITING VMCS_CONTROL_BIT(CR8_STORE_EXITING) #define CPU_BASED_TPR_SHADOW VMCS_CONTROL_BIT(VIRTUAL_TPR) -#define CPU_BASED_NMI_WINDOW_EXITING VMCS_CONTROL_BIT(VIRTUAL_NMI_PENDING) +#define CPU_BASED_NMI_WINDOW_EXITING VMCS_CONTROL_BIT(NMI_WINDOW_EXITING) #define CPU_BASED_MOV_DR_EXITING VMCS_CONTROL_BIT(MOV_DR_EXITING) #define CPU_BASED_UNCOND_IO_EXITING VMCS_CONTROL_BIT(UNCOND_IO_EXITING) #define CPU_BASED_USE_IO_BITMAPS VMCS_CONTROL_BIT(USE_IO_BITMAPS) diff --git a/arch/x86/include/asm/vmxfeatures.h b/arch/x86/include/asm/vmxfeatures.h index 0d04d8bf15a5..a50e4a0de315 100644 --- a/arch/x86/include/asm/vmxfeatures.h +++ b/arch/x86/include/asm/vmxfeatures.h @@ -34,8 +34,8 @@ #define VMX_FEATURE_EPTP_SWITCHING ( 0*32+ 28) /* EPTP switching (in guest) */ /* Primary Processor-Based VM-Execution Controls, word 1 */ -#define VMX_FEATURE_VIRTUAL_INTR_PENDING ( 1*32+ 2) /* "" VM-Exit if INTRs are unblocked in guest */ -#define VMX_FEATURE_TSC_OFFSETTING ( 1*32+ 3) /* "tsc_offset" Offset hardware TSC when read in guest */ +#define VMX_FEATURE_INTR_WINDOW_EXITING ( 1*32+ 2) /* "" VM-Exit if INTRs are unblocked in guest */ +#define VMX_FEATURE_USE_TSC_OFFSETTING ( 1*32+ 3) /* "tsc_offset" Offset hardware TSC when read in guest */ #define VMX_FEATURE_HLT_EXITING ( 1*32+ 7) /* "" VM-Exit on HLT */ #define VMX_FEATURE_INVLPG_EXITING ( 1*32+ 9) /* "" VM-Exit on INVLPG */ #define VMX_FEATURE_MWAIT_EXITING ( 1*32+ 10) /* "" VM-Exit on MWAIT */ @@ -46,7 +46,7 @@ #define VMX_FEATURE_CR8_LOAD_EXITING ( 1*32+ 19) /* "" VM-Exit on writes to CR8 */ #define VMX_FEATURE_CR8_STORE_EXITING ( 1*32+ 20) /* "" VM-Exit on reads from CR8 */ #define VMX_FEATURE_VIRTUAL_TPR ( 1*32+ 21) /* "vtpr" TPR virtualization, a.k.a. TPR shadow */ -#define VMX_FEATURE_VIRTUAL_NMI_PENDING ( 1*32+ 22) /* "" VM-Exit if NMIs are unblocked in guest */ +#define VMX_FEATURE_NMI_WINDOW_EXITING ( 1*32+ 22) /* "" VM-Exit if NMIs are unblocked in guest */ #define VMX_FEATURE_MOV_DR_EXITING ( 1*32+ 23) /* "" VM-Exit on accesses to debug registers */ #define VMX_FEATURE_UNCOND_IO_EXITING ( 1*32+ 24) /* "" VM-Exit on *all* IN{S} and OUT{S}*/ #define VMX_FEATURE_USE_IO_BITMAPS ( 1*32+ 25) /* "" VM-Exit based on I/O port */ -- cgit v1.2.3-59-g8ed1b From d76c7fbc01b29257359ed8b0d16d662e725b7bf9 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 28 Jan 2020 15:53:44 -0800 Subject: KVM: x86: Mark CR4.UMIP as reserved based on associated CPUID bit Re-add code to mark CR4.UMIP as reserved if UMIP is not supported by the host. The UMIP handling was unintentionally dropped during a recent refactoring. Not flagging CR4.UMIP allows the guest to set its CR4.UMIP regardless of host support or userspace desires. On CPUs with UMIP support, including emulated UMIP, this allows the guest to enable UMIP against the wishes of the userspace VMM. On CPUs without any form of UMIP, this results in a failed VM-Enter due to invalid guest state. Fixes: 345599f9a2928 ("KVM: x86: Add macro to ensure reserved cr4 bits checks stay in sync") Signed-off-by: Sean Christopherson Reviewed-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6db92371ad21..fbabb2f06273 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -898,6 +898,8 @@ EXPORT_SYMBOL_GPL(kvm_set_xcr); __reserved_bits |= X86_CR4_PKE; \ if (!__cpu_has(__c, X86_FEATURE_LA57)) \ __reserved_bits |= X86_CR4_LA57; \ + if (!__cpu_has(__c, X86_FEATURE_UMIP)) \ + __reserved_bits |= X86_CR4_UMIP; \ __reserved_bits; \ }) -- cgit v1.2.3-59-g8ed1b From a8be1ad01b795bd2a13297ddbaecdb956ab0efd0 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Wed, 5 Feb 2020 23:33:53 +0800 Subject: KVM: vmx: delete meaningless vmx_decache_cr0_guest_bits() declaration The function vmx_decache_cr0_guest_bits() is only called below its implementation. So this is meaningless and should be removed. Signed-off-by: Miaohe Lin Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index ba334acaa37e..9a6664886f2e 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1428,8 +1428,6 @@ static bool emulation_required(struct kvm_vcpu *vcpu) return emulate_invalid_guest_state && !guest_state_valid(vcpu); } -static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu); - unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); -- cgit v1.2.3-59-g8ed1b