From d222af072380c4470295c07d84ecb15f4937e365 Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Wed, 6 Sep 2017 15:20:55 +1000
Subject: KVM: PPC: Book3S HV: Don't access XIVE PIPR register using byte
 accesses

The XIVE interrupt controller on POWER9 machines doesn't support byte
accesses to any register in the thread management area other than the
CPPR (current processor priority register).  In particular, when
reading the PIPR (pending interrupt priority register), we need to
do a 32-bit or 64-bit load.

Cc: stable@vger.kernel.org # v4.13
Fixes: 2c4fb78f78b6 ("KVM: PPC: Book3S HV: Workaround POWER9 DD1.0 bug causing IPB bit loss")
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/kvm/book3s_hv_rm_xive.c    | 1 -
 arch/powerpc/kvm/book3s_xive.c          | 1 -
 arch/powerpc/kvm/book3s_xive_template.c | 7 ++++---
 3 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv_rm_xive.c b/arch/powerpc/kvm/book3s_hv_rm_xive.c
index abf5f01b6eb1..5b81a807d742 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_xive.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_xive.c
@@ -38,7 +38,6 @@ static inline void __iomem *get_tima_phys(void)
 #define __x_tima		get_tima_phys()
 #define __x_eoi_page(xd)	((void __iomem *)((xd)->eoi_page))
 #define __x_trig_page(xd)	((void __iomem *)((xd)->trig_page))
-#define __x_readb	__raw_rm_readb
 #define __x_writeb	__raw_rm_writeb
 #define __x_readw	__raw_rm_readw
 #define __x_readq	__raw_rm_readq
diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c
index 08b200a0bbce..13304622ab1c 100644
--- a/arch/powerpc/kvm/book3s_xive.c
+++ b/arch/powerpc/kvm/book3s_xive.c
@@ -48,7 +48,6 @@
 #define __x_tima		xive_tima
 #define __x_eoi_page(xd)	((void __iomem *)((xd)->eoi_mmio))
 #define __x_trig_page(xd)	((void __iomem *)((xd)->trig_mmio))
-#define __x_readb	__raw_readb
 #define __x_writeb	__raw_writeb
 #define __x_readw	__raw_readw
 #define __x_readq	__raw_readq
diff --git a/arch/powerpc/kvm/book3s_xive_template.c b/arch/powerpc/kvm/book3s_xive_template.c
index d1ed2c41b5d2..c7a5deadd1cc 100644
--- a/arch/powerpc/kvm/book3s_xive_template.c
+++ b/arch/powerpc/kvm/book3s_xive_template.c
@@ -28,7 +28,8 @@ static void GLUE(X_PFX,ack_pending)(struct kvmppc_xive_vcpu *xc)
 	 * bit.
 	 */
 	if (cpu_has_feature(CPU_FTR_POWER9_DD1)) {
-		u8 pipr = __x_readb(__x_tima + TM_QW1_OS + TM_PIPR);
+		__be64 qw1 = __x_readq(__x_tima + TM_QW1_OS);
+		u8 pipr = be64_to_cpu(qw1) & 0xff;
 		if (pipr >= xc->hw_cppr)
 			return;
 	}
@@ -336,7 +337,6 @@ X_STATIC unsigned long GLUE(X_PFX,h_ipoll)(struct kvm_vcpu *vcpu, unsigned long
 	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
 	u8 pending = xc->pending;
 	u32 hirq;
-	u8 pipr;
 
 	pr_devel("H_IPOLL(server=%ld)\n", server);
 
@@ -353,7 +353,8 @@ X_STATIC unsigned long GLUE(X_PFX,h_ipoll)(struct kvm_vcpu *vcpu, unsigned long
 		pending = 0xff;
 	} else {
 		/* Grab pending interrupt if any */
-		pipr = __x_readb(__x_tima + TM_QW1_OS + TM_PIPR);
+		__be64 qw1 = __x_readq(__x_tima + TM_QW1_OS);
+		u8 pipr = be64_to_cpu(qw1) & 0xff;
 		if (pipr < 8)
 			pending |= 1 << pipr;
 	}
-- 
cgit v1.2.3-59-g8ed1b


From cf5f6f3125241853462334b1bc696f3c3c492178 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Mon, 11 Sep 2017 16:05:30 +1000
Subject: KVM: PPC: Book3S HV: Hold kvm->lock around call to kvmppc_update_lpcr

Commit 468808bd35c4 ("KVM: PPC: Book3S HV: Set process table for HPT
guests on POWER9", 2017-01-30) added a call to kvmppc_update_lpcr()
which doesn't hold the kvm->lock mutex around the call, as required.
This adds the lock/unlock pair, and for good measure, includes
the kvmppc_setup_partition_table() call in the locked region, since
it is altering global state of the VM.

This error appears not to have any fatal consequences for the host;
the consequences would be that the VCPUs could end up running with
different LPCR values, or an update to the LPCR value by userspace
using the one_reg interface could get overwritten, or the update
done by kvmhv_configure_mmu() could get overwritten.

Cc: stable@vger.kernel.org # v4.10+
Fixes: 468808bd35c4 ("KVM: PPC: Book3S HV: Set process table for HPT guests on POWER9")
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/kvm/book3s_hv.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 18e974a34fce..a7177c284f9b 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -4212,11 +4212,13 @@ static int kvmhv_configure_mmu(struct kvm *kvm, struct kvm_ppc_mmuv3_cfg *cfg)
 	if ((cfg->process_table & PRTS_MASK) > 24)
 		return -EINVAL;
 
+	mutex_lock(&kvm->lock);
 	kvm->arch.process_table = cfg->process_table;
 	kvmppc_setup_partition_table(kvm);
 
 	lpcr = (cfg->flags & KVM_PPC_MMUV3_GTSE) ? LPCR_GTSE : 0;
 	kvmppc_update_lpcr(kvm, lpcr, LPCR_GTSE);
+	mutex_unlock(&kvm->lock);
 
 	return 0;
 }
-- 
cgit v1.2.3-59-g8ed1b


From 67f8a8c1151c9ef3d1285905d1e66ebb769ecdf7 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Tue, 12 Sep 2017 13:47:23 +1000
Subject: KVM: PPC: Book3S HV: Fix bug causing host SLB to be restored
 incorrectly

Aneesh Kumar reported seeing host crashes when running recent kernels
on POWER8.  The symptom was an oops like this:

Unable to handle kernel paging request for data at address 0xf00000000786c620
Faulting instruction address: 0xc00000000030e1e4
Oops: Kernel access of bad area, sig: 11 [#1]
LE SMP NR_CPUS=2048 NUMA PowerNV
Modules linked in: powernv_op_panel
CPU: 24 PID: 6663 Comm: qemu-system-ppc Tainted: G        W 4.13.0-rc7-43932-gfc36c59 #2
task: c000000fdeadfe80 task.stack: c000000fdeb68000
NIP:  c00000000030e1e4 LR: c00000000030de6c CTR: c000000000103620
REGS: c000000fdeb6b450 TRAP: 0300   Tainted: G        W        (4.13.0-rc7-43932-gfc36c59)
MSR:  9000000000009033 <SF,HV,EE,ME,IR,DR,RI,LE>  CR: 24044428  XER: 20000000
CFAR: c00000000030e134 DAR: f00000000786c620 DSISR: 40000000 SOFTE: 0
GPR00: 0000000000000000 c000000fdeb6b6d0 c0000000010bd000 000000000000e1b0
GPR04: c00000000115e168 c000001fffa6e4b0 c00000000115d000 c000001e1b180386
GPR08: f000000000000000 c000000f9a8913e0 f00000000786c600 00007fff587d0000
GPR12: c000000fdeb68000 c00000000fb0f000 0000000000000001 00007fff587cffff
GPR16: 0000000000000000 c000000000000000 00000000003fffff c000000fdebfe1f8
GPR20: 0000000000000004 c000000fdeb6b8a8 0000000000000001 0008000000000040
GPR24: 07000000000000c0 00007fff587cffff c000000fdec20bf8 00007fff587d0000
GPR28: c000000fdeca9ac0 00007fff587d0000 00007fff587c0000 00007fff587d0000
NIP [c00000000030e1e4] __get_user_pages_fast+0x434/0x1070
LR [c00000000030de6c] __get_user_pages_fast+0xbc/0x1070
Call Trace:
[c000000fdeb6b6d0] [c00000000139dab8] lock_classes+0x0/0x35fe50 (unreliable)
[c000000fdeb6b7e0] [c00000000030ef38] get_user_pages_fast+0xf8/0x120
[c000000fdeb6b830] [c000000000112318] kvmppc_book3s_hv_page_fault+0x308/0xf30
[c000000fdeb6b960] [c00000000010e10c] kvmppc_vcpu_run_hv+0xfdc/0x1f00
[c000000fdeb6bb20] [c0000000000e915c] kvmppc_vcpu_run+0x2c/0x40
[c000000fdeb6bb40] [c0000000000e5650] kvm_arch_vcpu_ioctl_run+0x110/0x300
[c000000fdeb6bbe0] [c0000000000d6468] kvm_vcpu_ioctl+0x528/0x900
[c000000fdeb6bd40] [c0000000003bc04c] do_vfs_ioctl+0xcc/0x950
[c000000fdeb6bde0] [c0000000003bc930] SyS_ioctl+0x60/0x100
[c000000fdeb6be30] [c00000000000b96c] system_call+0x58/0x6c
Instruction dump:
7ca81a14 2fa50000 41de0010 7cc8182a 68c60002 78c6ffe2 0b060000 3cc2000a
794a3664 390610d8 e9080000 7d485214 <e90a0020> 7d435378 790507e1 408202f0
---[ end trace fad4a342d0414aa2 ]---

It turns out that what has happened is that the SLB entry for the
vmmemap region hasn't been reloaded on exit from a guest, and it has
the wrong page size.  Then, when the host next accesses the vmemmap
region, it gets a page fault.

Commit a25bd72badfa ("powerpc/mm/radix: Workaround prefetch issue with
KVM", 2017-07-24) modified the guest exit code so that it now only clears
out the SLB for hash guest.  The code tests the radix flag and puts the
result in a non-volatile CR field, CR2, and later branches based on CR2.

Unfortunately, the kvmppc_save_tm function, which gets called between
those two points, modifies all the user-visible registers in the case
where the guest was in transactional or suspended state, except for a
few which it restores (namely r1, r2, r9 and r13).  Thus the hash/radix indication in CR2 gets corrupted.

This fixes the problem by re-doing the comparison just before the
result is needed.  For good measure, this also adds comments next to
the call sites of kvmppc_save_tm and kvmppc_restore_tm pointing out
that non-volatile register state will be lost.

Cc: stable@vger.kernel.org # v4.13
Fixes: a25bd72badfa ("powerpc/mm/radix: Workaround prefetch issue with KVM")
Tested-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/kvm/book3s_hv_rmhandlers.S | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 663a4a861e7f..17936f82d3c7 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -771,6 +771,9 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
 
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
 BEGIN_FTR_SECTION
+	/*
+	 * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS INCLUDING CR
+	 */
 	bl	kvmppc_restore_tm
 END_FTR_SECTION_IFSET(CPU_FTR_TM)
 #endif
@@ -1630,6 +1633,9 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
 
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
 BEGIN_FTR_SECTION
+	/*
+	 * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS INCLUDING CR
+	 */
 	bl	kvmppc_save_tm
 END_FTR_SECTION_IFSET(CPU_FTR_TM)
 #endif
@@ -1749,7 +1755,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
 	/*
 	 * Are we running hash or radix ?
 	 */
-	beq	cr2,3f
+	ld	r5, VCPU_KVM(r9)
+	lbz	r0, KVM_RADIX(r5)
+	cmpwi	cr2, r0, 0
+	beq	cr2, 3f
 
 	/* Radix: Handle the case where the guest used an illegal PID */
 	LOAD_REG_ADDR(r4, mmu_base_pid)
@@ -2466,6 +2475,9 @@ _GLOBAL(kvmppc_h_cede)		/* r3 = vcpu pointer, r11 = msr, r13 = paca */
 
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
 BEGIN_FTR_SECTION
+	/*
+	 * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS INCLUDING CR
+	 */
 	ld	r9, HSTATE_KVM_VCPU(r13)
 	bl	kvmppc_save_tm
 END_FTR_SECTION_IFSET(CPU_FTR_TM)
@@ -2578,6 +2590,9 @@ kvm_end_cede:
 
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
 BEGIN_FTR_SECTION
+	/*
+	 * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS INCLUDING CR
+	 */
 	bl	kvmppc_restore_tm
 END_FTR_SECTION_IFSET(CPU_FTR_TM)
 #endif
-- 
cgit v1.2.3-59-g8ed1b


From 98152b83e065d5593c261b0cc58197666f94a34f Mon Sep 17 00:00:00 2001
From: Joerg Roedel <jroedel@suse.de>
Date: Mon, 28 Aug 2017 16:38:35 +0200
Subject: KVM: x86: Remove .get_pkru() from kvm_x86_ops
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The commit

	9dd21e104bc ('KVM: x86: simplify handling of PKRU')

removed all users and providers of that call-back, but
didn't remove it. Remove it now.

Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 8844eee290b2..bbb802382a3e 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -951,7 +951,6 @@ struct kvm_x86_ops {
 	void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg);
 	unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
 	void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
-	u32 (*get_pkru)(struct kvm_vcpu *vcpu);
 
 	void (*tlb_flush)(struct kvm_vcpu *vcpu);
 
-- 
cgit v1.2.3-59-g8ed1b


From 49a8afca386ee1775519a4aa80f8e121bd227dd4 Mon Sep 17 00:00:00 2001
From: "Jan H. Schönherr" <jschoenh@amazon.de>
Date: Tue, 5 Sep 2017 23:58:44 +0200
Subject: KVM: SVM: Add a missing 'break' statement
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Jan H. Schönherr <jschoenh@amazon.de>
Fixes: f6511935f424 ("KVM: SVM: Add checks for IO instructions")
Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/kvm/svm.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 2c1cfe68a9af..af54327b9017 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -5302,6 +5302,7 @@ static int svm_check_intercept(struct kvm_vcpu *vcpu,
 		 */
 		if (info->rep_prefix != REPE_PREFIX)
 			goto out;
+		break;
 	case SVM_EXIT_IOIO: {
 		u64 exit_info;
 		u32 bytes;
-- 
cgit v1.2.3-59-g8ed1b


From a05950009f50ca971a1d616655d01628177bd2e3 Mon Sep 17 00:00:00 2001
From: "Jan H. Schönherr" <jschoenh@amazon.de>
Date: Wed, 6 Sep 2017 00:27:19 +0200
Subject: KVM: x86: Fix handling of pending signal on uninitialized AP
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

KVM API says that KVM_RUN will return with -EINTR when a signal is
pending. However, if a vCPU is in KVM_MP_STATE_UNINITIALIZED, then
the return value is unconditionally -EAGAIN.

Copy over some code from vcpu_run(), so that the case of a pending
signal results in the expected return value.

Signed-off-by: Jan H. Schönherr <jschoenh@amazon.de>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/kvm/x86.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 6069af86da3b..b27f7f0020e3 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7235,6 +7235,11 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		kvm_apic_accept_events(vcpu);
 		kvm_clear_request(KVM_REQ_UNHALT, vcpu);
 		r = -EAGAIN;
+		if (signal_pending(current)) {
+			r = -EINTR;
+			vcpu->run->exit_reason = KVM_EXIT_INTR;
+			++vcpu->stat.signal_exits;
+		}
 		goto out;
 	}
 
-- 
cgit v1.2.3-59-g8ed1b


From 2f173d2688559a6f85643d38a2ad6f45eb420c42 Mon Sep 17 00:00:00 2001
From: "Jan H. Schönherr" <jschoenh@amazon.de>
Date: Wed, 6 Sep 2017 18:34:06 +0200
Subject: KVM: x86: Fix immediate_exit handling for uninitialized AP
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When user space sets kvm_run->immediate_exit, KVM is supposed to
return quickly. However, when a vCPU is in KVM_MP_STATE_UNINITIALIZED,
the value is not considered and the vCPU blocks.

Fix that oversight.

Fixes: 460df4c1fc7c008 ("KVM: race-free exit from KVM_RUN without POSIX signals")
Signed-off-by: Jan H. Schönherr <jschoenh@amazon.de>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/kvm/x86.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index b27f7f0020e3..c777a8f681a8 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7231,6 +7231,10 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
 
 	if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
+		if (kvm_run->immediate_exit) {
+			r = -EINTR;
+			goto out;
+		}
 		kvm_vcpu_block(vcpu);
 		kvm_apic_accept_events(vcpu);
 		kvm_clear_request(KVM_REQ_UNHALT, vcpu);
-- 
cgit v1.2.3-59-g8ed1b


From 021086e383fa408a219f6c6541b37f495f59d576 Mon Sep 17 00:00:00 2001
From: Christian Borntraeger <borntraeger@de.ibm.com>
Date: Wed, 13 Sep 2017 14:17:22 +0200
Subject: KVM: fix rcu warning on VM_CREATE errors
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit 3898da947bba ("KVM: avoid using rcu_dereference_protected") can
trigger the following lockdep/rcu splat if the VM_CREATE ioctl fails,
for example if kvm_arch_init_vm fails:

WARNING: suspicious RCU usage
4.13.0+ #105 Not tainted
-----------------------------
./include/linux/kvm_host.h:481 suspicious rcu_dereference_check() usage!

other info that might help us debug this:

rcu_scheduler_active = 2, debug_locks = 1
no locks held by qemu-system-s39/79.
stack backtrace:
CPU: 0 PID: 79 Comm: qemu-system-s39 Not tainted 4.13.0+ #105
Hardware name: IBM 2964 NC9 704 (KVM/Linux)
Call Trace:
([<00000000001140b2>] show_stack+0xea/0xf0)
 [<00000000008a68a4>] dump_stack+0x94/0xd8
 [<0000000000134c12>] kvm_dev_ioctl+0x372/0x7a0
 [<000000000038f940>] do_vfs_ioctl+0xa8/0x6c8
 [<0000000000390004>] SyS_ioctl+0xa4/0xb8
 [<00000000008c7a8c>] system_call+0xc4/0x27c
no locks held by qemu-system-s39/79.

We have to reset the just created users_count back to 0 to
tell the check to not trigger.

Reported-by: Stefan Haberland <sth@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Fixes: 3898da947bba ("KVM: avoid using rcu_dereference_protected")
Cc: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 virt/kvm/kvm_main.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 6ed1c2021198..2d7df5cc955b 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -674,6 +674,7 @@ out_err_no_irq_srcu:
 out_err_no_srcu:
 	hardware_disable_all();
 out_err_no_disable:
+	refcount_set(&kvm->users_count, 0);
 	for (i = 0; i < KVM_NR_BUSES; i++)
 		kfree(kvm_get_bus(kvm, i));
 	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
-- 
cgit v1.2.3-59-g8ed1b


From 5153723388c8691b886eb9ad28dde92c01f9b191 Mon Sep 17 00:00:00 2001
From: Radim Krčmář <rkrcmar@redhat.com>
Date: Wed, 13 Sep 2017 15:13:46 +0200
Subject: KVM: x86: fix clang build
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Clang resolves __builtin_constant_p() to false even if the expression is
constant in the end.  The only purpose of that expression was to
differentiate a case where the following expression couldn't be checked
at compile-time, so we can just remove the check.

Clang handles the following two correctly.  Turn it into BUG_ON if there
are any more problems with this.

Fixes: d6321d493319 ("KVM: x86: generalize guest_cpuid_has_ helpers")
Reported-by: Dmitry Vyukov <dvyukov@google.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/kvm/cpuid.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index 1ea3c0e1e3a9..0bc5c1315708 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -59,7 +59,6 @@ static __always_inline struct cpuid_reg x86_feature_cpuid(unsigned x86_feature)
 {
 	unsigned x86_leaf = x86_feature / 32;
 
-	BUILD_BUG_ON(!__builtin_constant_p(x86_leaf));
 	BUILD_BUG_ON(x86_leaf >= ARRAY_SIZE(reverse_cpuid));
 	BUILD_BUG_ON(reverse_cpuid[x86_leaf].function == 0);
 
-- 
cgit v1.2.3-59-g8ed1b


From dfa20099e26e35cd9e83d4d43172615c73c4e9f5 Mon Sep 17 00:00:00 2001
From: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
Date: Tue, 12 Sep 2017 10:42:40 -0500
Subject: KVM: SVM: Refactor AVIC vcpu initialization into avic_init_vcpu()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Preparing the base code for subsequent changes. This does not change
existing logic.

Signed-off-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/kvm/svm.c | 28 ++++++++++++++++++++--------
 1 file changed, 20 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index af54327b9017..cab020acb509 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1600,6 +1600,23 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
 		avic_update_vapic_bar(svm, APIC_DEFAULT_PHYS_BASE);
 }
 
+static int avic_init_vcpu(struct vcpu_svm *svm)
+{
+	int ret;
+
+	if (!avic)
+		return 0;
+
+	ret = avic_init_backing_page(&svm->vcpu);
+	if (ret)
+		return ret;
+
+	INIT_LIST_HEAD(&svm->ir_list);
+	spin_lock_init(&svm->ir_list_lock);
+
+	return ret;
+}
+
 static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
 {
 	struct vcpu_svm *svm;
@@ -1636,14 +1653,9 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
 	if (!hsave_page)
 		goto free_page3;
 
-	if (avic) {
-		err = avic_init_backing_page(&svm->vcpu);
-		if (err)
-			goto free_page4;
-
-		INIT_LIST_HEAD(&svm->ir_list);
-		spin_lock_init(&svm->ir_list_lock);
-	}
+	err = avic_init_vcpu(svm);
+	if (err)
+		goto free_page4;
 
 	/* We initialize this flag to true to make sure that the is_running
 	 * bit would be set the first time the vcpu is loaded.
-- 
cgit v1.2.3-59-g8ed1b


From b2a05feff274e007abd7cda0d2cdae7fdf5cbb0a Mon Sep 17 00:00:00 2001
From: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
Date: Tue, 12 Sep 2017 10:42:41 -0500
Subject: KVM: Add struct kvm_vcpu pointer parameter to get_enable_apicv()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Modify struct kvm_x86_ops.arch.apicv_active() to take struct kvm_vcpu
pointer as parameter in preparation to subsequent changes.

Signed-off-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 2 +-
 arch/x86/kvm/svm.c              | 2 +-
 arch/x86/kvm/vmx.c              | 2 +-
 arch/x86/kvm/x86.c              | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index bbb802382a3e..c73e493adf07 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -972,7 +972,7 @@ struct kvm_x86_ops {
 	void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
 	void (*enable_irq_window)(struct kvm_vcpu *vcpu);
 	void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
-	bool (*get_enable_apicv)(void);
+	bool (*get_enable_apicv)(struct kvm_vcpu *vcpu);
 	void (*refresh_apicv_exec_ctrl)(struct kvm_vcpu *vcpu);
 	void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr);
 	void (*hwapic_isr_update)(struct kvm_vcpu *vcpu, int isr);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index cab020acb509..76f559f5a3dd 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -4407,7 +4407,7 @@ static void svm_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
 	return;
 }
 
-static bool svm_get_enable_apicv(void)
+static bool svm_get_enable_apicv(struct kvm_vcpu *vcpu)
 {
 	return avic;
 }
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 4253adef9044..09204993a739 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -5012,7 +5012,7 @@ static void vmx_disable_intercept_msr_x2apic(u32 msr, int type, bool apicv_activ
 	}
 }
 
-static bool vmx_get_enable_apicv(void)
+static bool vmx_get_enable_apicv(struct kvm_vcpu *vcpu)
 {
 	return enable_apicv;
 }
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c777a8f681a8..f4a978e36030 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7980,7 +7980,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 	BUG_ON(vcpu->kvm == NULL);
 	kvm = vcpu->kvm;
 
-	vcpu->arch.apicv_active = kvm_x86_ops->get_enable_apicv();
+	vcpu->arch.apicv_active = kvm_x86_ops->get_enable_apicv(vcpu);
 	vcpu->arch.pv.pv_unhalted = false;
 	vcpu->arch.emulate_ctxt.ops = &emulate_ops;
 	if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_reset_bsp(vcpu))
-- 
cgit v1.2.3-59-g8ed1b


From 67034bb9dd5ecc0171bdacfbd31ebf7c2634df98 Mon Sep 17 00:00:00 2001
From: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
Date: Tue, 12 Sep 2017 10:42:42 -0500
Subject: KVM: SVM: Add irqchip_split() checks before enabling AVIC
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

SVM AVIC hardware accelerates guest write to APIC_EOI register
(for edge-trigger interrupt), which means it does not trap to KVM.

So, only enable SVM AVIC only in split irqchip mode.
(e.g. launching qemu w/ option '-machine kernel_irqchip=split').

Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
Fixes: 44a95dae1d22 ("KVM: x86: Detect and Initialize AVIC support")
[Removed pr_debug - Radim.]
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/kvm/svm.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 76f559f5a3dd..0e68f0b3cbf7 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1200,7 +1200,6 @@ static void avic_init_vmcb(struct vcpu_svm *svm)
 	vmcb->control.avic_physical_id = ppa & AVIC_HPA_MASK;
 	vmcb->control.avic_physical_id |= AVIC_MAX_PHYSICAL_ID_COUNT;
 	vmcb->control.int_ctl |= AVIC_ENABLE_MASK;
-	svm->vcpu.arch.apicv_active = true;
 }
 
 static void init_vmcb(struct vcpu_svm *svm)
@@ -1316,7 +1315,7 @@ static void init_vmcb(struct vcpu_svm *svm)
 		set_intercept(svm, INTERCEPT_PAUSE);
 	}
 
-	if (avic)
+	if (kvm_vcpu_apicv_active(&svm->vcpu))
 		avic_init_vmcb(svm);
 
 	/*
@@ -1604,7 +1603,7 @@ static int avic_init_vcpu(struct vcpu_svm *svm)
 {
 	int ret;
 
-	if (!avic)
+	if (!kvm_vcpu_apicv_active(&svm->vcpu))
 		return 0;
 
 	ret = avic_init_backing_page(&svm->vcpu);
@@ -4409,7 +4408,7 @@ static void svm_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
 
 static bool svm_get_enable_apicv(struct kvm_vcpu *vcpu)
 {
-	return avic;
+	return avic && irqchip_split(vcpu->kvm);
 }
 
 static void svm_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
@@ -4426,7 +4425,7 @@ static void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
 	struct vcpu_svm *svm = to_svm(vcpu);
 	struct vmcb *vmcb = svm->vmcb;
 
-	if (!avic)
+	if (!kvm_vcpu_apicv_active(&svm->vcpu))
 		return;
 
 	vmcb->control.int_ctl &= ~AVIC_ENABLE_MASK;
-- 
cgit v1.2.3-59-g8ed1b


From a5f01f8e9756b40a56e351f98b3fae6d235b834f Mon Sep 17 00:00:00 2001
From: Wanpeng Li <wanpeng.li@hotmail.com>
Date: Wed, 13 Sep 2017 04:04:01 -0700
Subject: KVM: X86: Don't block vCPU if there is pending exception
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Don't block vCPU if there is pending exception.

Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Wanpeng Li <wanpeng.li@hotmail.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/kvm/x86.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index f4a978e36030..bfda79fb102c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -8461,6 +8461,9 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
 	if (vcpu->arch.pv.pv_unhalted)
 		return true;
 
+	if (vcpu->arch.exception.pending)
+		return true;
+
 	if (kvm_test_request(KVM_REQ_NMI, vcpu) ||
 	    (vcpu->arch.nmi_pending &&
 	     kvm_x86_ops->nmi_allowed(vcpu)))
-- 
cgit v1.2.3-59-g8ed1b


From 9a6e7c39810e4a8bc7fc95056cefb40583fe07ef Mon Sep 17 00:00:00 2001
From: Wanpeng Li <wanpeng.li@hotmail.com>
Date: Thu, 14 Sep 2017 03:54:16 -0700
Subject: KVM: async_pf: Fix #DF due to inject "Page not Present" and "Page
 Ready" exceptions simultaneously
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

qemu-system-x86-8600  [004] d..1  7205.687530: kvm_entry: vcpu 2
qemu-system-x86-8600  [004] ....  7205.687532: kvm_exit: reason EXCEPTION_NMI rip 0xffffffffa921297d info ffffeb2c0e44e018 80000b0e
qemu-system-x86-8600  [004] ....  7205.687532: kvm_page_fault: address ffffeb2c0e44e018 error_code 0
qemu-system-x86-8600  [004] ....  7205.687620: kvm_try_async_get_page: gva = 0xffffeb2c0e44e018, gfn = 0x427e4e
qemu-system-x86-8600  [004] .N..  7205.687628: kvm_async_pf_not_present: token 0x8b002 gva 0xffffeb2c0e44e018
    kworker/4:2-7814  [004] ....  7205.687655: kvm_async_pf_completed: gva 0xffffeb2c0e44e018 address 0x7fcc30c4e000
qemu-system-x86-8600  [004] ....  7205.687703: kvm_async_pf_ready: token 0x8b002 gva 0xffffeb2c0e44e018
qemu-system-x86-8600  [004] d..1  7205.687711: kvm_entry: vcpu 2

After running some memory intensive workload in guest, I catch the kworker
which completes the GUP too quickly, and queues an "Page Ready" #PF exception
after the "Page not Present" exception before the next vmentry as the above
trace which will result in #DF injected to guest.

This patch fixes it by clearing the queue for "Page not Present" if "Page Ready"
occurs before the next vmentry since the GUP has already got the required page
and shadow page table has already been fixed by "Page Ready" handler.

Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Wanpeng Li <wanpeng.li@hotmail.com>
Fixes: 7c90705bf2a3 ("KVM: Inject asynchronous page fault into a PV guest if page is swapped out.")
[Changed indentation and added clearing of injected. - Radim]
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/kvm/x86.c | 37 ++++++++++++++++++++++++++++---------
 1 file changed, 28 insertions(+), 9 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index bfda79fb102c..cd17b7d9a107 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -8631,6 +8631,13 @@ static int apf_put_user(struct kvm_vcpu *vcpu, u32 val)
 				      sizeof(val));
 }
 
+static int apf_get_user(struct kvm_vcpu *vcpu, u32 *val)
+{
+
+	return kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, val,
+				      sizeof(u32));
+}
+
 void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
 				     struct kvm_async_pf *work)
 {
@@ -8658,6 +8665,7 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
 				 struct kvm_async_pf *work)
 {
 	struct x86_exception fault;
+	u32 val;
 
 	if (work->wakeup_all)
 		work->arch.token = ~0; /* broadcast wakeup */
@@ -8665,15 +8673,26 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
 		kvm_del_async_pf_gfn(vcpu, work->arch.gfn);
 	trace_kvm_async_pf_ready(work->arch.token, work->gva);
 
-	if ((vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) &&
-	    !apf_put_user(vcpu, KVM_PV_REASON_PAGE_READY)) {
-		fault.vector = PF_VECTOR;
-		fault.error_code_valid = true;
-		fault.error_code = 0;
-		fault.nested_page_fault = false;
-		fault.address = work->arch.token;
-		fault.async_page_fault = true;
-		kvm_inject_page_fault(vcpu, &fault);
+	if (vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED &&
+	    !apf_get_user(vcpu, &val)) {
+		if (val == KVM_PV_REASON_PAGE_NOT_PRESENT &&
+		    vcpu->arch.exception.pending &&
+		    vcpu->arch.exception.nr == PF_VECTOR &&
+		    !apf_put_user(vcpu, 0)) {
+			vcpu->arch.exception.injected = false;
+			vcpu->arch.exception.pending = false;
+			vcpu->arch.exception.nr = 0;
+			vcpu->arch.exception.has_error_code = false;
+			vcpu->arch.exception.error_code = 0;
+		} else if (!apf_put_user(vcpu, KVM_PV_REASON_PAGE_READY)) {
+			fault.vector = PF_VECTOR;
+			fault.error_code_valid = true;
+			fault.error_code = 0;
+			fault.nested_page_fault = false;
+			fault.address = work->arch.token;
+			fault.async_page_fault = true;
+			kvm_inject_page_fault(vcpu, &fault);
+		}
 	}
 	vcpu->arch.apf.halted = false;
 	vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
-- 
cgit v1.2.3-59-g8ed1b


From 488e32f1981e506976d666b3772d6ccc8b726fee Mon Sep 17 00:00:00 2001
From: Ladi Prosek <lprosek@redhat.com>
Date: Thu, 14 Sep 2017 11:50:07 +0200
Subject: KVM: trace events: update list of exit reasons
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adding entries for exit reasons 23 - 27:

  KVM_EXIT_EPR
  KVM_EXIT_SYSTEM_EVENT
  KVM_EXIT_S390_STSI
  KVM_EXIT_IOAPIC_EOI
  KVM_EXIT_HYPERV

Signed-off-by: Ladi Prosek <lprosek@redhat.com>
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 include/trace/events/kvm.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h
index 8ade3eb6c640..dcffedfac431 100644
--- a/include/trace/events/kvm.h
+++ b/include/trace/events/kvm.h
@@ -14,7 +14,9 @@
 	ERSN(SHUTDOWN), ERSN(FAIL_ENTRY), ERSN(INTR), ERSN(SET_TPR),	\
 	ERSN(TPR_ACCESS), ERSN(S390_SIEIC), ERSN(S390_RESET), ERSN(DCR),\
 	ERSN(NMI), ERSN(INTERNAL_ERROR), ERSN(OSI), ERSN(PAPR_HCALL),	\
-	ERSN(S390_UCONTROL), ERSN(WATCHDOG), ERSN(S390_TSCH)
+	ERSN(S390_UCONTROL), ERSN(WATCHDOG), ERSN(S390_TSCH), ERSN(EPR),\
+	ERSN(SYSTEM_EVENT), ERSN(S390_STSI), ERSN(IOAPIC_EOI),          \
+	ERSN(HYPERV)
 
 TRACE_EVENT(kvm_userspace_exit,
 	    TP_PROTO(__u32 reason, int errno),
-- 
cgit v1.2.3-59-g8ed1b


From 51aa68e7d57e3217192d88ce90fd5b8ef29ec94f Mon Sep 17 00:00:00 2001
From: Jim Mattson <jmattson@google.com>
Date: Tue, 12 Sep 2017 13:02:54 -0700
Subject: kvm: nVMX: Don't allow L2 to access the hardware CR8

If L1 does not specify the "use TPR shadow" VM-execution control in
vmcs12, then L0 must specify the "CR8-load exiting" and "CR8-store
exiting" VM-execution controls in vmcs02. Failure to do so will give
the L2 VM unrestricted read/write access to the hardware CR8.

This fixes CVE-2017-12154.

Signed-off-by: Jim Mattson <jmattson@google.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 09204993a739..8e1ae716f938 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -10525,6 +10525,11 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 	if (exec_control & CPU_BASED_TPR_SHADOW) {
 		vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, -1ull);
 		vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold);
+	} else {
+#ifdef CONFIG_X86_64
+		exec_control |= CPU_BASED_CR8_LOAD_EXITING |
+				CPU_BASED_CR8_STORE_EXITING;
+#endif
 	}
 
 	/*
-- 
cgit v1.2.3-59-g8ed1b


From 36ae3c0a36b7456432fedce38ae2f7bd3e01a563 Mon Sep 17 00:00:00 2001
From: "Jan H. Schönherr" <jschoenh@amazon.de>
Date: Thu, 7 Sep 2017 19:02:48 +0100
Subject: KVM: Don't accept obviously wrong gsi values via KVM_IRQFD
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We cannot add routes for gsi values >= KVM_MAX_IRQ_ROUTES -- see
kvm_set_irq_routing(). Hence, there is no sense in accepting them
via KVM_IRQFD. Prevent them from entering the system in the first
place.

Signed-off-by: Jan H. Schönherr <jschoenh@amazon.de>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 virt/kvm/eventfd.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index f2ac53ab8243..c608ab495282 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -565,6 +565,8 @@ kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args)
 {
 	if (args->flags & ~(KVM_IRQFD_FLAG_DEASSIGN | KVM_IRQFD_FLAG_RESAMPLE))
 		return -EINVAL;
+	if (args->gsi >= KVM_MAX_IRQ_ROUTES)
+		return -EINVAL;
 
 	if (args->flags & KVM_IRQFD_FLAG_DEASSIGN)
 		return kvm_irqfd_deassign(kvm, args);
-- 
cgit v1.2.3-59-g8ed1b


From 3a8b0677fc6180a467e26cc32ce6b0c09a32f9bb Mon Sep 17 00:00:00 2001
From: "Jan H. Schönherr" <jschoenh@amazon.de>
Date: Thu, 7 Sep 2017 19:02:30 +0100
Subject: KVM: VMX: Do not BUG() on out-of-bounds guest IRQ
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The value of the guest_irq argument to vmx_update_pi_irte() is
ultimately coming from a KVM_IRQFD API call. Do not BUG() in
vmx_update_pi_irte() if the value is out-of bounds. (Especially,
since KVM as a whole seems to hang after that.)

Instead, print a message only once if we find that we don't have a
route for a certain IRQ (which can be out-of-bounds or within the
array).

This fixes CVE-2017-1000252.

Fixes: efc644048ecde54 ("KVM: x86: Update IRTE for posted-interrupts")
Signed-off-by: Jan H. Schönherr <jschoenh@amazon.de>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 8e1ae716f938..0b15b43ef45d 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -11834,7 +11834,7 @@ static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
 	struct kvm_lapic_irq irq;
 	struct kvm_vcpu *vcpu;
 	struct vcpu_data vcpu_info;
-	int idx, ret = -EINVAL;
+	int idx, ret = 0;
 
 	if (!kvm_arch_has_assigned_device(kvm) ||
 		!irq_remapping_cap(IRQ_POSTING_CAP) ||
@@ -11843,7 +11843,12 @@ static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
 
 	idx = srcu_read_lock(&kvm->irq_srcu);
 	irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
-	BUG_ON(guest_irq >= irq_rt->nr_rt_entries);
+	if (guest_irq >= irq_rt->nr_rt_entries ||
+	    hlist_empty(&irq_rt->map[guest_irq])) {
+		pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n",
+			     guest_irq, irq_rt->nr_rt_entries);
+		goto out;
+	}
 
 	hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
 		if (e->type != KVM_IRQ_ROUTING_MSI)
-- 
cgit v1.2.3-59-g8ed1b


From 8cd641e3c7cbf86c7cbd2a17a160dd137d86c860 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Wed, 13 Sep 2017 13:08:18 -0700
Subject: sched/wait: Add swq_has_sleeper()

Which is the equivalent of what we have in regular waitqueues.
I'm not crazy about the name, but this also helps us get both
apis closer -- which iirc comes originally from the -net folks.

We also duplicate the comments for the lockless swait_active(),
from wait.h. Future users will make use of this interface.

Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/swait.h | 58 +++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 56 insertions(+), 2 deletions(-)

diff --git a/include/linux/swait.h b/include/linux/swait.h
index 4a4e180d0a35..73e97a08d3d0 100644
--- a/include/linux/swait.h
+++ b/include/linux/swait.h
@@ -79,9 +79,63 @@ extern void __init_swait_queue_head(struct swait_queue_head *q, const char *name
 	DECLARE_SWAIT_QUEUE_HEAD(name)
 #endif
 
-static inline int swait_active(struct swait_queue_head *q)
+/**
+ * swait_active -- locklessly test for waiters on the queue
+ * @wq: the waitqueue to test for waiters
+ *
+ * returns true if the wait list is not empty
+ *
+ * NOTE: this function is lockless and requires care, incorrect usage _will_
+ * lead to sporadic and non-obvious failure.
+ *
+ * NOTE2: this function has the same above implications as regular waitqueues.
+ *
+ * Use either while holding swait_queue_head::lock or when used for wakeups
+ * with an extra smp_mb() like:
+ *
+ *      CPU0 - waker                    CPU1 - waiter
+ *
+ *                                      for (;;) {
+ *      @cond = true;                     prepare_to_swait(&wq_head, &wait, state);
+ *      smp_mb();                         // smp_mb() from set_current_state()
+ *      if (swait_active(wq_head))        if (@cond)
+ *        wake_up(wq_head);                      break;
+ *                                        schedule();
+ *                                      }
+ *                                      finish_swait(&wq_head, &wait);
+ *
+ * Because without the explicit smp_mb() it's possible for the
+ * swait_active() load to get hoisted over the @cond store such that we'll
+ * observe an empty wait list while the waiter might not observe @cond.
+ * This, in turn, can trigger missing wakeups.
+ *
+ * Also note that this 'optimization' trades a spin_lock() for an smp_mb(),
+ * which (when the lock is uncontended) are of roughly equal cost.
+ */
+static inline int swait_active(struct swait_queue_head *wq)
+{
+	return !list_empty(&wq->task_list);
+}
+
+/**
+ * swq_has_sleeper - check if there are any waiting processes
+ * @wq: the waitqueue to test for waiters
+ *
+ * Returns true if @wq has waiting processes
+ *
+ * Please refer to the comment for swait_active.
+ */
+static inline bool swq_has_sleeper(struct swait_queue_head *wq)
 {
-	return !list_empty(&q->task_list);
+	/*
+	 * We need to be sure we are in sync with the list_add()
+	 * modifications to the wait queue (task_list).
+	 *
+	 * This memory barrier should be paired with one on the
+	 * waiting side.
+	 */
+	smp_mb();
+	return swait_active(wq);
 }
 
 extern void swake_up(struct swait_queue_head *q);
-- 
cgit v1.2.3-59-g8ed1b


From b9f67a420b3d76991592558af06e9cf1b8953b3d Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Wed, 13 Sep 2017 13:08:19 -0700
Subject: kvm,async_pf: Use swq_has_sleeper()

... as we've got the new helper now. This caller already
does the right thing, hence no changes in semantics.

Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 virt/kvm/async_pf.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c
index bb298a200cd3..57bcb27dcf30 100644
--- a/virt/kvm/async_pf.c
+++ b/virt/kvm/async_pf.c
@@ -106,11 +106,7 @@ static void async_pf_execute(struct work_struct *work)
 
 	trace_kvm_async_pf_completed(addr, gva);
 
-	/*
-	 * This memory barrier pairs with prepare_to_wait's set_current_state()
-	 */
-	smp_mb();
-	if (swait_active(&vcpu->wq))
+	if (swq_has_sleeper(&vcpu->wq))
 		swake_up(&vcpu->wq);
 
 	mmput(mm);
-- 
cgit v1.2.3-59-g8ed1b


From cc1b46803a671047be22f7832ef4a2bb3f63dd94 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Wed, 13 Sep 2017 13:08:20 -0700
Subject: kvm,lapic: Justify use of swait_active()

A comment might serve future readers.

Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/lapic.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index aaf10b6f5380..69c5612be786 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1324,6 +1324,10 @@ static void apic_timer_expired(struct kvm_lapic *apic)
 	atomic_inc(&apic->lapic_timer.pending);
 	kvm_set_pending_timer(vcpu);
 
+	/*
+	 * For x86, the atomic_inc() is serialized, thus
+	 * using swait_active() is safe.
+	 */
 	if (swait_active(q))
 		swake_up(q);
 
-- 
cgit v1.2.3-59-g8ed1b


From a0cff57bb2a41cb9cbf13d3203097b4156d8c0ae Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Wed, 13 Sep 2017 13:08:21 -0700
Subject: kvm,x86: Fix apf_task_wake_one() wq serialization

During code inspection, the following potential race was seen:

CPU0   	    		    	     	CPU1
kvm_async_pf_task_wait			apf_task_wake_one
					  [L] swait_active(&n->wq)
  [S] prepare_to_swait(&n.wq)
  [L] if (!hlist_unhahed(&n.link))
	schedule()			  [S] hlist_del_init(&n->link);

Properly serialize swait_active() checks such that a wakeup is
not missed.

Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kernel/kvm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 874827b0d7ca..aa60a08b65b1 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -180,7 +180,7 @@ static void apf_task_wake_one(struct kvm_task_sleep_node *n)
 	hlist_del_init(&n->link);
 	if (n->halted)
 		smp_send_reschedule(n->cpu);
-	else if (swait_active(&n->wq))
+	else if (swq_has_sleeper(&n->wq))
 		swake_up(&n->wq);
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 5e0018b3e39e9b44dbfb380b83026e55d2f65b91 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Wed, 13 Sep 2017 13:08:22 -0700
Subject: kvm: Serialize wq active checks in kvm_vcpu_wake_up()

This is a generic call and can be suceptible to races
in reading the wq task_list while another task is adding
itself to the list. Add a full barrier by using the
swq_has_sleeper() helper.

Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 virt/kvm/kvm_main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 2d7df5cc955b..9deb5a245b83 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2187,7 +2187,7 @@ bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu)
 	struct swait_queue_head *wqp;
 
 	wqp = kvm_arch_vcpu_wq(vcpu);
-	if (swait_active(wqp)) {
+	if (swq_has_sleeper(wqp)) {
 		swake_up(wqp);
 		++vcpu->stat.halt_wakeup;
 		return true;
-- 
cgit v1.2.3-59-g8ed1b


From 267ad7bc2d3f69af536035b6a3e4a9a2b6ae11dc Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Wed, 13 Sep 2017 13:08:23 -0700
Subject: kvm,powerpc: Serialize wq active checks in ops->vcpu_kick

Particularly because kvmppc_fast_vcpu_kick_hv() is a callback,
ensure that we properly serialize wq active checks in order to
avoid potentially missing a wakeup due to racing with the waiter
side.

Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/powerpc/kvm/book3s_hv.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index a7177c284f9b..73bf1ebfa78f 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -181,7 +181,7 @@ static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu)
 	struct swait_queue_head *wqp;
 
 	wqp = kvm_arch_vcpu_wq(vcpu);
-	if (swait_active(wqp)) {
+	if (swq_has_sleeper(wqp)) {
 		swake_up(wqp);
 		++vcpu->stat.halt_wakeup;
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 4c0b4bc60f95de4741c89b41174760258343f091 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Wed, 13 Sep 2017 13:08:24 -0700
Subject: kvm,mips: Fix potential swait_active() races

For example, the following could occur, making us miss a wakeup:

CPU0					CPU1
kvm_vcpu_block				kvm_mips_comparecount_func
					  [L] swait_active(&vcpu->wq)
  [S] prepare_to_swait(&vcpu->wq)
  [L] if (!kvm_vcpu_has_pending_timer(vcpu))
         schedule()                       [S] queue_timer_int(vcpu)

Ensure that the swait_active() check is not hoisted over the interrupt.

Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/mips/kvm/mips.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index bce2a6431430..d535edc01434 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -514,7 +514,7 @@ int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
 
 	dvcpu->arch.wait = 0;
 
-	if (swait_active(&dvcpu->wq))
+	if (swq_has_sleeper(&dvcpu->wq))
 		swake_up(&dvcpu->wq);
 
 	return 0;
@@ -1179,7 +1179,7 @@ static void kvm_mips_comparecount_func(unsigned long data)
 	kvm_mips_callbacks->queue_timer_int(vcpu);
 
 	vcpu->arch.wait = 0;
-	if (swait_active(&vcpu->wq))
+	if (swq_has_sleeper(&vcpu->wq))
 		swake_up(&vcpu->wq);
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 7881f96cac4d420c94e62a4e1eea243899a7052e Mon Sep 17 00:00:00 2001
From: Jim Mattson <jmattson@google.com>
Date: Thu, 14 Sep 2017 16:31:40 -0700
Subject: kvm: nVMX: Remove nested_vmx_succeed after successful VM-entry

After a successful VM-entry, RFLAGS is cleared, with the exception of
bit 1, which is always set. This is handled by load_vmcs12_host_state.

Signed-off-by: Jim Mattson <jmattson@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 0b15b43ef45d..29f85ed5a329 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -11491,16 +11491,18 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
 	 */
 	kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
 
-	/*
-	 * Exiting from L2 to L1, we're now back to L1 which thinks it just
-	 * finished a VMLAUNCH or VMRESUME instruction, so we need to set the
-	 * success or failure flag accordingly.
-	 */
 	if (unlikely(vmx->fail)) {
+		/*
+		 * After an early L2 VM-entry failure, we're now back
+		 * in L1 which thinks it just finished a VMLAUNCH or
+		 * VMRESUME instruction, so we need to set the failure
+		 * flag and the VM-instruction error field of the VMCS
+		 * accordingly.
+		 */
 		vmx->fail = 0;
 		nested_vmx_failValid(vcpu, vm_inst_error);
-	} else
-		nested_vmx_succeed(vcpu);
+	}
+
 	if (enable_shadow_vmcs)
 		vmx->nested.sync_shadow_vmcs = true;
 
-- 
cgit v1.2.3-59-g8ed1b


From b060ca3b2e9e72ef005e2042476f95ee0b8839e9 Mon Sep 17 00:00:00 2001
From: Jim Mattson <jmattson@google.com>
Date: Thu, 14 Sep 2017 16:31:42 -0700
Subject: kvm: vmx: Handle VMLAUNCH/VMRESUME failure properly

On an early VMLAUNCH/VMRESUME failure (i.e. one which sets the
VM-instruction error field of the current VMCS), the launch state of
the current VMCS is not set to "launched," and the VM-exit information
fields of the current VMCS (including IDT-vectoring information and
exit reason) are stale.

On a late VMLAUNCH/VMRESUME failure (i.e. one which sets the high bit
of the exit reason field), the launch state of the current VMCS is not
set to "launched," and only two of the VM-exit information fields of
the current VMCS are modified (exit reason and exit
qualification). The remaining VM-exit information fields of the
current VMCS (including IDT-vectoring information, in particular) are
stale.

Signed-off-by: Jim Mattson <jmattson@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 29f85ed5a329..a0a78f09b22d 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -9424,12 +9424,6 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 				  | (1 << VCPU_EXREG_CR3));
 	vcpu->arch.regs_dirty = 0;
 
-	vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
-
-	vmx->loaded_vmcs->launched = 1;
-
-	vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
-
 	/*
 	 * eager fpu is enabled if PKEY is supported and CR4 is switched
 	 * back on host, so it is safe to read guest PKRU from current
@@ -9451,6 +9445,14 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 		kvm_make_request(KVM_REQ_EVENT, vcpu);
 
 	vmx->nested.nested_run_pending = 0;
+	vmx->idt_vectoring_info = 0;
+
+	vmx->exit_reason = vmx->fail ? 0xdead : vmcs_read32(VM_EXIT_REASON);
+	if (vmx->fail || (vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY))
+		return;
+
+	vmx->loaded_vmcs->launched = 1;
+	vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
 
 	vmx_complete_atomic_exit(vmx);
 	vmx_recover_nmi_blocking(vmx);
-- 
cgit v1.2.3-59-g8ed1b


From 4f350c6dbcb9000e18907515ec8a7b205ac33c69 Mon Sep 17 00:00:00 2001
From: Jim Mattson <jmattson@google.com>
Date: Thu, 14 Sep 2017 16:31:44 -0700
Subject: kvm: nVMX: Handle deferred early VMLAUNCH/VMRESUME failure properly

When emulating a nested VM-entry from L1 to L2, several control field
validation checks are deferred to the hardware. Should one of these
validation checks fail, vcpu_vmx_run will set the vmx->fail flag. When
this happens, the L2 guest state is not loaded (even in part), and
execution should continue in L1 with the next instruction after the
VMLAUNCH/VMRESUME.

The VMCS12 is not modified (except for the VM-instruction error
field), the VMCS12 MSR save/load lists are not processed, and the CPU
state is not loaded from the VMCS12 host area. Moreover, the vmcs02
exit reason is stale, so it should not be consulted for any reason.

Signed-off-by: Jim Mattson <jmattson@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 134 ++++++++++++++++++++++++++++++-----------------------
 1 file changed, 75 insertions(+), 59 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index a0a78f09b22d..a406afbb6d21 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -8344,12 +8344,14 @@ static bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason)
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
 
-	trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason,
-				vmcs_readl(EXIT_QUALIFICATION),
-				vmx->idt_vectoring_info,
-				intr_info,
-				vmcs_read32(VM_EXIT_INTR_ERROR_CODE),
-				KVM_ISA_VMX);
+	if (vmx->nested.nested_run_pending)
+		return false;
+
+	if (unlikely(vmx->fail)) {
+		pr_info_ratelimited("%s failed vm entry %x\n", __func__,
+				    vmcs_read32(VM_INSTRUCTION_ERROR));
+		return true;
+	}
 
 	/*
 	 * The host physical addresses of some pages of guest memory
@@ -8363,14 +8365,12 @@ static bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason)
 	 */
 	nested_mark_vmcs12_pages_dirty(vcpu);
 
-	if (vmx->nested.nested_run_pending)
-		return false;
-
-	if (unlikely(vmx->fail)) {
-		pr_info_ratelimited("%s failed vm entry %x\n", __func__,
-				    vmcs_read32(VM_INSTRUCTION_ERROR));
-		return true;
-	}
+	trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason,
+				vmcs_readl(EXIT_QUALIFICATION),
+				vmx->idt_vectoring_info,
+				intr_info,
+				vmcs_read32(VM_EXIT_INTR_ERROR_CODE),
+				KVM_ISA_VMX);
 
 	switch (exit_reason) {
 	case EXIT_REASON_EXCEPTION_NMI:
@@ -11395,46 +11395,30 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
-	u32 vm_inst_error = 0;
 
 	/* trying to cancel vmlaunch/vmresume is a bug */
 	WARN_ON_ONCE(vmx->nested.nested_run_pending);
 
+	/*
+	 * The only expected VM-instruction error is "VM entry with
+	 * invalid control field(s)." Anything else indicates a
+	 * problem with L0.
+	 */
+	WARN_ON_ONCE(vmx->fail && (vmcs_read32(VM_INSTRUCTION_ERROR) !=
+				   VMXERR_ENTRY_INVALID_CONTROL_FIELD));
+
 	leave_guest_mode(vcpu);
-	prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info,
-		       exit_qualification);
 
-	if (nested_vmx_store_msr(vcpu, vmcs12->vm_exit_msr_store_addr,
-				 vmcs12->vm_exit_msr_store_count))
-		nested_vmx_abort(vcpu, VMX_ABORT_SAVE_GUEST_MSR_FAIL);
+	if (likely(!vmx->fail)) {
+		prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info,
+			       exit_qualification);
 
-	if (unlikely(vmx->fail))
-		vm_inst_error = vmcs_read32(VM_INSTRUCTION_ERROR);
+		if (nested_vmx_store_msr(vcpu, vmcs12->vm_exit_msr_store_addr,
+					 vmcs12->vm_exit_msr_store_count))
+			nested_vmx_abort(vcpu, VMX_ABORT_SAVE_GUEST_MSR_FAIL);
+	}
 
 	vmx_switch_vmcs(vcpu, &vmx->vmcs01);
-
-	/*
-	 * TODO: SDM says that with acknowledge interrupt on exit, bit 31 of
-	 * the VM-exit interrupt information (valid interrupt) is always set to
-	 * 1 on EXIT_REASON_EXTERNAL_INTERRUPT, so we shouldn't need
-	 * kvm_cpu_has_interrupt().  See the commit message for details.
-	 */
-	if (nested_exit_intr_ack_set(vcpu) &&
-	    exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT &&
-	    kvm_cpu_has_interrupt(vcpu)) {
-		int irq = kvm_cpu_get_interrupt(vcpu);
-		WARN_ON(irq < 0);
-		vmcs12->vm_exit_intr_info = irq |
-			INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR;
-	}
-
-	trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason,
-				       vmcs12->exit_qualification,
-				       vmcs12->idt_vectoring_info_field,
-				       vmcs12->vm_exit_intr_info,
-				       vmcs12->vm_exit_intr_error_code,
-				       KVM_ISA_VMX);
-
 	vm_entry_controls_reset_shadow(vmx);
 	vm_exit_controls_reset_shadow(vmx);
 	vmx_segment_cache_clear(vmx);
@@ -11443,8 +11427,6 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
 	if (VMCS02_POOL_SIZE == 0)
 		nested_free_vmcs02(vmx, vmx->nested.current_vmptr);
 
-	load_vmcs12_host_state(vcpu, vmcs12);
-
 	/* Update any VMCS fields that might have changed while L2 ran */
 	vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
 	vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
@@ -11493,23 +11475,57 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
 	 */
 	kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
 
-	if (unlikely(vmx->fail)) {
-		/*
-		 * After an early L2 VM-entry failure, we're now back
-		 * in L1 which thinks it just finished a VMLAUNCH or
-		 * VMRESUME instruction, so we need to set the failure
-		 * flag and the VM-instruction error field of the VMCS
-		 * accordingly.
-		 */
-		vmx->fail = 0;
-		nested_vmx_failValid(vcpu, vm_inst_error);
-	}
-
 	if (enable_shadow_vmcs)
 		vmx->nested.sync_shadow_vmcs = true;
 
 	/* in case we halted in L2 */
 	vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+
+	if (likely(!vmx->fail)) {
+		/*
+		 * TODO: SDM says that with acknowledge interrupt on
+		 * exit, bit 31 of the VM-exit interrupt information
+		 * (valid interrupt) is always set to 1 on
+		 * EXIT_REASON_EXTERNAL_INTERRUPT, so we shouldn't
+		 * need kvm_cpu_has_interrupt().  See the commit
+		 * message for details.
+		 */
+		if (nested_exit_intr_ack_set(vcpu) &&
+		    exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT &&
+		    kvm_cpu_has_interrupt(vcpu)) {
+			int irq = kvm_cpu_get_interrupt(vcpu);
+			WARN_ON(irq < 0);
+			vmcs12->vm_exit_intr_info = irq |
+				INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR;
+		}
+
+		trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason,
+					       vmcs12->exit_qualification,
+					       vmcs12->idt_vectoring_info_field,
+					       vmcs12->vm_exit_intr_info,
+					       vmcs12->vm_exit_intr_error_code,
+					       KVM_ISA_VMX);
+
+		load_vmcs12_host_state(vcpu, vmcs12);
+
+		return;
+	}
+	
+	/*
+	 * After an early L2 VM-entry failure, we're now back
+	 * in L1 which thinks it just finished a VMLAUNCH or
+	 * VMRESUME instruction, so we need to set the failure
+	 * flag and the VM-instruction error field of the VMCS
+	 * accordingly.
+	 */
+	nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
+	/*
+	 * The emulated instruction was already skipped in
+	 * nested_vmx_run, but the updated RIP was never
+	 * written back to the vmcs01.
+	 */
+	skip_emulated_instruction(vcpu);
+	vmx->fail = 0;
 }
 
 /*
-- 
cgit v1.2.3-59-g8ed1b