From bdaffe1d93e7eddbcc71d074a5d49eba7fe1c765 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 29 Jul 2015 15:03:06 +0200
Subject: KVM: x86: set TMR when the interrupt is accepted

Do not compute TMR in advance.  Instead, set the TMR just before the interrupt
is accepted into the IRR.  This limits the coupling between IOAPIC and LAPIC.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/ioapic.c |  9 ++-------
 arch/x86/kvm/ioapic.h |  3 +--
 arch/x86/kvm/lapic.c  | 19 ++++++++++---------
 arch/x86/kvm/lapic.h  |  1 -
 arch/x86/kvm/x86.c    |  5 +----
 5 files changed, 14 insertions(+), 23 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
index 856f79105bb5..eaf4ec38d980 100644
--- a/arch/x86/kvm/ioapic.c
+++ b/arch/x86/kvm/ioapic.c
@@ -246,8 +246,7 @@ static void update_handled_vectors(struct kvm_ioapic *ioapic)
 	smp_wmb();
 }
 
-void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap,
-			u32 *tmr)
+void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
 {
 	struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic;
 	union kvm_ioapic_redirect_entry *e;
@@ -260,13 +259,9 @@ void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap,
 		    kvm_irq_has_notifier(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index) ||
 		    index == RTC_GSI) {
 			if (kvm_apic_match_dest(vcpu, NULL, 0,
-				e->fields.dest_id, e->fields.dest_mode)) {
+				e->fields.dest_id, e->fields.dest_mode))
 				__set_bit(e->fields.vector,
 					(unsigned long *)eoi_exit_bitmap);
-				if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG)
-					__set_bit(e->fields.vector,
-						(unsigned long *)tmr);
-			}
 		}
 	}
 	spin_unlock(&ioapic->lock);
diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h
index ca0b0b4e6256..3dbd0e2aac4e 100644
--- a/arch/x86/kvm/ioapic.h
+++ b/arch/x86/kvm/ioapic.h
@@ -120,7 +120,6 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
 		struct kvm_lapic_irq *irq, unsigned long *dest_map);
 int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
 int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
-void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap,
-			u32 *tmr);
+void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
 
 #endif
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 8d9013c5e1ee..5693dd9fc163 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -551,15 +551,6 @@ static void pv_eoi_clr_pending(struct kvm_vcpu *vcpu)
 	__clear_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention);
 }
 
-void kvm_apic_update_tmr(struct kvm_vcpu *vcpu, u32 *tmr)
-{
-	struct kvm_lapic *apic = vcpu->arch.apic;
-	int i;
-
-	for (i = 0; i < 8; i++)
-		apic_set_reg(apic, APIC_TMR + 0x10 * i, tmr[i]);
-}
-
 static void apic_update_ppr(struct kvm_lapic *apic)
 {
 	u32 tpr, isrv, ppr, old_ppr;
@@ -781,6 +772,9 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
 	case APIC_DM_LOWEST:
 		vcpu->arch.apic_arb_prio++;
 	case APIC_DM_FIXED:
+		if (unlikely(trig_mode && !level))
+			break;
+
 		/* FIXME add logic for vcpu on reset */
 		if (unlikely(!apic_enabled(apic)))
 			break;
@@ -790,6 +784,13 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
 		if (dest_map)
 			__set_bit(vcpu->vcpu_id, dest_map);
 
+		if (apic_test_vector(vector, apic->regs + APIC_TMR) != !!trig_mode) {
+			if (trig_mode)
+				apic_set_vector(vector, apic->regs + APIC_TMR);
+			else
+				apic_clear_vector(vector, apic->regs + APIC_TMR);
+		}
+
 		if (kvm_x86_ops->deliver_posted_interrupt)
 			kvm_x86_ops->deliver_posted_interrupt(vcpu, vector);
 		else {
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 764037991d26..eb46d6bcaa75 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -57,7 +57,6 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value);
 u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu);
 void kvm_apic_set_version(struct kvm_vcpu *vcpu);
 
-void kvm_apic_update_tmr(struct kvm_vcpu *vcpu, u32 *tmr);
 void __kvm_apic_update_irr(u32 *pir, void *regs);
 void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir);
 int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 92511d4b7236..c1ed74ebc502 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6144,17 +6144,14 @@ static void process_smi(struct kvm_vcpu *vcpu)
 static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
 {
 	u64 eoi_exit_bitmap[4];
-	u32 tmr[8];
 
 	if (!kvm_apic_hw_enabled(vcpu->arch.apic))
 		return;
 
 	memset(eoi_exit_bitmap, 0, 32);
-	memset(tmr, 0, 32);
 
-	kvm_ioapic_scan_entry(vcpu, eoi_exit_bitmap, tmr);
+	kvm_ioapic_scan_entry(vcpu, eoi_exit_bitmap);
 	kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap);
-	kvm_apic_update_tmr(vcpu, tmr);
 }
 
 static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu)
-- 
cgit v1.2.3-59-g8ed1b


From 3bb345f387dd26beb097cf776e342bc0d96d805a Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 29 Jul 2015 10:43:18 +0200
Subject: KVM: x86: store IOAPIC-handled vectors in each VCPU

We can reuse the algorithm that computes the EOI exit bitmap to figure
out which vectors are handled by the IOAPIC.  The only difference
between the two is for edge-triggered interrupts other than IRQ8
that have no notifiers active; however, the IOAPIC does not have to
do anything special for these interrupts anyway.

This again limits the interactions between the IOAPIC and the LAPIC,
making it easier to move the former to userspace.

Inspired by a patch from Steve Rutherford.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  3 ++-
 arch/x86/kvm/ioapic.c           | 18 ++----------------
 arch/x86/kvm/ioapic.h           |  8 --------
 arch/x86/kvm/lapic.c            | 10 ++++++++--
 arch/x86/kvm/svm.c              |  2 +-
 arch/x86/kvm/vmx.c              |  3 ++-
 arch/x86/kvm/x86.c              |  8 +++-----
 7 files changed, 18 insertions(+), 34 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 2beee0382088..33609c2c743b 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -396,6 +396,7 @@ struct kvm_vcpu_arch {
 	u64 efer;
 	u64 apic_base;
 	struct kvm_lapic *apic;    /* kernel irqchip context */
+	u64 eoi_exit_bitmap[4];
 	unsigned long apic_attention;
 	int32_t apic_arb_prio;
 	int mp_state;
@@ -822,7 +823,7 @@ struct kvm_x86_ops {
 	int (*vm_has_apicv)(struct kvm *kvm);
 	void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr);
 	void (*hwapic_isr_update)(struct kvm *kvm, int isr);
-	void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
+	void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu);
 	void (*set_virtual_x2apic_mode)(struct kvm_vcpu *vcpu, bool set);
 	void (*set_apic_access_page_addr)(struct kvm_vcpu *vcpu, hpa_t hpa);
 	void (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector);
diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
index eaf4ec38d980..2dcda0f188ba 100644
--- a/arch/x86/kvm/ioapic.c
+++ b/arch/x86/kvm/ioapic.c
@@ -233,19 +233,6 @@ static void kvm_ioapic_inject_all(struct kvm_ioapic *ioapic, unsigned long irr)
 }
 
 
-static void update_handled_vectors(struct kvm_ioapic *ioapic)
-{
-	DECLARE_BITMAP(handled_vectors, 256);
-	int i;
-
-	memset(handled_vectors, 0, sizeof(handled_vectors));
-	for (i = 0; i < IOAPIC_NUM_PINS; ++i)
-		__set_bit(ioapic->redirtbl[i].fields.vector, handled_vectors);
-	memcpy(ioapic->handled_vectors, handled_vectors,
-	       sizeof(handled_vectors));
-	smp_wmb();
-}
-
 void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
 {
 	struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic;
@@ -310,7 +297,6 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
 			e->bits |= (u32) val;
 			e->fields.remote_irr = 0;
 		}
-		update_handled_vectors(ioapic);
 		mask_after = e->fields.mask;
 		if (mask_before != mask_after)
 			kvm_fire_mask_notifiers(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index, mask_after);
@@ -594,7 +580,6 @@ static void kvm_ioapic_reset(struct kvm_ioapic *ioapic)
 	ioapic->id = 0;
 	memset(ioapic->irq_eoi, 0x00, IOAPIC_NUM_PINS);
 	rtc_irq_eoi_tracking_reset(ioapic);
-	update_handled_vectors(ioapic);
 }
 
 static const struct kvm_io_device_ops ioapic_mmio_ops = {
@@ -623,8 +608,10 @@ int kvm_ioapic_init(struct kvm *kvm)
 	if (ret < 0) {
 		kvm->arch.vioapic = NULL;
 		kfree(ioapic);
+		return ret;
 	}
 
+	kvm_vcpu_request_scan_ioapic(kvm);
 	return ret;
 }
 
@@ -661,7 +648,6 @@ int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state)
 	memcpy(ioapic, state, sizeof(struct kvm_ioapic_state));
 	ioapic->irr = 0;
 	ioapic->irr_delivered = 0;
-	update_handled_vectors(ioapic);
 	kvm_vcpu_request_scan_ioapic(kvm);
 	kvm_ioapic_inject_all(ioapic, state->irr);
 	spin_unlock(&ioapic->lock);
diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h
index 3dbd0e2aac4e..bf36d66a1951 100644
--- a/arch/x86/kvm/ioapic.h
+++ b/arch/x86/kvm/ioapic.h
@@ -73,7 +73,6 @@ struct kvm_ioapic {
 	struct kvm *kvm;
 	void (*ack_notifier)(void *opaque, int irq);
 	spinlock_t lock;
-	DECLARE_BITMAP(handled_vectors, 256);
 	struct rtc_status rtc_status;
 	struct delayed_work eoi_inject;
 	u32 irq_eoi[IOAPIC_NUM_PINS];
@@ -98,13 +97,6 @@ static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm)
 	return kvm->arch.vioapic;
 }
 
-static inline bool kvm_ioapic_handles_vector(struct kvm *kvm, int vector)
-{
-	struct kvm_ioapic *ioapic = kvm->arch.vioapic;
-	smp_rmb();
-	return test_bit(vector, ioapic->handled_vectors);
-}
-
 void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu);
 bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
 		int short_hand, unsigned int dest, int dest_mode);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 5693dd9fc163..4c30fb0a48a1 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -869,14 +869,20 @@ int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2)
 	return vcpu1->arch.apic_arb_prio - vcpu2->arch.apic_arb_prio;
 }
 
+static bool kvm_ioapic_handles_vector(struct kvm_lapic *apic, int vector)
+{
+	return test_bit(vector, (ulong *)apic->vcpu->arch.eoi_exit_bitmap);
+}
+
 static void kvm_ioapic_send_eoi(struct kvm_lapic *apic, int vector)
 {
-	if (kvm_ioapic_handles_vector(apic->vcpu->kvm, vector)) {
+	if (kvm_ioapic_handles_vector(apic, vector)) {
 		int trigger_mode;
 		if (apic_test_vector(vector, apic->regs + APIC_TMR))
 			trigger_mode = IOAPIC_LEVEL_TRIG;
 		else
 			trigger_mode = IOAPIC_EDGE_TRIG;
+
 		kvm_ioapic_update_eoi(apic->vcpu, vector, trigger_mode);
 	}
 }
@@ -1923,7 +1929,7 @@ static void apic_sync_pv_eoi_to_guest(struct kvm_vcpu *vcpu,
 	    /* Cache not set: could be safe but we don't bother. */
 	    apic->highest_isr_cache == -1 ||
 	    /* Need EOI to update ioapic. */
-	    kvm_ioapic_handles_vector(vcpu->kvm, apic->highest_isr_cache)) {
+	    kvm_ioapic_handles_vector(apic, apic->highest_isr_cache)) {
 		/*
 		 * PV EOI was disabled by apic_sync_pv_eoi_from_guest
 		 * so we need not do anything here.
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 2f9ed1ff0632..09582081276e 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -3664,7 +3664,7 @@ static int svm_vm_has_apicv(struct kvm *kvm)
 	return 0;
 }
 
-static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
+static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu)
 {
 	return;
 }
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 06ef4908ba61..9381b1d34313 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -8043,8 +8043,9 @@ static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
 	}
 }
 
-static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
+static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu)
 {
+	u64 *eoi_exit_bitmap = vcpu->arch.eoi_exit_bitmap;
 	if (!vmx_vm_has_apicv(vcpu->kvm))
 		return;
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c1ed74ebc502..c4adb8847b23 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6143,15 +6143,13 @@ static void process_smi(struct kvm_vcpu *vcpu)
 
 static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
 {
-	u64 eoi_exit_bitmap[4];
-
 	if (!kvm_apic_hw_enabled(vcpu->arch.apic))
 		return;
 
-	memset(eoi_exit_bitmap, 0, 32);
+	memset(vcpu->arch.eoi_exit_bitmap, 0, 256 / 8);
 
-	kvm_ioapic_scan_entry(vcpu, eoi_exit_bitmap);
-	kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap);
+	kvm_ioapic_scan_entry(vcpu, vcpu->arch.eoi_exit_bitmap);
+	kvm_x86_ops->load_eoi_exitmap(vcpu);
 }
 
 static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu)
-- 
cgit v1.2.3-59-g8ed1b


From d50ab6c1a2b24e12d3012d7beb343eba5b94a6ca Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 29 Jul 2015 11:49:59 +0200
Subject: KVM: x86: replace vm_has_apicv hook with cpu_uses_apicv

This will avoid an unnecessary trip to ->kvm and from there to the VPIC.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 2 +-
 arch/x86/kvm/irq.c              | 2 +-
 arch/x86/kvm/lapic.c            | 4 ++--
 arch/x86/kvm/lapic.h            | 4 ++--
 arch/x86/kvm/svm.c              | 4 ++--
 arch/x86/kvm/vmx.c              | 8 +++++++-
 6 files changed, 15 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 33609c2c743b..a0ef289d5a86 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -820,7 +820,7 @@ struct kvm_x86_ops {
 	void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
 	void (*enable_irq_window)(struct kvm_vcpu *vcpu);
 	void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
-	int (*vm_has_apicv)(struct kvm *kvm);
+	int (*cpu_uses_apicv)(struct kvm_vcpu *vcpu);
 	void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr);
 	void (*hwapic_isr_update)(struct kvm *kvm, int isr);
 	void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index a1ec6a50a05a..c0dad893dc59 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -63,7 +63,7 @@ int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v)
 	if (kvm_cpu_has_extint(v))
 		return 1;
 
-	if (kvm_apic_vid_enabled(v->kvm))
+	if (kvm_vcpu_apic_vid_enabled(v))
 		return 0;
 
 	return kvm_apic_has_interrupt(v) != -1; /* LAPIC */
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 4c30fb0a48a1..c568d69c7060 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -390,7 +390,7 @@ static inline void apic_clear_irr(int vec, struct kvm_lapic *apic)
 
 	vcpu = apic->vcpu;
 
-	if (unlikely(kvm_apic_vid_enabled(vcpu->kvm))) {
+	if (unlikely(kvm_vcpu_apic_vid_enabled(vcpu))) {
 		/* try to update RVI */
 		apic_clear_vector(vec, apic->regs + APIC_IRR);
 		kvm_make_request(KVM_REQ_EVENT, vcpu);
@@ -1622,7 +1622,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
 		apic_set_reg(apic, APIC_ISR + 0x10 * i, 0);
 		apic_set_reg(apic, APIC_TMR + 0x10 * i, 0);
 	}
-	apic->irr_pending = kvm_apic_vid_enabled(vcpu->kvm);
+	apic->irr_pending = kvm_vcpu_apic_vid_enabled(vcpu);
 	apic->isr_count = kvm_x86_ops->hwapic_isr_update ? 1 : 0;
 	apic->highest_isr_cache = -1;
 	update_divide_count(apic);
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index eb46d6bcaa75..7259d272416f 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -143,9 +143,9 @@ static inline int apic_x2apic_mode(struct kvm_lapic *apic)
 	return apic->vcpu->arch.apic_base & X2APIC_ENABLE;
 }
 
-static inline bool kvm_apic_vid_enabled(struct kvm *kvm)
+static inline bool kvm_vcpu_apic_vid_enabled(struct kvm_vcpu *vcpu)
 {
-	return kvm_x86_ops->vm_has_apicv(kvm);
+	return kvm_x86_ops->cpu_uses_apicv(vcpu);
 }
 
 static inline bool kvm_apic_has_events(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 09582081276e..9a37e487aca0 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -3659,7 +3659,7 @@ static void svm_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
 	return;
 }
 
-static int svm_vm_has_apicv(struct kvm *kvm)
+static int svm_cpu_uses_apicv(struct kvm_vcpu *vcpu)
 {
 	return 0;
 }
@@ -4425,7 +4425,7 @@ static struct kvm_x86_ops svm_x86_ops = {
 	.enable_irq_window = enable_irq_window,
 	.update_cr8_intercept = update_cr8_intercept,
 	.set_virtual_x2apic_mode = svm_set_virtual_x2apic_mode,
-	.vm_has_apicv = svm_vm_has_apicv,
+	.cpu_uses_apicv = svm_cpu_uses_apicv,
 	.load_eoi_exitmap = svm_load_eoi_exitmap,
 	.sync_pir_to_irr = svm_sync_pir_to_irr,
 
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 9381b1d34313..b8a90c4f676f 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -810,6 +810,7 @@ static void kvm_cpu_vmxoff(void);
 static bool vmx_mpx_supported(void);
 static bool vmx_xsaves_supported(void);
 static int vmx_vm_has_apicv(struct kvm *kvm);
+static int vmx_cpu_uses_apicv(struct kvm_vcpu *vcpu);
 static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
 static void vmx_set_segment(struct kvm_vcpu *vcpu,
 			    struct kvm_segment *var, int seg);
@@ -4337,6 +4338,11 @@ static int vmx_vm_has_apicv(struct kvm *kvm)
 	return enable_apicv && irqchip_in_kernel(kvm);
 }
 
+static int vmx_cpu_uses_apicv(struct kvm_vcpu *vcpu)
+{
+	return vmx_vm_has_apicv(vcpu->kvm);
+}
+
 static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -10362,7 +10368,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
 	.update_cr8_intercept = update_cr8_intercept,
 	.set_virtual_x2apic_mode = vmx_set_virtual_x2apic_mode,
 	.set_apic_access_page_addr = vmx_set_apic_access_page_addr,
-	.vm_has_apicv = vmx_vm_has_apicv,
+	.cpu_uses_apicv = vmx_cpu_uses_apicv,
 	.load_eoi_exitmap = vmx_load_eoi_exitmap,
 	.hwapic_irr_update = vmx_hwapic_irr_update,
 	.hwapic_isr_update = vmx_hwapic_isr_update,
-- 
cgit v1.2.3-59-g8ed1b


From 35754c987f252e859bfa390a6816e85563afe79d Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 29 Jul 2015 12:05:37 +0200
Subject: KVM: x86: introduce lapic_in_kernel

Avoid pointer chasing and memory barriers, and simplify the code
when split irqchip (LAPIC in kernel, IOAPIC/PIC in userspace)
is introduced.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/irq.c   |  6 +++---
 arch/x86/kvm/irq.h   |  8 ++++++++
 arch/x86/kvm/lapic.c |  4 ++--
 arch/x86/kvm/mmu.c   |  2 +-
 arch/x86/kvm/svm.c   |  4 ++--
 arch/x86/kvm/vmx.c   | 46 ++++++++++++++++++++--------------------------
 arch/x86/kvm/x86.c   | 18 +++++++++---------
 7 files changed, 45 insertions(+), 43 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index c0dad893dc59..b653ae202c8e 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -57,7 +57,7 @@ static int kvm_cpu_has_extint(struct kvm_vcpu *v)
  */
 int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v)
 {
-	if (!irqchip_in_kernel(v->kvm))
+	if (!lapic_in_kernel(v))
 		return v->arch.interrupt.pending;
 
 	if (kvm_cpu_has_extint(v))
@@ -75,7 +75,7 @@ int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v)
  */
 int kvm_cpu_has_interrupt(struct kvm_vcpu *v)
 {
-	if (!irqchip_in_kernel(v->kvm))
+	if (!lapic_in_kernel(v))
 		return v->arch.interrupt.pending;
 
 	if (kvm_cpu_has_extint(v))
@@ -103,7 +103,7 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v)
 {
 	int vector;
 
-	if (!irqchip_in_kernel(v->kvm))
+	if (!lapic_in_kernel(v))
 		return v->arch.interrupt.nr;
 
 	vector = kvm_cpu_get_extint(v);
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 3d782a2c336a..9e6e7e04de98 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -92,6 +92,14 @@ static inline int irqchip_in_kernel(struct kvm *kvm)
 	return vpic != NULL;
 }
 
+static inline int lapic_in_kernel(struct kvm_vcpu *vcpu)
+{
+	/* Same as irqchip_in_kernel(vcpu->kvm), but with less
+	 * pointer chasing and no unnecessary memory barriers.
+	 */
+	return vcpu->arch.apic != NULL;
+}
+
 void kvm_pic_reset(struct kvm_kpic_state *s);
 
 void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index c568d69c7060..c4bcc86d6dc4 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1985,7 +1985,7 @@ int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 	struct kvm_lapic *apic = vcpu->arch.apic;
 	u32 reg = (msr - APIC_BASE_MSR) << 4;
 
-	if (!irqchip_in_kernel(vcpu->kvm) || !apic_x2apic_mode(apic))
+	if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(apic))
 		return 1;
 
 	if (reg == APIC_ICR2)
@@ -2002,7 +2002,7 @@ int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data)
 	struct kvm_lapic *apic = vcpu->arch.apic;
 	u32 reg = (msr - APIC_BASE_MSR) << 4, low, high = 0;
 
-	if (!irqchip_in_kernel(vcpu->kvm) || !apic_x2apic_mode(apic))
+	if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(apic))
 		return 1;
 
 	if (reg == APIC_DFR || reg == APIC_ICR2) {
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index ff606f507913..c3f39aa9b9cb 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3427,7 +3427,7 @@ static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
 
 static bool can_do_async_pf(struct kvm_vcpu *vcpu)
 {
-	if (unlikely(!irqchip_in_kernel(vcpu->kvm) ||
+	if (unlikely(!lapic_in_kernel(vcpu) ||
 		     kvm_event_needs_reinjection(vcpu)))
 		return false;
 
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 9a37e487aca0..34c089023827 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -3060,7 +3060,7 @@ static int cr8_write_interception(struct vcpu_svm *svm)
 	u8 cr8_prev = kvm_get_cr8(&svm->vcpu);
 	/* instruction emulation calls kvm_set_cr8() */
 	r = cr_interception(svm);
-	if (irqchip_in_kernel(svm->vcpu.kvm))
+	if (lapic_in_kernel(&svm->vcpu))
 		return r;
 	if (cr8_prev <= kvm_get_cr8(&svm->vcpu))
 		return r;
@@ -3305,7 +3305,7 @@ static int interrupt_window_interception(struct vcpu_svm *svm)
 	 * If the user space waits to inject interrupts, exit as soon as
 	 * possible
 	 */
-	if (!irqchip_in_kernel(svm->vcpu.kvm) &&
+	if (!lapic_in_kernel(&svm->vcpu) &&
 	    kvm_run->request_interrupt_window &&
 	    !kvm_cpu_has_interrupt(&svm->vcpu)) {
 		kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index b8a90c4f676f..4729b7f6e5f2 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -809,7 +809,6 @@ static void kvm_cpu_vmxon(u64 addr);
 static void kvm_cpu_vmxoff(void);
 static bool vmx_mpx_supported(void);
 static bool vmx_xsaves_supported(void);
-static int vmx_vm_has_apicv(struct kvm *kvm);
 static int vmx_cpu_uses_apicv(struct kvm_vcpu *vcpu);
 static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
 static void vmx_set_segment(struct kvm_vcpu *vcpu,
@@ -947,9 +946,9 @@ static inline bool cpu_has_vmx_tpr_shadow(void)
 	return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW;
 }
 
-static inline bool vm_need_tpr_shadow(struct kvm *kvm)
+static inline bool cpu_need_tpr_shadow(struct kvm_vcpu *vcpu)
 {
-	return (cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm));
+	return cpu_has_vmx_tpr_shadow() && lapic_in_kernel(vcpu);
 }
 
 static inline bool cpu_has_secondary_exec_ctrls(void)
@@ -1063,9 +1062,9 @@ static inline bool cpu_has_vmx_ple(void)
 		SECONDARY_EXEC_PAUSE_LOOP_EXITING;
 }
 
-static inline bool vm_need_virtualize_apic_accesses(struct kvm *kvm)
+static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu)
 {
-	return flexpriority_enabled && irqchip_in_kernel(kvm);
+	return flexpriority_enabled && lapic_in_kernel(vcpu);
 }
 
 static inline bool cpu_has_vmx_vpid(void)
@@ -2378,7 +2377,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
 	vmx->nested.nested_vmx_pinbased_ctls_high |=
 		PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
 		PIN_BASED_VMX_PREEMPTION_TIMER;
-	if (vmx_vm_has_apicv(vmx->vcpu.kvm))
+	if (vmx_cpu_uses_apicv(&vmx->vcpu))
 		vmx->nested.nested_vmx_pinbased_ctls_high |=
 			PIN_BASED_POSTED_INTR;
 
@@ -4333,14 +4332,9 @@ static void vmx_disable_intercept_msr_write_x2apic(u32 msr)
 			msr, MSR_TYPE_W);
 }
 
-static int vmx_vm_has_apicv(struct kvm *kvm)
-{
-	return enable_apicv && irqchip_in_kernel(kvm);
-}
-
 static int vmx_cpu_uses_apicv(struct kvm_vcpu *vcpu)
 {
-	return vmx_vm_has_apicv(vcpu->kvm);
+	return enable_apicv && lapic_in_kernel(vcpu);
 }
 
 static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
@@ -4520,7 +4514,7 @@ static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx)
 {
 	u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl;
 
-	if (!vmx_vm_has_apicv(vmx->vcpu.kvm))
+	if (!vmx_cpu_uses_apicv(&vmx->vcpu))
 		pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR;
 	return pin_based_exec_ctrl;
 }
@@ -4532,7 +4526,7 @@ static u32 vmx_exec_control(struct vcpu_vmx *vmx)
 	if (vmx->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)
 		exec_control &= ~CPU_BASED_MOV_DR_EXITING;
 
-	if (!vm_need_tpr_shadow(vmx->vcpu.kvm)) {
+	if (!cpu_need_tpr_shadow(&vmx->vcpu)) {
 		exec_control &= ~CPU_BASED_TPR_SHADOW;
 #ifdef CONFIG_X86_64
 		exec_control |= CPU_BASED_CR8_STORE_EXITING |
@@ -4549,7 +4543,7 @@ static u32 vmx_exec_control(struct vcpu_vmx *vmx)
 static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
 {
 	u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
-	if (!vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
+	if (!cpu_need_virtualize_apic_accesses(&vmx->vcpu))
 		exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
 	if (vmx->vpid == 0)
 		exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
@@ -4563,7 +4557,7 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
 		exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
 	if (!ple_gap)
 		exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
-	if (!vmx_vm_has_apicv(vmx->vcpu.kvm))
+	if (!vmx_cpu_uses_apicv(&vmx->vcpu))
 		exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT |
 				  SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
 	exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
@@ -4624,7 +4618,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 				vmx_secondary_exec_control(vmx));
 	}
 
-	if (vmx_vm_has_apicv(vmx->vcpu.kvm)) {
+	if (vmx_cpu_uses_apicv(&vmx->vcpu)) {
 		vmcs_write64(EOI_EXIT_BITMAP0, 0);
 		vmcs_write64(EOI_EXIT_BITMAP1, 0);
 		vmcs_write64(EOI_EXIT_BITMAP2, 0);
@@ -4768,7 +4762,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
 
 	if (cpu_has_vmx_tpr_shadow() && !init_event) {
 		vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
-		if (vm_need_tpr_shadow(vcpu->kvm))
+		if (cpu_need_tpr_shadow(vcpu))
 			vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
 				     __pa(vcpu->arch.apic->regs));
 		vmcs_write32(TPR_THRESHOLD, 0);
@@ -4776,7 +4770,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
 
 	kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
 
-	if (vmx_vm_has_apicv(vcpu->kvm))
+	if (vmx_cpu_uses_apicv(vcpu))
 		memset(&vmx->pi_desc, 0, sizeof(struct pi_desc));
 
 	if (vmx->vpid != 0)
@@ -5316,7 +5310,7 @@ static int handle_cr(struct kvm_vcpu *vcpu)
 				u8 cr8 = (u8)val;
 				err = kvm_set_cr8(vcpu, cr8);
 				kvm_complete_insn_gp(vcpu, err);
-				if (irqchip_in_kernel(vcpu->kvm))
+				if (lapic_in_kernel(vcpu))
 					return 1;
 				if (cr8_prev <= cr8)
 					return 1;
@@ -5535,7 +5529,7 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu)
 	 * If the user space waits to inject interrupts, exit as soon as
 	 * possible
 	 */
-	if (!irqchip_in_kernel(vcpu->kvm) &&
+	if (!lapic_in_kernel(vcpu) &&
 	    vcpu->run->request_interrupt_window &&
 	    !kvm_cpu_has_interrupt(vcpu)) {
 		vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
@@ -7944,10 +7938,10 @@ static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
 	 * apicv
 	 */
 	if (!cpu_has_vmx_virtualize_x2apic_mode() ||
-				!vmx_vm_has_apicv(vcpu->kvm))
+				!vmx_cpu_uses_apicv(vcpu))
 		return;
 
-	if (!vm_need_tpr_shadow(vcpu->kvm))
+	if (!cpu_need_tpr_shadow(vcpu))
 		return;
 
 	sec_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
@@ -8052,7 +8046,7 @@ static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
 static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu)
 {
 	u64 *eoi_exit_bitmap = vcpu->arch.eoi_exit_bitmap;
-	if (!vmx_vm_has_apicv(vcpu->kvm))
+	if (!vmx_cpu_uses_apicv(vcpu))
 		return;
 
 	vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]);
@@ -8551,7 +8545,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
 	put_cpu();
 	if (err)
 		goto free_vmcs;
-	if (vm_need_virtualize_apic_accesses(kvm)) {
+	if (cpu_need_virtualize_apic_accesses(&vmx->vcpu)) {
 		err = alloc_apic_access_page(kvm);
 		if (err)
 			goto free_vmcs;
@@ -9344,7 +9338,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 				vmcs_write64(APIC_ACCESS_ADDR,
 				  page_to_phys(vmx->nested.apic_access_page));
 		} else if (!(nested_cpu_has_virt_x2apic_mode(vmcs12)) &&
-			    (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))) {
+			    cpu_need_virtualize_apic_accesses(&vmx->vcpu)) {
 			exec_control |=
 				SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
 			kvm_vcpu_reload_apic_access_page(vcpu);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c4adb8847b23..88fb3bf2ae6d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -788,7 +788,7 @@ int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
 {
 	if (cr8 & CR8_RESERVED_BITS)
 		return 1;
-	if (irqchip_in_kernel(vcpu->kvm))
+	if (lapic_in_kernel(vcpu))
 		kvm_lapic_set_tpr(vcpu, cr8);
 	else
 		vcpu->arch.cr8 = cr8;
@@ -798,7 +798,7 @@ EXPORT_SYMBOL_GPL(kvm_set_cr8);
 
 unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
 {
-	if (irqchip_in_kernel(vcpu->kvm))
+	if (lapic_in_kernel(vcpu))
 		return kvm_lapic_get_cr8(vcpu);
 	else
 		return vcpu->arch.cr8;
@@ -3175,7 +3175,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 		struct kvm_vapic_addr va;
 
 		r = -EINVAL;
-		if (!irqchip_in_kernel(vcpu->kvm))
+		if (!lapic_in_kernel(vcpu))
 			goto out;
 		r = -EFAULT;
 		if (copy_from_user(&va, argp, sizeof va))
@@ -5666,7 +5666,7 @@ void kvm_arch_exit(void)
 int kvm_vcpu_halt(struct kvm_vcpu *vcpu)
 {
 	++vcpu->stat.halt_exits;
-	if (irqchip_in_kernel(vcpu->kvm)) {
+	if (lapic_in_kernel(vcpu)) {
 		vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
 		return 1;
 	} else {
@@ -6162,7 +6162,7 @@ void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
 {
 	struct page *page = NULL;
 
-	if (!irqchip_in_kernel(vcpu->kvm))
+	if (!lapic_in_kernel(vcpu))
 		return;
 
 	if (!kvm_x86_ops->set_apic_access_page_addr)
@@ -6200,7 +6200,7 @@ void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm,
 static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 {
 	int r;
-	bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
+	bool req_int_win = !lapic_in_kernel(vcpu) &&
 		vcpu->run->request_interrupt_window;
 	bool req_immediate_exit = false;
 
@@ -6597,7 +6597,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	}
 
 	/* re-sync apic's tpr */
-	if (!irqchip_in_kernel(vcpu->kvm)) {
+	if (!lapic_in_kernel(vcpu)) {
 		if (kvm_set_cr8(vcpu, kvm_run->cr8) != 0) {
 			r = -EINVAL;
 			goto out;
@@ -7297,7 +7297,7 @@ bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu)
 
 bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu)
 {
-	return irqchip_in_kernel(vcpu->kvm) == (vcpu->arch.apic != NULL);
+	return irqchip_in_kernel(vcpu->kvm) == lapic_in_kernel(vcpu);
 }
 
 struct static_key kvm_no_apic_vcpu __read_mostly;
@@ -7391,7 +7391,7 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
 	kvm_mmu_destroy(vcpu);
 	srcu_read_unlock(&vcpu->kvm->srcu, idx);
 	free_page((unsigned long)vcpu->arch.pio_data);
-	if (!irqchip_in_kernel(vcpu->kvm))
+	if (!lapic_in_kernel(vcpu))
 		static_key_slow_dec(&kvm_no_apic_vcpu);
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 4ca7dd8ce4b24e18f94eed90e80c6eb80fb48c9a Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Thu, 30 Jul 2015 10:32:16 +0200
Subject: KVM: x86: unify handling of interrupt window

The interrupt window is currently checked twice, once in vmx.c/svm.c and
once in dm_request_for_irq_injection.  The only difference is the extra
check for kvm_arch_interrupt_allowed in dm_request_for_irq_injection,
and the different return value (EINTR/KVM_EXIT_INTR for vmx.c/svm.c vs.
0/KVM_EXIT_IRQ_WINDOW_OPEN for dm_request_for_irq_injection).

However, dm_request_for_irq_injection is basically dead code!  Revive it
by removing the checks in vmx.c and svm.c's vmexit handlers, and
fixing the returned values for the dm_request_for_irq_injection case.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/svm.c | 13 -------------
 arch/x86/kvm/vmx.c | 11 -----------
 arch/x86/kvm/x86.c |  4 ++--
 3 files changed, 2 insertions(+), 26 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 34c089023827..54a86183e5d3 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -3294,24 +3294,11 @@ static int msr_interception(struct vcpu_svm *svm)
 
 static int interrupt_window_interception(struct vcpu_svm *svm)
 {
-	struct kvm_run *kvm_run = svm->vcpu.run;
-
 	kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
 	svm_clear_vintr(svm);
 	svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
 	mark_dirty(svm->vmcb, VMCB_INTR);
 	++svm->vcpu.stat.irq_window_exits;
-	/*
-	 * If the user space waits to inject interrupts, exit as soon as
-	 * possible
-	 */
-	if (!lapic_in_kernel(&svm->vcpu) &&
-	    kvm_run->request_interrupt_window &&
-	    !kvm_cpu_has_interrupt(&svm->vcpu)) {
-		kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
-		return 0;
-	}
-
 	return 1;
 }
 
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 4729b7f6e5f2..6ee0dc69675b 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -5524,17 +5524,6 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu)
 	kvm_make_request(KVM_REQ_EVENT, vcpu);
 
 	++vcpu->stat.irq_window_exits;
-
-	/*
-	 * If the user space waits to inject interrupts, exit as soon as
-	 * possible
-	 */
-	if (!lapic_in_kernel(vcpu) &&
-	    vcpu->run->request_interrupt_window &&
-	    !kvm_cpu_has_interrupt(vcpu)) {
-		vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
-		return 0;
-	}
 	return 1;
 }
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 88fb3bf2ae6d..28c98c8f9d1c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6469,8 +6469,8 @@ static int vcpu_run(struct kvm_vcpu *vcpu)
 			kvm_inject_pending_timer_irqs(vcpu);
 
 		if (dm_request_for_irq_injection(vcpu)) {
-			r = -EINTR;
-			vcpu->run->exit_reason = KVM_EXIT_INTR;
+			r = 0;
+			vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
 			++vcpu->stat.request_irq_exits;
 			break;
 		}
-- 
cgit v1.2.3-59-g8ed1b


From 49df6397edfc5a8ba8ca813b51fb9729d8e94b40 Mon Sep 17 00:00:00 2001
From: Steve Rutherford <srutherford@google.com>
Date: Wed, 29 Jul 2015 23:21:40 -0700
Subject: KVM: x86: Split the APIC from the rest of IRQCHIP.

First patch in a series which enables the relocation of the
PIC/IOAPIC to userspace.

Adds capability KVM_CAP_SPLIT_IRQCHIP;

KVM_CAP_SPLIT_IRQCHIP enables the construction of LAPICs without the
rest of the irqchip.

Compile tested for x86.

Signed-off-by: Steve Rutherford <srutherford@google.com>
Suggested-by: Andrew Honig <ahonig@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Documentation/virtual/kvm/api.txt | 17 +++++++++++++++++
 arch/x86/include/asm/kvm_host.h   |  2 ++
 arch/x86/kvm/i8254.c              |  4 +++-
 arch/x86/kvm/ioapic.h             |  8 ++++++++
 arch/x86/kvm/irq.h                | 11 ++++++++++-
 arch/x86/kvm/irq_comm.c           |  9 ++++++++-
 arch/x86/kvm/lapic.c              |  6 ++++--
 arch/x86/kvm/x86.c                | 23 +++++++++++++++++++++--
 include/linux/kvm_host.h          |  1 +
 include/uapi/linux/kvm.h          |  1 +
 10 files changed, 75 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index d9ecceea5a02..43e0816d0de1 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -3627,6 +3627,23 @@ struct {
 
 KVM handlers should exit to userspace with rc = -EREMOTE.
 
+7.5 KVM_CAP_SPLIT_IRQCHIP
+
+Architectures: x86
+Parameters: None
+Returns: 0 on success, -1 on error
+
+Create a local apic for each processor in the kernel. This can be used
+instead of KVM_CREATE_IRQCHIP if the userspace VMM wishes to emulate the
+IOAPIC and PIC (and also the PIT, even though this has to be enabled
+separately).
+
+This supersedes KVM_CREATE_IRQCHIP, creating only local APICs, but no in kernel
+IOAPIC or PIC. This also enables in kernel routing of interrupt requests.
+
+Fails if VCPU has already been created, or if the irqchip is already in the
+kernel (i.e. KVM_CREATE_IRQCHIP has already been called).
+
 
 8. Other capabilities.
 ----------------------
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index a0ef289d5a86..befcf555bddc 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -684,6 +684,8 @@ struct kvm_arch {
 	u32 bsp_vcpu_id;
 
 	u64 disabled_quirks;
+
+	bool irqchip_split;
 };
 
 struct kvm_vm_stat {
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index f90952f64e79..08116ff227cc 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -35,6 +35,7 @@
 #include <linux/kvm_host.h>
 #include <linux/slab.h>
 
+#include "ioapic.h"
 #include "irq.h"
 #include "i8254.h"
 #include "x86.h"
@@ -333,7 +334,8 @@ static void create_pit_timer(struct kvm *kvm, u32 val, int is_period)
 	struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state;
 	s64 interval;
 
-	if (!irqchip_in_kernel(kvm) || ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)
+	if (!ioapic_in_kernel(kvm) ||
+	    ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)
 		return;
 
 	interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ);
diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h
index bf36d66a1951..a8842c0dee73 100644
--- a/arch/x86/kvm/ioapic.h
+++ b/arch/x86/kvm/ioapic.h
@@ -97,6 +97,14 @@ static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm)
 	return kvm->arch.vioapic;
 }
 
+static inline int ioapic_in_kernel(struct kvm *kvm)
+{
+	int ret;
+
+	ret = (ioapic_irqchip(kvm) != NULL);
+	return ret;
+}
+
 void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu);
 bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
 		int short_hand, unsigned int dest, int dest_mode);
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 9e6e7e04de98..2f9703dcd913 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -83,13 +83,22 @@ static inline struct kvm_pic *pic_irqchip(struct kvm *kvm)
 	return kvm->arch.vpic;
 }
 
+static inline int irqchip_split(struct kvm *kvm)
+{
+	return kvm->arch.irqchip_split;
+}
+
 static inline int irqchip_in_kernel(struct kvm *kvm)
 {
 	struct kvm_pic *vpic = pic_irqchip(kvm);
+	bool ret;
+
+	ret = (vpic != NULL);
+	ret |= irqchip_split(kvm);
 
 	/* Read vpic before kvm->irq_routing.  */
 	smp_rmb();
-	return vpic != NULL;
+	return ret;
 }
 
 static inline int lapic_in_kernel(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
index 9efff9e5b58c..67f6b62a6814 100644
--- a/arch/x86/kvm/irq_comm.c
+++ b/arch/x86/kvm/irq_comm.c
@@ -208,7 +208,7 @@ void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id)
 		goto unlock;
 	}
 	clear_bit(irq_source_id, &kvm->arch.irq_sources_bitmap);
-	if (!irqchip_in_kernel(kvm))
+	if (!ioapic_in_kernel(kvm))
 		goto unlock;
 
 	kvm_ioapic_clear_all(kvm->arch.vioapic, irq_source_id);
@@ -328,3 +328,10 @@ int kvm_setup_default_irq_routing(struct kvm *kvm)
 	return kvm_set_irq_routing(kvm, default_routing,
 				   ARRAY_SIZE(default_routing), 0);
 }
+
+static const struct kvm_irq_routing_entry empty_routing[] = {};
+
+int kvm_setup_empty_irq_routing(struct kvm *kvm)
+{
+	return kvm_set_irq_routing(kvm, empty_routing, 0, 0);
+}
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index c4bcc86d6dc4..e05946c36b87 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -209,7 +209,8 @@ out:
 	if (old)
 		kfree_rcu(old, rcu);
 
-	kvm_vcpu_request_scan_ioapic(kvm);
+	if (ioapic_in_kernel(kvm))
+		kvm_vcpu_request_scan_ioapic(kvm);
 }
 
 static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val)
@@ -1845,7 +1846,8 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu,
 		kvm_x86_ops->hwapic_isr_update(vcpu->kvm,
 				apic_find_highest_isr(apic));
 	kvm_make_request(KVM_REQ_EVENT, vcpu);
-	kvm_rtc_eoi_tracking_restore_one(vcpu);
+	if (ioapic_in_kernel(vcpu->kvm))
+		kvm_rtc_eoi_tracking_restore_one(vcpu);
 }
 
 void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 28c98c8f9d1c..f720774a4797 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2448,6 +2448,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_ENABLE_CAP_VM:
 	case KVM_CAP_DISABLE_QUIRKS:
 	case KVM_CAP_SET_BOOT_CPU_ID:
+ 	case KVM_CAP_SPLIT_IRQCHIP:
 #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
 	case KVM_CAP_ASSIGN_DEV_IRQ:
 	case KVM_CAP_PCI_2_3:
@@ -3555,6 +3556,24 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
 		kvm->arch.disabled_quirks = cap->args[0];
 		r = 0;
 		break;
+	case KVM_CAP_SPLIT_IRQCHIP: {
+		mutex_lock(&kvm->lock);
+		r = -EEXIST;
+		if (irqchip_in_kernel(kvm))
+			goto split_irqchip_unlock;
+		if (atomic_read(&kvm->online_vcpus))
+			goto split_irqchip_unlock;
+		r = kvm_setup_empty_irq_routing(kvm);
+		if (r)
+			goto split_irqchip_unlock;
+		/* Pairs with irqchip_in_kernel. */
+		smp_wmb();
+		kvm->arch.irqchip_split = true;
+		r = 0;
+split_irqchip_unlock:
+		mutex_unlock(&kvm->lock);
+		break;
+	}
 	default:
 		r = -EINVAL;
 		break;
@@ -3668,7 +3687,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
 		}
 
 		r = -ENXIO;
-		if (!irqchip_in_kernel(kvm))
+		if (!irqchip_in_kernel(kvm) || irqchip_split(kvm))
 			goto get_irqchip_out;
 		r = kvm_vm_ioctl_get_irqchip(kvm, chip);
 		if (r)
@@ -3692,7 +3711,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
 		}
 
 		r = -ENXIO;
-		if (!irqchip_in_kernel(kvm))
+		if (!irqchip_in_kernel(kvm) || irqchip_split(kvm))
 			goto set_irqchip_out;
 		r = kvm_vm_ioctl_set_irqchip(kvm, chip);
 		if (r)
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 1bef9e21e725..354f147647ab 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1002,6 +1002,7 @@ static inline int mmu_notifier_retry(struct kvm *kvm, unsigned long mmu_seq)
 #endif
 
 int kvm_setup_default_irq_routing(struct kvm *kvm);
+int kvm_setup_empty_irq_routing(struct kvm *kvm);
 int kvm_set_irq_routing(struct kvm *kvm,
 			const struct kvm_irq_routing_entry *entries,
 			unsigned nr,
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index a9256f0331ae..ed00f8fc9ea2 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -824,6 +824,7 @@ struct kvm_ppc_smmu_info {
 #define KVM_CAP_MULTI_ADDRESS_SPACE 118
 #define KVM_CAP_GUEST_DEBUG_HW_BPS 119
 #define KVM_CAP_GUEST_DEBUG_HW_WPS 120
+#define KVM_CAP_SPLIT_IRQCHIP 121
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
-- 
cgit v1.2.3-59-g8ed1b


From 7543a635aa09eb138b2cbf60ac3ff19503ae6954 Mon Sep 17 00:00:00 2001
From: Steve Rutherford <srutherford@google.com>
Date: Wed, 29 Jul 2015 23:21:41 -0700
Subject: KVM: x86: Add KVM exit for IOAPIC EOIs

Adds KVM_EXIT_IOAPIC_EOI which allows the kernel to EOI
level-triggered IOAPIC interrupts.

Uses a per VCPU exit bitmap to decide whether or not the IOAPIC needs
to be informed (which is identical to the EOI_EXIT_BITMAP field used
by modern x86 processors, but can also be used to elide kvm IOAPIC EOI
exits on older processors).

[Note: A prototype using ResampleFDs found that decoupling the EOI
from the VCPU's thread made it possible for the VCPU to not see a
recent EOI after reentering the guest. This does not match real
hardware.]

Compile tested for Intel x86.

Signed-off-by: Steve Rutherford <srutherford@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Documentation/virtual/kvm/api.txt | 12 ++++++++++++
 arch/x86/include/asm/kvm_host.h   |  2 ++
 arch/x86/kvm/lapic.c              | 24 +++++++++++++++++-------
 arch/x86/kvm/x86.c                | 11 +++++++++++
 include/linux/kvm_host.h          |  2 +-
 include/uapi/linux/kvm.h          |  5 +++++
 6 files changed, 48 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 43e0816d0de1..0d14bf5db534 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -3309,6 +3309,18 @@ Valid values for 'type' are:
    to ignore the request, or to gather VM memory core dump and/or
    reset/shutdown of the VM.
 
+		/* KVM_EXIT_IOAPIC_EOI */
+		struct {
+			__u8 vector;
+		} eoi;
+
+Indicates that the VCPU's in-kernel local APIC received an EOI for a
+level-triggered IOAPIC interrupt.  This exit only triggers when the
+IOAPIC is implemented in userspace (i.e. KVM_CAP_SPLIT_IRQCHIP is enabled);
+the userspace IOAPIC should process the EOI and retrigger the interrupt if
+it is still asserted.  Vector is the LAPIC interrupt vector for which the
+EOI was received.
+
 		/* Fix the size of the union. */
 		char padding[256];
 	};
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index befcf555bddc..af09fa1d1be7 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -574,6 +574,8 @@ struct kvm_vcpu_arch {
 	struct {
 		bool pv_unhalted;
 	} pv;
+
+	int pending_ioapic_eoi;
 };
 
 struct kvm_lpage_info {
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index e05946c36b87..ef70f6f3a37a 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -877,15 +877,25 @@ static bool kvm_ioapic_handles_vector(struct kvm_lapic *apic, int vector)
 
 static void kvm_ioapic_send_eoi(struct kvm_lapic *apic, int vector)
 {
-	if (kvm_ioapic_handles_vector(apic, vector)) {
-		int trigger_mode;
-		if (apic_test_vector(vector, apic->regs + APIC_TMR))
-			trigger_mode = IOAPIC_LEVEL_TRIG;
-		else
-			trigger_mode = IOAPIC_EDGE_TRIG;
+	int trigger_mode;
+
+	/* Eoi the ioapic only if the ioapic doesn't own the vector. */
+	if (!kvm_ioapic_handles_vector(apic, vector))
+		return;
 
-		kvm_ioapic_update_eoi(apic->vcpu, vector, trigger_mode);
+	/* Request a KVM exit to inform the userspace IOAPIC. */
+	if (irqchip_split(apic->vcpu->kvm)) {
+		apic->vcpu->arch.pending_ioapic_eoi = vector;
+		kvm_make_request(KVM_REQ_IOAPIC_EOI_EXIT, apic->vcpu);
+		return;
 	}
+
+	if (apic_test_vector(vector, apic->regs + APIC_TMR))
+		trigger_mode = IOAPIC_LEVEL_TRIG;
+	else
+		trigger_mode = IOAPIC_EDGE_TRIG;
+
+	kvm_ioapic_update_eoi(apic->vcpu, vector, trigger_mode);
 }
 
 static int apic_set_eoi(struct kvm_lapic *apic)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index f720774a4797..27429daa054e 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6271,6 +6271,17 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 			kvm_pmu_handle_event(vcpu);
 		if (kvm_check_request(KVM_REQ_PMI, vcpu))
 			kvm_pmu_deliver_pmi(vcpu);
+		if (kvm_check_request(KVM_REQ_IOAPIC_EOI_EXIT, vcpu)) {
+			BUG_ON(vcpu->arch.pending_ioapic_eoi > 255);
+			if (test_bit(vcpu->arch.pending_ioapic_eoi,
+				     (void *) vcpu->arch.eoi_exit_bitmap)) {
+				vcpu->run->exit_reason = KVM_EXIT_IOAPIC_EOI;
+				vcpu->run->eoi.vector =
+						vcpu->arch.pending_ioapic_eoi;
+				r = 0;
+				goto out;
+			}
+		}
 		if (kvm_check_request(KVM_REQ_SCAN_IOAPIC, vcpu))
 			vcpu_scan_ioapic(vcpu);
 		if (kvm_check_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu))
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 354f147647ab..3b33215ed447 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -140,6 +140,7 @@ static inline bool is_error_page(struct page *page)
 #define KVM_REQ_APIC_PAGE_RELOAD  25
 #define KVM_REQ_SMI               26
 #define KVM_REQ_HV_CRASH          27
+#define KVM_REQ_IOAPIC_EOI_EXIT   28
 
 #define KVM_USERSPACE_IRQ_SOURCE_ID		0
 #define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID	1
@@ -1146,4 +1147,3 @@ static inline void kvm_vcpu_set_dy_eligible(struct kvm_vcpu *vcpu, bool val)
 }
 #endif /* CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT */
 #endif
-
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index ed00f8fc9ea2..12e3afbf0f47 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -183,6 +183,7 @@ struct kvm_s390_skeys {
 #define KVM_EXIT_EPR              23
 #define KVM_EXIT_SYSTEM_EVENT     24
 #define KVM_EXIT_S390_STSI        25
+#define KVM_EXIT_IOAPIC_EOI       26
 
 /* For KVM_EXIT_INTERNAL_ERROR */
 /* Emulate instruction failed. */
@@ -333,6 +334,10 @@ struct kvm_run {
 			__u8 sel1;
 			__u16 sel2;
 		} s390_stsi;
+		/* KVM_EXIT_IOAPIC_EOI */
+		struct {
+			__u8 vector;
+		} eoi;
 		/* Fix the size of the union. */
 		char padding[256];
 	};
-- 
cgit v1.2.3-59-g8ed1b


From b053b2aef25d00773fa6762dcd4b7f5c9c42d171 Mon Sep 17 00:00:00 2001
From: Steve Rutherford <srutherford@google.com>
Date: Wed, 29 Jul 2015 23:32:35 -0700
Subject: KVM: x86: Add EOI exit bitmap inference

In order to support a userspace IOAPIC interacting with an in kernel
APIC, the EOI exit bitmaps need to be configurable.

If the IOAPIC is in userspace (i.e. the irqchip has been split), the
EOI exit bitmaps will be set whenever the GSI Routes are configured.
In particular, for the low MSI routes are reservable for userspace
IOAPICs. For these MSI routes, the EOI Exit bit corresponding to the
destination vector of the route will be set for the destination VCPU.

The intention is for the userspace IOAPICs to use the reservable MSI
routes to inject interrupts into the guest.

This is a slight abuse of the notion of an MSI Route, given that MSIs
classically bypass the IOAPIC. It might be worthwhile to add an
additional route type to improve clarity.

Compile tested for Intel x86.

Signed-off-by: Steve Rutherford <srutherford@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Documentation/virtual/kvm/api.txt |  9 ++++++---
 arch/x86/include/asm/kvm_host.h   |  1 +
 arch/x86/kvm/ioapic.h             |  2 ++
 arch/x86/kvm/irq_comm.c           | 42 +++++++++++++++++++++++++++++++++++++++
 arch/x86/kvm/lapic.c              |  3 +--
 arch/x86/kvm/x86.c                |  9 ++++++++-
 include/linux/kvm_host.h          | 16 +++++++++++++++
 virt/kvm/irqchip.c                | 12 ++---------
 8 files changed, 78 insertions(+), 16 deletions(-)

(limited to 'arch/x86')

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 0d14bf5db534..89e71648d748 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -3642,7 +3642,7 @@ KVM handlers should exit to userspace with rc = -EREMOTE.
 7.5 KVM_CAP_SPLIT_IRQCHIP
 
 Architectures: x86
-Parameters: None
+Parameters: args[0] - number of routes reserved for userspace IOAPICs
 Returns: 0 on success, -1 on error
 
 Create a local apic for each processor in the kernel. This can be used
@@ -3650,8 +3650,11 @@ instead of KVM_CREATE_IRQCHIP if the userspace VMM wishes to emulate the
 IOAPIC and PIC (and also the PIT, even though this has to be enabled
 separately).
 
-This supersedes KVM_CREATE_IRQCHIP, creating only local APICs, but no in kernel
-IOAPIC or PIC. This also enables in kernel routing of interrupt requests.
+This capability also enables in kernel routing of interrupt requests;
+when KVM_CAP_SPLIT_IRQCHIP only routes of KVM_IRQ_ROUTING_MSI type are
+used in the IRQ routing table.  The first args[0] MSI routes are reserved
+for the IOAPIC pins.  Whenever the LAPIC receives an EOI for these routes,
+a KVM_EXIT_IOAPIC_EOI vmexit will be reported to userspace.
 
 Fails if VCPU has already been created, or if the irqchip is already in the
 kernel (i.e. KVM_CREATE_IRQCHIP has already been called).
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index af09fa1d1be7..7a5f9debbcd8 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -688,6 +688,7 @@ struct kvm_arch {
 	u64 disabled_quirks;
 
 	bool irqchip_split;
+	u8 nr_reserved_ioapic_pins;
 };
 
 struct kvm_vm_stat {
diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h
index a8842c0dee73..084617d37c74 100644
--- a/arch/x86/kvm/ioapic.h
+++ b/arch/x86/kvm/ioapic.h
@@ -9,6 +9,7 @@ struct kvm;
 struct kvm_vcpu;
 
 #define IOAPIC_NUM_PINS  KVM_IOAPIC_NUM_PINS
+#define MAX_NR_RESERVED_IOAPIC_PINS KVM_MAX_IRQ_ROUTES
 #define IOAPIC_VERSION_ID 0x11	/* IOAPIC version */
 #define IOAPIC_EDGE_TRIG  0
 #define IOAPIC_LEVEL_TRIG 1
@@ -121,5 +122,6 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
 int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
 int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
 void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
+void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
 
 #endif
diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
index 67f6b62a6814..177460998bb0 100644
--- a/arch/x86/kvm/irq_comm.c
+++ b/arch/x86/kvm/irq_comm.c
@@ -335,3 +335,45 @@ int kvm_setup_empty_irq_routing(struct kvm *kvm)
 {
 	return kvm_set_irq_routing(kvm, empty_routing, 0, 0);
 }
+
+void kvm_arch_irq_routing_update(struct kvm *kvm)
+{
+	if (ioapic_in_kernel(kvm) || !irqchip_in_kernel(kvm))
+		return;
+	kvm_make_scan_ioapic_request(kvm);
+}
+
+void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
+{
+	struct kvm *kvm = vcpu->kvm;
+	struct kvm_kernel_irq_routing_entry *entry;
+	struct kvm_irq_routing_table *table;
+	u32 i, nr_ioapic_pins;
+	int idx;
+
+	/* kvm->irq_routing must be read after clearing
+	 * KVM_SCAN_IOAPIC. */
+	smp_mb();
+	idx = srcu_read_lock(&kvm->irq_srcu);
+	table = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
+	nr_ioapic_pins = min_t(u32, table->nr_rt_entries,
+			       kvm->arch.nr_reserved_ioapic_pins);
+	for (i = 0; i < nr_ioapic_pins; ++i) {
+		hlist_for_each_entry(entry, &table->map[i], link) {
+			u32 dest_id, dest_mode;
+
+			if (entry->type != KVM_IRQ_ROUTING_MSI)
+				continue;
+			dest_id = (entry->msi.address_lo >> 12) & 0xff;
+			dest_mode = (entry->msi.address_lo >> 2) & 0x1;
+			if (kvm_apic_match_dest(vcpu, NULL, 0, dest_id,
+						dest_mode)) {
+				u32 vector = entry->msi.data & 0xff;
+
+				__set_bit(vector,
+					  (unsigned long *) eoi_exit_bitmap);
+			}
+		}
+	}
+	srcu_read_unlock(&kvm->irq_srcu, idx);
+}
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index ef70f6f3a37a..2f4c0d0cbe0a 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -209,8 +209,7 @@ out:
 	if (old)
 		kfree_rcu(old, rcu);
 
-	if (ioapic_in_kernel(kvm))
-		kvm_vcpu_request_scan_ioapic(kvm);
+	kvm_make_scan_ioapic_request(kvm);
 }
 
 static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 27429daa054e..4aeed2086c5e 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3558,6 +3558,9 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
 		break;
 	case KVM_CAP_SPLIT_IRQCHIP: {
 		mutex_lock(&kvm->lock);
+		r = -EINVAL;
+		if (cap->args[0] > MAX_NR_RESERVED_IOAPIC_PINS)
+			goto split_irqchip_unlock;
 		r = -EEXIST;
 		if (irqchip_in_kernel(kvm))
 			goto split_irqchip_unlock;
@@ -3569,6 +3572,7 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
 		/* Pairs with irqchip_in_kernel. */
 		smp_wmb();
 		kvm->arch.irqchip_split = true;
+		kvm->arch.nr_reserved_ioapic_pins = cap->args[0];
 		r = 0;
 split_irqchip_unlock:
 		mutex_unlock(&kvm->lock);
@@ -6167,7 +6171,10 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
 
 	memset(vcpu->arch.eoi_exit_bitmap, 0, 256 / 8);
 
-	kvm_ioapic_scan_entry(vcpu, vcpu->arch.eoi_exit_bitmap);
+	if (irqchip_split(vcpu->kvm))
+		kvm_scan_ioapic_routes(vcpu, vcpu->arch.eoi_exit_bitmap);
+	else
+		kvm_ioapic_scan_entry(vcpu, vcpu->arch.eoi_exit_bitmap);
 	kvm_x86_ops->load_eoi_exitmap(vcpu);
 }
 
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 3b33215ed447..d754f2d826e9 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -330,6 +330,18 @@ struct kvm_kernel_irq_routing_entry {
 	struct hlist_node link;
 };
 
+#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
+struct kvm_irq_routing_table {
+	int chip[KVM_NR_IRQCHIPS][KVM_IRQCHIP_NUM_PINS];
+	u32 nr_rt_entries;
+	/*
+	 * Array indexed by gsi. Each entry contains list of irq chips
+	 * the gsi is connected to.
+	 */
+	struct hlist_head map[0];
+};
+#endif
+
 #ifndef KVM_PRIVATE_MEM_SLOTS
 #define KVM_PRIVATE_MEM_SLOTS 0
 #endif
@@ -456,10 +468,14 @@ void vcpu_put(struct kvm_vcpu *vcpu);
 
 #ifdef __KVM_HAVE_IOAPIC
 void kvm_vcpu_request_scan_ioapic(struct kvm *kvm);
+void kvm_arch_irq_routing_update(struct kvm *kvm);
 #else
 static inline void kvm_vcpu_request_scan_ioapic(struct kvm *kvm)
 {
 }
+static inline void kvm_arch_irq_routing_update(struct kvm *kvm)
+{
+}
 #endif
 
 #ifdef CONFIG_HAVE_KVM_IRQFD
diff --git a/virt/kvm/irqchip.c b/virt/kvm/irqchip.c
index d7ea8e20dae4..716a1c4db528 100644
--- a/virt/kvm/irqchip.c
+++ b/virt/kvm/irqchip.c
@@ -31,16 +31,6 @@
 #include <trace/events/kvm.h>
 #include "irq.h"
 
-struct kvm_irq_routing_table {
-	int chip[KVM_NR_IRQCHIPS][KVM_IRQCHIP_NUM_PINS];
-	u32 nr_rt_entries;
-	/*
-	 * Array indexed by gsi. Each entry contains list of irq chips
-	 * the gsi is connected to.
-	 */
-	struct hlist_head map[0];
-};
-
 int kvm_irq_map_gsi(struct kvm *kvm,
 		    struct kvm_kernel_irq_routing_entry *entries, int gsi)
 {
@@ -231,6 +221,8 @@ int kvm_set_irq_routing(struct kvm *kvm,
 	kvm_irq_routing_update(kvm);
 	mutex_unlock(&kvm->irq_lock);
 
+	kvm_arch_irq_routing_update(kvm);
+
 	synchronize_srcu_expedited(&kvm->irq_srcu);
 
 	new = old;
-- 
cgit v1.2.3-59-g8ed1b


From 1c1a9ce973a7863dd46767226bce2a5f12d48bc6 Mon Sep 17 00:00:00 2001
From: Steve Rutherford <srutherford@google.com>
Date: Thu, 30 Jul 2015 11:27:16 +0200
Subject: KVM: x86: Add support for local interrupt requests from userspace

In order to enable userspace PIC support, the userspace PIC needs to
be able to inject local interrupts even when the APICs are in the
kernel.

KVM_INTERRUPT now supports sending local interrupts to an APIC when
APICs are in the kernel.

The ready_for_interrupt_request flag is now only set when the CPU/APIC
will immediately accept and inject an interrupt (i.e. APIC has not
masked the PIC).

When the PIC wishes to initiate an INTA cycle with, say, CPU0, it
kicks CPU0 out of the guest, and renedezvous with CPU0 once it arrives
in userspace.

When the CPU/APIC unmasks the PIC, a KVM_EXIT_IRQ_WINDOW_OPEN is
triggered, so that userspace has a chance to inject a PIC interrupt
if it had been pending.

Overall, this design can lead to a small number of spurious userspace
renedezvous. In particular, whenever the PIC transistions from low to
high while it is masked and whenever the PIC becomes unmasked while
it is low.

Note: this does not buffer more than one local interrupt in the
kernel, so the VMM needs to enter the guest in order to complete
interrupt injection before injecting an additional interrupt.

Compiles for x86.

Can pass the KVM Unit Tests.

Signed-off-by: Steve Rutherford <srutherford@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Documentation/virtual/kvm/api.txt | 14 +++++++++----
 arch/x86/include/asm/kvm_host.h   |  1 +
 arch/x86/kvm/irq.c                | 32 +++++++++++++++++++++++------
 arch/x86/kvm/irq.h                |  8 ++++++++
 arch/x86/kvm/x86.c                | 42 ++++++++++++++++++++++++++++++---------
 5 files changed, 78 insertions(+), 19 deletions(-)

(limited to 'arch/x86')

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 89e71648d748..e3e9c41721a2 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -401,10 +401,9 @@ Capability: basic
 Architectures: x86, ppc, mips
 Type: vcpu ioctl
 Parameters: struct kvm_interrupt (in)
-Returns: 0 on success, -1 on error
+Returns: 0 on success, negative on failure.
 
-Queues a hardware interrupt vector to be injected.  This is only
-useful if in-kernel local APIC or equivalent is not used.
+Queues a hardware interrupt vector to be injected.
 
 /* for KVM_INTERRUPT */
 struct kvm_interrupt {
@@ -414,7 +413,14 @@ struct kvm_interrupt {
 
 X86:
 
-Note 'irq' is an interrupt vector, not an interrupt pin or line.
+Returns: 0 on success,
+	 -EEXIST if an interrupt is already enqueued
+	 -EINVAL the the irq number is invalid
+	 -ENXIO if the PIC is in the kernel
+	 -EFAULT if the pointer is invalid
+
+Note 'irq' is an interrupt vector, not an interrupt pin or line. This
+ioctl is useful if the in-kernel PIC is not used.
 
 PPC:
 
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 7a5f9debbcd8..76a5b30979b3 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -576,6 +576,7 @@ struct kvm_vcpu_arch {
 	} pv;
 
 	int pending_ioapic_eoi;
+	int pending_external_vector;
 };
 
 struct kvm_lpage_info {
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index b653ae202c8e..097060e33bd6 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -37,15 +37,28 @@ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL(kvm_cpu_has_pending_timer);
 
+/*
+ * check if there is a pending userspace external interrupt
+ */
+static int pending_userspace_extint(struct kvm_vcpu *v)
+{
+	return v->arch.pending_external_vector != -1;
+}
+
 /*
  * check if there is pending interrupt from
  * non-APIC source without intack.
  */
 static int kvm_cpu_has_extint(struct kvm_vcpu *v)
 {
-	if (kvm_apic_accept_pic_intr(v))
-		return pic_irqchip(v->kvm)->output;	/* PIC */
-	else
+	u8 accept = kvm_apic_accept_pic_intr(v);
+
+	if (accept) {
+		if (irqchip_split(v->kvm))
+			return pending_userspace_extint(v);
+		else
+			return pic_irqchip(v->kvm)->output;
+	} else
 		return 0;
 }
 
@@ -91,9 +104,16 @@ EXPORT_SYMBOL_GPL(kvm_cpu_has_interrupt);
  */
 static int kvm_cpu_get_extint(struct kvm_vcpu *v)
 {
-	if (kvm_cpu_has_extint(v))
-		return kvm_pic_read_irq(v->kvm); /* PIC */
-	return -1;
+	if (kvm_cpu_has_extint(v)) {
+		if (irqchip_split(v->kvm)) {
+			int vector = v->arch.pending_external_vector;
+
+			v->arch.pending_external_vector = -1;
+			return vector;
+		} else
+			return kvm_pic_read_irq(v->kvm); /* PIC */
+	} else
+		return -1;
 }
 
 /*
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 2f9703dcd913..ae5c78f2337d 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -83,6 +83,14 @@ static inline struct kvm_pic *pic_irqchip(struct kvm *kvm)
 	return kvm->arch.vpic;
 }
 
+static inline int pic_in_kernel(struct kvm *kvm)
+{
+	int ret;
+
+	ret = (pic_irqchip(kvm) != NULL);
+	return ret;
+}
+
 static inline int irqchip_split(struct kvm *kvm)
 {
 	return kvm->arch.irqchip_split;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 4aeed2086c5e..8b97c54edebc 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2662,12 +2662,24 @@ static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
 {
 	if (irq->irq >= KVM_NR_INTERRUPTS)
 		return -EINVAL;
-	if (irqchip_in_kernel(vcpu->kvm))
+
+	if (!irqchip_in_kernel(vcpu->kvm)) {
+		kvm_queue_interrupt(vcpu, irq->irq, false);
+		kvm_make_request(KVM_REQ_EVENT, vcpu);
+		return 0;
+	}
+
+	/*
+	 * With in-kernel LAPIC, we only use this to inject EXTINT, so
+	 * fail for in-kernel 8259.
+	 */
+	if (pic_in_kernel(vcpu->kvm))
 		return -ENXIO;
 
-	kvm_queue_interrupt(vcpu, irq->irq, false);
-	kvm_make_request(KVM_REQ_EVENT, vcpu);
+	if (vcpu->arch.pending_external_vector != -1)
+		return -EEXIST;
 
+	vcpu->arch.pending_external_vector = irq->irq;
 	return 0;
 }
 
@@ -5796,9 +5808,15 @@ static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt)
  */
 static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu)
 {
-	return (!irqchip_in_kernel(vcpu->kvm) && !kvm_cpu_has_interrupt(vcpu) &&
-		vcpu->run->request_interrupt_window &&
-		kvm_arch_interrupt_allowed(vcpu));
+	if (!vcpu->run->request_interrupt_window || pic_in_kernel(vcpu->kvm))
+		return false;
+
+	if (kvm_cpu_has_interrupt(vcpu))
+		return false;
+
+	return (irqchip_split(vcpu->kvm)
+		? kvm_apic_accept_pic_intr(vcpu)
+		: kvm_arch_interrupt_allowed(vcpu));
 }
 
 static void post_kvm_run_save(struct kvm_vcpu *vcpu)
@@ -5809,13 +5827,17 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu)
 	kvm_run->flags = is_smm(vcpu) ? KVM_RUN_X86_SMM : 0;
 	kvm_run->cr8 = kvm_get_cr8(vcpu);
 	kvm_run->apic_base = kvm_get_apic_base(vcpu);
-	if (irqchip_in_kernel(vcpu->kvm))
-		kvm_run->ready_for_interrupt_injection = 1;
-	else
+	if (!irqchip_in_kernel(vcpu->kvm))
 		kvm_run->ready_for_interrupt_injection =
 			kvm_arch_interrupt_allowed(vcpu) &&
 			!kvm_cpu_has_interrupt(vcpu) &&
 			!kvm_event_needs_reinjection(vcpu);
+	else if (!pic_in_kernel(vcpu->kvm))
+		kvm_run->ready_for_interrupt_injection =
+			kvm_apic_accept_pic_intr(vcpu) &&
+			!kvm_cpu_has_interrupt(vcpu);
+	else
+		kvm_run->ready_for_interrupt_injection = 1;
 }
 
 static void update_cr8_intercept(struct kvm_vcpu *vcpu)
@@ -7403,6 +7425,8 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 	kvm_async_pf_hash_reset(vcpu);
 	kvm_pmu_init(vcpu);
 
+	vcpu->arch.pending_external_vector = -1;
+
 	return 0;
 
 fail_free_mce_banks:
-- 
cgit v1.2.3-59-g8ed1b


From 931c33b178b091cced2a6b3f57f04655f8ff5207 Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Tue, 15 Sep 2015 14:41:58 +0800
Subject: kvm: add tracepoint for fast mmio

Cc: Gleb Natapov <gleb@kernel.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/trace.h | 18 ++++++++++++++++++
 arch/x86/kvm/vmx.c   |  1 +
 arch/x86/kvm/x86.c   |  1 +
 3 files changed, 20 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 4eae7c35ddf5..ce4abe333c39 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -128,6 +128,24 @@ TRACE_EVENT(kvm_pio,
 		  __entry->count > 1 ? "(...)" : "")
 );
 
+/*
+ * Tracepoint for fast mmio.
+ */
+TRACE_EVENT(kvm_fast_mmio,
+	TP_PROTO(u64 gpa),
+	TP_ARGS(gpa),
+
+	TP_STRUCT__entry(
+		__field(u64,	gpa)
+	),
+
+	TP_fast_assign(
+		__entry->gpa		= gpa;
+	),
+
+	TP_printk("fast mmio at gpa 0x%llx", __entry->gpa)
+);
+
 /*
  * Tracepoint for cpuid.
  */
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 6ee0dc69675b..324b09ff1def 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -5756,6 +5756,7 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
 	gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
 	if (!kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
 		skip_emulated_instruction(vcpu);
+		trace_kvm_fast_mmio(gpa);
 		return 1;
 	}
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 8b97c54edebc..b90ad01c617d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -8070,6 +8070,7 @@ bool kvm_arch_has_noncoherent_dma(struct kvm *kvm)
 EXPORT_SYMBOL_GPL(kvm_arch_has_noncoherent_dma);
 
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr);
-- 
cgit v1.2.3-59-g8ed1b


From e516cebb4f2b2057a5a421fea589079502acfff6 Mon Sep 17 00:00:00 2001
From: Andrey Smetanin <asmetanin@virtuozzo.com>
Date: Wed, 16 Sep 2015 12:29:48 +0300
Subject: kvm/x86: Hyper-V HV_X64_MSR_RESET msr

HV_X64_MSR_RESET msr is used by Hyper-V based Windows guest
to reset guest VM by hypervisor.

Necessary to support loading of winhv.sys in guest, which in turn is
required to support Windows VMBus.

Signed-off-by: Andrey Smetanin <asmetanin@virtuozzo.com>
Reviewed-by: Roman Kagan <rkagan@virtuozzo.com>
Signed-off-by: Denis V. Lunev <den@openvz.org>
CC: Paolo Bonzini <pbonzini@redhat.com>
CC: Gleb Natapov <gleb@kernel.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/uapi/asm/hyperv.h |  3 +++
 arch/x86/kvm/hyperv.c              | 10 ++++++++++
 arch/x86/kvm/x86.c                 |  7 +++++++
 include/linux/kvm_host.h           |  1 +
 4 files changed, 21 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/uapi/asm/hyperv.h b/arch/x86/include/uapi/asm/hyperv.h
index f0412c50c47b..dab584bf7ddf 100644
--- a/arch/x86/include/uapi/asm/hyperv.h
+++ b/arch/x86/include/uapi/asm/hyperv.h
@@ -153,6 +153,9 @@
 /* MSR used to provide vcpu index */
 #define HV_X64_MSR_VP_INDEX			0x40000002
 
+/* MSR used to reset the guest OS. */
+#define HV_X64_MSR_RESET			0x40000003
+
 /* MSR used to read the per-partition time reference counter */
 #define HV_X64_MSR_TIME_REF_COUNT		0x40000020
 
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index a8160d2ae362..0ad11a232474 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -41,6 +41,7 @@ static bool kvm_hv_msr_partition_wide(u32 msr)
 	case HV_X64_MSR_TIME_REF_COUNT:
 	case HV_X64_MSR_CRASH_CTL:
 	case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
+	case HV_X64_MSR_RESET:
 		r = true;
 		break;
 	}
@@ -163,6 +164,12 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
 						 data);
 	case HV_X64_MSR_CRASH_CTL:
 		return kvm_hv_msr_set_crash_ctl(vcpu, data, host);
+	case HV_X64_MSR_RESET:
+		if (data == 1) {
+			vcpu_debug(vcpu, "hyper-v reset requested\n");
+			kvm_make_request(KVM_REQ_HV_RESET, vcpu);
+		}
+		break;
 	default:
 		vcpu_unimpl(vcpu, "Hyper-V uhandled wrmsr: 0x%x data 0x%llx\n",
 			    msr, data);
@@ -241,6 +248,9 @@ static int kvm_hv_get_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 						 pdata);
 	case HV_X64_MSR_CRASH_CTL:
 		return kvm_hv_msr_get_crash_ctl(vcpu, pdata);
+	case HV_X64_MSR_RESET:
+		data = 0;
+		break;
 	default:
 		vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
 		return 1;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index b90ad01c617d..297b36fa3b80 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -952,6 +952,7 @@ static u32 emulated_msrs[] = {
 	HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC,
 	HV_X64_MSR_CRASH_P0, HV_X64_MSR_CRASH_P1, HV_X64_MSR_CRASH_P2,
 	HV_X64_MSR_CRASH_P3, HV_X64_MSR_CRASH_P4, HV_X64_MSR_CRASH_CTL,
+	HV_X64_MSR_RESET,
 	HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
 	MSR_KVM_PV_EOI_EN,
 
@@ -6321,6 +6322,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 			r = 0;
 			goto out;
 		}
+		if (kvm_check_request(KVM_REQ_HV_RESET, vcpu)) {
+			vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
+			vcpu->run->system_event.type = KVM_SYSTEM_EVENT_RESET;
+			r = 0;
+			goto out;
+		}
 	}
 
 	if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index d754f2d826e9..cd0ba2e931e1 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -141,6 +141,7 @@ static inline bool is_error_page(struct page *page)
 #define KVM_REQ_SMI               26
 #define KVM_REQ_HV_CRASH          27
 #define KVM_REQ_IOAPIC_EOI_EXIT   28
+#define KVM_REQ_HV_RESET          29
 
 #define KVM_USERSPACE_IRQ_SOURCE_ID		0
 #define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID	1
-- 
cgit v1.2.3-59-g8ed1b


From 11c4b1ca719eaaa5ca6fe0e80bb009f3f2012fd2 Mon Sep 17 00:00:00 2001
From: Andrey Smetanin <asmetanin@virtuozzo.com>
Date: Wed, 16 Sep 2015 12:29:49 +0300
Subject: kvm/x86: Hyper-V HV_X64_MSR_VP_INDEX export for QEMU.

Insert Hyper-V HV_X64_MSR_VP_INDEX into msr's emulated list,
so QEMU can set Hyper-V features cpuid HV_X64_MSR_VP_INDEX_AVAILABLE
bit correctly. KVM emulation part is in place already.

Necessary to support loading of winhv.sys in guest, which in turn is
required to support Windows VMBus.

Signed-off-by: Andrey Smetanin <asmetanin@virtuozzo.com>
Reviewed-by: Roman Kagan <rkagan@virtuozzo.com>
Signed-off-by: Denis V. Lunev <den@openvz.org>
CC: Paolo Bonzini <pbonzini@redhat.com>
CC: Gleb Natapov <gleb@kernel.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/x86.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 297b36fa3b80..716b4610791c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -953,6 +953,7 @@ static u32 emulated_msrs[] = {
 	HV_X64_MSR_CRASH_P0, HV_X64_MSR_CRASH_P1, HV_X64_MSR_CRASH_P2,
 	HV_X64_MSR_CRASH_P3, HV_X64_MSR_CRASH_P4, HV_X64_MSR_CRASH_CTL,
 	HV_X64_MSR_RESET,
+	HV_X64_MSR_VP_INDEX,
 	HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
 	MSR_KVM_PV_EOI_EN,
 
-- 
cgit v1.2.3-59-g8ed1b


From 9eec50b8bbe1535c440a1ee88c1958f78fc55957 Mon Sep 17 00:00:00 2001
From: Andrey Smetanin <asmetanin@virtuozzo.com>
Date: Wed, 16 Sep 2015 12:29:50 +0300
Subject: kvm/x86: Hyper-V HV_X64_MSR_VP_RUNTIME support

HV_X64_MSR_VP_RUNTIME msr used by guest to get
"the time the virtual processor consumes running guest code,
and the time the associated logical processor spends running
hypervisor code on behalf of that guest."

Calculation of this time is performed by task_cputime_adjusted()
for vcpu task.

Necessary to support loading of winhv.sys in guest, which in turn is
required to support Windows VMBus.

Signed-off-by: Andrey Smetanin <asmetanin@virtuozzo.com>
Reviewed-by: Roman Kagan <rkagan@virtuozzo.com>
Signed-off-by: Denis V. Lunev <den@openvz.org>
CC: Paolo Bonzini <pbonzini@redhat.com>
CC: Gleb Natapov <gleb@kernel.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h    |  1 +
 arch/x86/include/uapi/asm/hyperv.h |  3 +++
 arch/x86/kvm/hyperv.c              | 21 +++++++++++++++++++--
 arch/x86/kvm/x86.c                 |  1 +
 kernel/sched/cputime.c             |  2 ++
 5 files changed, 26 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 76a5b30979b3..d064cb2e19e8 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -374,6 +374,7 @@ struct kvm_mtrr {
 /* Hyper-V per vcpu emulation context */
 struct kvm_vcpu_hv {
 	u64 hv_vapic;
+	s64 runtime_offset;
 };
 
 struct kvm_vcpu_arch {
diff --git a/arch/x86/include/uapi/asm/hyperv.h b/arch/x86/include/uapi/asm/hyperv.h
index dab584bf7ddf..2677a0aac2cc 100644
--- a/arch/x86/include/uapi/asm/hyperv.h
+++ b/arch/x86/include/uapi/asm/hyperv.h
@@ -156,6 +156,9 @@
 /* MSR used to reset the guest OS. */
 #define HV_X64_MSR_RESET			0x40000003
 
+/* MSR used to provide vcpu runtime in 100ns units */
+#define HV_X64_MSR_VP_RUNTIME			0x40000010
+
 /* MSR used to read the per-partition time reference counter */
 #define HV_X64_MSR_TIME_REF_COUNT		0x40000020
 
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 0ad11a232474..62cf8c915e95 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -178,7 +178,16 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
 	return 0;
 }
 
-static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+/* Calculate cpu time spent by current task in 100ns units */
+static u64 current_task_runtime_100ns(void)
+{
+	cputime_t utime, stime;
+
+	task_cputime_adjusted(current, &utime, &stime);
+	return div_u64(cputime_to_nsecs(utime + stime), 100);
+}
+
+static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
 {
 	struct kvm_vcpu_hv *hv = &vcpu->arch.hyperv;
 
@@ -212,6 +221,11 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 		return kvm_hv_vapic_msr_write(vcpu, APIC_ICR, data);
 	case HV_X64_MSR_TPR:
 		return kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data);
+	case HV_X64_MSR_VP_RUNTIME:
+		if (!host)
+			return 1;
+		hv->runtime_offset = data - current_task_runtime_100ns();
+		break;
 	default:
 		vcpu_unimpl(vcpu, "Hyper-V uhandled wrmsr: 0x%x data 0x%llx\n",
 			    msr, data);
@@ -287,6 +301,9 @@ static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 	case HV_X64_MSR_APIC_ASSIST_PAGE:
 		data = hv->hv_vapic;
 		break;
+	case HV_X64_MSR_VP_RUNTIME:
+		data = current_task_runtime_100ns() + hv->runtime_offset;
+		break;
 	default:
 		vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
 		return 1;
@@ -305,7 +322,7 @@ int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
 		mutex_unlock(&vcpu->kvm->lock);
 		return r;
 	} else
-		return kvm_hv_set_msr(vcpu, msr, data);
+		return kvm_hv_set_msr(vcpu, msr, data, host);
 }
 
 int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 716b4610791c..185fc1619c99 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -954,6 +954,7 @@ static u32 emulated_msrs[] = {
 	HV_X64_MSR_CRASH_P3, HV_X64_MSR_CRASH_P4, HV_X64_MSR_CRASH_CTL,
 	HV_X64_MSR_RESET,
 	HV_X64_MSR_VP_INDEX,
+	HV_X64_MSR_VP_RUNTIME,
 	HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
 	MSR_KVM_PV_EOI_EN,
 
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 8cbc3db671df..26a54461bf59 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -444,6 +444,7 @@ void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
 	*ut = p->utime;
 	*st = p->stime;
 }
+EXPORT_SYMBOL_GPL(task_cputime_adjusted);
 
 void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
 {
@@ -652,6 +653,7 @@ void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
 	task_cputime(p, &cputime.utime, &cputime.stime);
 	cputime_adjust(&cputime, &p->prev_cputime, ut, st);
 }
+EXPORT_SYMBOL_GPL(task_cputime_adjusted);
 
 void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
 {
-- 
cgit v1.2.3-59-g8ed1b


From d6a858d13e5d02b97daab5ba0648c704ae3f9517 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Mon, 28 Sep 2015 11:58:14 +0200
Subject: KVM: vmx: disable posted interrupts if no local APIC

Uniprocessor 32-bit randconfigs can disable the local APIC, and posted
interrupts require reserving a vector on the LAPIC, so they are
incompatible.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 324b09ff1def..0f15e2382109 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -983,7 +983,8 @@ static inline bool cpu_has_vmx_virtual_intr_delivery(void)
 
 static inline bool cpu_has_vmx_posted_intr(void)
 {
-	return vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR;
+	return IS_ENABLED(CONFIG_X86_LOCAL_APIC) &&
+		vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR;
 }
 
 static inline bool cpu_has_vmx_apicv(void)
-- 
cgit v1.2.3-59-g8ed1b


From eb1c31b46800a476644cd63ab3bc7ef918b0a917 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <guangrong.xiao@linux.intel.com>
Date: Wed, 9 Sep 2015 14:05:50 +0800
Subject: KVM: x86: allow guest to use cflushopt and clwb

Pass these CPU features to guest to enable them in guest

They are needed by nvdimm drivers

Signed-off-by: Xiao Guangrong <guangrong.xiao@linux.intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/cpuid.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 2fbea2544f24..962fc7d7e0d4 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -348,7 +348,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 		F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) |
 		F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) |
 		F(ADX) | F(SMAP) | F(AVX512F) | F(AVX512PF) | F(AVX512ER) |
-		F(AVX512CD);
+		F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB);
 
 	/* cpuid 0xD.1.eax */
 	const u32 kvm_supported_word10_x86_features =
-- 
cgit v1.2.3-59-g8ed1b


From 8b3e34e46aca9b6d349b331cd9cf71ccbdc91b2e Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <guangrong.xiao@linux.intel.com>
Date: Wed, 9 Sep 2015 14:05:51 +0800
Subject: KVM: x86: add pcommit support

Pass PCOMMIT CPU feature to guest to enable PCOMMIT instruction

Currently we do not catch pcommit instruction for L1 guest and
allow L1 to catch this instruction for L2 if, as required by the spec,
L1 can enumerate the PCOMMIT instruction via CPUID:
| IA32_VMX_PROCBASED_CTLS2[53] (which enumerates support for the
| 1-setting of PCOMMIT exiting) is always the same as
| CPUID.07H:EBX.PCOMMIT[bit 22]. Thus, software can set PCOMMIT exiting
| to 1 if and only if the PCOMMIT instruction is enumerated via CPUID

The spec can be found at
https://software.intel.com/sites/default/files/managed/0d/53/319433-022.pdf

Signed-off-by: Xiao Guangrong <guangrong.xiao@linux.intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/vmx.h      |  2 +-
 arch/x86/include/uapi/asm/vmx.h |  4 +++-
 arch/x86/kvm/cpuid.c            |  2 +-
 arch/x86/kvm/cpuid.h            |  8 ++++++++
 arch/x86/kvm/vmx.c              | 34 +++++++++++++++++++++++++++++-----
 5 files changed, 42 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 448b7ca61aee..d25f32ac9c5c 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -72,7 +72,7 @@
 #define SECONDARY_EXEC_SHADOW_VMCS              0x00004000
 #define SECONDARY_EXEC_ENABLE_PML               0x00020000
 #define SECONDARY_EXEC_XSAVES			0x00100000
-
+#define SECONDARY_EXEC_PCOMMIT			0x00200000
 
 #define PIN_BASED_EXT_INTR_MASK                 0x00000001
 #define PIN_BASED_NMI_EXITING                   0x00000008
diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
index 37fee272618f..5b15d94a33f8 100644
--- a/arch/x86/include/uapi/asm/vmx.h
+++ b/arch/x86/include/uapi/asm/vmx.h
@@ -78,6 +78,7 @@
 #define EXIT_REASON_PML_FULL            62
 #define EXIT_REASON_XSAVES              63
 #define EXIT_REASON_XRSTORS             64
+#define EXIT_REASON_PCOMMIT             65
 
 #define VMX_EXIT_REASONS \
 	{ EXIT_REASON_EXCEPTION_NMI,         "EXCEPTION_NMI" }, \
@@ -126,7 +127,8 @@
 	{ EXIT_REASON_INVVPID,               "INVVPID" }, \
 	{ EXIT_REASON_INVPCID,               "INVPCID" }, \
 	{ EXIT_REASON_XSAVES,                "XSAVES" }, \
-	{ EXIT_REASON_XRSTORS,               "XRSTORS" }
+	{ EXIT_REASON_XRSTORS,               "XRSTORS" }, \
+	{ EXIT_REASON_PCOMMIT,               "PCOMMIT" }
 
 #define VMX_ABORT_SAVE_GUEST_MSR_FAIL        1
 #define VMX_ABORT_LOAD_HOST_MSR_FAIL         4
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 962fc7d7e0d4..faeb0b3bb343 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -348,7 +348,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 		F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) |
 		F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) |
 		F(ADX) | F(SMAP) | F(AVX512F) | F(AVX512PF) | F(AVX512ER) |
-		F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB);
+		F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(PCOMMIT);
 
 	/* cpuid 0xD.1.eax */
 	const u32 kvm_supported_word10_x86_features =
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index dd05b9cef6ae..aed7bfeeaf0d 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -133,4 +133,12 @@ static inline bool guest_cpuid_has_mpx(struct kvm_vcpu *vcpu)
 	best = kvm_find_cpuid_entry(vcpu, 7, 0);
 	return best && (best->ebx & bit(X86_FEATURE_MPX));
 }
+
+static inline bool guest_cpuid_has_pcommit(struct kvm_vcpu *vcpu)
+{
+	struct kvm_cpuid_entry2 *best;
+
+	best = kvm_find_cpuid_entry(vcpu, 7, 0);
+	return best && (best->ebx & bit(X86_FEATURE_PCOMMIT));
+}
 #endif
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 0f15e2382109..691010604144 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2475,7 +2475,8 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
 		SECONDARY_EXEC_APIC_REGISTER_VIRT |
 		SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
 		SECONDARY_EXEC_WBINVD_EXITING |
-		SECONDARY_EXEC_XSAVES;
+		SECONDARY_EXEC_XSAVES |
+		SECONDARY_EXEC_PCOMMIT;
 
 	if (enable_ept) {
 		/* nested EPT: emulate EPT also to L1 */
@@ -3016,7 +3017,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 			SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
 			SECONDARY_EXEC_SHADOW_VMCS |
 			SECONDARY_EXEC_XSAVES |
-			SECONDARY_EXEC_ENABLE_PML;
+			SECONDARY_EXEC_ENABLE_PML |
+			SECONDARY_EXEC_PCOMMIT;
 		if (adjust_vmx_controls(min2, opt2,
 					MSR_IA32_VMX_PROCBASED_CTLS2,
 					&_cpu_based_2nd_exec_control) < 0)
@@ -4571,6 +4573,9 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
 	/* PML is enabled/disabled in creating/destorying vcpu */
 	exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
 
+	/* Currently, we allow L1 guest to directly run pcommit instruction. */
+	exec_control &= ~SECONDARY_EXEC_PCOMMIT;
+
 	return exec_control;
 }
 
@@ -4614,10 +4619,9 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 
 	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx));
 
-	if (cpu_has_secondary_exec_ctrls()) {
+	if (cpu_has_secondary_exec_ctrls())
 		vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
 				vmx_secondary_exec_control(vmx));
-	}
 
 	if (vmx_cpu_uses_apicv(&vmx->vcpu)) {
 		vmcs_write64(EOI_EXIT_BITMAP0, 0);
@@ -7212,6 +7216,13 @@ static int handle_pml_full(struct kvm_vcpu *vcpu)
 	return 1;
 }
 
+static int handle_pcommit(struct kvm_vcpu *vcpu)
+{
+	/* we never catch pcommit instruct for L1 guest. */
+	WARN_ON(1);
+	return 1;
+}
+
 /*
  * The exit handlers return 1 if the exit was handled fully and guest execution
  * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
@@ -7262,6 +7273,7 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
 	[EXIT_REASON_XSAVES]                  = handle_xsaves,
 	[EXIT_REASON_XRSTORS]                 = handle_xrstors,
 	[EXIT_REASON_PML_FULL]		      = handle_pml_full,
+	[EXIT_REASON_PCOMMIT]                 = handle_pcommit,
 };
 
 static const int kvm_vmx_max_exit_handlers =
@@ -7563,6 +7575,8 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
 		 * the XSS exit bitmap in vmcs12.
 		 */
 		return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES);
+	case EXIT_REASON_PCOMMIT:
+		return nested_cpu_has2(vmcs12, SECONDARY_EXEC_PCOMMIT);
 	default:
 		return true;
 	}
@@ -8697,6 +8711,15 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
 		if (best)
 			best->ebx &= ~bit(X86_FEATURE_INVPCID);
 	}
+
+	if (static_cpu_has(X86_FEATURE_PCOMMIT) && nested) {
+		if (guest_cpuid_has_pcommit(vcpu))
+			vmx->nested.nested_vmx_secondary_ctls_high |=
+				SECONDARY_EXEC_PCOMMIT;
+		else
+			vmx->nested.nested_vmx_secondary_ctls_high &=
+				~SECONDARY_EXEC_PCOMMIT;
+	}
 }
 
 static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
@@ -9310,7 +9333,8 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 		exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
 				  SECONDARY_EXEC_RDTSCP |
 				  SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
-				  SECONDARY_EXEC_APIC_REGISTER_VIRT);
+				  SECONDARY_EXEC_APIC_REGISTER_VIRT |
+				  SECONDARY_EXEC_PCOMMIT);
 		if (nested_cpu_has(vmcs12,
 				CPU_BASED_ACTIVATE_SECONDARY_CONTROLS))
 			exec_control |= vmcs12->secondary_vm_exec_control;
-- 
cgit v1.2.3-59-g8ed1b


From e2821620c0908593e6cb40f79c7e78b31921ed73 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <guangrong.xiao@linux.intel.com>
Date: Wed, 9 Sep 2015 14:05:52 +0800
Subject: KVM: VMX: drop rdtscp_enabled check in prepare_vmcs02()

SECONDARY_EXEC_RDTSCP set for L2 guest comes from vmcs12

Signed-off-by: Xiao Guangrong <guangrong.xiao@linux.intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 691010604144..8f4c13808232 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -9327,8 +9327,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 
 	if (cpu_has_secondary_exec_ctrls()) {
 		exec_control = vmx_secondary_exec_control(vmx);
-		if (!vmx->rdtscp_enabled)
-			exec_control &= ~SECONDARY_EXEC_RDTSCP;
+
 		/* Take the following fields only from vmcs12 */
 		exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
 				  SECONDARY_EXEC_RDTSCP |
-- 
cgit v1.2.3-59-g8ed1b


From f36201e5f44b55faf44ddc820929548e51ec841b Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <guangrong.xiao@linux.intel.com>
Date: Wed, 9 Sep 2015 14:05:53 +0800
Subject: KVM: VMX: simplify rdtscp handling in vmx_cpuid_update()

if vmx_rdtscp_supported() is true SECONDARY_EXEC_RDTSCP must
have already been set in current vmcs by
vmx_secondary_exec_control()

Signed-off-by: Xiao Guangrong <guangrong.xiao@linux.intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 8f4c13808232..d53f57aad5eb 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -8677,16 +8677,15 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
 	vmx->rdtscp_enabled = false;
 	if (vmx_rdtscp_supported()) {
 		exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
-		if (exec_control & SECONDARY_EXEC_RDTSCP) {
-			best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
-			if (best && (best->edx & bit(X86_FEATURE_RDTSCP)))
-				vmx->rdtscp_enabled = true;
-			else {
-				exec_control &= ~SECONDARY_EXEC_RDTSCP;
-				vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
-						exec_control);
-			}
+		best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
+		if (best && (best->edx & bit(X86_FEATURE_RDTSCP)))
+			vmx->rdtscp_enabled = true;
+		else {
+			exec_control &= ~SECONDARY_EXEC_RDTSCP;
+			vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
+					exec_control);
 		}
+
 		if (nested && !vmx->rdtscp_enabled)
 			vmx->nested.nested_vmx_secondary_ctls_high &=
 				~SECONDARY_EXEC_RDTSCP;
-- 
cgit v1.2.3-59-g8ed1b


From 29541bb8f4a18b2939dea137315e1803b4617c8d Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <guangrong.xiao@linux.intel.com>
Date: Wed, 9 Sep 2015 14:05:54 +0800
Subject: KVM: VMX: simplify invpcid handling in vmx_cpuid_update()

If vmx_invpcid_supported() is true, second execution control
filed must be supported and SECONDARY_EXEC_ENABLE_INVPCID
must have already been set in current vmcs by
vmx_secondary_exec_control()

If vmx_invpcid_supported() is false, no need to clear
SECONDARY_EXEC_ENABLE_INVPCID

Signed-off-by: Xiao Guangrong <guangrong.xiao@linux.intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 17 +++++------------
 1 file changed, 5 insertions(+), 12 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index d53f57aad5eb..a5b26a940df2 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -8694,19 +8694,12 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
 	/* Exposing INVPCID only when PCID is exposed */
 	best = kvm_find_cpuid_entry(vcpu, 0x7, 0);
 	if (vmx_invpcid_supported() &&
-	    best && (best->ebx & bit(X86_FEATURE_INVPCID)) &&
-	    guest_cpuid_has_pcid(vcpu)) {
+	    (!best || !(best->ebx & bit(X86_FEATURE_INVPCID)) ||
+	    !guest_cpuid_has_pcid(vcpu))) {
 		exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
-		exec_control |= SECONDARY_EXEC_ENABLE_INVPCID;
-		vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
-			     exec_control);
-	} else {
-		if (cpu_has_secondary_exec_ctrls()) {
-			exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
-			exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID;
-			vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
-				     exec_control);
-		}
+		exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID;
+		vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
+
 		if (best)
 			best->ebx &= ~bit(X86_FEATURE_INVPCID);
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 8b97265a1516819d54c2d24b3874489a52f269d4 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Tue, 15 Sep 2015 17:34:42 +0200
Subject: KVM: VMX: align vmx->nested.nested_vmx_secondary_ctls_high to
 vmx->rdtscp_enabled

The SECONDARY_EXEC_RDTSCP must be available iff RDTSCP is enabled in the
guest.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index a5b26a940df2..dad471f8ca57 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -8686,9 +8686,14 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
 					exec_control);
 		}
 
-		if (nested && !vmx->rdtscp_enabled)
-			vmx->nested.nested_vmx_secondary_ctls_high &=
-				~SECONDARY_EXEC_RDTSCP;
+		if (nested) {
+			if (vmx->rdtscp_enabled)
+				vmx->nested.nested_vmx_secondary_ctls_high |=
+					SECONDARY_EXEC_RDTSCP;
+			else
+				vmx->nested.nested_vmx_secondary_ctls_high &=
+					~SECONDARY_EXEC_RDTSCP;
+		}
 	}
 
 	/* Exposing INVPCID only when PCID is exposed */
-- 
cgit v1.2.3-59-g8ed1b


From feda805fe7c4ed9cf78158e73b1218752e3b4314 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <guangrong.xiao@linux.intel.com>
Date: Wed, 9 Sep 2015 14:05:55 +0800
Subject: KVM: VMX: unify SECONDARY_VM_EXEC_CONTROL update

Unify the update in vmx_cpuid_update()

Signed-off-by: Xiao Guangrong <guangrong.xiao@linux.intel.com>
[Rewrite to use vmcs_set_secondary_exec_control. - Paolo]
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 35 +++++++++++++++++++++++++----------
 1 file changed, 25 insertions(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index dad471f8ca57..a7a0ed6906fe 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -8668,23 +8668,38 @@ static int vmx_get_lpage_level(void)
 		return PT_PDPE_LEVEL;
 }
 
+static void vmcs_set_secondary_exec_control(u32 new_ctl)
+{
+	/*
+	 * These bits in the secondary execution controls field
+	 * are dynamic, the others are mostly based on the hypervisor
+	 * architecture and the guest's CPUID.  Do not touch the
+	 * dynamic bits.
+	 */
+	u32 mask =
+		SECONDARY_EXEC_SHADOW_VMCS |
+		SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
+		SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+
+	u32 cur_ctl = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
+
+	vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
+		     (new_ctl & ~mask) | (cur_ctl & mask));
+}
+
 static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
 {
 	struct kvm_cpuid_entry2 *best;
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	u32 exec_control;
+	u32 secondary_exec_ctl = vmx_secondary_exec_control(vmx);
 
 	vmx->rdtscp_enabled = false;
 	if (vmx_rdtscp_supported()) {
-		exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
 		best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
 		if (best && (best->edx & bit(X86_FEATURE_RDTSCP)))
 			vmx->rdtscp_enabled = true;
-		else {
-			exec_control &= ~SECONDARY_EXEC_RDTSCP;
-			vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
-					exec_control);
-		}
+		else
+			secondary_exec_ctl &= ~SECONDARY_EXEC_RDTSCP;
 
 		if (nested) {
 			if (vmx->rdtscp_enabled)
@@ -8701,14 +8716,14 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
 	if (vmx_invpcid_supported() &&
 	    (!best || !(best->ebx & bit(X86_FEATURE_INVPCID)) ||
 	    !guest_cpuid_has_pcid(vcpu))) {
-		exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
-		exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID;
-		vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
+		secondary_exec_ctl &= ~SECONDARY_EXEC_ENABLE_INVPCID;
 
 		if (best)
 			best->ebx &= ~bit(X86_FEATURE_INVPCID);
 	}
 
+	vmcs_set_secondary_exec_control(secondary_exec_ctl);
+
 	if (static_cpu_has(X86_FEATURE_PCOMMIT) && nested) {
 		if (guest_cpuid_has_pcommit(vcpu))
 			vmx->nested.nested_vmx_secondary_ctls_high |=
-- 
cgit v1.2.3-59-g8ed1b


From 7ec362964d6cc228ea68572fdd6d75a59f8b40fa Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <guangrong.xiao@linux.intel.com>
Date: Wed, 9 Sep 2015 14:05:56 +0800
Subject: KVM: VMX: clean up bit operation on SECONDARY_VM_EXEC_CONTROL

Use vmcs_set_bits() and vmcs_clear_bits() to clean up the code

Signed-off-by: Xiao Guangrong <guangrong.xiao@linux.intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 23 ++++++-----------------
 1 file changed, 6 insertions(+), 17 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index a7a0ed6906fe..2937cf136df6 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -6636,7 +6636,6 @@ static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
 
 static inline void nested_release_vmcs12(struct vcpu_vmx *vmx)
 {
-	u32 exec_control;
 	if (vmx->nested.current_vmptr == -1ull)
 		return;
 
@@ -6649,9 +6648,8 @@ static inline void nested_release_vmcs12(struct vcpu_vmx *vmx)
 		   they were modified */
 		copy_shadow_to_vmcs12(vmx);
 		vmx->nested.sync_shadow_vmcs = false;
-		exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
-		exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
-		vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
+		vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL,
+				SECONDARY_EXEC_SHADOW_VMCS);
 		vmcs_write64(VMCS_LINK_POINTER, -1ull);
 	}
 	vmx->nested.posted_intr_nv = -1;
@@ -7047,7 +7045,6 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	gpa_t vmptr;
-	u32 exec_control;
 
 	if (!nested_vmx_check_permission(vcpu))
 		return 1;
@@ -7079,9 +7076,8 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
 		vmx->nested.current_vmcs12 = new_vmcs12;
 		vmx->nested.current_vmcs12_page = page;
 		if (enable_shadow_vmcs) {
-			exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
-			exec_control |= SECONDARY_EXEC_SHADOW_VMCS;
-			vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
+			vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL,
+				      SECONDARY_EXEC_SHADOW_VMCS);
 			vmcs_write64(VMCS_LINK_POINTER,
 				     __pa(vmx->nested.current_shadow_vmcs));
 			vmx->nested.sync_shadow_vmcs = true;
@@ -7591,7 +7587,6 @@ static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
 static int vmx_enable_pml(struct vcpu_vmx *vmx)
 {
 	struct page *pml_pg;
-	u32 exec_control;
 
 	pml_pg = alloc_page(GFP_KERNEL | __GFP_ZERO);
 	if (!pml_pg)
@@ -7602,24 +7597,18 @@ static int vmx_enable_pml(struct vcpu_vmx *vmx)
 	vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
 	vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
 
-	exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
-	exec_control |= SECONDARY_EXEC_ENABLE_PML;
-	vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
+	vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL, SECONDARY_EXEC_ENABLE_PML);
 
 	return 0;
 }
 
 static void vmx_disable_pml(struct vcpu_vmx *vmx)
 {
-	u32 exec_control;
-
 	ASSERT(vmx->pml_pg);
 	__free_page(vmx->pml_pg);
 	vmx->pml_pg = NULL;
 
-	exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
-	exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
-	vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
+	vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, SECONDARY_EXEC_ENABLE_PML);
 }
 
 static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu)
-- 
cgit v1.2.3-59-g8ed1b


From 1cea0ce68ed76490ffa64a9e2a7a40104efe9352 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <guangrong.xiao@linux.intel.com>
Date: Wed, 9 Sep 2015 14:05:57 +0800
Subject: KVM: VMX: drop rdtscp_enabled field

Check cpuid bit instead of it

Signed-off-by: Xiao Guangrong <guangrong.xiao@linux.intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/cpuid.h |  8 ++++++++
 arch/x86/kvm/vmx.c   | 17 ++++++-----------
 2 files changed, 14 insertions(+), 11 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index aed7bfeeaf0d..d434ee952b00 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -141,4 +141,12 @@ static inline bool guest_cpuid_has_pcommit(struct kvm_vcpu *vcpu)
 	best = kvm_find_cpuid_entry(vcpu, 7, 0);
 	return best && (best->ebx & bit(X86_FEATURE_PCOMMIT));
 }
+
+static inline bool guest_cpuid_has_rdtscp(struct kvm_vcpu *vcpu)
+{
+	struct kvm_cpuid_entry2 *best;
+
+	best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
+	return best && (best->edx & bit(X86_FEATURE_RDTSCP));
+}
 #endif
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 2937cf136df6..6f3f97f6e248 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -532,8 +532,6 @@ struct vcpu_vmx {
 	s64 vnmi_blocked_time;
 	u32 exit_reason;
 
-	bool rdtscp_enabled;
-
 	/* Posted interrupt descriptor */
 	struct pi_desc pi_desc;
 
@@ -2208,7 +2206,7 @@ static void setup_msrs(struct vcpu_vmx *vmx)
 		if (index >= 0)
 			move_msr_up(vmx, index, save_nmsrs++);
 		index = __find_msr_index(vmx, MSR_TSC_AUX);
-		if (index >= 0 && vmx->rdtscp_enabled)
+		if (index >= 0 && guest_cpuid_has_rdtscp(&vmx->vcpu))
 			move_msr_up(vmx, index, save_nmsrs++);
 		/*
 		 * MSR_STAR is only needed on long mode guests, and only
@@ -2675,7 +2673,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		msr_info->data = vcpu->arch.ia32_xss;
 		break;
 	case MSR_TSC_AUX:
-		if (!to_vmx(vcpu)->rdtscp_enabled)
+		if (!guest_cpuid_has_rdtscp(vcpu))
 			return 1;
 		/* Otherwise falls through */
 	default:
@@ -2781,7 +2779,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 			clear_atomic_switch_msr(vmx, MSR_IA32_XSS);
 		break;
 	case MSR_TSC_AUX:
-		if (!vmx->rdtscp_enabled)
+		if (!guest_cpuid_has_rdtscp(vcpu))
 			return 1;
 		/* Check reserved bit, higher 32 bits should be zero */
 		if ((data >> 32) != 0)
@@ -8682,16 +8680,13 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	u32 secondary_exec_ctl = vmx_secondary_exec_control(vmx);
 
-	vmx->rdtscp_enabled = false;
 	if (vmx_rdtscp_supported()) {
-		best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
-		if (best && (best->edx & bit(X86_FEATURE_RDTSCP)))
-			vmx->rdtscp_enabled = true;
-		else
+		bool rdtscp_enabled = guest_cpuid_has_rdtscp(vcpu);
+		if (!rdtscp_enabled)
 			secondary_exec_ctl &= ~SECONDARY_EXEC_RDTSCP;
 
 		if (nested) {
-			if (vmx->rdtscp_enabled)
+			if (rdtscp_enabled)
 				vmx->nested.nested_vmx_secondary_ctls_high |=
 					SECONDARY_EXEC_RDTSCP;
 			else
-- 
cgit v1.2.3-59-g8ed1b


From 72c930dcfc2b49404ee9e20f6c868402e9c71166 Mon Sep 17 00:00:00 2001
From: Radim Krčmář <rkrcmar@redhat.com>
Date: Fri, 18 Sep 2015 17:54:29 +0200
Subject: x86: kvmclock: abolish PVCLOCK_COUNTS_FROM_ZERO
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Newer KVM won't be exposing PVCLOCK_COUNTS_FROM_ZERO anymore.
The purpose of that flags was to start counting system time from 0 when
the KVM clock has been initialized.
We can achieve the same by selecting one read as the initial point.

A simple subtraction will work unless the KVM clock count overflows
earlier (has smaller width) than scheduler's cycle count.  We should be
safe till x86_128.

Because PVCLOCK_COUNTS_FROM_ZERO was enabled only on new hypervisors,
setting sched clock as stable based on PVCLOCK_TSC_STABLE_BIT might
regress on older ones.

I presume we don't need to change kvm_clock_read instead of introducing
kvm_sched_clock_read.  A problem could arise in case sched_clock is
expected to return the same value as get_cycles, but we should have
merged those clocks in that case.

Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Acked-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kernel/kvmclock.c | 46 +++++++++++++++++++++++++++++++++++-----------
 1 file changed, 35 insertions(+), 11 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 2c7aafa70702..2bd81e302427 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -32,6 +32,7 @@
 static int kvmclock = 1;
 static int msr_kvm_system_time = MSR_KVM_SYSTEM_TIME;
 static int msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK;
+static cycle_t kvm_sched_clock_offset;
 
 static int parse_no_kvmclock(char *arg)
 {
@@ -92,6 +93,29 @@ static cycle_t kvm_clock_get_cycles(struct clocksource *cs)
 	return kvm_clock_read();
 }
 
+static cycle_t kvm_sched_clock_read(void)
+{
+	return kvm_clock_read() - kvm_sched_clock_offset;
+}
+
+static inline void kvm_sched_clock_init(bool stable)
+{
+	if (!stable) {
+		pv_time_ops.sched_clock = kvm_clock_read;
+		return;
+	}
+
+	kvm_sched_clock_offset = kvm_clock_read();
+	pv_time_ops.sched_clock = kvm_sched_clock_read;
+	set_sched_clock_stable();
+
+	printk(KERN_INFO "kvm-clock: using sched offset of %llu cycles\n",
+			kvm_sched_clock_offset);
+
+	BUILD_BUG_ON(sizeof(kvm_sched_clock_offset) >
+	         sizeof(((struct pvclock_vcpu_time_info *)NULL)->system_time));
+}
+
 /*
  * If we don't do that, there is the possibility that the guest
  * will calibrate under heavy load - thus, getting a lower lpj -
@@ -248,7 +272,17 @@ void __init kvmclock_init(void)
 		memblock_free(mem, size);
 		return;
 	}
-	pv_time_ops.sched_clock = kvm_clock_read;
+
+	if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT))
+		pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT);
+
+	cpu = get_cpu();
+	vcpu_time = &hv_clock[cpu].pvti;
+	flags = pvclock_read_flags(vcpu_time);
+
+	kvm_sched_clock_init(flags & PVCLOCK_TSC_STABLE_BIT);
+	put_cpu();
+
 	x86_platform.calibrate_tsc = kvm_get_tsc_khz;
 	x86_platform.get_wallclock = kvm_get_wallclock;
 	x86_platform.set_wallclock = kvm_set_wallclock;
@@ -265,16 +299,6 @@ void __init kvmclock_init(void)
 	kvm_get_preset_lpj();
 	clocksource_register_hz(&kvm_clock, NSEC_PER_SEC);
 	pv_info.name = "KVM";
-
-	if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT))
-		pvclock_set_flags(~0);
-
-	cpu = get_cpu();
-	vcpu_time = &hv_clock[cpu].pvti;
-	flags = pvclock_read_flags(vcpu_time);
-	if (flags & PVCLOCK_COUNTS_FROM_ZERO)
-		set_sched_clock_stable();
-	put_cpu();
 }
 
 int __init kvm_setup_vsyscall_timeinfo(void)
-- 
cgit v1.2.3-59-g8ed1b


From 18cd52c4d9bfbd081fb643021ba8d048475cd82a Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Fri, 18 Sep 2015 18:04:17 +0200
Subject: irq_remapping: move structs outside #ifdef

This is friendlier to clients of the code, who are going to prepare
vcpu_data structs unconditionally, even if CONFIG_IRQ_REMAP is not
defined.

Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/irq_remapping.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h
index 046c7fb1ca43..a210eba2727c 100644
--- a/arch/x86/include/asm/irq_remapping.h
+++ b/arch/x86/include/asm/irq_remapping.h
@@ -33,6 +33,11 @@ enum irq_remap_cap {
 	IRQ_POSTING_CAP = 0,
 };
 
+struct vcpu_data {
+	u64 pi_desc_addr;	/* Physical address of PI Descriptor */
+	u32 vector;		/* Guest vector of the interrupt */
+};
+
 #ifdef CONFIG_IRQ_REMAP
 
 extern bool irq_remapping_cap(enum irq_remap_cap cap);
@@ -58,11 +63,6 @@ static inline struct irq_domain *arch_get_ir_parent_domain(void)
 	return x86_vector_domain;
 }
 
-struct vcpu_data {
-	u64 pi_desc_addr;	/* Physical address of PI Descriptor */
-	u32 vector;		/* Guest vector of the interrupt */
-};
-
 #else  /* CONFIG_IRQ_REMAP */
 
 static inline bool irq_remapping_cap(enum irq_remap_cap cap) { return 0; }
-- 
cgit v1.2.3-59-g8ed1b


From 6ef1522f7ecc063317dfb5ca63da6e47130a4c50 Mon Sep 17 00:00:00 2001
From: Feng Wu <feng.wu@intel.com>
Date: Fri, 18 Sep 2015 22:29:45 +0800
Subject: KVM: Extend struct pi_desc for VT-d Posted-Interrupts

Extend struct pi_desc for VT-d Posted-Interrupts.

Signed-off-by: Feng Wu <feng.wu@intel.com>
Reviewed-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 6f3f97f6e248..4ebdccd882c8 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -446,8 +446,24 @@ struct nested_vmx {
 /* Posted-Interrupt Descriptor */
 struct pi_desc {
 	u32 pir[8];     /* Posted interrupt requested */
-	u32 control;	/* bit 0 of control is outstanding notification bit */
-	u32 rsvd[7];
+	union {
+		struct {
+				/* bit 256 - Outstanding Notification */
+			u16	on	: 1,
+				/* bit 257 - Suppress Notification */
+				sn	: 1,
+				/* bit 271:258 - Reserved */
+				rsvd_1	: 14;
+				/* bit 279:272 - Notification Vector */
+			u8	nv;
+				/* bit 287:280 - Reserved */
+			u8	rsvd_2;
+				/* bit 319:288 - Notification Destination */
+			u32	ndst;
+		};
+		u64 control;
+	};
+	u32 rsvd[6];
 } __aligned(64);
 
 static bool pi_test_and_set_on(struct pi_desc *pi_desc)
-- 
cgit v1.2.3-59-g8ed1b


From ebbfc765369690cf8fc512615e6b83ec1702f8ac Mon Sep 17 00:00:00 2001
From: Feng Wu <feng.wu@intel.com>
Date: Fri, 18 Sep 2015 22:29:46 +0800
Subject: KVM: Add some helper functions for Posted-Interrupts

This patch adds some helper functions to manipulate the
Posted-Interrupts Descriptor.

Signed-off-by: Feng Wu <feng.wu@intel.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Alex Williamson <alex.williamson@redhat.com>
[Make the new functions inline. - Paolo]
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 4ebdccd882c8..4739a52874c2 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -443,6 +443,8 @@ struct nested_vmx {
 };
 
 #define POSTED_INTR_ON  0
+#define POSTED_INTR_SN  1
+
 /* Posted-Interrupt Descriptor */
 struct pi_desc {
 	u32 pir[8];     /* Posted interrupt requested */
@@ -483,6 +485,30 @@ static int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc)
 	return test_and_set_bit(vector, (unsigned long *)pi_desc->pir);
 }
 
+static inline void pi_clear_sn(struct pi_desc *pi_desc)
+{
+	return clear_bit(POSTED_INTR_SN,
+			(unsigned long *)&pi_desc->control);
+}
+
+static inline void pi_set_sn(struct pi_desc *pi_desc)
+{
+	return set_bit(POSTED_INTR_SN,
+			(unsigned long *)&pi_desc->control);
+}
+
+static inline int pi_test_on(struct pi_desc *pi_desc)
+{
+	return test_bit(POSTED_INTR_ON,
+			(unsigned long *)&pi_desc->control);
+}
+
+static inline int pi_test_sn(struct pi_desc *pi_desc)
+{
+	return test_bit(POSTED_INTR_SN,
+			(unsigned long *)&pi_desc->control);
+}
+
 struct vcpu_vmx {
 	struct kvm_vcpu       vcpu;
 	unsigned long         host_rsp;
-- 
cgit v1.2.3-59-g8ed1b


From 8feb4a04dc756002f78df0026e58118669de4851 Mon Sep 17 00:00:00 2001
From: Feng Wu <feng.wu@intel.com>
Date: Fri, 18 Sep 2015 22:29:47 +0800
Subject: KVM: Define a new interface kvm_intr_is_single_vcpu()

This patch defines a new interface kvm_intr_is_single_vcpu(),
which can returns whether the interrupt is for single-CPU or not.

It is used by VT-d PI, since now we only support single-CPU
interrupts, For lowest-priority interrupts, if user configures
it via /proc/irq or uses irqbalance to make it single-CPU, we
can use PI to deliver the interrupts to it. Full functionality
of lowest-priority support will be added later.

Signed-off-by: Feng Wu <feng.wu@intel.com>
Reviewed-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  3 +++
 arch/x86/kvm/irq_comm.c         | 27 +++++++++++++++++++
 arch/x86/kvm/lapic.c            | 59 +++++++++++++++++++++++++++++++++++++++++
 arch/x86/kvm/lapic.h            |  2 ++
 4 files changed, 91 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index d064cb2e19e8..ba4e5673e604 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1241,4 +1241,7 @@ int x86_set_memory_region(struct kvm *kvm,
 bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu);
 bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu);
 
+bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq,
+			     struct kvm_vcpu **dest_vcpu);
+
 #endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
index 177460998bb0..39f833f8132e 100644
--- a/arch/x86/kvm/irq_comm.c
+++ b/arch/x86/kvm/irq_comm.c
@@ -297,6 +297,33 @@ out:
 	return r;
 }
 
+bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq,
+			     struct kvm_vcpu **dest_vcpu)
+{
+	int i, r = 0;
+	struct kvm_vcpu *vcpu;
+
+	if (kvm_intr_is_single_vcpu_fast(kvm, irq, dest_vcpu))
+		return true;
+
+	kvm_for_each_vcpu(i, vcpu, kvm) {
+		if (!kvm_apic_present(vcpu))
+			continue;
+
+		if (!kvm_apic_match_dest(vcpu, NULL, irq->shorthand,
+					irq->dest_id, irq->dest_mode))
+			continue;
+
+		if (++r == 2)
+			return false;
+
+		*dest_vcpu = vcpu;
+	}
+
+	return r == 1;
+}
+EXPORT_SYMBOL_GPL(kvm_intr_is_single_vcpu);
+
 #define IOAPIC_ROUTING_ENTRY(irq) \
 	{ .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP,	\
 	  .u.irqchip = { .irqchip = KVM_IRQCHIP_IOAPIC, .pin = (irq) } }
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 2f4c0d0cbe0a..944b38a56929 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -755,6 +755,65 @@ out:
 	return ret;
 }
 
+bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq,
+			struct kvm_vcpu **dest_vcpu)
+{
+	struct kvm_apic_map *map;
+	bool ret = false;
+	struct kvm_lapic *dst = NULL;
+
+	if (irq->shorthand)
+		return false;
+
+	rcu_read_lock();
+	map = rcu_dereference(kvm->arch.apic_map);
+
+	if (!map)
+		goto out;
+
+	if (irq->dest_mode == APIC_DEST_PHYSICAL) {
+		if (irq->dest_id == 0xFF)
+			goto out;
+
+		if (irq->dest_id >= ARRAY_SIZE(map->phys_map))
+			goto out;
+
+		dst = map->phys_map[irq->dest_id];
+		if (dst && kvm_apic_present(dst->vcpu))
+			*dest_vcpu = dst->vcpu;
+		else
+			goto out;
+	} else {
+		u16 cid;
+		unsigned long bitmap = 1;
+		int i, r = 0;
+
+		if (!kvm_apic_logical_map_valid(map))
+			goto out;
+
+		apic_logical_id(map, irq->dest_id, &cid, (u16 *)&bitmap);
+
+		if (cid >= ARRAY_SIZE(map->logical_map))
+			goto out;
+
+		for_each_set_bit(i, &bitmap, 16) {
+			dst = map->logical_map[cid][i];
+			if (++r == 2)
+				goto out;
+		}
+
+		if (dst && kvm_apic_present(dst->vcpu))
+			*dest_vcpu = dst->vcpu;
+		else
+			goto out;
+	}
+
+	ret = true;
+out:
+	rcu_read_unlock();
+	return ret;
+}
+
 /*
  * Add a pending IRQ into lapic.
  * Return 1 if successfully added and 0 if discarded.
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 7259d272416f..fde8e35d5850 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -168,4 +168,6 @@ bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector);
 
 void wait_lapic_expire(struct kvm_vcpu *vcpu);
 
+bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq,
+			struct kvm_vcpu **dest_vcpu);
 #endif
-- 
cgit v1.2.3-59-g8ed1b


From d84f1e0755ba1e87d20d1f90c1e6eb0cbbc0af9d Mon Sep 17 00:00:00 2001
From: Feng Wu <feng.wu@intel.com>
Date: Fri, 18 Sep 2015 22:29:49 +0800
Subject: KVM: make kvm_set_msi_irq() public

Make kvm_set_msi_irq() public, we can use this function outside.

Signed-off-by: Feng Wu <feng.wu@intel.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 4 ++++
 arch/x86/kvm/irq_comm.c         | 5 +++--
 2 files changed, 7 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index ba4e5673e604..79fffbbe2348 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -176,6 +176,8 @@ enum {
  */
 #define KVM_APIC_PV_EOI_PENDING	1
 
+struct kvm_kernel_irq_routing_entry;
+
 /*
  * We don't want allocation failures within the mmu code, so we preallocate
  * enough memory for a single page fault in a cache.
@@ -1244,4 +1246,6 @@ bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu);
 bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq,
 			     struct kvm_vcpu **dest_vcpu);
 
+void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e,
+		     struct kvm_lapic_irq *irq);
 #endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
index 39f833f8132e..c89228980230 100644
--- a/arch/x86/kvm/irq_comm.c
+++ b/arch/x86/kvm/irq_comm.c
@@ -91,8 +91,8 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
 	return r;
 }
 
-static inline void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e,
-				   struct kvm_lapic_irq *irq)
+void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e,
+		     struct kvm_lapic_irq *irq)
 {
 	trace_kvm_msi_set_irq(e->msi.address_lo, e->msi.data);
 
@@ -108,6 +108,7 @@ static inline void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e,
 	irq->level = 1;
 	irq->shorthand = 0;
 }
+EXPORT_SYMBOL_GPL(kvm_set_msi_irq);
 
 int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
 		struct kvm *kvm, int irq_source_id, int level, bool line_status)
-- 
cgit v1.2.3-59-g8ed1b


From efc644048ecde54f016011fe10110addd0de348f Mon Sep 17 00:00:00 2001
From: Feng Wu <feng.wu@intel.com>
Date: Fri, 18 Sep 2015 22:29:51 +0800
Subject: KVM: x86: Update IRTE for posted-interrupts

This patch adds the routine to update IRTE for posted-interrupts
when guest changes the interrupt configuration.

Signed-off-by: Feng Wu <feng.wu@intel.com>
Reviewed-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Fengguang Wu <fengguang.wu@intel.com>
[Squashed in automatically generated patch from the build robot
 "KVM: x86: vcpu_to_pi_desc() can be static" - Paolo]
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  3 ++
 arch/x86/kvm/trace.h            | 33 ++++++++++++++++
 arch/x86/kvm/vmx.c              | 83 +++++++++++++++++++++++++++++++++++++++++
 arch/x86/kvm/x86.c              |  2 +
 4 files changed, 121 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 79fffbbe2348..a4067ecd4376 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -897,6 +897,9 @@ struct kvm_x86_ops {
 					   gfn_t offset, unsigned long mask);
 	/* pmu operations of sub-arch */
 	const struct kvm_pmu_ops *pmu_ops;
+
+	int (*update_pi_irte)(struct kvm *kvm, unsigned int host_irq,
+			      uint32_t guest_irq, bool set);
 };
 
 struct kvm_arch_async_pf {
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index ce4abe333c39..120302511802 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -992,6 +992,39 @@ TRACE_EVENT(kvm_enter_smm,
 		  __entry->smbase)
 );
 
+/*
+ * Tracepoint for VT-d posted-interrupts.
+ */
+TRACE_EVENT(kvm_pi_irte_update,
+	TP_PROTO(unsigned int vcpu_id, unsigned int gsi,
+		 unsigned int gvec, u64 pi_desc_addr, bool set),
+	TP_ARGS(vcpu_id, gsi, gvec, pi_desc_addr, set),
+
+	TP_STRUCT__entry(
+		__field(	unsigned int,	vcpu_id		)
+		__field(	unsigned int,	gsi		)
+		__field(	unsigned int,	gvec		)
+		__field(	u64,		pi_desc_addr	)
+		__field(	bool,		set		)
+	),
+
+	TP_fast_assign(
+		__entry->vcpu_id	= vcpu_id;
+		__entry->gsi		= gsi;
+		__entry->gvec		= gvec;
+		__entry->pi_desc_addr	= pi_desc_addr;
+		__entry->set		= set;
+	),
+
+	TP_printk("VT-d PI is %s for this irq, vcpu %u, gsi: 0x%x, "
+		  "gvec: 0x%x, pi_desc_addr: 0x%llx",
+		  __entry->set ? "enabled and being updated" : "disabled",
+		  __entry->vcpu_id,
+		  __entry->gsi,
+		  __entry->gvec,
+		  __entry->pi_desc_addr)
+);
+
 #endif /* _TRACE_KVM_H */
 
 #undef TRACE_INCLUDE_PATH
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 4739a52874c2..cb26b93a2cd7 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -45,6 +45,7 @@
 #include <asm/debugreg.h>
 #include <asm/kexec.h>
 #include <asm/apic.h>
+#include <asm/irq_remapping.h>
 
 #include "trace.h"
 #include "pmu.h"
@@ -603,6 +604,11 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
 	return container_of(vcpu, struct vcpu_vmx, vcpu);
 }
 
+static struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu)
+{
+	return &(to_vmx(vcpu)->pi_desc);
+}
+
 #define VMCS12_OFFSET(x) offsetof(struct vmcs12, x)
 #define FIELD(number, name)	[number] = VMCS12_OFFSET(name)
 #define FIELD64(number, name)	[number] = VMCS12_OFFSET(name), \
@@ -10345,6 +10351,81 @@ static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm,
 	kvm_mmu_clear_dirty_pt_masked(kvm, memslot, offset, mask);
 }
 
+/*
+ * vmx_update_pi_irte - set IRTE for Posted-Interrupts
+ *
+ * @kvm: kvm
+ * @host_irq: host irq of the interrupt
+ * @guest_irq: gsi of the interrupt
+ * @set: set or unset PI
+ * returns 0 on success, < 0 on failure
+ */
+static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
+			      uint32_t guest_irq, bool set)
+{
+	struct kvm_kernel_irq_routing_entry *e;
+	struct kvm_irq_routing_table *irq_rt;
+	struct kvm_lapic_irq irq;
+	struct kvm_vcpu *vcpu;
+	struct vcpu_data vcpu_info;
+	int idx, ret = -EINVAL;
+
+	if (!kvm_arch_has_assigned_device(kvm) ||
+		!irq_remapping_cap(IRQ_POSTING_CAP))
+		return 0;
+
+	idx = srcu_read_lock(&kvm->irq_srcu);
+	irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
+	BUG_ON(guest_irq >= irq_rt->nr_rt_entries);
+
+	hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
+		if (e->type != KVM_IRQ_ROUTING_MSI)
+			continue;
+		/*
+		 * VT-d PI cannot support posting multicast/broadcast
+		 * interrupts to a vCPU, we still use interrupt remapping
+		 * for these kind of interrupts.
+		 *
+		 * For lowest-priority interrupts, we only support
+		 * those with single CPU as the destination, e.g. user
+		 * configures the interrupts via /proc/irq or uses
+		 * irqbalance to make the interrupts single-CPU.
+		 *
+		 * We will support full lowest-priority interrupt later.
+		 */
+
+		kvm_set_msi_irq(e, &irq);
+		if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu))
+			continue;
+
+		vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu));
+		vcpu_info.vector = irq.vector;
+
+		trace_kvm_pi_irte_update(vcpu->vcpu_id, e->gsi,
+				vcpu_info.vector, vcpu_info.pi_desc_addr, set);
+
+		if (set)
+			ret = irq_set_vcpu_affinity(host_irq, &vcpu_info);
+		else {
+			/* suppress notification event before unposting */
+			pi_set_sn(vcpu_to_pi_desc(vcpu));
+			ret = irq_set_vcpu_affinity(host_irq, NULL);
+			pi_clear_sn(vcpu_to_pi_desc(vcpu));
+		}
+
+		if (ret < 0) {
+			printk(KERN_INFO "%s: failed to update PI IRTE\n",
+					__func__);
+			goto out;
+		}
+	}
+
+	ret = 0;
+out:
+	srcu_read_unlock(&kvm->irq_srcu, idx);
+	return ret;
+}
+
 static struct kvm_x86_ops vmx_x86_ops = {
 	.cpu_has_kvm_support = cpu_has_kvm_support,
 	.disabled_by_bios = vmx_disabled_by_bios,
@@ -10462,6 +10543,8 @@ static struct kvm_x86_ops vmx_x86_ops = {
 	.enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked,
 
 	.pmu_ops = &intel_pmu_ops,
+
+	.update_pi_irte = vmx_update_pi_irte,
 };
 
 static int __init vmx_init(void)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 185fc1619c99..3978ace4ddbc 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -64,6 +64,7 @@
 #include <asm/fpu/internal.h> /* Ugh! */
 #include <asm/pvclock.h>
 #include <asm/div64.h>
+#include <asm/irq_remapping.h>
 
 #define MAX_IO_MSRS 256
 #define KVM_MAX_MCE_BANKS 32
@@ -8094,3 +8095,4 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_tsc_offset);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ple_window);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pml_full);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pi_irte_update);
-- 
cgit v1.2.3-59-g8ed1b


From 87276880065246ce49ec571130d3d1e4a22e5604 Mon Sep 17 00:00:00 2001
From: Feng Wu <feng.wu@intel.com>
Date: Fri, 18 Sep 2015 22:29:40 +0800
Subject: KVM: x86: select IRQ_BYPASS_MANAGER

Select IRQ_BYPASS_MANAGER for x86 when CONFIG_KVM is set

Signed-off-by: Feng Wu <feng.wu@intel.com>
Reviewed-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/kvm/Kconfig            |  2 ++
 arch/x86/kvm/x86.c              | 53 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 56 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index a4067ecd4376..15664994b6f3 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -24,6 +24,7 @@
 #include <linux/perf_event.h>
 #include <linux/pvclock_gtod.h>
 #include <linux/clocksource.h>
+#include <linux/irqbypass.h>
 
 #include <asm/pvclock-abi.h>
 #include <asm/desc.h>
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index d8a1d56276e1..639a6e34500c 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -28,6 +28,8 @@ config KVM
 	select ANON_INODES
 	select HAVE_KVM_IRQCHIP
 	select HAVE_KVM_IRQFD
+	select IRQ_BYPASS_MANAGER
+	select HAVE_KVM_IRQ_BYPASS
 	select HAVE_KVM_IRQ_ROUTING
 	select HAVE_KVM_EVENTFD
 	select KVM_APIC_ARCHITECTURE
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 3978ace4ddbc..b8425a769c0a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -51,6 +51,8 @@
 #include <linux/pci.h>
 #include <linux/timekeeper_internal.h>
 #include <linux/pvclock_gtod.h>
+#include <linux/kvm_irqfd.h>
+#include <linux/irqbypass.h>
 #include <trace/events/kvm.h>
 
 #define CREATE_TRACE_POINTS
@@ -8079,6 +8081,57 @@ bool kvm_arch_has_noncoherent_dma(struct kvm *kvm)
 }
 EXPORT_SYMBOL_GPL(kvm_arch_has_noncoherent_dma);
 
+int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
+				      struct irq_bypass_producer *prod)
+{
+	struct kvm_kernel_irqfd *irqfd =
+		container_of(cons, struct kvm_kernel_irqfd, consumer);
+
+	if (kvm_x86_ops->update_pi_irte) {
+		irqfd->producer = prod;
+		return kvm_x86_ops->update_pi_irte(irqfd->kvm,
+				prod->irq, irqfd->gsi, 1);
+	}
+
+	return -EINVAL;
+}
+
+void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
+				      struct irq_bypass_producer *prod)
+{
+	int ret;
+	struct kvm_kernel_irqfd *irqfd =
+		container_of(cons, struct kvm_kernel_irqfd, consumer);
+
+	if (!kvm_x86_ops->update_pi_irte) {
+		WARN_ON(irqfd->producer != NULL);
+		return;
+	}
+
+	WARN_ON(irqfd->producer != prod);
+	irqfd->producer = NULL;
+
+	/*
+	 * When producer of consumer is unregistered, we change back to
+	 * remapped mode, so we can re-use the current implementation
+	 * when the irq is masked/disabed or the consumer side (KVM
+	 * int this case doesn't want to receive the interrupts.
+	*/
+	ret = kvm_x86_ops->update_pi_irte(irqfd->kvm, prod->irq, irqfd->gsi, 0);
+	if (ret)
+		printk(KERN_INFO "irq bypass consumer (token %p) unregistration"
+		       " fails: %d\n", irqfd->consumer.token, ret);
+}
+
+int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq,
+				   uint32_t guest_irq, bool set)
+{
+	if (!kvm_x86_ops->update_pi_irte)
+		return -EINVAL;
+
+	return kvm_x86_ops->update_pi_irte(kvm, host_irq, guest_irq, set);
+}
+
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
-- 
cgit v1.2.3-59-g8ed1b


From 28b835d60fcc2498e717cf5e6f0c3691c24546f7 Mon Sep 17 00:00:00 2001
From: Feng Wu <feng.wu@intel.com>
Date: Fri, 18 Sep 2015 22:29:54 +0800
Subject: KVM: Update Posted-Interrupts Descriptor when vCPU is preempted

This patch updates the Posted-Interrupts Descriptor when vCPU
is preempted.

sched out:
- Set 'SN' to suppress furture non-urgent interrupts posted for
the vCPU.

sched in:
- Clear 'SN'
- Change NDST if vCPU is scheduled to a different CPU
- Set 'NV' to POSTED_INTR_VECTOR

Signed-off-by: Feng Wu <feng.wu@intel.com>
[Include asm/cpu.h to fix !CONFIG_SMP compilation. - Paolo]
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 80 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 80 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index cb26b93a2cd7..99f5c61954ea 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -35,6 +35,7 @@
 #include "kvm_cache_regs.h"
 #include "x86.h"
 
+#include <asm/cpu.h>
 #include <asm/io.h>
 #include <asm/desc.h>
 #include <asm/vmx.h>
@@ -1942,6 +1943,52 @@ static void vmx_load_host_state(struct vcpu_vmx *vmx)
 	preempt_enable();
 }
 
+static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
+{
+	struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+	struct pi_desc old, new;
+	unsigned int dest;
+
+	if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
+		!irq_remapping_cap(IRQ_POSTING_CAP))
+		return;
+
+	do {
+		old.control = new.control = pi_desc->control;
+
+		/*
+		 * If 'nv' field is POSTED_INTR_WAKEUP_VECTOR, there
+		 * are two possible cases:
+		 * 1. After running 'pre_block', context switch
+		 *    happened. For this case, 'sn' was set in
+		 *    vmx_vcpu_put(), so we need to clear it here.
+		 * 2. After running 'pre_block', we were blocked,
+		 *    and woken up by some other guy. For this case,
+		 *    we don't need to do anything, 'pi_post_block'
+		 *    will do everything for us. However, we cannot
+		 *    check whether it is case #1 or case #2 here
+		 *    (maybe, not needed), so we also clear sn here,
+		 *    I think it is not a big deal.
+		 */
+		if (pi_desc->nv != POSTED_INTR_WAKEUP_VECTOR) {
+			if (vcpu->cpu != cpu) {
+				dest = cpu_physical_id(cpu);
+
+				if (x2apic_enabled())
+					new.ndst = dest;
+				else
+					new.ndst = (dest << 8) & 0xFF00;
+			}
+
+			/* set 'NV' to 'notification vector' */
+			new.nv = POSTED_INTR_VECTOR;
+		}
+
+		/* Allow posting non-urgent interrupts */
+		new.sn = 0;
+	} while (cmpxchg(&pi_desc->control, old.control,
+			new.control) != old.control);
+}
 /*
  * Switches to specified vcpu, until a matching vcpu_put(), but assumes
  * vcpu mutex is already taken.
@@ -1992,10 +2039,27 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 		vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
 		vmx->loaded_vmcs->cpu = cpu;
 	}
+
+	vmx_vcpu_pi_load(vcpu, cpu);
+}
+
+static void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
+{
+	struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+
+	if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
+		!irq_remapping_cap(IRQ_POSTING_CAP))
+		return;
+
+	/* Set SN when the vCPU is preempted */
+	if (vcpu->preempted)
+		pi_set_sn(pi_desc);
 }
 
 static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
 {
+	vmx_vcpu_pi_put(vcpu);
+
 	__vmx_load_host_state(to_vmx(vcpu));
 	if (!vmm_exclusive) {
 		__loaded_vmcs_clear(to_vmx(vcpu)->loaded_vmcs);
@@ -4427,6 +4491,22 @@ static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu)
 {
 #ifdef CONFIG_SMP
 	if (vcpu->mode == IN_GUEST_MODE) {
+		struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+		/*
+		 * Currently, we don't support urgent interrupt,
+		 * all interrupts are recognized as non-urgent
+		 * interrupt, so we cannot post interrupts when
+		 * 'SN' is set.
+		 *
+		 * If the vcpu is in guest mode, it means it is
+		 * running instead of being scheduled out and
+		 * waiting in the run queue, and that's the only
+		 * case when 'SN' is set currently, warning if
+		 * 'SN' is set.
+		 */
+		WARN_ON_ONCE(pi_test_sn(&vmx->pi_desc));
+
 		apic->send_IPI_mask(get_cpu_mask(vcpu->cpu),
 				POSTED_INTR_VECTOR);
 		return true;
-- 
cgit v1.2.3-59-g8ed1b


From bf9f6ac8d74969690df1485b33b7c238ca9f2269 Mon Sep 17 00:00:00 2001
From: Feng Wu <feng.wu@intel.com>
Date: Fri, 18 Sep 2015 22:29:55 +0800
Subject: KVM: Update Posted-Interrupts Descriptor when vCPU is blocked

This patch updates the Posted-Interrupts Descriptor when vCPU
is blocked.

pre-block:
- Add the vCPU to the blocked per-CPU list
- Set 'NV' to POSTED_INTR_WAKEUP_VECTOR

post-block:
- Remove the vCPU from the per-CPU list

Signed-off-by: Feng Wu <feng.wu@intel.com>
[Concentrate invocation of pre/post-block hooks to vcpu_block. - Paolo]
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Documentation/virtual/kvm/locking.txt |  12 +++
 arch/x86/include/asm/kvm_host.h       |  11 +++
 arch/x86/kvm/vmx.c                    | 153 ++++++++++++++++++++++++++++++++++
 arch/x86/kvm/x86.c                    |  34 +++++---
 include/linux/kvm_host.h              |   3 +
 virt/kvm/kvm_main.c                   |   3 +
 6 files changed, 206 insertions(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/Documentation/virtual/kvm/locking.txt b/Documentation/virtual/kvm/locking.txt
index d68af4dc3006..19f94a6b9bb0 100644
--- a/Documentation/virtual/kvm/locking.txt
+++ b/Documentation/virtual/kvm/locking.txt
@@ -166,3 +166,15 @@ Comment:	The srcu read lock must be held while accessing memslots (e.g.
 		MMIO/PIO address->device structure mapping (kvm->buses).
 		The srcu index can be stored in kvm_vcpu->srcu_idx per vcpu
 		if it is needed by multiple functions.
+
+Name:		blocked_vcpu_on_cpu_lock
+Type:		spinlock_t
+Arch:		x86
+Protects:	blocked_vcpu_on_cpu
+Comment:	This is a per-CPU lock and it is used for VT-d posted-interrupts.
+		When VT-d posted-interrupts is supported and the VM has assigned
+		devices, we put the blocked vCPU on the list blocked_vcpu_on_cpu
+		protected by blocked_vcpu_on_cpu_lock, when VT-d hardware issues
+		wakeup notification event since external interrupts from the
+		assigned devices happens, we will find the vCPU on the list to
+		wakeup.
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 15664994b6f3..cdbdb559ecd2 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -899,6 +899,17 @@ struct kvm_x86_ops {
 	/* pmu operations of sub-arch */
 	const struct kvm_pmu_ops *pmu_ops;
 
+	/*
+	 * Architecture specific hooks for vCPU blocking due to
+	 * HLT instruction.
+	 * Returns for .pre_block():
+	 *    - 0 means continue to block the vCPU.
+	 *    - 1 means we cannot block the vCPU since some event
+	 *        happens during this period, such as, 'ON' bit in
+	 *        posted-interrupts descriptor is set.
+	 */
+	int (*pre_block)(struct kvm_vcpu *vcpu);
+	void (*post_block)(struct kvm_vcpu *vcpu);
 	int (*update_pi_irte)(struct kvm *kvm, unsigned int host_irq,
 			      uint32_t guest_irq, bool set);
 };
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 99f5c61954ea..c5c22831aee2 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -878,6 +878,13 @@ static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
 static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu);
 static DEFINE_PER_CPU(struct desc_ptr, host_gdt);
 
+/*
+ * We maintian a per-CPU linked-list of vCPU, so in wakeup_handler() we
+ * can find which vCPU should be waken up.
+ */
+static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu);
+static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock);
+
 static unsigned long *vmx_io_bitmap_a;
 static unsigned long *vmx_io_bitmap_b;
 static unsigned long *vmx_msr_bitmap_legacy;
@@ -2986,6 +2993,8 @@ static int hardware_enable(void)
 		return -EBUSY;
 
 	INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
+	INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu));
+	spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
 
 	/*
 	 * Now we can enable the vmclear operation in kdump
@@ -6045,6 +6054,25 @@ static void update_ple_window_actual_max(void)
 			                    ple_window_grow, INT_MIN);
 }
 
+/*
+ * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR.
+ */
+static void wakeup_handler(void)
+{
+	struct kvm_vcpu *vcpu;
+	int cpu = smp_processor_id();
+
+	spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
+	list_for_each_entry(vcpu, &per_cpu(blocked_vcpu_on_cpu, cpu),
+			blocked_vcpu_list) {
+		struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+
+		if (pi_test_on(pi_desc) == 1)
+			kvm_vcpu_kick(vcpu);
+	}
+	spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
+}
+
 static __init int hardware_setup(void)
 {
 	int r = -ENOMEM, i, msr;
@@ -6231,6 +6259,8 @@ static __init int hardware_setup(void)
 		kvm_x86_ops->enable_log_dirty_pt_masked = NULL;
 	}
 
+	kvm_set_posted_intr_wakeup_handler(wakeup_handler);
+
 	return alloc_kvm_area();
 
 out8:
@@ -10431,6 +10461,126 @@ static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm,
 	kvm_mmu_clear_dirty_pt_masked(kvm, memslot, offset, mask);
 }
 
+/*
+ * This routine does the following things for vCPU which is going
+ * to be blocked if VT-d PI is enabled.
+ * - Store the vCPU to the wakeup list, so when interrupts happen
+ *   we can find the right vCPU to wake up.
+ * - Change the Posted-interrupt descriptor as below:
+ *      'NDST' <-- vcpu->pre_pcpu
+ *      'NV' <-- POSTED_INTR_WAKEUP_VECTOR
+ * - If 'ON' is set during this process, which means at least one
+ *   interrupt is posted for this vCPU, we cannot block it, in
+ *   this case, return 1, otherwise, return 0.
+ *
+ */
+static int vmx_pre_block(struct kvm_vcpu *vcpu)
+{
+	unsigned long flags;
+	unsigned int dest;
+	struct pi_desc old, new;
+	struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+
+	if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
+		!irq_remapping_cap(IRQ_POSTING_CAP))
+		return 0;
+
+	vcpu->pre_pcpu = vcpu->cpu;
+	spin_lock_irqsave(&per_cpu(blocked_vcpu_on_cpu_lock,
+			  vcpu->pre_pcpu), flags);
+	list_add_tail(&vcpu->blocked_vcpu_list,
+		      &per_cpu(blocked_vcpu_on_cpu,
+		      vcpu->pre_pcpu));
+	spin_unlock_irqrestore(&per_cpu(blocked_vcpu_on_cpu_lock,
+			       vcpu->pre_pcpu), flags);
+
+	do {
+		old.control = new.control = pi_desc->control;
+
+		/*
+		 * We should not block the vCPU if
+		 * an interrupt is posted for it.
+		 */
+		if (pi_test_on(pi_desc) == 1) {
+			spin_lock_irqsave(&per_cpu(blocked_vcpu_on_cpu_lock,
+					  vcpu->pre_pcpu), flags);
+			list_del(&vcpu->blocked_vcpu_list);
+			spin_unlock_irqrestore(
+					&per_cpu(blocked_vcpu_on_cpu_lock,
+					vcpu->pre_pcpu), flags);
+			vcpu->pre_pcpu = -1;
+
+			return 1;
+		}
+
+		WARN((pi_desc->sn == 1),
+		     "Warning: SN field of posted-interrupts "
+		     "is set before blocking\n");
+
+		/*
+		 * Since vCPU can be preempted during this process,
+		 * vcpu->cpu could be different with pre_pcpu, we
+		 * need to set pre_pcpu as the destination of wakeup
+		 * notification event, then we can find the right vCPU
+		 * to wakeup in wakeup handler if interrupts happen
+		 * when the vCPU is in blocked state.
+		 */
+		dest = cpu_physical_id(vcpu->pre_pcpu);
+
+		if (x2apic_enabled())
+			new.ndst = dest;
+		else
+			new.ndst = (dest << 8) & 0xFF00;
+
+		/* set 'NV' to 'wakeup vector' */
+		new.nv = POSTED_INTR_WAKEUP_VECTOR;
+	} while (cmpxchg(&pi_desc->control, old.control,
+			new.control) != old.control);
+
+	return 0;
+}
+
+static void vmx_post_block(struct kvm_vcpu *vcpu)
+{
+	struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+	struct pi_desc old, new;
+	unsigned int dest;
+	unsigned long flags;
+
+	if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
+		!irq_remapping_cap(IRQ_POSTING_CAP))
+		return;
+
+	do {
+		old.control = new.control = pi_desc->control;
+
+		dest = cpu_physical_id(vcpu->cpu);
+
+		if (x2apic_enabled())
+			new.ndst = dest;
+		else
+			new.ndst = (dest << 8) & 0xFF00;
+
+		/* Allow posting non-urgent interrupts */
+		new.sn = 0;
+
+		/* set 'NV' to 'notification vector' */
+		new.nv = POSTED_INTR_VECTOR;
+	} while (cmpxchg(&pi_desc->control, old.control,
+			new.control) != old.control);
+
+	if(vcpu->pre_pcpu != -1) {
+		spin_lock_irqsave(
+			&per_cpu(blocked_vcpu_on_cpu_lock,
+			vcpu->pre_pcpu), flags);
+		list_del(&vcpu->blocked_vcpu_list);
+		spin_unlock_irqrestore(
+			&per_cpu(blocked_vcpu_on_cpu_lock,
+			vcpu->pre_pcpu), flags);
+		vcpu->pre_pcpu = -1;
+	}
+}
+
 /*
  * vmx_update_pi_irte - set IRTE for Posted-Interrupts
  *
@@ -10622,6 +10772,9 @@ static struct kvm_x86_ops vmx_x86_ops = {
 	.flush_log_dirty = vmx_flush_log_dirty,
 	.enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked,
 
+	.pre_block = vmx_pre_block,
+	.post_block = vmx_post_block,
+
 	.pmu_ops = &intel_pmu_ops,
 
 	.update_pi_irte = vmx_update_pi_irte,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index b8425a769c0a..2d2c9bb0d6d6 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6335,6 +6335,20 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 		}
 	}
 
+	/*
+	 * KVM_REQ_EVENT is not set when posted interrupts are set by
+	 * VT-d hardware, so we have to update RVI unconditionally.
+	 */
+	if (kvm_lapic_enabled(vcpu)) {
+		/*
+		 * Update architecture specific hints for APIC
+		 * virtual interrupt delivery.
+		 */
+		if (kvm_x86_ops->hwapic_irr_update)
+			kvm_x86_ops->hwapic_irr_update(vcpu,
+				kvm_lapic_find_highest_irr(vcpu));
+	}
+
 	if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
 		kvm_apic_accept_events(vcpu);
 		if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
@@ -6351,13 +6365,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 			kvm_x86_ops->enable_irq_window(vcpu);
 
 		if (kvm_lapic_enabled(vcpu)) {
-			/*
-			 * Update architecture specific hints for APIC
-			 * virtual interrupt delivery.
-			 */
-			if (kvm_x86_ops->hwapic_irr_update)
-				kvm_x86_ops->hwapic_irr_update(vcpu,
-					kvm_lapic_find_highest_irr(vcpu));
 			update_cr8_intercept(vcpu);
 			kvm_lapic_sync_to_vapic(vcpu);
 		}
@@ -6493,10 +6500,15 @@ out:
 
 static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu)
 {
-	if (!kvm_arch_vcpu_runnable(vcpu)) {
+	if (!kvm_arch_vcpu_runnable(vcpu) &&
+	    (!kvm_x86_ops->pre_block || kvm_x86_ops->pre_block(vcpu) == 0)) {
 		srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
 		kvm_vcpu_block(vcpu);
 		vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
+
+		if (kvm_x86_ops->post_block)
+			kvm_x86_ops->post_block(vcpu);
+
 		if (!kvm_check_request(KVM_REQ_UNHALT, vcpu))
 			return 1;
 	}
@@ -6528,10 +6540,12 @@ static int vcpu_run(struct kvm_vcpu *vcpu)
 
 	for (;;) {
 		if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
-		    !vcpu->arch.apf.halted)
+		    !vcpu->arch.apf.halted) {
 			r = vcpu_enter_guest(vcpu);
-		else
+		} else {
 			r = vcpu_block(kvm, vcpu);
+		}
+
 		if (r <= 0)
 			break;
 
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 5c3f4538807f..9596a2f0977b 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -234,6 +234,9 @@ struct kvm_vcpu {
 	unsigned long requests;
 	unsigned long guest_debug;
 
+	int pre_pcpu;
+	struct list_head blocked_vcpu_list;
+
 	struct mutex mutex;
 	struct kvm_run *run;
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index afd7ae6aec65..a75502c93c3e 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -230,6 +230,9 @@ int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
 	init_waitqueue_head(&vcpu->wq);
 	kvm_async_pf_vcpu_init(vcpu);
 
+	vcpu->pre_pcpu = -1;
+	INIT_LIST_HEAD(&vcpu->blocked_vcpu_list);
+
 	page = alloc_page(GFP_KERNEL | __GFP_ZERO);
 	if (!page) {
 		r = -ENOMEM;
-- 
cgit v1.2.3-59-g8ed1b


From c77f3fab441c3e466b4c3601a475fc31ce156b06 Mon Sep 17 00:00:00 2001
From: Radim Krčmář <rkrcmar@redhat.com>
Date: Thu, 8 Oct 2015 20:23:33 +0200
Subject: kvm: x86: set KVM_REQ_EVENT when updating IRR
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

After moving PIR to IRR, the interrupt needs to be delivered manually.

Reported-by: Paolo Bonzini <pbonzini@redhat.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/lapic.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 944b38a56929..168b8759bd73 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -348,6 +348,8 @@ void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir)
 	struct kvm_lapic *apic = vcpu->arch.apic;
 
 	__kvm_apic_update_irr(pir, apic->regs);
+
+	kvm_make_request(KVM_REQ_EVENT, vcpu);
 }
 EXPORT_SYMBOL_GPL(kvm_apic_update_irr);
 
-- 
cgit v1.2.3-59-g8ed1b


From db2bdcbbbd32e5500b822d5e74ef8b5bd777e687 Mon Sep 17 00:00:00 2001
From: Radim Krčmář <rkrcmar@redhat.com>
Date: Thu, 8 Oct 2015 20:23:34 +0200
Subject: KVM: x86: fix edge EOI and IOAPIC reconfig race
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

KVM uses eoi_exit_bitmap to track vectors that need an action on EOI.
The problem is that IOAPIC can be reconfigured while an interrupt with
old configuration is pending and eoi_exit_bitmap only remembers the
newest configuration;  thus EOI from the pending interrupt is not
recognized.

(Reconfiguration is not a problem for level interrupts, because IOAPIC
 sends interrupt with the new configuration.)

For an edge interrupt with ACK notifiers, like i8254 timer; things can
happen in this order
 1) IOAPIC inject a vector from i8254
 2) guest reconfigures that vector's VCPU and therefore eoi_exit_bitmap
    on original VCPU gets cleared
 3) guest's handler for the vector does EOI
 4) KVM's EOI handler doesn't pass that vector to IOAPIC because it is
    not in that VCPU's eoi_exit_bitmap
 5) i8254 stops working

A simple solution is to set the IOAPIC vector in eoi_exit_bitmap if the
vector is in PIR/IRR/ISR.

This creates an unwanted situation if the vector is reused by a
non-IOAPIC source, but I think it is so rare that we don't want to make
the solution more sophisticated.  The simple solution also doesn't work
if we are reconfiguring the vector.  (Shouldn't happen in the wild and
I'd rather fix users of ACK notifiers instead of working around that.)

The are no races because ioapic injection and reconfig are locked.

Fixes: b053b2aef25d ("KVM: x86: Add EOI exit bitmap inference")
[Before b053b2aef25d, this bug happened only with APICv.]
Fixes: c7c9c56ca26f ("x86, apicv: add virtual interrupt delivery support")
Cc: <stable@vger.kernel.org>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/ioapic.c | 4 +++-
 arch/x86/kvm/x86.c    | 4 +++-
 2 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
index 2dcda0f188ba..88d0a92d3f94 100644
--- a/arch/x86/kvm/ioapic.c
+++ b/arch/x86/kvm/ioapic.c
@@ -246,7 +246,9 @@ void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
 		    kvm_irq_has_notifier(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index) ||
 		    index == RTC_GSI) {
 			if (kvm_apic_match_dest(vcpu, NULL, 0,
-				e->fields.dest_id, e->fields.dest_mode))
+			             e->fields.dest_id, e->fields.dest_mode) ||
+			    (e->fields.trig_mode == IOAPIC_EDGE_TRIG &&
+			     kvm_apic_pending_eoi(vcpu, e->fields.vector)))
 				__set_bit(e->fields.vector,
 					(unsigned long *)eoi_exit_bitmap);
 		}
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e28954d2698a..e33aebbf189e 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6201,8 +6201,10 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
 
 	if (irqchip_split(vcpu->kvm))
 		kvm_scan_ioapic_routes(vcpu, vcpu->arch.eoi_exit_bitmap);
-	else
+	else {
+		kvm_x86_ops->sync_pir_to_irr(vcpu);
 		kvm_ioapic_scan_entry(vcpu, vcpu->arch.eoi_exit_bitmap);
+	}
 	kvm_x86_ops->load_eoi_exitmap(vcpu);
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 13db77347db175a68edd58a79963cdf5cb3a9607 Mon Sep 17 00:00:00 2001
From: Radim Krčmář <rkrcmar@redhat.com>
Date: Thu, 8 Oct 2015 20:30:00 +0200
Subject: KVM: x86: don't notify userspace IOAPIC on edge EOI
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

On real hardware, edge-triggered interrupts don't set a bit in TMR,
which means that IOAPIC isn't notified on EOI.  Do the same here.

Staying in guest/kernel mode after edge EOI is what we want for most
devices.  If some bugs could be nicely worked around with edge EOI
notifications, we should invest in a better interface.

Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/irq_comm.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
index c89228980230..8f4499c7ffc1 100644
--- a/arch/x86/kvm/irq_comm.c
+++ b/arch/x86/kvm/irq_comm.c
@@ -389,13 +389,15 @@ void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
 	for (i = 0; i < nr_ioapic_pins; ++i) {
 		hlist_for_each_entry(entry, &table->map[i], link) {
 			u32 dest_id, dest_mode;
+			bool level;
 
 			if (entry->type != KVM_IRQ_ROUTING_MSI)
 				continue;
 			dest_id = (entry->msi.address_lo >> 12) & 0xff;
 			dest_mode = (entry->msi.address_lo >> 2) & 0x1;
-			if (kvm_apic_match_dest(vcpu, NULL, 0, dest_id,
-						dest_mode)) {
+			level = entry->msi.data & MSI_DATA_TRIGGER_LEVEL;
+			if (level && kvm_apic_match_dest(vcpu, NULL, 0,
+						dest_id, dest_mode)) {
 				u32 vector = entry->msi.data & 0xff;
 
 				__set_bit(vector,
-- 
cgit v1.2.3-59-g8ed1b


From 991e7a0eedf12721f5561d7a1a54d248dc290bc0 Mon Sep 17 00:00:00 2001
From: Wanpeng Li <wanpeng.li@hotmail.com>
Date: Wed, 16 Sep 2015 17:30:05 +0800
Subject: KVM: VMX: adjust interface to allocate/free_vpid

Adjust allocate/free_vid so that they can be reused for the nested vpid.

Signed-off-by: Wanpeng Li <wanpeng.li@hotmail.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 25 ++++++++++++-------------
 1 file changed, 12 insertions(+), 13 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 8eeba6ac5914..1452271e98ae 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -4268,29 +4268,28 @@ static int alloc_identity_pagetable(struct kvm *kvm)
 	return r;
 }
 
-static void allocate_vpid(struct vcpu_vmx *vmx)
+static int allocate_vpid(void)
 {
 	int vpid;
 
-	vmx->vpid = 0;
 	if (!enable_vpid)
-		return;
+		return 0;
 	spin_lock(&vmx_vpid_lock);
 	vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS);
-	if (vpid < VMX_NR_VPIDS) {
-		vmx->vpid = vpid;
+	if (vpid < VMX_NR_VPIDS)
 		__set_bit(vpid, vmx_vpid_bitmap);
-	}
+	else
+		vpid = 0;
 	spin_unlock(&vmx_vpid_lock);
+	return vpid;
 }
 
-static void free_vpid(struct vcpu_vmx *vmx)
+static void free_vpid(int vpid)
 {
-	if (!enable_vpid)
+	if (!enable_vpid || vpid == 0)
 		return;
 	spin_lock(&vmx_vpid_lock);
-	if (vmx->vpid != 0)
-		__clear_bit(vmx->vpid, vmx_vpid_bitmap);
+	__clear_bit(vpid, vmx_vpid_bitmap);
 	spin_unlock(&vmx_vpid_lock);
 }
 
@@ -8629,7 +8628,7 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
 
 	if (enable_pml)
 		vmx_disable_pml(vmx);
-	free_vpid(vmx);
+	free_vpid(vmx->vpid);
 	leave_guest_mode(vcpu);
 	vmx_load_vmcs01(vcpu);
 	free_nested(vmx);
@@ -8648,7 +8647,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
 	if (!vmx)
 		return ERR_PTR(-ENOMEM);
 
-	allocate_vpid(vmx);
+	vmx->vpid = allocate_vpid();
 
 	err = kvm_vcpu_init(&vmx->vcpu, kvm, id);
 	if (err)
@@ -8724,7 +8723,7 @@ free_msrs:
 uninit_vcpu:
 	kvm_vcpu_uninit(&vmx->vcpu);
 free_vcpu:
-	free_vpid(vmx);
+	free_vpid(vmx->vpid);
 	kmem_cache_free(kvm_vcpu_cache, vmx);
 	return ERR_PTR(err);
 }
-- 
cgit v1.2.3-59-g8ed1b


From dd5f5341a3a684c8850f8a8ccbaa70e196518e76 Mon Sep 17 00:00:00 2001
From: Wanpeng Li <wanpeng.li@hotmail.com>
Date: Wed, 23 Sep 2015 18:26:57 +0800
Subject: KVM: VMX: introduce __vmx_flush_tlb to handle specific vpid

Introduce __vmx_flush_tlb() to handle specific vpid.

Signed-off-by: Wanpeng Li <wanpeng.li@hotmail.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 1452271e98ae..e1d94f81a28d 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1392,13 +1392,13 @@ static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
 			 __loaded_vmcs_clear, loaded_vmcs, 1);
 }
 
-static inline void vpid_sync_vcpu_single(struct vcpu_vmx *vmx)
+static inline void vpid_sync_vcpu_single(int vpid)
 {
-	if (vmx->vpid == 0)
+	if (vpid == 0)
 		return;
 
 	if (cpu_has_vmx_invvpid_single())
-		__invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vmx->vpid, 0);
+		__invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vpid, 0);
 }
 
 static inline void vpid_sync_vcpu_global(void)
@@ -1407,10 +1407,10 @@ static inline void vpid_sync_vcpu_global(void)
 		__invvpid(VMX_VPID_EXTENT_ALL_CONTEXT, 0, 0);
 }
 
-static inline void vpid_sync_context(struct vcpu_vmx *vmx)
+static inline void vpid_sync_context(int vpid)
 {
 	if (cpu_has_vmx_invvpid_single())
-		vpid_sync_vcpu_single(vmx);
+		vpid_sync_vcpu_single(vpid);
 	else
 		vpid_sync_vcpu_global();
 }
@@ -3563,9 +3563,9 @@ static void exit_lmode(struct kvm_vcpu *vcpu)
 
 #endif
 
-static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
+static inline void __vmx_flush_tlb(struct kvm_vcpu *vcpu, int vpid)
 {
-	vpid_sync_context(to_vmx(vcpu));
+	vpid_sync_context(vpid);
 	if (enable_ept) {
 		if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
 			return;
@@ -3573,6 +3573,11 @@ static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
 	}
 }
 
+static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
+{
+	__vmx_flush_tlb(vcpu, to_vmx(vcpu)->vpid);
+}
+
 static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
 {
 	ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits;
@@ -4915,7 +4920,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
 	vmx_fpu_activate(vcpu);
 	update_exception_bitmap(vcpu);
 
-	vpid_sync_context(vmx);
+	vpid_sync_context(vmx->vpid);
 }
 
 /*
-- 
cgit v1.2.3-59-g8ed1b


From 99b83ac893b84ed1a62ad6d1f2b6cc32026b9e85 Mon Sep 17 00:00:00 2001
From: Wanpeng Li <wanpeng.li@hotmail.com>
Date: Tue, 13 Oct 2015 09:12:21 -0700
Subject: KVM: nVMX: emulate the INVVPID instruction

Add the INVVPID instruction emulation.

Reviewed-by: Wincy Van <fanwenyi0529@gmail.com>
Signed-off-by: Wanpeng Li <wanpeng.li@hotmail.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/vmx.h |  1 +
 arch/x86/kvm/vmx.c         | 61 +++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 61 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index d25f32ac9c5c..aa336ff3e03e 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -416,6 +416,7 @@ enum vmcs_field {
 #define VMX_EPT_EXTENT_CONTEXT_BIT		(1ull << 25)
 #define VMX_EPT_EXTENT_GLOBAL_BIT		(1ull << 26)
 
+#define VMX_VPID_INVVPID_BIT                    (1ull << 0) /* (32 - 32) */
 #define VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT      (1ull << 9) /* (41 - 32) */
 #define VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT      (1ull << 10) /* (42 - 32) */
 
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index e1d94f81a28d..9b6ab311d0bd 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -442,6 +442,7 @@ struct nested_vmx {
 	u32 nested_vmx_misc_low;
 	u32 nested_vmx_misc_high;
 	u32 nested_vmx_ept_caps;
+	u32 nested_vmx_vpid_caps;
 };
 
 #define POSTED_INTR_ON  0
@@ -2612,6 +2613,8 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
 	} else
 		vmx->nested.nested_vmx_ept_caps = 0;
 
+	vmx->nested.nested_vmx_vpid_caps = 0;
+
 	if (enable_unrestricted_guest)
 		vmx->nested.nested_vmx_secondary_ctls_high |=
 			SECONDARY_EXEC_UNRESTRICTED_GUEST;
@@ -7329,7 +7332,63 @@ static int handle_invept(struct kvm_vcpu *vcpu)
 
 static int handle_invvpid(struct kvm_vcpu *vcpu)
 {
-	kvm_queue_exception(vcpu, UD_VECTOR);
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	u32 vmx_instruction_info;
+	unsigned long type, types;
+	gva_t gva;
+	struct x86_exception e;
+	int vpid;
+
+	if (!(vmx->nested.nested_vmx_secondary_ctls_high &
+	      SECONDARY_EXEC_ENABLE_VPID) ||
+			!(vmx->nested.nested_vmx_vpid_caps & VMX_VPID_INVVPID_BIT)) {
+		kvm_queue_exception(vcpu, UD_VECTOR);
+		return 1;
+	}
+
+	if (!nested_vmx_check_permission(vcpu))
+		return 1;
+
+	vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+	type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf);
+
+	types = (vmx->nested.nested_vmx_vpid_caps >> 8) & 0x7;
+
+	if (!(types & (1UL << type))) {
+		nested_vmx_failValid(vcpu,
+			VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+		return 1;
+	}
+
+	/* according to the intel vmx instruction reference, the memory
+	 * operand is read even if it isn't needed (e.g., for type==global)
+	 */
+	if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
+			vmx_instruction_info, false, &gva))
+		return 1;
+	if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vpid,
+				sizeof(u32), &e)) {
+		kvm_inject_page_fault(vcpu, &e);
+		return 1;
+	}
+
+	switch (type) {
+	case VMX_VPID_EXTENT_ALL_CONTEXT:
+		if (get_vmcs12(vcpu)->virtual_processor_id == 0) {
+			nested_vmx_failValid(vcpu,
+				VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+			return 1;
+		}
+		vmx_flush_tlb(vcpu);
+		nested_vmx_succeed(vcpu);
+		break;
+	default:
+		/* Trap single context invalidation invvpid calls */
+		BUG_ON(1);
+		break;
+	}
+
+	skip_emulated_instruction(vcpu);
 	return 1;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 5c614b3583e7b6dab0c86356fa36c2bcbb8322a0 Mon Sep 17 00:00:00 2001
From: Wanpeng Li <wanpeng.li@hotmail.com>
Date: Tue, 13 Oct 2015 09:18:36 -0700
Subject: KVM: nVMX: nested VPID emulation

VPID is used to tag address space and avoid a TLB flush. Currently L0 use
the same VPID to run L1 and all its guests. KVM flushes VPID when switching
between L1 and L2.

This patch advertises VPID to the L1 hypervisor, then address space of L1
and L2 can be separately treated and avoid TLB flush when swithing between
L1 and L2. For each nested vmentry, if vpid12 is changed, reuse shadow vpid
w/ an invvpid.

Performance:

run lmbench on L2 w/ 3.5 kernel.

Context switching - times in microseconds - smaller is better
-------------------------------------------------------------------------
Host                 OS  2p/0K 2p/16K 2p/64K 8p/16K 8p/64K 16p/16K 16p/64K
                         ctxsw  ctxsw  ctxsw ctxsw  ctxsw   ctxsw   ctxsw
--------- ------------- ------ ------ ------ ------ ------ ------- -------
kernel    Linux 3.5.0-1 1.2200 1.3700 1.4500 4.7800 2.3300 5.60000 2.88000  nested VPID
kernel    Linux 3.5.0-1 1.2600 1.4300 1.5600   12.7   12.9 3.49000 7.46000  vanilla

Reviewed-by: Jan Kiszka <jan.kiszka@siemens.com>
Reviewed-by: Wincy Van <fanwenyi0529@gmail.com>
Signed-off-by: Wanpeng Li <wanpeng.li@hotmail.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 39 ++++++++++++++++++++++++++++++++-------
 1 file changed, 32 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 9b6ab311d0bd..bea552204aa2 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -426,6 +426,9 @@ struct nested_vmx {
 	/* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */
 	u64 vmcs01_debugctl;
 
+	u16 vpid02;
+	u16 last_vpid;
+
 	u32 nested_vmx_procbased_ctls_low;
 	u32 nested_vmx_procbased_ctls_high;
 	u32 nested_vmx_true_procbased_ctls_low;
@@ -1213,6 +1216,11 @@ static inline bool nested_cpu_has_virt_x2apic_mode(struct vmcs12 *vmcs12)
 	return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE);
 }
 
+static inline bool nested_cpu_has_vpid(struct vmcs12 *vmcs12)
+{
+	return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_VPID);
+}
+
 static inline bool nested_cpu_has_apic_reg_virt(struct vmcs12 *vmcs12)
 {
 	return nested_cpu_has2(vmcs12, SECONDARY_EXEC_APIC_REGISTER_VIRT);
@@ -2590,6 +2598,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
 		SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
 		SECONDARY_EXEC_RDTSCP |
 		SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
+		SECONDARY_EXEC_ENABLE_VPID |
 		SECONDARY_EXEC_APIC_REGISTER_VIRT |
 		SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
 		SECONDARY_EXEC_WBINVD_EXITING |
@@ -6818,6 +6827,7 @@ static void free_nested(struct vcpu_vmx *vmx)
 		return;
 
 	vmx->nested.vmxon = false;
+	free_vpid(vmx->nested.vpid02);
 	nested_release_vmcs12(vmx);
 	if (enable_shadow_vmcs)
 		free_vmcs(vmx->nested.current_shadow_vmcs);
@@ -7379,7 +7389,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
 				VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
 			return 1;
 		}
-		vmx_flush_tlb(vcpu);
+		__vmx_flush_tlb(vcpu, to_vmx(vcpu)->nested.vpid02);
 		nested_vmx_succeed(vcpu);
 		break;
 	default:
@@ -8759,8 +8769,10 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
 			goto free_vmcs;
 	}
 
-	if (nested)
+	if (nested) {
 		nested_vmx_setup_ctls_msrs(vmx);
+		vmx->nested.vpid02 = allocate_vpid();
+	}
 
 	vmx->nested.posted_intr_nv = -1;
 	vmx->nested.current_vmptr = -1ull;
@@ -8781,6 +8793,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
 	return &vmx->vcpu;
 
 free_vmcs:
+	free_vpid(vmx->nested.vpid02);
 	free_loaded_vmcs(vmx->loaded_vmcs);
 free_msrs:
 	kfree(vmx->guest_msrs);
@@ -9665,12 +9678,24 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 
 	if (enable_vpid) {
 		/*
-		 * Trivially support vpid by letting L2s share their parent
-		 * L1's vpid. TODO: move to a more elaborate solution, giving
-		 * each L2 its own vpid and exposing the vpid feature to L1.
+		 * There is no direct mapping between vpid02 and vpid12, the
+		 * vpid02 is per-vCPU for L0 and reused while the value of
+		 * vpid12 is changed w/ one invvpid during nested vmentry.
+		 * The vpid12 is allocated by L1 for L2, so it will not
+		 * influence global bitmap(for vpid01 and vpid02 allocation)
+		 * even if spawn a lot of nested vCPUs.
 		 */
-		vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
-		vmx_flush_tlb(vcpu);
+		if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02) {
+			vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02);
+			if (vmcs12->virtual_processor_id != vmx->nested.last_vpid) {
+				vmx->nested.last_vpid = vmcs12->virtual_processor_id;
+				__vmx_flush_tlb(vcpu, to_vmx(vcpu)->nested.vpid02);
+			}
+		} else {
+			vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
+			vmx_flush_tlb(vcpu);
+		}
+
 	}
 
 	if (nested_cpu_has_ept(vmcs12)) {
-- 
cgit v1.2.3-59-g8ed1b


From 089d7b6ec5151ad06a2cd524bc0580d311b641ad Mon Sep 17 00:00:00 2001
From: Wanpeng Li <wanpeng.li@hotmail.com>
Date: Tue, 13 Oct 2015 09:18:37 -0700
Subject: KVM: nVMX: expose VPID capability to L1

Expose VPID capability to L1. For nested guests, we don't do anything
specific for single context invalidation. Hence, only advertise support
for global context invalidation. The major benefit of nested VPID comes
from having separate vpids when switching between L1 and L2, and also
when L2's vCPUs not sched in/out on L1.

Reviewed-by: Wincy Van <fanwenyi0529@gmail.com>
Signed-off-by: Wanpeng Li <wanpeng.li@hotmail.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index bea552204aa2..15bff51d492a 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2622,7 +2622,11 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
 	} else
 		vmx->nested.nested_vmx_ept_caps = 0;
 
-	vmx->nested.nested_vmx_vpid_caps = 0;
+	if (enable_vpid)
+		vmx->nested.nested_vmx_vpid_caps = VMX_VPID_INVVPID_BIT |
+				VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT;
+	else
+		vmx->nested.nested_vmx_vpid_caps = 0;
 
 	if (enable_unrestricted_guest)
 		vmx->nested.nested_vmx_secondary_ctls_high |=
@@ -2739,7 +2743,8 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
 		break;
 	case MSR_IA32_VMX_EPT_VPID_CAP:
 		/* Currently, no nested vpid support */
-		*pdata = vmx->nested.nested_vmx_ept_caps;
+		*pdata = vmx->nested.nested_vmx_ept_caps |
+			((u64)vmx->nested.nested_vmx_vpid_caps << 32);
 		break;
 	default:
 		return 1;
-- 
cgit v1.2.3-59-g8ed1b


From 951f9fd74f2d826fff1d84a8ec34b491517dc15d Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 23 Sep 2015 10:34:26 +0200
Subject: KVM: x86: manually unroll bad_mt_xwr loop

The loop is computing one of two constants, it can be simpler to write
everything inline.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu.c | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index c3f39aa9b9cb..b8482c0b75b2 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3706,7 +3706,7 @@ static void
 __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check,
 			    int maxphyaddr, bool execonly)
 {
-	int pte;
+	u64 bad_mt_xwr;
 
 	rsvd_check->rsvd_bits_mask[0][3] =
 		rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7);
@@ -3724,14 +3724,16 @@ __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check,
 		rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 20);
 	rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0];
 
-	for (pte = 0; pte < 64; pte++) {
-		int rwx_bits = pte & 7;
-		int mt = pte >> 3;
-		if (mt == 0x2 || mt == 0x3 || mt == 0x7 ||
-				rwx_bits == 0x2 || rwx_bits == 0x6 ||
-				(rwx_bits == 0x4 && !execonly))
-			rsvd_check->bad_mt_xwr |= (1ull << pte);
+	bad_mt_xwr = 0xFFull << (2 * 8);	/* bits 3..5 must not be 2 */
+	bad_mt_xwr |= 0xFFull << (3 * 8);	/* bits 3..5 must not be 3 */
+	bad_mt_xwr |= 0xFFull << (7 * 8);	/* bits 3..5 must not be 7 */
+	bad_mt_xwr |= REPEAT_BYTE(1ull << 2);	/* bits 0..2 must not be 010 */
+	bad_mt_xwr |= REPEAT_BYTE(1ull << 6);	/* bits 0..2 must not be 110 */
+	if (!execonly) {
+		/* bits 0..2 must not be 100 unless VMX capabilities allow it */
+		bad_mt_xwr |= REPEAT_BYTE(1ull << 4);
 	}
+	rsvd_check->bad_mt_xwr = bad_mt_xwr;
 }
 
 static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu,
-- 
cgit v1.2.3-59-g8ed1b


From 6092d3d3e6db983048469d424a8f2221915a8dd3 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joro@8bytes.org>
Date: Wed, 14 Oct 2015 15:10:54 +0200
Subject: kvm: svm: Only propagate next_rip when guest supports it

Currently we always write the next_rip of the shadow vmcb to
the guests vmcb when we emulate a vmexit. This could confuse
the guest when its cpuid indicated no support for the
next_rip feature.

Fix this by only propagating next_rip if the guest actually
supports it.

Cc: Bandan Das <bsd@redhat.com>
Cc: Dirk Mueller <dmueller@suse.com>
Tested-By: Dirk Mueller <dmueller@suse.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/cpuid.h | 21 +++++++++++++++++++++
 arch/x86/kvm/svm.c   | 11 ++++++++++-
 2 files changed, 31 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index d434ee952b00..06332cb7e7d1 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -149,4 +149,25 @@ static inline bool guest_cpuid_has_rdtscp(struct kvm_vcpu *vcpu)
 	best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
 	return best && (best->edx & bit(X86_FEATURE_RDTSCP));
 }
+
+/*
+ * NRIPS is provided through cpuidfn 0x8000000a.edx bit 3
+ */
+#define BIT_NRIPS	3
+
+static inline bool guest_cpuid_has_nrips(struct kvm_vcpu *vcpu)
+{
+	struct kvm_cpuid_entry2 *best;
+
+	best = kvm_find_cpuid_entry(vcpu, 0x8000000a, 0);
+
+	/*
+	 * NRIPS is a scattered cpuid feature, so we can't use
+	 * X86_FEATURE_NRIPS here (X86_FEATURE_NRIPS would be bit
+	 * position 8, not 3).
+	 */
+	return best && (best->edx & bit(BIT_NRIPS));
+}
+#undef BIT_NRIPS
+
 #endif
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 54a86183e5d3..cd8659cfc632 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -159,6 +159,9 @@ struct vcpu_svm {
 	u32 apf_reason;
 
 	u64  tsc_ratio;
+
+	/* cached guest cpuid flags for faster access */
+	bool nrips_enabled	: 1;
 };
 
 static DEFINE_PER_CPU(u64, current_tsc_ratio);
@@ -2365,7 +2368,9 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
 	nested_vmcb->control.exit_info_2       = vmcb->control.exit_info_2;
 	nested_vmcb->control.exit_int_info     = vmcb->control.exit_int_info;
 	nested_vmcb->control.exit_int_info_err = vmcb->control.exit_int_info_err;
-	nested_vmcb->control.next_rip          = vmcb->control.next_rip;
+
+	if (svm->nrips_enabled)
+		nested_vmcb->control.next_rip  = vmcb->control.next_rip;
 
 	/*
 	 * If we emulate a VMRUN/#VMEXIT in the same host #vmexit cycle we have
@@ -4085,6 +4090,10 @@ static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
 
 static void svm_cpuid_update(struct kvm_vcpu *vcpu)
 {
+	struct vcpu_svm *svm = to_svm(vcpu);
+
+	/* Update nrips enabled cache */
+	svm->nrips_enabled = !!guest_cpuid_has_nrips(&svm->vcpu);
 }
 
 static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
-- 
cgit v1.2.3-59-g8ed1b


From cd1872f028556dc0e8424e58413c0268c159383b Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa_takuya_b1@lab.ntt.co.jp>
Date: Fri, 16 Oct 2015 17:04:13 +0900
Subject: KVM: x86: MMU: Make force_pt_level bool

This will be passed to a function later.

Signed-off-by: Takuya Yoshikawa <yoshikawa_takuya_b1@lab.ntt.co.jp>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu.c         | 8 ++++----
 arch/x86/kvm/paging_tmpl.h | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index b8482c0b75b2..2262728863de 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2962,7 +2962,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
 {
 	int r;
 	int level;
-	int force_pt_level;
+	bool force_pt_level;
 	pfn_t pfn;
 	unsigned long mmu_seq;
 	bool map_writable, write = error_code & PFERR_WRITE_MASK;
@@ -3476,7 +3476,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
 	pfn_t pfn;
 	int r;
 	int level;
-	int force_pt_level;
+	bool force_pt_level;
 	gfn_t gfn = gpa >> PAGE_SHIFT;
 	unsigned long mmu_seq;
 	int write = error_code & PFERR_WRITE_MASK;
@@ -3497,9 +3497,9 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
 
 	if (mapping_level_dirty_bitmap(vcpu, gfn) ||
 	    !check_hugepage_cache_consistency(vcpu, gfn, PT_DIRECTORY_LEVEL))
-		force_pt_level = 1;
+		force_pt_level = true;
 	else
-		force_pt_level = 0;
+		force_pt_level = false;
 
 	if (likely(!force_pt_level)) {
 		level = mapping_level(vcpu, gfn);
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 736e6ab8784d..07f1a4ede637 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -698,7 +698,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
 	int r;
 	pfn_t pfn;
 	int level = PT_PAGE_TABLE_LEVEL;
-	int force_pt_level;
+	bool force_pt_level;
 	unsigned long mmu_seq;
 	bool map_writable, is_self_change_mapping;
 
@@ -747,7 +747,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
 		force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn)
 		   || is_self_change_mapping;
 	else
-		force_pt_level = 1;
+		force_pt_level = true;
 	if (!force_pt_level) {
 		level = min(walker.level, mapping_level(vcpu, walker.gfn));
 		walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1);
-- 
cgit v1.2.3-59-g8ed1b


From 5ed5c5c8fdbab889837c9223fc6f4bdaa830879c Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa_takuya_b1@lab.ntt.co.jp>
Date: Fri, 16 Oct 2015 17:05:13 +0900
Subject: KVM: x86: MMU: Simplify force_pt_level calculation code in
 FNAME(page_fault)()

As a bonus, an extra memory slot search can be eliminated when
is_self_change_mapping is true.

Signed-off-by: Takuya Yoshikawa <yoshikawa_takuya_b1@lab.ntt.co.jp>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/paging_tmpl.h | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 07f1a4ede637..8ebc3a5560ce 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -743,15 +743,14 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
 	is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu,
 	      &walker, user_fault, &vcpu->arch.write_fault_to_shadow_pgtable);
 
-	if (walker.level >= PT_DIRECTORY_LEVEL)
-		force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn)
-		   || is_self_change_mapping;
-	else
+	if (walker.level >= PT_DIRECTORY_LEVEL && !is_self_change_mapping) {
+		force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn);
+		if (!force_pt_level) {
+			level = min(walker.level, mapping_level(vcpu, walker.gfn));
+			walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1);
+		}
+	} else
 		force_pt_level = true;
-	if (!force_pt_level) {
-		level = min(walker.level, mapping_level(vcpu, walker.gfn));
-		walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1);
-	}
 
 	mmu_seq = vcpu->kvm->mmu_notifier_seq;
 	smp_rmb();
-- 
cgit v1.2.3-59-g8ed1b


From fd136902187838bcae3a572f41cb703553dd63b8 Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa_takuya_b1@lab.ntt.co.jp>
Date: Fri, 16 Oct 2015 17:06:02 +0900
Subject: KVM: x86: MMU: Move mapping_level_dirty_bitmap() call in
 mapping_level()

This is necessary to eliminate an extra memory slot search later.

Signed-off-by: Takuya Yoshikawa <yoshikawa_takuya_b1@lab.ntt.co.jp>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu.c         | 29 ++++++++++++++---------------
 arch/x86/kvm/paging_tmpl.h |  6 +++---
 2 files changed, 17 insertions(+), 18 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 2262728863de..890cd694c9a2 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -870,10 +870,16 @@ static bool mapping_level_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t large_gfn)
 	return !gfn_to_memslot_dirty_bitmap(vcpu, large_gfn, true);
 }
 
-static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
+static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn,
+			 bool *force_pt_level)
 {
 	int host_level, level, max_level;
 
+	if (likely(!*force_pt_level))
+		*force_pt_level = mapping_level_dirty_bitmap(vcpu, large_gfn);
+	if (unlikely(*force_pt_level))
+		return PT_PAGE_TABLE_LEVEL;
+
 	host_level = host_mapping_level(vcpu->kvm, large_gfn);
 
 	if (host_level == PT_PAGE_TABLE_LEVEL)
@@ -2962,14 +2968,13 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
 {
 	int r;
 	int level;
-	bool force_pt_level;
+	bool force_pt_level = false;
 	pfn_t pfn;
 	unsigned long mmu_seq;
 	bool map_writable, write = error_code & PFERR_WRITE_MASK;
 
-	force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn);
+	level = mapping_level(vcpu, gfn, &force_pt_level);
 	if (likely(!force_pt_level)) {
-		level = mapping_level(vcpu, gfn);
 		/*
 		 * This path builds a PAE pagetable - so we can map
 		 * 2mb pages at maximum. Therefore check if the level
@@ -2979,8 +2984,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
 			level = PT_DIRECTORY_LEVEL;
 
 		gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
-	} else
-		level = PT_PAGE_TABLE_LEVEL;
+	}
 
 	if (fast_page_fault(vcpu, v, level, error_code))
 		return 0;
@@ -3495,20 +3499,15 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
 	if (r)
 		return r;
 
-	if (mapping_level_dirty_bitmap(vcpu, gfn) ||
-	    !check_hugepage_cache_consistency(vcpu, gfn, PT_DIRECTORY_LEVEL))
-		force_pt_level = true;
-	else
-		force_pt_level = false;
-
+	force_pt_level = !check_hugepage_cache_consistency(vcpu, gfn,
+							   PT_DIRECTORY_LEVEL);
+	level = mapping_level(vcpu, gfn, &force_pt_level);
 	if (likely(!force_pt_level)) {
-		level = mapping_level(vcpu, gfn);
 		if (level > PT_DIRECTORY_LEVEL &&
 		    !check_hugepage_cache_consistency(vcpu, gfn, level))
 			level = PT_DIRECTORY_LEVEL;
 		gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
-	} else
-		level = PT_PAGE_TABLE_LEVEL;
+	}
 
 	if (fast_page_fault(vcpu, gpa, level, error_code))
 		return 0;
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 8ebc3a5560ce..bf39d0f3efa9 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -744,9 +744,9 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
 	      &walker, user_fault, &vcpu->arch.write_fault_to_shadow_pgtable);
 
 	if (walker.level >= PT_DIRECTORY_LEVEL && !is_self_change_mapping) {
-		force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn);
-		if (!force_pt_level) {
-			level = min(walker.level, mapping_level(vcpu, walker.gfn));
+		level = mapping_level(vcpu, walker.gfn, &force_pt_level);
+		if (likely(!force_pt_level)) {
+			level = min(walker.level, level);
 			walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1);
 		}
 	} else
-- 
cgit v1.2.3-59-g8ed1b


From d8aacf5df86a961923a2c9c547d341d64a9d9f5d Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa_takuya_b1@lab.ntt.co.jp>
Date: Fri, 16 Oct 2015 17:07:01 +0900
Subject: KVM: x86: MMU: Remove mapping_level_dirty_bitmap()

Now that it has only one caller, and its name is not so helpful for
readers, remove it.  The new memslot_valid_for_gpte() function
makes it possible to share the common code between
gfn_to_memslot_dirty_bitmap() and mapping_level().

Signed-off-by: Takuya Yoshikawa <yoshikawa_takuya_b1@lab.ntt.co.jp>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu.c | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 890cd694c9a2..09833b04fc97 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -851,6 +851,17 @@ static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
 	return ret;
 }
 
+static inline bool memslot_valid_for_gpte(struct kvm_memory_slot *slot,
+					  bool no_dirty_log)
+{
+	if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
+		return false;
+	if (no_dirty_log && slot->dirty_bitmap)
+		return false;
+
+	return true;
+}
+
 static struct kvm_memory_slot *
 gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn,
 			    bool no_dirty_log)
@@ -858,25 +869,22 @@ gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn,
 	struct kvm_memory_slot *slot;
 
 	slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
-	if (!slot || slot->flags & KVM_MEMSLOT_INVALID ||
-	      (no_dirty_log && slot->dirty_bitmap))
+	if (!memslot_valid_for_gpte(slot, no_dirty_log))
 		slot = NULL;
 
 	return slot;
 }
 
-static bool mapping_level_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t large_gfn)
-{
-	return !gfn_to_memslot_dirty_bitmap(vcpu, large_gfn, true);
-}
-
 static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn,
 			 bool *force_pt_level)
 {
 	int host_level, level, max_level;
+	struct kvm_memory_slot *slot;
+
+	slot = kvm_vcpu_gfn_to_memslot(vcpu, large_gfn);
 
 	if (likely(!*force_pt_level))
-		*force_pt_level = mapping_level_dirty_bitmap(vcpu, large_gfn);
+		*force_pt_level = !memslot_valid_for_gpte(slot, true);
 	if (unlikely(*force_pt_level))
 		return PT_PAGE_TABLE_LEVEL;
 
-- 
cgit v1.2.3-59-g8ed1b


From 5225fdf8c8bea4418f69875804584c89a27c170e Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa_takuya_b1@lab.ntt.co.jp>
Date: Fri, 16 Oct 2015 17:08:03 +0900
Subject: KVM: x86: MMU: Eliminate an extra memory slot search in
 mapping_level()

Calling kvm_vcpu_gfn_to_memslot() twice in mapping_level() should be
avoided since getting a slot by binary search may not be negligible,
especially for virtual machines with many memory slots.

Signed-off-by: Takuya Yoshikawa <yoshikawa_takuya_b1@lab.ntt.co.jp>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu.c | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 09833b04fc97..dd2a7c6ec2ca 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -818,14 +818,11 @@ static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
 	kvm->arch.indirect_shadow_pages--;
 }
 
-static int has_wrprotected_page(struct kvm_vcpu *vcpu,
-				gfn_t gfn,
-				int level)
+static int __has_wrprotected_page(gfn_t gfn, int level,
+				  struct kvm_memory_slot *slot)
 {
-	struct kvm_memory_slot *slot;
 	struct kvm_lpage_info *linfo;
 
-	slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
 	if (slot) {
 		linfo = lpage_info_slot(gfn, slot, level);
 		return linfo->write_count;
@@ -834,6 +831,14 @@ static int has_wrprotected_page(struct kvm_vcpu *vcpu,
 	return 1;
 }
 
+static int has_wrprotected_page(struct kvm_vcpu *vcpu, gfn_t gfn, int level)
+{
+	struct kvm_memory_slot *slot;
+
+	slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
+	return __has_wrprotected_page(gfn, level, slot);
+}
+
 static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
 {
 	unsigned long page_size;
@@ -896,7 +901,7 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn,
 	max_level = min(kvm_x86_ops->get_lpage_level(), host_level);
 
 	for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level)
-		if (has_wrprotected_page(vcpu, large_gfn, level))
+		if (__has_wrprotected_page(large_gfn, level, slot))
 			break;
 
 	return level - 1;
-- 
cgit v1.2.3-59-g8ed1b


From 7cae2bedcbd4680b155999655e49c27b9cf020fa Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Wed, 14 Oct 2015 19:33:09 -0300
Subject: KVM: x86: move steal time initialization to vcpu entry time

As reported at https://bugs.launchpad.net/qemu/+bug/1494350,
it is possible to have vcpu->arch.st.last_steal initialized
from a thread other than vcpu thread, say the iothread, via
KVM_SET_MSRS.

Which can cause an overflow later (when subtracting from vcpu threads
sched_info.run_delay).

To avoid that, move steal time accumulation to vcpu entry time,
before copying steal time data to guest.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Reviewed-by: David Matlack <dmatlack@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/x86.c | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e33aebbf189e..9e9c226cb79d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1903,6 +1903,8 @@ static void accumulate_steal_time(struct kvm_vcpu *vcpu)
 
 static void record_steal_time(struct kvm_vcpu *vcpu)
 {
+	accumulate_steal_time(vcpu);
+
 	if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
 		return;
 
@@ -2053,12 +2055,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		if (!(data & KVM_MSR_ENABLED))
 			break;
 
-		vcpu->arch.st.last_steal = current->sched_info.run_delay;
-
-		preempt_disable();
-		accumulate_steal_time(vcpu);
-		preempt_enable();
-
 		kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
 
 		break;
@@ -2634,7 +2630,6 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 		vcpu->cpu = cpu;
 	}
 
-	accumulate_steal_time(vcpu);
 	kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 5690891bcec5fcfda38da974ffa5488e36a59811 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Mon, 19 Oct 2015 11:30:19 +0200
Subject: kvm: x86: zero EFER on INIT

Not zeroing EFER means that a 32-bit firmware cannot enter paging mode
without clearing EFER.LME first (which it should not know about).
Yang Zhang from Intel confirmed that the manual is wrong and EFER is
cleared to zero on INIT.

Fixes: d28bc9dd25ce023270d2e039e7c98d38ecbf7758
Cc: stable@vger.kernel.org
Cc: Yang Z Zhang <yang.z.zhang@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/svm.c | 11 +++++------
 arch/x86/kvm/vmx.c |  3 +--
 2 files changed, 6 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index cd8659cfc632..f2c8e4917688 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1089,7 +1089,7 @@ static u64 svm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
 	return target_tsc - tsc;
 }
 
-static void init_vmcb(struct vcpu_svm *svm, bool init_event)
+static void init_vmcb(struct vcpu_svm *svm)
 {
 	struct vmcb_control_area *control = &svm->vmcb->control;
 	struct vmcb_save_area *save = &svm->vmcb->save;
@@ -1160,8 +1160,7 @@ static void init_vmcb(struct vcpu_svm *svm, bool init_event)
 	init_sys_seg(&save->ldtr, SEG_TYPE_LDT);
 	init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
 
-	if (!init_event)
-		svm_set_efer(&svm->vcpu, 0);
+	svm_set_efer(&svm->vcpu, 0);
 	save->dr6 = 0xffff0ff0;
 	kvm_set_rflags(&svm->vcpu, 2);
 	save->rip = 0x0000fff0;
@@ -1215,7 +1214,7 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
 		if (kvm_vcpu_is_reset_bsp(&svm->vcpu))
 			svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
 	}
-	init_vmcb(svm, init_event);
+	init_vmcb(svm);
 
 	kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy);
 	kvm_register_write(vcpu, VCPU_REGS_RDX, eax);
@@ -1271,7 +1270,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
 	clear_page(svm->vmcb);
 	svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT;
 	svm->asid_generation = 0;
-	init_vmcb(svm, false);
+	init_vmcb(svm);
 
 	svm_init_osvw(&svm->vcpu);
 
@@ -1893,7 +1892,7 @@ static int shutdown_interception(struct vcpu_svm *svm)
 	 * so reinitialize it.
 	 */
 	clear_page(svm->vmcb);
-	init_vmcb(svm, false);
+	init_vmcb(svm);
 
 	kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
 	return 0;
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 15bff51d492a..4d0aa31a42ba 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -4932,8 +4932,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
 	vmx_set_cr0(vcpu, cr0); /* enter rmode */
 	vmx->vcpu.arch.cr0 = cr0;
 	vmx_set_cr4(vcpu, 0);
-	if (!init_event)
-		vmx_set_efer(vcpu, 0);
+	vmx_set_efer(vcpu, 0);
 	vmx_fpu_activate(vcpu);
 	update_exception_bitmap(vcpu);
 
-- 
cgit v1.2.3-59-g8ed1b


From 8c85ac1c0a1b41299370857765bc0950666ed5d9 Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa_takuya_b1@lab.ntt.co.jp>
Date: Mon, 19 Oct 2015 15:13:29 +0900
Subject: KVM: x86: MMU: Initialize force_pt_level before calling
 mapping_level()

Commit fd1369021878 ("KVM: x86: MMU: Move mapping_level_dirty_bitmap()
call in mapping_level()") forgot to initialize force_pt_level to false
in FNAME(page_fault)() before calling mapping_level() like
nonpaging_map() does.  This can sometimes result in forcing page table
level mapping unnecessarily.

Fix this and move the first *force_pt_level check in mapping_level()
before kvm_vcpu_gfn_to_memslot() call to make it a bit clearer that
the variable must be initialized before mapping_level() gets called.

This change can also avoid calling kvm_vcpu_gfn_to_memslot() when
!check_hugepage_cache_consistency() check in tdp_page_fault() forces
page table level mapping.

Signed-off-by: Takuya Yoshikawa <yoshikawa_takuya_b1@lab.ntt.co.jp>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu.c         | 7 ++++---
 arch/x86/kvm/paging_tmpl.h | 2 +-
 2 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index dd2a7c6ec2ca..7d85bcae3332 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -886,10 +886,11 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn,
 	int host_level, level, max_level;
 	struct kvm_memory_slot *slot;
 
-	slot = kvm_vcpu_gfn_to_memslot(vcpu, large_gfn);
+	if (unlikely(*force_pt_level))
+		return PT_PAGE_TABLE_LEVEL;
 
-	if (likely(!*force_pt_level))
-		*force_pt_level = !memslot_valid_for_gpte(slot, true);
+	slot = kvm_vcpu_gfn_to_memslot(vcpu, large_gfn);
+	*force_pt_level = !memslot_valid_for_gpte(slot, true);
 	if (unlikely(*force_pt_level))
 		return PT_PAGE_TABLE_LEVEL;
 
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index bf39d0f3efa9..b41faa91a6f9 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -698,7 +698,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
 	int r;
 	pfn_t pfn;
 	int level = PT_PAGE_TABLE_LEVEL;
-	bool force_pt_level;
+	bool force_pt_level = false;
 	unsigned long mmu_seq;
 	bool map_writable, is_self_change_mapping;
 
-- 
cgit v1.2.3-59-g8ed1b


From 3217f7c25bca66eed9b07f0b8bfd1937169b0736 Mon Sep 17 00:00:00 2001
From: Christoffer Dall <christoffer.dall@linaro.org>
Date: Thu, 27 Aug 2015 16:41:15 +0200
Subject: KVM: Add kvm_arch_vcpu_{un}blocking callbacks

Some times it is useful for architecture implementations of KVM to know
when the VCPU thread is about to block or when it comes back from
blocking (arm/arm64 needs to know this to properly implement timers, for
example).

Therefore provide a generic architecture callback function in line with
what we do elsewhere for KVM generic-arch interactions.

Reviewed-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
---
 arch/arm/include/asm/kvm_host.h     | 3 +++
 arch/arm64/include/asm/kvm_host.h   | 3 +++
 arch/mips/include/asm/kvm_host.h    | 2 ++
 arch/powerpc/include/asm/kvm_host.h | 2 ++
 arch/s390/include/asm/kvm_host.h    | 2 ++
 arch/x86/include/asm/kvm_host.h     | 3 +++
 include/linux/kvm_host.h            | 2 ++
 virt/kvm/kvm_main.c                 | 3 +++
 8 files changed, 20 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index c4072d9f32c7..84da97901f1f 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -234,4 +234,7 @@ static inline void kvm_arm_setup_debug(struct kvm_vcpu *vcpu) {}
 static inline void kvm_arm_clear_debug(struct kvm_vcpu *vcpu) {}
 static inline void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu) {}
 
+static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {}
+static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {}
+
 #endif /* __ARM_KVM_HOST_H__ */
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index ed039688c221..e4f4d65f7d2b 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -255,4 +255,7 @@ void kvm_arm_setup_debug(struct kvm_vcpu *vcpu);
 void kvm_arm_clear_debug(struct kvm_vcpu *vcpu);
 void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu);
 
+static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {}
+static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {}
+
 #endif /* __ARM64_KVM_HOST_H__ */
diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index 5a1a882e0a75..6ded8d347af9 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -847,5 +847,7 @@ static inline void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
 		struct kvm_memory_slot *slot) {}
 static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {}
 static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
+static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {}
+static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {}
 
 #endif /* __MIPS_KVM_HOST_H__ */
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 827a38d7a9db..c9f122d00920 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -718,5 +718,7 @@ static inline void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslot
 static inline void kvm_arch_flush_shadow_all(struct kvm *kvm) {}
 static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
 static inline void kvm_arch_exit(void) {}
+static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {}
+static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {}
 
 #endif /* __POWERPC_KVM_HOST_H__ */
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index 8ced426091e1..72a614c68ed8 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -644,5 +644,7 @@ static inline void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslot
 static inline void kvm_arch_flush_shadow_all(struct kvm *kvm) {}
 static inline void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
 		struct kvm_memory_slot *slot) {}
+static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {}
+static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {}
 
 #endif
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 2beee0382088..b28f0f142ecb 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1233,4 +1233,7 @@ int x86_set_memory_region(struct kvm *kvm,
 bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu);
 bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu);
 
+static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {}
+static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {}
+
 #endif /* _ASM_X86_KVM_HOST_H */
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 1bef9e21e725..4a86f5f072c0 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -625,6 +625,8 @@ int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data,
 void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn);
 
 void kvm_vcpu_block(struct kvm_vcpu *vcpu);
+void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu);
+void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu);
 void kvm_vcpu_kick(struct kvm_vcpu *vcpu);
 int kvm_vcpu_yield_to(struct kvm_vcpu *target);
 void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 8db1d9361993..7873d6daccb1 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2018,6 +2018,8 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
 		} while (single_task_running() && ktime_before(cur, stop));
 	}
 
+	kvm_arch_vcpu_blocking(vcpu);
+
 	for (;;) {
 		prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
 
@@ -2031,6 +2033,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
 	finish_wait(&vcpu->wq, &wait);
 	cur = ktime_get();
 
+	kvm_arch_vcpu_unblocking(vcpu);
 out:
 	block_ns = ktime_to_ns(cur) - ktime_to_ns(start);
 
-- 
cgit v1.2.3-59-g8ed1b


From 2da29bccc5045ea10c70cb3a69be777768fd0b66 Mon Sep 17 00:00:00 2001
From: Saurabh Sengar <saurabh.truth@gmail.com>
Date: Fri, 30 Oct 2015 12:56:11 +0530
Subject: KVM: x86: removing unused variable

removing unused variables, found by coccinelle

Signed-off-by: Saurabh Sengar <saurabh.truth@gmail.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/x86.c | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 9e9c226cb79d..57b5f79aad08 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3438,41 +3438,35 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
 
 static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps)
 {
-	int r = 0;
-
 	mutex_lock(&kvm->arch.vpit->pit_state.lock);
 	memcpy(ps, &kvm->arch.vpit->pit_state, sizeof(struct kvm_pit_state));
 	mutex_unlock(&kvm->arch.vpit->pit_state.lock);
-	return r;
+	return 0;
 }
 
 static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)
 {
-	int r = 0;
-
 	mutex_lock(&kvm->arch.vpit->pit_state.lock);
 	memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state));
 	kvm_pit_load_count(kvm, 0, ps->channels[0].count, 0);
 	mutex_unlock(&kvm->arch.vpit->pit_state.lock);
-	return r;
+	return 0;
 }
 
 static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
 {
-	int r = 0;
-
 	mutex_lock(&kvm->arch.vpit->pit_state.lock);
 	memcpy(ps->channels, &kvm->arch.vpit->pit_state.channels,
 		sizeof(ps->channels));
 	ps->flags = kvm->arch.vpit->pit_state.flags;
 	mutex_unlock(&kvm->arch.vpit->pit_state.lock);
 	memset(&ps->reserved, 0, sizeof(ps->reserved));
-	return r;
+	return 0;
 }
 
 static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
 {
-	int r = 0, start = 0;
+	int start = 0;
 	u32 prev_legacy, cur_legacy;
 	mutex_lock(&kvm->arch.vpit->pit_state.lock);
 	prev_legacy = kvm->arch.vpit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY;
@@ -3484,7 +3478,7 @@ static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
 	kvm->arch.vpit->pit_state.flags = ps->flags;
 	kvm_pit_load_count(kvm, 0, kvm->arch.vpit->pit_state.channels[0].count, start);
 	mutex_unlock(&kvm->arch.vpit->pit_state.lock);
-	return r;
+	return 0;
 }
 
 static int kvm_vm_ioctl_reinject(struct kvm *kvm,
-- 
cgit v1.2.3-59-g8ed1b


From 7a036a6f670f63b32c5ee126425f9109271ca13f Mon Sep 17 00:00:00 2001
From: Radim Krčmář <rkrcmar@redhat.com>
Date: Fri, 30 Oct 2015 16:36:24 +0100
Subject: KVM: x86: add read_phys to x86_emulate_ops
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We want to read the physical memory when emulating RSM.

X86EMUL_IO_NEEDED is returned on all errors for consistency with other
helpers.

Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Tested-by: Laszlo Ersek <lersek@redhat.com>
Cc: stable@vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_emulate.h | 10 ++++++++++
 arch/x86/kvm/x86.c                 | 10 ++++++++++
 2 files changed, 20 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index e16466ec473c..e9cd7befcb76 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -111,6 +111,16 @@ struct x86_emulate_ops {
 			unsigned int bytes,
 			struct x86_exception *fault);
 
+	/*
+	 * read_phys: Read bytes of standard (non-emulated/special) memory.
+	 *            Used for descriptor reading.
+	 *  @addr:  [IN ] Physical address from which to read.
+	 *  @val:   [OUT] Value read from memory.
+	 *  @bytes: [IN ] Number of bytes to read from memory.
+	 */
+	int (*read_phys)(struct x86_emulate_ctxt *ctxt, unsigned long addr,
+			void *val, unsigned int bytes);
+
 	/*
 	 * write_std: Write bytes of standard (non-emulated/special) memory.
 	 *            Used for descriptor writing.
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 57b5f79aad08..a24bae0a83a2 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4089,6 +4089,15 @@ static int kvm_read_guest_virt_system(struct x86_emulate_ctxt *ctxt,
 	return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, exception);
 }
 
+static int kvm_read_guest_phys_system(struct x86_emulate_ctxt *ctxt,
+		unsigned long addr, void *val, unsigned int bytes)
+{
+	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+	int r = kvm_vcpu_read_guest(vcpu, addr, val, bytes);
+
+	return r < 0 ? X86EMUL_IO_NEEDED : X86EMUL_CONTINUE;
+}
+
 int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt,
 				       gva_t addr, void *val,
 				       unsigned int bytes,
@@ -4824,6 +4833,7 @@ static const struct x86_emulate_ops emulate_ops = {
 	.write_gpr           = emulator_write_gpr,
 	.read_std            = kvm_read_guest_virt_system,
 	.write_std           = kvm_write_guest_virt_system,
+	.read_phys           = kvm_read_guest_phys_system,
 	.fetch               = kvm_fetch_guest_virt,
 	.read_emulated       = emulator_read_emulated,
 	.write_emulated      = emulator_write_emulated,
-- 
cgit v1.2.3-59-g8ed1b


From f40606b147dd5b4678cedc877a71deb520ca507e Mon Sep 17 00:00:00 2001
From: Radim Krčmář <rkrcmar@redhat.com>
Date: Fri, 30 Oct 2015 16:36:25 +0100
Subject: KVM: x86: handle SMBASE as physical address in RSM
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

GET_SMSTATE depends on real mode to ensure that smbase+offset is treated
as a physical address, which has already caused a bug after shuffling
the code.  Enforce physical addressing.

Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Reported-by: Laszlo Ersek <lersek@redhat.com>
Tested-by: Laszlo Ersek <lersek@redhat.com>
Cc: stable@vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/emulate.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 9da95b9daf8d..b60fed56671b 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2272,8 +2272,8 @@ static int emulator_has_longmode(struct x86_emulate_ctxt *ctxt)
 #define GET_SMSTATE(type, smbase, offset)				  \
 	({								  \
 	 type __val;							  \
-	 int r = ctxt->ops->read_std(ctxt, smbase + offset, &__val,       \
-				     sizeof(__val), NULL);		  \
+	 int r = ctxt->ops->read_phys(ctxt, smbase + offset, &__val,      \
+				      sizeof(__val));			  \
 	 if (r != X86EMUL_CONTINUE)					  \
 		 return X86EMUL_UNHANDLEABLE;				  \
 	 __val;								  \
@@ -2484,8 +2484,7 @@ static int em_rsm(struct x86_emulate_ctxt *ctxt)
 
 	/*
 	 * Get back to real mode, to prepare a safe state in which to load
-	 * CR0/CR3/CR4/EFER.  Also this will ensure that addresses passed
-	 * to read_std/write_std are not virtual.
+	 * CR0/CR3/CR4/EFER.
 	 *
 	 * CR4.PCIDE must be zero, because it is a 64-bit mode only feature.
 	 */
-- 
cgit v1.2.3-59-g8ed1b


From c75efa974e013640496620f26f0b532cb5cb17f9 Mon Sep 17 00:00:00 2001
From: Andrey Smetanin <asmetanin@virtuozzo.com>
Date: Fri, 16 Oct 2015 10:07:50 +0300
Subject: drivers/hv: share Hyper-V SynIC constants with userspace

Moved Hyper-V synic contants from guest Hyper-V drivers private
header into x86 arch uapi Hyper-V header.

Added Hyper-V synic msr's flags into x86 arch uapi Hyper-V header.

Signed-off-by: Andrey Smetanin <asmetanin@virtuozzo.com>
Reviewed-by: Roman Kagan <rkagan@virtuozzo.com>
Signed-off-by: Denis V. Lunev <den@openvz.org>
CC: Vitaly Kuznetsov <vkuznets@redhat.com>
CC: "K. Y. Srinivasan" <kys@microsoft.com>
CC: Gleb Natapov <gleb@kernel.org>
CC: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/uapi/asm/hyperv.h | 12 ++++++++++++
 drivers/hv/hyperv_vmbus.h          |  5 -----
 include/linux/hyperv.h             |  1 +
 3 files changed, 13 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/uapi/asm/hyperv.h b/arch/x86/include/uapi/asm/hyperv.h
index 2677a0aac2cc..040d4083c24f 100644
--- a/arch/x86/include/uapi/asm/hyperv.h
+++ b/arch/x86/include/uapi/asm/hyperv.h
@@ -257,4 +257,16 @@ typedef struct _HV_REFERENCE_TSC_PAGE {
 	__s64 tsc_offset;
 } HV_REFERENCE_TSC_PAGE, *PHV_REFERENCE_TSC_PAGE;
 
+/* Define the number of synthetic interrupt sources. */
+#define HV_SYNIC_SINT_COUNT		(16)
+/* Define the expected SynIC version. */
+#define HV_SYNIC_VERSION_1		(0x1)
+
+#define HV_SYNIC_CONTROL_ENABLE		(1ULL << 0)
+#define HV_SYNIC_SIMP_ENABLE		(1ULL << 0)
+#define HV_SYNIC_SIEFP_ENABLE		(1ULL << 0)
+#define HV_SYNIC_SINT_MASKED		(1ULL << 16)
+#define HV_SYNIC_SINT_AUTO_EOI		(1ULL << 17)
+#define HV_SYNIC_SINT_VECTOR_MASK	(0xFF)
+
 #endif
diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h
index 3d70e36c918e..3782636562a1 100644
--- a/drivers/hv/hyperv_vmbus.h
+++ b/drivers/hv/hyperv_vmbus.h
@@ -63,9 +63,6 @@ enum hv_cpuid_function {
 /* Define version of the synthetic interrupt controller. */
 #define HV_SYNIC_VERSION		(1)
 
-/* Define the expected SynIC version. */
-#define HV_SYNIC_VERSION_1		(0x1)
-
 /* Define synthetic interrupt controller message constants. */
 #define HV_MESSAGE_SIZE			(256)
 #define HV_MESSAGE_PAYLOAD_BYTE_COUNT	(240)
@@ -105,8 +102,6 @@ enum hv_message_type {
 	HVMSG_X64_LEGACY_FP_ERROR		= 0x80010005
 };
 
-/* Define the number of synthetic interrupt sources. */
-#define HV_SYNIC_SINT_COUNT		(16)
 #define HV_SYNIC_STIMER_COUNT		(4)
 
 /* Define invalid partition identifier. */
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index 54733d5b503e..8fdc17b84739 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -26,6 +26,7 @@
 #define _HYPERV_H
 
 #include <uapi/linux/hyperv.h>
+#include <uapi/asm/hyperv.h>
 
 #include <linux/types.h>
 #include <linux/scatterlist.h>
-- 
cgit v1.2.3-59-g8ed1b


From 0669a51015c58b1f036030743a0c0781eb63867f Mon Sep 17 00:00:00 2001
From: Radim Krčmář <rkrcmar@redhat.com>
Date: Fri, 30 Oct 2015 15:48:20 +0100
Subject: KVM: x86: zero apic_arb_prio on reset
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

BSP doesn't get INIT so its apic_arb_prio isn't zeroed after reboot.
BSP won't get lowest priority interrupts until other VCPUs get enough
interrupts to match their pre-reboot apic_arb_prio.

That behavior doesn't fit into KVM's round-robin-like interpretation of
lowest priority delivery ... userspace should KVM_SET_LAPIC on reset, so
just zero apic_arb_prio there.

Reported-by: Yuki Shibuya <shibuya.yk@ncos.nec.co.jp>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/lapic.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 168b8759bd73..ecd4ea1d28a8 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1918,6 +1918,8 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu,
 	kvm_make_request(KVM_REQ_EVENT, vcpu);
 	if (ioapic_in_kernel(vcpu->kvm))
 		kvm_rtc_eoi_tracking_restore_one(vcpu);
+
+	vcpu->arch.apic_arb_prio = 0;
 }
 
 void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
-- 
cgit v1.2.3-59-g8ed1b


From b97e6de9c96cefaa02a6a7464731ea504b45e150 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 28 Oct 2015 19:16:47 +0100
Subject: KVM: x86: merge kvm_arch_set_irq with kvm_set_msi_inatomic
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We do not want to do too much work in atomic context, in particular
not walking all the VCPUs of the virtual machine.  So we want
to distinguish the architecture-specific injection function for irqfd
from kvm_set_msi.  Since it's still empty, reuse the newly added
kvm_arch_set_irq and rename it to kvm_arch_set_irq_inatomic.

Reviewed-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/irq_comm.c  | 14 ++++++++------
 include/linux/kvm_host.h |  7 +++----
 virt/kvm/eventfd.c       | 11 ++++-------
 3 files changed, 15 insertions(+), 17 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
index 8f4499c7ffc1..75dc633c48dc 100644
--- a/arch/x86/kvm/irq_comm.c
+++ b/arch/x86/kvm/irq_comm.c
@@ -124,12 +124,16 @@ int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
 }
 
 
-static int kvm_set_msi_inatomic(struct kvm_kernel_irq_routing_entry *e,
-			 struct kvm *kvm)
+int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e,
+			      struct kvm *kvm, int irq_source_id, int level,
+			      bool line_status)
 {
 	struct kvm_lapic_irq irq;
 	int r;
 
+	if (unlikely(e->type != KVM_IRQ_ROUTING_MSI))
+		return -EWOULDBLOCK;
+
 	kvm_set_msi_irq(e, &irq);
 
 	if (kvm_irq_delivery_to_apic_fast(kvm, NULL, &irq, &r, NULL))
@@ -165,10 +169,8 @@ int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq, int level)
 	idx = srcu_read_lock(&kvm->irq_srcu);
 	if (kvm_irq_map_gsi(kvm, entries, irq) > 0) {
 		e = &entries[0];
-		if (likely(e->type == KVM_IRQ_ROUTING_MSI))
-			ret = kvm_set_msi_inatomic(e, kvm);
-		else
-			ret = -EWOULDBLOCK;
+		ret = kvm_arch_set_irq_inatomic(e, kvm, irq_source_id,
+						irq, level);
 	}
 	srcu_read_unlock(&kvm->irq_srcu, idx);
 	return ret;
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 87189a41d904..15c78f320678 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -830,10 +830,9 @@ int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level,
 int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq, int level);
 int kvm_set_msi(struct kvm_kernel_irq_routing_entry *irq_entry, struct kvm *kvm,
 		int irq_source_id, int level, bool line_status);
-
-int kvm_arch_set_irq(struct kvm_kernel_irq_routing_entry *irq, struct kvm *kvm,
-		     int irq_source_id, int level, bool line_status);
-
+int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e,
+			       struct kvm *kvm, int irq_source_id,
+			       int level, bool line_status);
 bool kvm_irq_has_notifier(struct kvm *kvm, unsigned irqchip, unsigned pin);
 void kvm_notify_acked_gsi(struct kvm *kvm, int gsi);
 void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin);
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index e29fd2640709..46dbc0a7dfc1 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -171,7 +171,7 @@ irqfd_deactivate(struct kvm_kernel_irqfd *irqfd)
 	queue_work(irqfd_cleanup_wq, &irqfd->shutdown);
 }
 
-int __attribute__((weak)) kvm_arch_set_irq(
+int __attribute__((weak)) kvm_arch_set_irq_inatomic(
 				struct kvm_kernel_irq_routing_entry *irq,
 				struct kvm *kvm, int irq_source_id,
 				int level,
@@ -201,12 +201,9 @@ irqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key)
 			irq = irqfd->irq_entry;
 		} while (read_seqcount_retry(&irqfd->irq_entry_sc, seq));
 		/* An event has been signaled, inject an interrupt */
-		if (irq.type == KVM_IRQ_ROUTING_MSI)
-			kvm_set_msi(&irq, kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 1,
-					false);
-		else if (kvm_arch_set_irq(&irq, kvm,
-					  KVM_USERSPACE_IRQ_SOURCE_ID, 1,
-					  false) == -EWOULDBLOCK)
+		if (kvm_arch_set_irq_inatomic(&irq, kvm,
+					      KVM_USERSPACE_IRQ_SOURCE_ID, 1,
+					      false) == -EWOULDBLOCK)
 			schedule_work(&irqfd->inject);
 		srcu_read_unlock(&kvm->irq_srcu, idx);
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 7695405698d011c05a95288f1f3c0a3dd252dccc Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 28 Oct 2015 19:22:20 +0100
Subject: KVM: device assignment: remove pointless #ifdefs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The symbols are always defined.

Reviewed-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/assigned-dev.c | 25 -------------------------
 1 file changed, 25 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/assigned-dev.c b/arch/x86/kvm/assigned-dev.c
index d090ecf08809..1c17ee807ef7 100644
--- a/arch/x86/kvm/assigned-dev.c
+++ b/arch/x86/kvm/assigned-dev.c
@@ -131,7 +131,6 @@ static irqreturn_t kvm_assigned_dev_thread_intx(int irq, void *dev_id)
 	return IRQ_HANDLED;
 }
 
-#ifdef __KVM_HAVE_MSI
 static irqreturn_t kvm_assigned_dev_msi(int irq, void *dev_id)
 {
 	struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
@@ -150,9 +149,7 @@ static irqreturn_t kvm_assigned_dev_thread_msi(int irq, void *dev_id)
 
 	return IRQ_HANDLED;
 }
-#endif
 
-#ifdef __KVM_HAVE_MSIX
 static irqreturn_t kvm_assigned_dev_msix(int irq, void *dev_id)
 {
 	struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
@@ -183,7 +180,6 @@ static irqreturn_t kvm_assigned_dev_thread_msix(int irq, void *dev_id)
 
 	return IRQ_HANDLED;
 }
-#endif
 
 /* Ack the irq line for an assigned device */
 static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
@@ -386,7 +382,6 @@ static int assigned_device_enable_host_intx(struct kvm *kvm,
 	return 0;
 }
 
-#ifdef __KVM_HAVE_MSI
 static int assigned_device_enable_host_msi(struct kvm *kvm,
 					   struct kvm_assigned_dev_kernel *dev)
 {
@@ -408,9 +403,7 @@ static int assigned_device_enable_host_msi(struct kvm *kvm,
 
 	return 0;
 }
-#endif
 
-#ifdef __KVM_HAVE_MSIX
 static int assigned_device_enable_host_msix(struct kvm *kvm,
 					    struct kvm_assigned_dev_kernel *dev)
 {
@@ -443,8 +436,6 @@ err:
 	return r;
 }
 
-#endif
-
 static int assigned_device_enable_guest_intx(struct kvm *kvm,
 				struct kvm_assigned_dev_kernel *dev,
 				struct kvm_assigned_irq *irq)
@@ -454,7 +445,6 @@ static int assigned_device_enable_guest_intx(struct kvm *kvm,
 	return 0;
 }
 
-#ifdef __KVM_HAVE_MSI
 static int assigned_device_enable_guest_msi(struct kvm *kvm,
 			struct kvm_assigned_dev_kernel *dev,
 			struct kvm_assigned_irq *irq)
@@ -463,9 +453,7 @@ static int assigned_device_enable_guest_msi(struct kvm *kvm,
 	dev->ack_notifier.gsi = -1;
 	return 0;
 }
-#endif
 
-#ifdef __KVM_HAVE_MSIX
 static int assigned_device_enable_guest_msix(struct kvm *kvm,
 			struct kvm_assigned_dev_kernel *dev,
 			struct kvm_assigned_irq *irq)
@@ -474,7 +462,6 @@ static int assigned_device_enable_guest_msix(struct kvm *kvm,
 	dev->ack_notifier.gsi = -1;
 	return 0;
 }
-#endif
 
 static int assign_host_irq(struct kvm *kvm,
 			   struct kvm_assigned_dev_kernel *dev,
@@ -492,16 +479,12 @@ static int assign_host_irq(struct kvm *kvm,
 	case KVM_DEV_IRQ_HOST_INTX:
 		r = assigned_device_enable_host_intx(kvm, dev);
 		break;
-#ifdef __KVM_HAVE_MSI
 	case KVM_DEV_IRQ_HOST_MSI:
 		r = assigned_device_enable_host_msi(kvm, dev);
 		break;
-#endif
-#ifdef __KVM_HAVE_MSIX
 	case KVM_DEV_IRQ_HOST_MSIX:
 		r = assigned_device_enable_host_msix(kvm, dev);
 		break;
-#endif
 	default:
 		r = -EINVAL;
 	}
@@ -534,16 +517,12 @@ static int assign_guest_irq(struct kvm *kvm,
 	case KVM_DEV_IRQ_GUEST_INTX:
 		r = assigned_device_enable_guest_intx(kvm, dev, irq);
 		break;
-#ifdef __KVM_HAVE_MSI
 	case KVM_DEV_IRQ_GUEST_MSI:
 		r = assigned_device_enable_guest_msi(kvm, dev, irq);
 		break;
-#endif
-#ifdef __KVM_HAVE_MSIX
 	case KVM_DEV_IRQ_GUEST_MSIX:
 		r = assigned_device_enable_guest_msix(kvm, dev, irq);
 		break;
-#endif
 	default:
 		r = -EINVAL;
 	}
@@ -826,7 +805,6 @@ out:
 }
 
 
-#ifdef __KVM_HAVE_MSIX
 static int kvm_vm_ioctl_set_msix_nr(struct kvm *kvm,
 				    struct kvm_assigned_msix_nr *entry_nr)
 {
@@ -906,7 +884,6 @@ msix_entry_out:
 
 	return r;
 }
-#endif
 
 static int kvm_vm_ioctl_set_pci_irq_mask(struct kvm *kvm,
 		struct kvm_assigned_pci_dev *assigned_dev)
@@ -1012,7 +989,6 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
 			goto out;
 		break;
 	}
-#ifdef __KVM_HAVE_MSIX
 	case KVM_ASSIGN_SET_MSIX_NR: {
 		struct kvm_assigned_msix_nr entry_nr;
 		r = -EFAULT;
@@ -1033,7 +1009,6 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
 			goto out;
 		break;
 	}
-#endif
 	case KVM_ASSIGN_SET_INTX_MASK: {
 		struct kvm_assigned_pci_dev assigned_dev;
 
-- 
cgit v1.2.3-59-g8ed1b


From 8a22f234a81ab4d1de5d948c3478608f08a9b844 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 28 Oct 2015 18:52:02 +0100
Subject: KVM: x86: move kvm_set_irq_inatomic to legacy device assignment
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The function is not used outside device assignment, and
kvm_arch_set_irq_inatomic has a different prototype.  Move it here and
make it static to avoid confusion.

Reviewed-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/assigned-dev.c | 37 +++++++++++++++++++++++++++++++++++++
 arch/x86/kvm/irq_comm.c     | 34 ----------------------------------
 include/linux/kvm_host.h    |  1 -
 3 files changed, 37 insertions(+), 35 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/assigned-dev.c b/arch/x86/kvm/assigned-dev.c
index 1c17ee807ef7..9dc091acd5fb 100644
--- a/arch/x86/kvm/assigned-dev.c
+++ b/arch/x86/kvm/assigned-dev.c
@@ -21,6 +21,7 @@
 #include <linux/fs.h>
 #include "irq.h"
 #include "assigned-dev.h"
+#include "trace/events/kvm.h"
 
 struct kvm_assigned_dev_kernel {
 	struct kvm_irq_ack_notifier ack_notifier;
@@ -131,6 +132,42 @@ static irqreturn_t kvm_assigned_dev_thread_intx(int irq, void *dev_id)
 	return IRQ_HANDLED;
 }
 
+/*
+ * Deliver an IRQ in an atomic context if we can, or return a failure,
+ * user can retry in a process context.
+ * Return value:
+ *  -EWOULDBLOCK - Can't deliver in atomic context: retry in a process context.
+ *  Other values - No need to retry.
+ */
+static int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq,
+				int level)
+{
+	struct kvm_kernel_irq_routing_entry entries[KVM_NR_IRQCHIPS];
+	struct kvm_kernel_irq_routing_entry *e;
+	int ret = -EINVAL;
+	int idx;
+
+	trace_kvm_set_irq(irq, level, irq_source_id);
+
+	/*
+	 * Injection into either PIC or IOAPIC might need to scan all CPUs,
+	 * which would need to be retried from thread context;  when same GSI
+	 * is connected to both PIC and IOAPIC, we'd have to report a
+	 * partial failure here.
+	 * Since there's no easy way to do this, we only support injecting MSI
+	 * which is limited to 1:1 GSI mapping.
+	 */
+	idx = srcu_read_lock(&kvm->irq_srcu);
+	if (kvm_irq_map_gsi(kvm, entries, irq) > 0) {
+		e = &entries[0];
+		ret = kvm_arch_set_irq_inatomic(e, kvm, irq_source_id,
+						irq, level);
+	}
+	srcu_read_unlock(&kvm->irq_srcu, idx);
+	return ret;
+}
+
+
 static irqreturn_t kvm_assigned_dev_msi(int irq, void *dev_id)
 {
 	struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
index 75dc633c48dc..84b96d319909 100644
--- a/arch/x86/kvm/irq_comm.c
+++ b/arch/x86/kvm/irq_comm.c
@@ -142,40 +142,6 @@ int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e,
 		return -EWOULDBLOCK;
 }
 
-/*
- * Deliver an IRQ in an atomic context if we can, or return a failure,
- * user can retry in a process context.
- * Return value:
- *  -EWOULDBLOCK - Can't deliver in atomic context: retry in a process context.
- *  Other values - No need to retry.
- */
-int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq, int level)
-{
-	struct kvm_kernel_irq_routing_entry entries[KVM_NR_IRQCHIPS];
-	struct kvm_kernel_irq_routing_entry *e;
-	int ret = -EINVAL;
-	int idx;
-
-	trace_kvm_set_irq(irq, level, irq_source_id);
-
-	/*
-	 * Injection into either PIC or IOAPIC might need to scan all CPUs,
-	 * which would need to be retried from thread context;  when same GSI
-	 * is connected to both PIC and IOAPIC, we'd have to report a
-	 * partial failure here.
-	 * Since there's no easy way to do this, we only support injecting MSI
-	 * which is limited to 1:1 GSI mapping.
-	 */
-	idx = srcu_read_lock(&kvm->irq_srcu);
-	if (kvm_irq_map_gsi(kvm, entries, irq) > 0) {
-		e = &entries[0];
-		ret = kvm_arch_set_irq_inatomic(e, kvm, irq_source_id,
-						irq, level);
-	}
-	srcu_read_unlock(&kvm->irq_srcu, idx);
-	return ret;
-}
-
 int kvm_request_irq_source_id(struct kvm *kvm)
 {
 	unsigned long *bitmap = &kvm->arch.irq_sources_bitmap;
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 15c78f320678..242a6d2b53ff 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -827,7 +827,6 @@ int kvm_irq_map_chip_pin(struct kvm *kvm, unsigned irqchip, unsigned pin);
 
 int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level,
 		bool line_status);
-int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq, int level);
 int kvm_set_msi(struct kvm_kernel_irq_routing_entry *irq_entry, struct kvm *kvm,
 		int irq_source_id, int level, bool line_status);
 int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e,
-- 
cgit v1.2.3-59-g8ed1b


From 656ec4a4928a3db7d16e5cb9bce351a478cfd3d5 Mon Sep 17 00:00:00 2001
From: Radim Krčmář <rkrcmar@redhat.com>
Date: Mon, 2 Nov 2015 22:20:00 +0100
Subject: KVM: VMX: fix SMEP and SMAP without EPT
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The comment in code had it mostly right, but we enable paging for
emulated real mode regardless of EPT.

Without EPT (which implies emulated real mode), secondary VCPUs won't
start unless we disable SM[AE]P when the guest doesn't use paging.

Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Cc: stable@vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 4d0aa31a42ba..2ac116417583 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3788,20 +3788,21 @@ static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 		if (!is_paging(vcpu)) {
 			hw_cr4 &= ~X86_CR4_PAE;
 			hw_cr4 |= X86_CR4_PSE;
-			/*
-			 * SMEP/SMAP is disabled if CPU is in non-paging mode
-			 * in hardware. However KVM always uses paging mode to
-			 * emulate guest non-paging mode with TDP.
-			 * To emulate this behavior, SMEP/SMAP needs to be
-			 * manually disabled when guest switches to non-paging
-			 * mode.
-			 */
-			hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP);
 		} else if (!(cr4 & X86_CR4_PAE)) {
 			hw_cr4 &= ~X86_CR4_PAE;
 		}
 	}
 
+	if (!enable_unrestricted_guest && !is_paging(vcpu))
+		/*
+		 * SMEP/SMAP is disabled if CPU is in non-paging mode in
+		 * hardware.  However KVM always uses paging mode without
+		 * unrestricted guest.
+		 * To emulate this behavior, SMEP/SMAP needs to be manually
+		 * disabled when guest switches to non-paging mode.
+		 */
+		hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP);
+
 	vmcs_writel(CR4_READ_SHADOW, cr4);
 	vmcs_writel(GUEST_CR4, hw_cr4);
 	return 0;
-- 
cgit v1.2.3-59-g8ed1b


From 89651a3decbe03754f304a0b248f27eeb9a37937 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Tue, 3 Nov 2015 13:43:05 +0100
Subject: KVM: x86: allow RSM from 64-bit mode
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The SDM says that exiting system management mode from 64-bit mode
is invalid, but that would be too good to be true.  But actually,
most of the code is already there to support exiting from compat
mode (EFER.LME=1, EFER.LMA=0).  Getting all the way from 64-bit
mode to real mode only requires clearing CS.L and CR4.PCIDE.

Cc: stable@vger.kernel.org
Fixes: 660a5d517aaab9187f93854425c4c63f4a09195c
Tested-by: Laszlo Ersek <lersek@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/emulate.c | 30 +++++++++++++++++++++++++-----
 1 file changed, 25 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index b60fed56671b..1505587d06e9 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2484,16 +2484,36 @@ static int em_rsm(struct x86_emulate_ctxt *ctxt)
 
 	/*
 	 * Get back to real mode, to prepare a safe state in which to load
-	 * CR0/CR3/CR4/EFER.
-	 *
-	 * CR4.PCIDE must be zero, because it is a 64-bit mode only feature.
+	 * CR0/CR3/CR4/EFER.  It's all a bit more complicated if the vCPU
+	 * supports long mode.
 	 */
+	cr4 = ctxt->ops->get_cr(ctxt, 4);
+	if (emulator_has_longmode(ctxt)) {
+		struct desc_struct cs_desc;
+
+		/* Zero CR4.PCIDE before CR0.PG.  */
+		if (cr4 & X86_CR4_PCIDE) {
+			ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PCIDE);
+			cr4 &= ~X86_CR4_PCIDE;
+		}
+
+		/* A 32-bit code segment is required to clear EFER.LMA.  */
+		memset(&cs_desc, 0, sizeof(cs_desc));
+		cs_desc.type = 0xb;
+		cs_desc.s = cs_desc.g = cs_desc.p = 1;
+		ctxt->ops->set_segment(ctxt, 0, &cs_desc, 0, VCPU_SREG_CS);
+	}
+
+	/* For the 64-bit case, this will clear EFER.LMA.  */
 	cr0 = ctxt->ops->get_cr(ctxt, 0);
 	if (cr0 & X86_CR0_PE)
 		ctxt->ops->set_cr(ctxt, 0, cr0 & ~(X86_CR0_PG | X86_CR0_PE));
-	cr4 = ctxt->ops->get_cr(ctxt, 4);
+
+	/* Now clear CR4.PAE (which must be done before clearing EFER.LME).  */
 	if (cr4 & X86_CR4_PAE)
 		ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PAE);
+
+	/* And finally go back to 32-bit mode.  */
 	efer = 0;
 	ctxt->ops->set_msr(ctxt, MSR_EFER, efer);
 
@@ -4454,7 +4474,7 @@ static const struct opcode twobyte_table[256] = {
 	F(DstMem | SrcReg | Src2CL | ModRM, em_shld), N, N,
 	/* 0xA8 - 0xAF */
 	I(Stack | Src2GS, em_push_sreg), I(Stack | Src2GS, em_pop_sreg),
-	II(No64 | EmulateOnUD | ImplicitOps, em_rsm, rsm),
+	II(EmulateOnUD | ImplicitOps, em_rsm, rsm),
 	F(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_bts),
 	F(DstMem | SrcReg | Src2ImmByte | ModRM, em_shrd),
 	F(DstMem | SrcReg | Src2CL | ModRM, em_shrd),
-- 
cgit v1.2.3-59-g8ed1b


From 879ae1880449c88db11c1ebdaedc2da79b2fe73f Mon Sep 17 00:00:00 2001
From: Laszlo Ersek <lersek@redhat.com>
Date: Wed, 4 Nov 2015 12:54:41 +0100
Subject: KVM: x86: obey KVM_X86_QUIRK_CD_NW_CLEARED in kvm_set_cr0()

Commit b18d5431acc7 ("KVM: x86: fix CR0.CD virtualization") was
technically correct, but it broke OVMF guests by slowing down various
parts of the firmware.

Commit fb279950ba02 ("KVM: vmx: obey KVM_QUIRK_CD_NW_CLEARED") quirked the
first function modified by b18d5431acc7, vmx_get_mt_mask(), for OVMF's
sake. This restored the speed of the OVMF code that runs before
PlatformPei (including the memory intensive LZMA decompression in SEC).

This patch extends the quirk to the second function modified by
b18d5431acc7, kvm_set_cr0(). It eliminates the intrusive slowdown that
hits the EFI_MP_SERVICES_PROTOCOL implementation of edk2's
UefiCpuPkg/CpuDxe -- which is built into OVMF --, when CpuDxe starts up
all APs at once for initialization, in order to count them.

We also carry over the kvm_arch_has_noncoherent_dma() sub-condition from
the other half of the original commit b18d5431acc7.

Fixes: b18d5431acc7a2fd22767925f3a6f597aa4bd29e
Cc: stable@vger.kernel.org
Cc: Jordan Justen <jordan.l.justen@intel.com>
Cc: Alex Williamson <alex.williamson@redhat.com>
Reviewed-by: Xiao Guangrong <guangrong.xiao@linux.intel.com>
Tested-by: Janusz Mocek <januszmk6@gmail.com>
Signed-off-by: Laszlo Ersek <lersek@redhat.com>#
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/x86.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index a24bae0a83a2..30723a4122cd 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -625,7 +625,9 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 	if ((cr0 ^ old_cr0) & update_bits)
 		kvm_mmu_reset_context(vcpu);
 
-	if ((cr0 ^ old_cr0) & X86_CR0_CD)
+	if (((cr0 ^ old_cr0) & X86_CR0_CD) &&
+	    kvm_arch_has_noncoherent_dma(vcpu->kvm) &&
+	    !kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
 		kvm_zap_gfn_range(vcpu->kvm, 0, ~0ULL);
 
 	return 0;
-- 
cgit v1.2.3-59-g8ed1b


From a3eaa8649e4c6a6afdafaa04b9114fb230617bb1 Mon Sep 17 00:00:00 2001
From: Kai Huang <kai.huang@linux.intel.com>
Date: Wed, 4 Nov 2015 13:46:05 +0800
Subject: KVM: VMX: Fix commit which broke PML

I found PML was broken since below commit:

	commit feda805fe7c4ed9cf78158e73b1218752e3b4314
	Author: Xiao Guangrong <guangrong.xiao@linux.intel.com>
	Date:   Wed Sep 9 14:05:55 2015 +0800

	KVM: VMX: unify SECONDARY_VM_EXEC_CONTROL update

	Unify the update in vmx_cpuid_update()

	Signed-off-by: Xiao Guangrong <guangrong.xiao@linux.intel.com>
	[Rewrite to use vmcs_set_secondary_exec_control. - Paolo]
	Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>

The reason is in above commit vmx_cpuid_update calls vmx_secondary_exec_control,
in which currently SECONDARY_EXEC_ENABLE_PML bit is cleared unconditionally (as
PML is enabled in creating vcpu). Therefore if vcpu_cpuid_update is called after
vcpu is created, PML will be disabled unexpectedly while log-dirty code still
thinks PML is used.

Fix this by clearing SECONDARY_EXEC_ENABLE_PML in vmx_secondary_exec_control
only when PML is not supported or not enabled (!enable_pml). This is more
reasonable as PML is currently either always enabled or disabled. With this
explicit updating SECONDARY_EXEC_ENABLE_PML in vmx_enable{disable}_pml is not
needed so also rename vmx_enable{disable}_pml to vmx_create{destroy}_pml_buffer.

Fixes: feda805fe7c4ed9cf78158e73b1218752e3b4314
Signed-off-by: Kai Huang <kai.huang@linux.intel.com>
[While at it, change a wrong ASSERT to an "if".  The condition can happen
 if creating the VCPU fails with ENOMEM. - Paolo]
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 24 +++++++++++-------------
 1 file changed, 11 insertions(+), 13 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 2ac116417583..5eb56ed77c1f 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -4718,8 +4718,9 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
 	   a current VMCS12
 	*/
 	exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
-	/* PML is enabled/disabled in creating/destorying vcpu */
-	exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
+
+	if (!enable_pml)
+		exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
 
 	/* Currently, we allow L1 guest to directly run pcommit instruction. */
 	exec_control &= ~SECONDARY_EXEC_PCOMMIT;
@@ -7804,7 +7805,7 @@ static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
 	*info2 = vmcs_read32(VM_EXIT_INTR_INFO);
 }
 
-static int vmx_enable_pml(struct vcpu_vmx *vmx)
+static int vmx_create_pml_buffer(struct vcpu_vmx *vmx)
 {
 	struct page *pml_pg;
 
@@ -7817,18 +7818,15 @@ static int vmx_enable_pml(struct vcpu_vmx *vmx)
 	vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
 	vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
 
-	vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL, SECONDARY_EXEC_ENABLE_PML);
-
 	return 0;
 }
 
-static void vmx_disable_pml(struct vcpu_vmx *vmx)
+static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx)
 {
-	ASSERT(vmx->pml_pg);
-	__free_page(vmx->pml_pg);
-	vmx->pml_pg = NULL;
-
-	vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, SECONDARY_EXEC_ENABLE_PML);
+	if (vmx->pml_pg) {
+		__free_page(vmx->pml_pg);
+		vmx->pml_pg = NULL;
+	}
 }
 
 static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu)
@@ -8706,7 +8704,7 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 
 	if (enable_pml)
-		vmx_disable_pml(vmx);
+		vmx_destroy_pml_buffer(vmx);
 	free_vpid(vmx->vpid);
 	leave_guest_mode(vcpu);
 	vmx_load_vmcs01(vcpu);
@@ -8790,7 +8788,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
 	 * for the guest, etc.
 	 */
 	if (enable_pml) {
-		err = vmx_enable_pml(vmx);
+		err = vmx_create_pml_buffer(vmx);
 		if (err)
 			goto free_vmcs;
 	}
-- 
cgit v1.2.3-59-g8ed1b